Model save

Browse files

Files changed (14) hide show

README.md +68 -0
all_results.json +9 -0
config.json +30 -0
generation_config.json +8 -0
merges.txt +0 -0
model.safetensors +3 -0
runs/Aug27_09-54-50_GPU-4090s7/events.out.tfevents.1724745293.GPU-4090s7.3323851.0 +3 -0
special_tokens_map.json +28 -0
tokenizer.json +0 -0
tokenizer_config.json +154 -0
train_results.json +9 -0
trainer_state.json +2262 -0
training_args.bin +3 -0
vocab.json +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,68 @@

+---
+library_name: transformers
+license: apache-2.0
+base_model: HuggingFaceTB/SmolLM-360M-Instruct
+tags:
+- trl
+- sft
+- generated_from_trainer
+datasets:
+- generator
+model-index:
+- name: smollm-360M-aliases-5-plus
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# smollm-360M-aliases-5-plus
+This model is a fine-tuned version of [HuggingFaceTB/SmolLM-360M-Instruct](https://huggingface.co/HuggingFaceTB/SmolLM-360M-Instruct) on the generator dataset.
+It achieves the following results on the evaluation set:
+- Loss: 2.2459
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.001
+- train_batch_size: 4
+- eval_batch_size: 4
+- seed: 42
+- distributed_type: multi-GPU
+- num_devices: 2
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 32
+- total_eval_batch_size: 8
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_ratio: 0.1
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss |
+|:-------------:|:------:|:----:|:---------------:|
+| 0.956         | 0.9997 | 1576 | 2.2459          |
+### Framework versions
+- Transformers 4.44.2
+- Pytorch 2.4.0+cu121
+- Datasets 2.21.0
+- Tokenizers 0.19.1

all_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9996828417380272,
+    "total_flos": 38663670988800.0,
+    "train_loss": 1.4483577938854393,
+    "train_runtime": 3409.163,
+    "train_samples": 750000,
+    "train_samples_per_second": 14.797,
+    "train_steps_per_second": 0.462
+}

config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "HuggingFaceTB/SmolLM-360M-Instruct",
+  "architectures": [
+    "LlamaForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "hidden_act": "silu",
+  "hidden_size": 960,
+  "initializer_range": 0.02,
+  "intermediate_size": 2560,
+  "max_position_embeddings": 2048,
+  "mlp_bias": false,
+  "model_type": "llama",
+  "num_attention_heads": 15,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 5,
+  "pad_token_id": 2,
+  "pretraining_tp": 1,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000.0,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.44.2",
+  "use_cache": false,
+  "vocab_size": 49152
+}

generation_config.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "_from_model_config": true,
+  "bos_token_id": 1,
+  "eos_token_id": 2,
+  "max_new_tokens": 40,
+  "pad_token_id": 2,
+  "transformers_version": "4.44.2"
+}

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74c45941ffa5c383e215ed07c0e441952403629fb3f16bdd5359f0703fea7c2f
+size 723674912

runs/Aug27_09-54-50_GPU-4090s7/events.out.tfevents.1724745293.GPU-4090s7.3323851.0 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbedf05e00632c8e5bddfd447ad8e72f111b8667f5bee993580e479ea3ffdea8
+size 72668

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,28 @@

+{
+  "additional_special_tokens": [
+    {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    },
+    {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false
+    }
+  ],
+  "bos_token": "<|im_start|>",
+  "eos_token": "<|im_end|>",
+  "pad_token": "<|im_end|>",
+  "unk_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,154 @@

+{
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "<repo_name>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "<reponame>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "5": {
+      "content": "<file_sep>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "6": {
+      "content": "<filename>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "7": {
+      "content": "<gh_stars>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "8": {
+      "content": "<issue_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "9": {
+      "content": "<issue_comment>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "10": {
+      "content": "<issue_closed>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "11": {
+      "content": "<jupyter_start>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "12": {
+      "content": "<jupyter_text>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "13": {
+      "content": "<jupyter_code>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "14": {
+      "content": "<jupyter_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "15": {
+      "content": "<jupyter_script>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "16": {
+      "content": "<empty_output>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>"
+  ],
+  "bos_token": "<|im_start|>",
+  "chat_template": "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "model_max_length": 2048,
+  "pad_token": "<|im_end|>",
+  "tokenizer_class": "GPT2Tokenizer",
+  "unk_token": "<|endoftext|>",
+  "vocab_size": 49152
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "epoch": 0.9996828417380272,
+    "total_flos": 38663670988800.0,
+    "train_loss": 1.4483577938854393,
+    "train_runtime": 3409.163,
+    "train_samples": 750000,
+    "train_samples_per_second": 14.797,
+    "train_steps_per_second": 0.462
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2262 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.9996828417380272,
+  "eval_steps": 500,
+  "global_step": 1576,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0006343165239454488,
+      "grad_norm": 4.072216572643748,
+      "learning_rate": 6.329113924050633e-06,
+      "loss": 3.2618,
+      "step": 1
+    },
+    {
+      "epoch": 0.003171582619727244,
+      "grad_norm": 3.6569951400637444,
+      "learning_rate": 3.1645569620253167e-05,
+      "loss": 3.3026,
+      "step": 5
+    },
+    {
+      "epoch": 0.006343165239454488,
+      "grad_norm": 2.7101736291338634,
+      "learning_rate": 6.329113924050633e-05,
+      "loss": 3.1879,
+      "step": 10
+    },
+    {
+      "epoch": 0.009514747859181731,
+      "grad_norm": 0.8292602783930105,
+      "learning_rate": 9.49367088607595e-05,
+      "loss": 2.9431,
+      "step": 15
+    },
+    {
+      "epoch": 0.012686330478908976,
+      "grad_norm": 0.4014812685185947,
+      "learning_rate": 0.00012658227848101267,
+      "loss": 2.7946,
+      "step": 20
+    },
+    {
+      "epoch": 0.01585791309863622,
+      "grad_norm": 0.3318705203128404,
+      "learning_rate": 0.00015822784810126583,
+      "loss": 2.6345,
+      "step": 25
+    },
+    {
+      "epoch": 0.019029495718363463,
+      "grad_norm": 0.4987722149702972,
+      "learning_rate": 0.000189873417721519,
+      "loss": 2.547,
+      "step": 30
+    },
+    {
+      "epoch": 0.022201078338090707,
+      "grad_norm": 0.3868479554032603,
+      "learning_rate": 0.00022151898734177215,
+      "loss": 2.462,
+      "step": 35
+    },
+    {
+      "epoch": 0.02537266095781795,
+      "grad_norm": 0.3440357488186946,
+      "learning_rate": 0.00025316455696202533,
+      "loss": 2.4119,
+      "step": 40
+    },
+    {
+      "epoch": 0.028544243577545196,
+      "grad_norm": 0.2542091158949379,
+      "learning_rate": 0.0002848101265822785,
+      "loss": 2.3337,
+      "step": 45
+    },
+    {
+      "epoch": 0.03171582619727244,
+      "grad_norm": 0.5028112177822042,
+      "learning_rate": 0.00031645569620253165,
+      "loss": 2.3079,
+      "step": 50
+    },
+    {
+      "epoch": 0.034887408816999685,
+      "grad_norm": 0.3647071858886548,
+      "learning_rate": 0.00034810126582278487,
+      "loss": 2.2772,
+      "step": 55
+    },
+    {
+      "epoch": 0.038058991436726926,
+      "grad_norm": 0.3365279113242041,
+      "learning_rate": 0.000379746835443038,
+      "loss": 2.2643,
+      "step": 60
+    },
+    {
+      "epoch": 0.041230574056454174,
+      "grad_norm": 0.3437920074360608,
+      "learning_rate": 0.0004113924050632912,
+      "loss": 2.2467,
+      "step": 65
+    },
+    {
+      "epoch": 0.044402156676181415,
+      "grad_norm": 0.2180989181512006,
+      "learning_rate": 0.0004430379746835443,
+      "loss": 2.2192,
+      "step": 70
+    },
+    {
+      "epoch": 0.047573739295908656,
+      "grad_norm": 0.3740832044447792,
+      "learning_rate": 0.00047468354430379745,
+      "loss": 2.2202,
+      "step": 75
+    },
+    {
+      "epoch": 0.0507453219156359,
+      "grad_norm": 0.4042428788012064,
+      "learning_rate": 0.0005063291139240507,
+      "loss": 2.206,
+      "step": 80
+    },
+    {
+      "epoch": 0.053916904535363144,
+      "grad_norm": 0.5502810429404877,
+      "learning_rate": 0.0005379746835443038,
+      "loss": 2.1792,
+      "step": 85
+    },
+    {
+      "epoch": 0.05708848715509039,
+      "grad_norm": 0.9725610364599878,
+      "learning_rate": 0.000569620253164557,
+      "loss": 2.1717,
+      "step": 90
+    },
+    {
+      "epoch": 0.06026006977481763,
+      "grad_norm": 0.4008236462318082,
+      "learning_rate": 0.0006012658227848101,
+      "loss": 2.1581,
+      "step": 95
+    },
+    {
+      "epoch": 0.06343165239454487,
+      "grad_norm": 1.1094243654374898,
+      "learning_rate": 0.0006329113924050633,
+      "loss": 2.1497,
+      "step": 100
+    },
+    {
+      "epoch": 0.06660323501427212,
+      "grad_norm": 0.257873202758346,
+      "learning_rate": 0.0006645569620253165,
+      "loss": 2.1357,
+      "step": 105
+    },
+    {
+      "epoch": 0.06977481763399937,
+      "grad_norm": 0.4530053303085577,
+      "learning_rate": 0.0006962025316455697,
+      "loss": 2.1319,
+      "step": 110
+    },
+    {
+      "epoch": 0.0729464002537266,
+      "grad_norm": 0.237617906265262,
+      "learning_rate": 0.0007278481012658228,
+      "loss": 2.114,
+      "step": 115
+    },
+    {
+      "epoch": 0.07611798287345385,
+      "grad_norm": 1.309988153168323,
+      "learning_rate": 0.000759493670886076,
+      "loss": 2.101,
+      "step": 120
+    },
+    {
+      "epoch": 0.0792895654931811,
+      "grad_norm": 0.33147972360135136,
+      "learning_rate": 0.0007911392405063291,
+      "loss": 2.0983,
+      "step": 125
+    },
+    {
+      "epoch": 0.08246114811290835,
+      "grad_norm": 0.6923953247791184,
+      "learning_rate": 0.0008227848101265824,
+      "loss": 2.0775,
+      "step": 130
+    },
+    {
+      "epoch": 0.08563273073263558,
+      "grad_norm": 0.6108397955993198,
+      "learning_rate": 0.0008544303797468354,
+      "loss": 2.076,
+      "step": 135
+    },
+    {
+      "epoch": 0.08880431335236283,
+      "grad_norm": 0.25451367249745316,
+      "learning_rate": 0.0008860759493670886,
+      "loss": 2.0613,
+      "step": 140
+    },
+    {
+      "epoch": 0.09197589597209008,
+      "grad_norm": 0.5696237036165042,
+      "learning_rate": 0.0009177215189873418,
+      "loss": 2.0571,
+      "step": 145
+    },
+    {
+      "epoch": 0.09514747859181731,
+      "grad_norm": 0.47600183429795934,
+      "learning_rate": 0.0009493670886075949,
+      "loss": 2.0597,
+      "step": 150
+    },
+    {
+      "epoch": 0.09831906121154456,
+      "grad_norm": 0.49360558045014563,
+      "learning_rate": 0.0009810126582278482,
+      "loss": 2.0276,
+      "step": 155
+    },
+    {
+      "epoch": 0.1014906438312718,
+      "grad_norm": 0.2619769061768294,
+      "learning_rate": 0.0009999950915251159,
+      "loss": 2.049,
+      "step": 160
+    },
+    {
+      "epoch": 0.10466222645099905,
+      "grad_norm": 0.25933552218187667,
+      "learning_rate": 0.0009999398722894419,
+      "loss": 2.0304,
+      "step": 165
+    },
+    {
+      "epoch": 0.10783380907072629,
+      "grad_norm": 0.334897031747453,
+      "learning_rate": 0.0009998233050230736,
+      "loss": 2.0144,
+      "step": 170
+    },
+    {
+      "epoch": 0.11100539169045354,
+      "grad_norm": 0.31275160877263003,
+      "learning_rate": 0.0009996454040300758,
+      "loss": 1.9773,
+      "step": 175
+    },
+    {
+      "epoch": 0.11417697431018078,
+      "grad_norm": 0.5307747748580993,
+      "learning_rate": 0.0009994061911408245,
+      "loss": 1.9863,
+      "step": 180
+    },
+    {
+      "epoch": 0.11734855692990802,
+      "grad_norm": 0.4066591897491239,
+      "learning_rate": 0.0009991056957093295,
+      "loss": 1.9812,
+      "step": 185
+    },
+    {
+      "epoch": 0.12052013954963527,
+      "grad_norm": 0.40898061678871916,
+      "learning_rate": 0.0009987439546096308,
+      "loss": 1.9983,
+      "step": 190
+    },
+    {
+      "epoch": 0.12369172216936251,
+      "grad_norm": 0.6281156992876311,
+      "learning_rate": 0.0009983210122312745,
+      "loss": 1.9663,
+      "step": 195
+    },
+    {
+      "epoch": 0.12686330478908975,
+      "grad_norm": 0.418957878268899,
+      "learning_rate": 0.000997836920473866,
+      "loss": 1.9443,
+      "step": 200
+    },
+    {
+      "epoch": 0.130034887408817,
+      "grad_norm": 0.2967784254205414,
+      "learning_rate": 0.000997291738740701,
+      "loss": 1.9496,
+      "step": 205
+    },
+    {
+      "epoch": 0.13320647002854424,
+      "grad_norm": 0.32659030796363414,
+      "learning_rate": 0.0009966855339314756,
+      "loss": 1.9394,
+      "step": 210
+    },
+    {
+      "epoch": 0.1363780526482715,
+      "grad_norm": 0.44336971944165454,
+      "learning_rate": 0.0009960183804340781,
+      "loss": 1.9274,
+      "step": 215
+    },
+    {
+      "epoch": 0.13954963526799874,
+      "grad_norm": 0.7513841493295751,
+      "learning_rate": 0.0009952903601154596,
+      "loss": 1.937,
+      "step": 220
+    },
+    {
+      "epoch": 0.142721217887726,
+      "grad_norm": 0.8840919710762163,
+      "learning_rate": 0.0009945015623115897,
+      "loss": 1.9222,
+      "step": 225
+    },
+    {
+      "epoch": 0.1458928005074532,
+      "grad_norm": 0.3088103069995084,
+      "learning_rate": 0.000993652083816491,
+      "loss": 1.9272,
+      "step": 230
+    },
+    {
+      "epoch": 0.14906438312718046,
+      "grad_norm": 0.3205509173648325,
+      "learning_rate": 0.0009927420288703658,
+      "loss": 1.9282,
+      "step": 235
+    },
+    {
+      "epoch": 0.1522359657469077,
+      "grad_norm": 0.5725328480987164,
+      "learning_rate": 0.0009917715091467998,
+      "loss": 1.9092,
+      "step": 240
+    },
+    {
+      "epoch": 0.15540754836663495,
+      "grad_norm": 0.6881811737021382,
+      "learning_rate": 0.000990740643739063,
+      "loss": 1.9257,
+      "step": 245
+    },
+    {
+      "epoch": 0.1585791309863622,
+      "grad_norm": 0.3928160747689014,
+      "learning_rate": 0.000989649559145493,
+      "loss": 1.9075,
+      "step": 250
+    },
+    {
+      "epoch": 0.16175071360608945,
+      "grad_norm": 0.3457762887590973,
+      "learning_rate": 0.000988498389253972,
+      "loss": 1.8954,
+      "step": 255
+    },
+    {
+      "epoch": 0.1649222962258167,
+      "grad_norm": 0.6130106535523941,
+      "learning_rate": 0.0009872872753254995,
+      "loss": 1.8869,
+      "step": 260
+    },
+    {
+      "epoch": 0.16809387884554391,
+      "grad_norm": 0.2006935276789736,
+      "learning_rate": 0.0009860163659768566,
+      "loss": 1.8764,
+      "step": 265
+    },
+    {
+      "epoch": 0.17126546146527116,
+      "grad_norm": 0.2519600708036085,
+      "learning_rate": 0.0009846858171623687,
+      "loss": 1.8592,
+      "step": 270
+    },
+    {
+      "epoch": 0.1744370440849984,
+      "grad_norm": 0.21430434382675823,
+      "learning_rate": 0.0009832957921547696,
+      "loss": 1.8588,
+      "step": 275
+    },
+    {
+      "epoch": 0.17760862670472566,
+      "grad_norm": 0.7316176065198735,
+      "learning_rate": 0.000981846461525165,
+      "loss": 1.8442,
+      "step": 280
+    },
+    {
+      "epoch": 0.1807802093244529,
+      "grad_norm": 0.5438158046657656,
+      "learning_rate": 0.0009803380031221018,
+      "loss": 1.8681,
+      "step": 285
+    },
+    {
+      "epoch": 0.18395179194418015,
+      "grad_norm": 0.22290789589006946,
+      "learning_rate": 0.000978770602049745,
+      "loss": 1.8342,
+      "step": 290
+    },
+    {
+      "epoch": 0.1871233745639074,
+      "grad_norm": 0.2561355818352734,
+      "learning_rate": 0.0009771444506451621,
+      "loss": 1.8408,
+      "step": 295
+    },
+    {
+      "epoch": 0.19029495718363462,
+      "grad_norm": 0.3381776052738623,
+      "learning_rate": 0.0009754597484547223,
+      "loss": 1.829,
+      "step": 300
+    },
+    {
+      "epoch": 0.19346653980336187,
+      "grad_norm": 0.2267569653346989,
+      "learning_rate": 0.0009737167022096094,
+      "loss": 1.8283,
+      "step": 305
+    },
+    {
+      "epoch": 0.19663812242308912,
+      "grad_norm": 0.23165580428938548,
+      "learning_rate": 0.0009719155258004541,
+      "loss": 1.8071,
+      "step": 310
+    },
+    {
+      "epoch": 0.19980970504281637,
+      "grad_norm": 0.25586494282771377,
+      "learning_rate": 0.0009700564402510871,
+      "loss": 1.8145,
+      "step": 315
+    },
+    {
+      "epoch": 0.2029812876625436,
+      "grad_norm": 0.2540371949506308,
+      "learning_rate": 0.0009681396736914168,
+      "loss": 1.8015,
+      "step": 320
+    },
+    {
+      "epoch": 0.20615287028227086,
+      "grad_norm": 0.6388348558478815,
+      "learning_rate": 0.0009661654613294355,
+      "loss": 1.8127,
+      "step": 325
+    },
+    {
+      "epoch": 0.2093244529019981,
+      "grad_norm": 0.3864015903258655,
+      "learning_rate": 0.0009641340454223575,
+      "loss": 1.7935,
+      "step": 330
+    },
+    {
+      "epoch": 0.21249603552172533,
+      "grad_norm": 0.2751489116810319,
+      "learning_rate": 0.0009620456752468903,
+      "loss": 1.8058,
+      "step": 335
+    },
+    {
+      "epoch": 0.21566761814145258,
+      "grad_norm": 0.7422837235361598,
+      "learning_rate": 0.0009599006070686467,
+      "loss": 1.7927,
+      "step": 340
+    },
+    {
+      "epoch": 0.21883920076117983,
+      "grad_norm": 0.4534291854575538,
+      "learning_rate": 0.0009576991041106973,
+      "loss": 1.7927,
+      "step": 345
+    },
+    {
+      "epoch": 0.22201078338090707,
+      "grad_norm": 0.3583139746684532,
+      "learning_rate": 0.0009554414365212709,
+      "loss": 1.7883,
+      "step": 350
+    },
+    {
+      "epoch": 0.22518236600063432,
+      "grad_norm": 0.20634161577455162,
+      "learning_rate": 0.0009531278813406046,
+      "loss": 1.7637,
+      "step": 355
+    },
+    {
+      "epoch": 0.22835394862036157,
+      "grad_norm": 0.5462716024749192,
+      "learning_rate": 0.000950758722466947,
+      "loss": 1.7823,
+      "step": 360
+    },
+    {
+      "epoch": 0.23152553124008882,
+      "grad_norm": 0.20847302955466993,
+      "learning_rate": 0.0009483342506217214,
+      "loss": 1.7736,
+      "step": 365
+    },
+    {
+      "epoch": 0.23469711385981604,
+      "grad_norm": 0.21809684764751344,
+      "learning_rate": 0.0009458547633138515,
+      "loss": 1.7636,
+      "step": 370
+    },
+    {
+      "epoch": 0.23786869647954328,
+      "grad_norm": 0.19220401784144317,
+      "learning_rate": 0.0009433205648032528,
+      "loss": 1.7509,
+      "step": 375
+    },
+    {
+      "epoch": 0.24104027909927053,
+      "grad_norm": 0.273271874095809,
+      "learning_rate": 0.0009407319660634979,
+      "loss": 1.7488,
+      "step": 380
+    },
+    {
+      "epoch": 0.24421186171899778,
+      "grad_norm": 0.31458786826276625,
+      "learning_rate": 0.0009380892847436555,
+      "loss": 1.7342,
+      "step": 385
+    },
+    {
+      "epoch": 0.24738344433872503,
+      "grad_norm": 0.19789392284188642,
+      "learning_rate": 0.0009353928451293121,
+      "loss": 1.743,
+      "step": 390
+    },
+    {
+      "epoch": 0.2505550269584523,
+      "grad_norm": 0.24499888411428472,
+      "learning_rate": 0.0009326429781027789,
+      "loss": 1.7193,
+      "step": 395
+    },
+    {
+      "epoch": 0.2537266095781795,
+      "grad_norm": 0.33173879702411524,
+      "learning_rate": 0.0009298400211024877,
+      "loss": 1.729,
+      "step": 400
+    },
+    {
+      "epoch": 0.25689819219790677,
+      "grad_norm": 0.34879621136110506,
+      "learning_rate": 0.0009269843180815853,
+      "loss": 1.7241,
+      "step": 405
+    },
+    {
+      "epoch": 0.260069774817634,
+      "grad_norm": 0.20572899222991778,
+      "learning_rate": 0.0009240762194657253,
+      "loss": 1.7229,
+      "step": 410
+    },
+    {
+      "epoch": 0.26324135743736127,
+      "grad_norm": 0.21561079654661294,
+      "learning_rate": 0.0009211160821100679,
+      "loss": 1.7155,
+      "step": 415
+    },
+    {
+      "epoch": 0.2664129400570885,
+      "grad_norm": 0.443393553505543,
+      "learning_rate": 0.0009181042692554893,
+      "loss": 1.7111,
+      "step": 420
+    },
+    {
+      "epoch": 0.2695845226768157,
+      "grad_norm": 0.2266683853424827,
+      "learning_rate": 0.0009150411504840086,
+      "loss": 1.7009,
+      "step": 425
+    },
+    {
+      "epoch": 0.272756105296543,
+      "grad_norm": 0.3582735361142464,
+      "learning_rate": 0.000911927101673436,
+      "loss": 1.7016,
+      "step": 430
+    },
+    {
+      "epoch": 0.2759276879162702,
+      "grad_norm": 0.43342116834945776,
+      "learning_rate": 0.0009087625049512488,
+      "loss": 1.7037,
+      "step": 435
+    },
+    {
+      "epoch": 0.2790992705359975,
+      "grad_norm": 0.3295875571024751,
+      "learning_rate": 0.0009055477486476991,
+      "loss": 1.682,
+      "step": 440
+    },
+    {
+      "epoch": 0.2822708531557247,
+      "grad_norm": 0.1891978276034803,
+      "learning_rate": 0.0009022832272481627,
+      "loss": 1.6899,
+      "step": 445
+    },
+    {
+      "epoch": 0.285442435775452,
+      "grad_norm": 0.26615608448970285,
+      "learning_rate": 0.000898969341344731,
+      "loss": 1.6909,
+      "step": 450
+    },
+    {
+      "epoch": 0.2886140183951792,
+      "grad_norm": 0.26554406802462666,
+      "learning_rate": 0.0008956064975870544,
+      "loss": 1.6764,
+      "step": 455
+    },
+    {
+      "epoch": 0.2917856010149064,
+      "grad_norm": 0.20008546513645153,
+      "learning_rate": 0.0008921951086324411,
+      "loss": 1.6571,
+      "step": 460
+    },
+    {
+      "epoch": 0.2949571836346337,
+      "grad_norm": 0.25575390463894654,
+      "learning_rate": 0.0008887355930952202,
+      "loss": 1.6636,
+      "step": 465
+    },
+    {
+      "epoch": 0.2981287662543609,
+      "grad_norm": 0.3501161922386378,
+      "learning_rate": 0.0008852283754953732,
+      "loss": 1.657,
+      "step": 470
+    },
+    {
+      "epoch": 0.3013003488740882,
+      "grad_norm": 0.20707308621635875,
+      "learning_rate": 0.0008816738862064412,
+      "loss": 1.6659,
+      "step": 475
+    },
+    {
+      "epoch": 0.3044719314938154,
+      "grad_norm": 0.2572060719794171,
+      "learning_rate": 0.0008780725614027123,
+      "loss": 1.6521,
+      "step": 480
+    },
+    {
+      "epoch": 0.3076435141135427,
+      "grad_norm": 0.2773641851176988,
+      "learning_rate": 0.000874424843005699,
+      "loss": 1.6545,
+      "step": 485
+    },
+    {
+      "epoch": 0.3108150967332699,
+      "grad_norm": 0.5151669199508683,
+      "learning_rate": 0.0008707311786299099,
+      "loss": 1.6512,
+      "step": 490
+    },
+    {
+      "epoch": 0.3139866793529971,
+      "grad_norm": 0.35976330322294225,
+      "learning_rate": 0.0008669920215279222,
+      "loss": 1.6489,
+      "step": 495
+    },
+    {
+      "epoch": 0.3171582619727244,
+      "grad_norm": 0.18626964833018503,
+      "learning_rate": 0.0008632078305347623,
+      "loss": 1.6292,
+      "step": 500
+    },
+    {
+      "epoch": 0.3203298445924516,
+      "grad_norm": 0.26834931489718644,
+      "learning_rate": 0.0008593790700116029,
+      "loss": 1.6244,
+      "step": 505
+    },
+    {
+      "epoch": 0.3235014272121789,
+      "grad_norm": 0.24263398553664595,
+      "learning_rate": 0.0008555062097887796,
+      "loss": 1.6173,
+      "step": 510
+    },
+    {
+      "epoch": 0.3266730098319061,
+      "grad_norm": 0.20406088273168801,
+      "learning_rate": 0.0008515897251081384,
+      "loss": 1.6273,
+      "step": 515
+    },
+    {
+      "epoch": 0.3298445924516334,
+      "grad_norm": 0.18611145873310309,
+      "learning_rate": 0.0008476300965647186,
+      "loss": 1.5954,
+      "step": 520
+    },
+    {
+      "epoch": 0.3330161750713606,
+      "grad_norm": 0.2455194958653317,
+      "learning_rate": 0.0008436278100477775,
+      "loss": 1.6284,
+      "step": 525
+    },
+    {
+      "epoch": 0.33618775769108783,
+      "grad_norm": 0.3089123130431731,
+      "learning_rate": 0.0008395833566811676,
+      "loss": 1.6043,
+      "step": 530
+    },
+    {
+      "epoch": 0.3393593403108151,
+      "grad_norm": 0.278226103442385,
+      "learning_rate": 0.0008354972327630705,
+      "loss": 1.5991,
+      "step": 535
+    },
+    {
+      "epoch": 0.3425309229305423,
+      "grad_norm": 0.40283564646452896,
+      "learning_rate": 0.000831369939705094,
+      "loss": 1.5942,
+      "step": 540
+    },
+    {
+      "epoch": 0.3457025055502696,
+      "grad_norm": 0.667879925495927,
+      "learning_rate": 0.0008272019839707461,
+      "loss": 1.5968,
+      "step": 545
+    },
+    {
+      "epoch": 0.3488740881699968,
+      "grad_norm": 0.34787949832129605,
+      "learning_rate": 0.0008229938770132843,
+      "loss": 1.5815,
+      "step": 550
+    },
+    {
+      "epoch": 0.3520456707897241,
+      "grad_norm": 0.2584482438633063,
+      "learning_rate": 0.0008187461352129555,
+      "loss": 1.5884,
+      "step": 555
+    },
+    {
+      "epoch": 0.3552172534094513,
+      "grad_norm": 0.22742176340374293,
+      "learning_rate": 0.0008144592798136309,
+      "loss": 1.5919,
+      "step": 560
+    },
+    {
+      "epoch": 0.35838883602917854,
+      "grad_norm": 0.42255520990843093,
+      "learning_rate": 0.0008101338368588436,
+      "loss": 1.5913,
+      "step": 565
+    },
+    {
+      "epoch": 0.3615604186489058,
+      "grad_norm": 0.3824113474293145,
+      "learning_rate": 0.0008057703371272366,
+      "loss": 1.5611,
+      "step": 570
+    },
+    {
+      "epoch": 0.36473200126863303,
+      "grad_norm": 0.2091017723989841,
+      "learning_rate": 0.0008013693160674316,
+      "loss": 1.5626,
+      "step": 575
+    },
+    {
+      "epoch": 0.3679035838883603,
+      "grad_norm": 0.23814734243563393,
+      "learning_rate": 0.0007969313137323229,
+      "loss": 1.5656,
+      "step": 580
+    },
+    {
+      "epoch": 0.37107516650808753,
+      "grad_norm": 0.2597004458168679,
+      "learning_rate": 0.0007924568747128076,
+      "loss": 1.5624,
+      "step": 585
+    },
+    {
+      "epoch": 0.3742467491278148,
+      "grad_norm": 0.2949069544402481,
+      "learning_rate": 0.0007879465480709576,
+      "loss": 1.5516,
+      "step": 590
+    },
+    {
+      "epoch": 0.377418331747542,
+      "grad_norm": 0.21263382790898516,
+      "learning_rate": 0.0007834008872726453,
+      "loss": 1.5409,
+      "step": 595
+    },
+    {
+      "epoch": 0.38058991436726924,
+      "grad_norm": 0.27681275720229476,
+      "learning_rate": 0.0007788204501196254,
+      "loss": 1.5507,
+      "step": 600
+    },
+    {
+      "epoch": 0.3837614969869965,
+      "grad_norm": 0.5196324383707882,
+      "learning_rate": 0.000774205798681088,
+      "loss": 1.5435,
+      "step": 605
+    },
+    {
+      "epoch": 0.38693307960672374,
+      "grad_norm": 0.3397151418636398,
+      "learning_rate": 0.000769557499224686,
+      "loss": 1.5292,
+      "step": 610
+    },
+    {
+      "epoch": 0.390104662226451,
+      "grad_norm": 0.21757261564984298,
+      "learning_rate": 0.0007648761221470481,
+      "loss": 1.5342,
+      "step": 615
+    },
+    {
+      "epoch": 0.39327624484617824,
+      "grad_norm": 0.23799713493080946,
+      "learning_rate": 0.000760162241903785,
+      "loss": 1.5314,
+      "step": 620
+    },
+    {
+      "epoch": 0.3964478274659055,
+      "grad_norm": 0.20955913102047505,
+      "learning_rate": 0.0007554164369389975,
+      "loss": 1.5149,
+      "step": 625
+    },
+    {
+      "epoch": 0.39961941008563273,
+      "grad_norm": 0.19465193626198848,
+      "learning_rate": 0.0007506392896142951,
+      "loss": 1.514,
+      "step": 630
+    },
+    {
+      "epoch": 0.40279099270535995,
+      "grad_norm": 0.37370015455345407,
+      "learning_rate": 0.0007458313861373336,
+      "loss": 1.5138,
+      "step": 635
+    },
+    {
+      "epoch": 0.4059625753250872,
+      "grad_norm": 0.2112845859224254,
+      "learning_rate": 0.0007409933164898818,
+      "loss": 1.5024,
+      "step": 640
+    },
+    {
+      "epoch": 0.40913415794481445,
+      "grad_norm": 0.24626397881644146,
+      "learning_rate": 0.0007361256743554241,
+      "loss": 1.519,
+      "step": 645
+    },
+    {
+      "epoch": 0.4123057405645417,
+      "grad_norm": 0.3216374157044185,
+      "learning_rate": 0.0007312290570463083,
+      "loss": 1.5039,
+      "step": 650
+    },
+    {
+      "epoch": 0.41547732318426894,
+      "grad_norm": 0.22302629969432056,
+      "learning_rate": 0.0007263040654304502,
+      "loss": 1.494,
+      "step": 655
+    },
+    {
+      "epoch": 0.4186489058039962,
+      "grad_norm": 0.2675317557830398,
+      "learning_rate": 0.0007213513038575998,
+      "loss": 1.4884,
+      "step": 660
+    },
+    {
+      "epoch": 0.42182048842372344,
+      "grad_norm": 0.2905992631967741,
+      "learning_rate": 0.0007163713800851811,
+      "loss": 1.4851,
+      "step": 665
+    },
+    {
+      "epoch": 0.42499207104345066,
+      "grad_norm": 0.20033257450058217,
+      "learning_rate": 0.0007113649052037139,
+      "loss": 1.475,
+      "step": 670
+    },
+    {
+      "epoch": 0.42816365366317793,
+      "grad_norm": 0.24204591478150614,
+      "learning_rate": 0.0007063324935618264,
+      "loss": 1.4854,
+      "step": 675
+    },
+    {
+      "epoch": 0.43133523628290515,
+      "grad_norm": 0.2121430223132248,
+      "learning_rate": 0.0007012747626908679,
+      "loss": 1.4867,
+      "step": 680
+    },
+    {
+      "epoch": 0.43450681890263243,
+      "grad_norm": 0.22539040426730952,
+      "learning_rate": 0.0006961923332291309,
+      "loss": 1.467,
+      "step": 685
+    },
+    {
+      "epoch": 0.43767840152235965,
+      "grad_norm": 0.22695514383581045,
+      "learning_rate": 0.0006910858288456921,
+      "loss": 1.4657,
+      "step": 690
+    },
+    {
+      "epoch": 0.4408499841420869,
+      "grad_norm": 0.22184474318285244,
+      "learning_rate": 0.0006859558761638819,
+      "loss": 1.4423,
+      "step": 695
+    },
+    {
+      "epoch": 0.44402156676181415,
+      "grad_norm": 0.29575466467956124,
+      "learning_rate": 0.0006808031046843901,
+      "loss": 1.4485,
+      "step": 700
+    },
+    {
+      "epoch": 0.44719314938154137,
+      "grad_norm": 0.21488007270980458,
+      "learning_rate": 0.0006756281467080205,
+      "loss": 1.4508,
+      "step": 705
+    },
+    {
+      "epoch": 0.45036473200126864,
+      "grad_norm": 0.38667469007287536,
+      "learning_rate": 0.0006704316372580989,
+      "loss": 1.4459,
+      "step": 710
+    },
+    {
+      "epoch": 0.45353631462099586,
+      "grad_norm": 0.5234661249173684,
+      "learning_rate": 0.0006652142140025517,
+      "loss": 1.435,
+      "step": 715
+    },
+    {
+      "epoch": 0.45670789724072314,
+      "grad_norm": 0.37462414488518325,
+      "learning_rate": 0.0006599765171756538,
+      "loss": 1.4379,
+      "step": 720
+    },
+    {
+      "epoch": 0.45987947986045036,
+      "grad_norm": 0.3040640640559466,
+      "learning_rate": 0.0006547191894994679,
+      "loss": 1.4341,
+      "step": 725
+    },
+    {
+      "epoch": 0.46305106248017763,
+      "grad_norm": 0.28687145037107376,
+      "learning_rate": 0.0006494428761049736,
+      "loss": 1.4297,
+      "step": 730
+    },
+    {
+      "epoch": 0.46622264509990485,
+      "grad_norm": 0.20728566940658133,
+      "learning_rate": 0.0006441482244529037,
+      "loss": 1.4124,
+      "step": 735
+    },
+    {
+      "epoch": 0.4693942277196321,
+      "grad_norm": 0.21956352378213645,
+      "learning_rate": 0.0006388358842542938,
+      "loss": 1.4162,
+      "step": 740
+    },
+    {
+      "epoch": 0.47256581033935935,
+      "grad_norm": 0.20961137895482168,
+      "learning_rate": 0.0006335065073907551,
+      "loss": 1.4055,
+      "step": 745
+    },
+    {
+      "epoch": 0.47573739295908657,
+      "grad_norm": 0.21979161995613117,
+      "learning_rate": 0.0006281607478344823,
+      "loss": 1.4112,
+      "step": 750
+    },
+    {
+      "epoch": 0.47890897557881384,
+      "grad_norm": 0.2707420648256881,
+      "learning_rate": 0.0006227992615680033,
+      "loss": 1.4127,
+      "step": 755
+    },
+    {
+      "epoch": 0.48208055819854106,
+      "grad_norm": 0.2829526420993808,
+      "learning_rate": 0.000617422706503684,
+      "loss": 1.3905,
+      "step": 760
+    },
+    {
+      "epoch": 0.48525214081826834,
+      "grad_norm": 0.2988909342739172,
+      "learning_rate": 0.0006120317424029943,
+      "loss": 1.3941,
+      "step": 765
+    },
+    {
+      "epoch": 0.48842372343799556,
+      "grad_norm": 0.2787477024270155,
+      "learning_rate": 0.0006066270307955492,
+      "loss": 1.404,
+      "step": 770
+    },
+    {
+      "epoch": 0.4915953060577228,
+      "grad_norm": 0.22142539860110755,
+      "learning_rate": 0.000601209234897931,
+      "loss": 1.3886,
+      "step": 775
+    },
+    {
+      "epoch": 0.49476688867745006,
+      "grad_norm": 0.2777507592841434,
+      "learning_rate": 0.0005957790195323064,
+      "loss": 1.3896,
+      "step": 780
+    },
+    {
+      "epoch": 0.4979384712971773,
+      "grad_norm": 0.2552429702914411,
+      "learning_rate": 0.0005903370510448447,
+      "loss": 1.3779,
+      "step": 785
+    },
+    {
+      "epoch": 0.5011100539169046,
+      "grad_norm": 0.268362490404369,
+      "learning_rate": 0.0005848839972239511,
+      "loss": 1.3732,
+      "step": 790
+    },
+    {
+      "epoch": 0.5042816365366318,
+      "grad_norm": 0.24326700144489616,
+      "learning_rate": 0.0005794205272183205,
+      "loss": 1.3748,
+      "step": 795
+    },
+    {
+      "epoch": 0.507453219156359,
+      "grad_norm": 0.36345823855975784,
+      "learning_rate": 0.0005739473114548266,
+      "loss": 1.3755,
+      "step": 800
+    },
+    {
+      "epoch": 0.5106248017760863,
+      "grad_norm": 0.4843512449452226,
+      "learning_rate": 0.000568465021556253,
+      "loss": 1.3638,
+      "step": 805
+    },
+    {
+      "epoch": 0.5137963843958135,
+      "grad_norm": 0.28627979209509896,
+      "learning_rate": 0.0005629743302588779,
+      "loss": 1.3514,
+      "step": 810
+    },
+    {
+      "epoch": 0.5169679670155407,
+      "grad_norm": 0.25100043970170133,
+      "learning_rate": 0.0005574759113299217,
+      "loss": 1.341,
+      "step": 815
+    },
+    {
+      "epoch": 0.520139549635268,
+      "grad_norm": 0.294501664896367,
+      "learning_rate": 0.0005519704394848692,
+      "loss": 1.3323,
+      "step": 820
+    },
+    {
+      "epoch": 0.5233111322549953,
+      "grad_norm": 0.2382026692611784,
+      "learning_rate": 0.0005464585903046744,
+      "loss": 1.3483,
+      "step": 825
+    },
+    {
+      "epoch": 0.5264827148747225,
+      "grad_norm": 0.3233277006590301,
+      "learning_rate": 0.0005409410401528587,
+      "loss": 1.3275,
+      "step": 830
+    },
+    {
+      "epoch": 0.5296542974944497,
+      "grad_norm": 0.27343633383605254,
+      "learning_rate": 0.0005354184660925148,
+      "loss": 1.3379,
+      "step": 835
+    },
+    {
+      "epoch": 0.532825880114177,
+      "grad_norm": 0.23005655087591242,
+      "learning_rate": 0.0005298915458032233,
+      "loss": 1.3213,
+      "step": 840
+    },
+    {
+      "epoch": 0.5359974627339043,
+      "grad_norm": 0.22247227894179622,
+      "learning_rate": 0.0005243609574978941,
+      "loss": 1.3295,
+      "step": 845
+    },
+    {
+      "epoch": 0.5391690453536314,
+      "grad_norm": 0.30014284045451645,
+      "learning_rate": 0.0005188273798395424,
+      "loss": 1.3214,
+      "step": 850
+    },
+    {
+      "epoch": 0.5423406279733587,
+      "grad_norm": 0.3132385853038301,
+      "learning_rate": 0.0005132914918580093,
+      "loss": 1.3172,
+      "step": 855
+    },
+    {
+      "epoch": 0.545512210593086,
+      "grad_norm": 0.33728113255378034,
+      "learning_rate": 0.0005077539728666374,
+      "loss": 1.3218,
+      "step": 860
+    },
+    {
+      "epoch": 0.5486837932128132,
+      "grad_norm": 0.25874007616270794,
+      "learning_rate": 0.0005022155023789121,
+      "loss": 1.2957,
+      "step": 865
+    },
+    {
+      "epoch": 0.5518553758325404,
+      "grad_norm": 0.24203527103405384,
+      "learning_rate": 0.0004966767600250775,
+      "loss": 1.3035,
+      "step": 870
+    },
+    {
+      "epoch": 0.5550269584522677,
+      "grad_norm": 0.21381303917505012,
+      "learning_rate": 0.0004911384254687388,
+      "loss": 1.2995,
+      "step": 875
+    },
+    {
+      "epoch": 0.558198541071995,
+      "grad_norm": 0.24304896972645837,
+      "learning_rate": 0.00048560117832345984,
+      "loss": 1.2824,
+      "step": 880
+    },
+    {
+      "epoch": 0.5613701236917221,
+      "grad_norm": 0.3080366389357483,
+      "learning_rate": 0.0004800656980693674,
+      "loss": 1.2898,
+      "step": 885
+    },
+    {
+      "epoch": 0.5645417063114494,
+      "grad_norm": 0.262557897437375,
+      "learning_rate": 0.00047453266396977174,
+      "loss": 1.2779,
+      "step": 890
+    },
+    {
+      "epoch": 0.5677132889311767,
+      "grad_norm": 0.31083455423034645,
+      "learning_rate": 0.00046900275498781347,
+      "loss": 1.2806,
+      "step": 895
+    },
+    {
+      "epoch": 0.570884871550904,
+      "grad_norm": 0.21597926838814396,
+      "learning_rate": 0.00046347664970314723,
+      "loss": 1.274,
+      "step": 900
+    },
+    {
+      "epoch": 0.5740564541706311,
+      "grad_norm": 0.22596235970597578,
+      "learning_rate": 0.0004579550262286731,
+      "loss": 1.2666,
+      "step": 905
+    },
+    {
+      "epoch": 0.5772280367903584,
+      "grad_norm": 0.22827094422158484,
+      "learning_rate": 0.0004524385621273246,
+      "loss": 1.2583,
+      "step": 910
+    },
+    {
+      "epoch": 0.5803996194100857,
+      "grad_norm": 0.24853325436526866,
+      "learning_rate": 0.00044692793432892387,
+      "loss": 1.2693,
+      "step": 915
+    },
+    {
+      "epoch": 0.5835712020298128,
+      "grad_norm": 0.2765479869012326,
+      "learning_rate": 0.00044142381904711624,
+      "loss": 1.26,
+      "step": 920
+    },
+    {
+      "epoch": 0.5867427846495401,
+      "grad_norm": 0.27285996236330706,
+      "learning_rate": 0.00043592689169639034,
+      "loss": 1.246,
+      "step": 925
+    },
+    {
+      "epoch": 0.5899143672692674,
+      "grad_norm": 0.28781941328826144,
+      "learning_rate": 0.0004304378268091982,
+      "loss": 1.249,
+      "step": 930
+    },
+    {
+      "epoch": 0.5930859498889947,
+      "grad_norm": 0.240504157977766,
+      "learning_rate": 0.0004249572979531822,
+      "loss": 1.2534,
+      "step": 935
+    },
+    {
+      "epoch": 0.5962575325087218,
+      "grad_norm": 0.341483100362183,
+      "learning_rate": 0.0004194859776485216,
+      "loss": 1.2376,
+      "step": 940
+    },
+    {
+      "epoch": 0.5994291151284491,
+      "grad_norm": 0.27130765824409686,
+      "learning_rate": 0.0004140245372854065,
+      "loss": 1.2426,
+      "step": 945
+    },
+    {
+      "epoch": 0.6026006977481764,
+      "grad_norm": 0.28496801994375115,
+      "learning_rate": 0.0004085736470416516,
+      "loss": 1.2347,
+      "step": 950
+    },
+    {
+      "epoch": 0.6057722803679035,
+      "grad_norm": 0.33820479660283104,
+      "learning_rate": 0.00040313397580045765,
+      "loss": 1.2397,
+      "step": 955
+    },
+    {
+      "epoch": 0.6089438629876308,
+      "grad_norm": 0.2537502852561033,
+      "learning_rate": 0.0003977061910683325,
+      "loss": 1.2319,
+      "step": 960
+    },
+    {
+      "epoch": 0.6121154456073581,
+      "grad_norm": 0.2543562572422921,
+      "learning_rate": 0.0003922909588931808,
+      "loss": 1.2221,
+      "step": 965
+    },
+    {
+      "epoch": 0.6152870282270854,
+      "grad_norm": 0.28194628415561285,
+      "learning_rate": 0.0003868889437825724,
+      "loss": 1.2213,
+      "step": 970
+    },
+    {
+      "epoch": 0.6184586108468125,
+      "grad_norm": 0.26751445743912233,
+      "learning_rate": 0.0003815008086222007,
+      "loss": 1.211,
+      "step": 975
+    },
+    {
+      "epoch": 0.6216301934665398,
+      "grad_norm": 0.22966413613029274,
+      "learning_rate": 0.0003761272145945388,
+      "loss": 1.2058,
+      "step": 980
+    },
+    {
+      "epoch": 0.6248017760862671,
+      "grad_norm": 0.24668142278345986,
+      "learning_rate": 0.0003707688210977055,
+      "loss": 1.2223,
+      "step": 985
+    },
+    {
+      "epoch": 0.6279733587059942,
+      "grad_norm": 0.23811743937781157,
+      "learning_rate": 0.00036542628566455025,
+      "loss": 1.2024,
+      "step": 990
+    },
+    {
+      "epoch": 0.6311449413257215,
+      "grad_norm": 0.2901121774163334,
+      "learning_rate": 0.0003601002638819665,
+      "loss": 1.2036,
+      "step": 995
+    },
+    {
+      "epoch": 0.6343165239454488,
+      "grad_norm": 0.2600410825499236,
+      "learning_rate": 0.0003547914093104439,
+      "loss": 1.2012,
+      "step": 1000
+    },
+    {
+      "epoch": 0.6374881065651761,
+      "grad_norm": 0.352563938838776,
+      "learning_rate": 0.0003495003734038697,
+      "loss": 1.1751,
+      "step": 1005
+    },
+    {
+      "epoch": 0.6406596891849032,
+      "grad_norm": 0.26125000772161344,
+      "learning_rate": 0.00034422780542958827,
+      "loss": 1.1919,
+      "step": 1010
+    },
+    {
+      "epoch": 0.6438312718046305,
+      "grad_norm": 0.2640437019043301,
+      "learning_rate": 0.00033897435238872874,
+      "loss": 1.1781,
+      "step": 1015
+    },
+    {
+      "epoch": 0.6470028544243578,
+      "grad_norm": 0.2782272386225361,
+      "learning_rate": 0.00033374065893681127,
+      "loss": 1.1821,
+      "step": 1020
+    },
+    {
+      "epoch": 0.650174437044085,
+      "grad_norm": 0.24555576527657738,
+      "learning_rate": 0.0003285273673046409,
+      "loss": 1.1721,
+      "step": 1025
+    },
+    {
+      "epoch": 0.6533460196638122,
+      "grad_norm": 0.40556770599075914,
+      "learning_rate": 0.00032333511721949817,
+      "loss": 1.1679,
+      "step": 1030
+    },
+    {
+      "epoch": 0.6565176022835395,
+      "grad_norm": 0.25906084663363754,
+      "learning_rate": 0.00031816454582663856,
+      "loss": 1.1567,
+      "step": 1035
+    },
+    {
+      "epoch": 0.6596891849032668,
+      "grad_norm": 0.27183164743159954,
+      "learning_rate": 0.0003130162876111074,
+      "loss": 1.1596,
+      "step": 1040
+    },
+    {
+      "epoch": 0.6628607675229939,
+      "grad_norm": 0.24394077297020256,
+      "learning_rate": 0.0003078909743198817,
+      "loss": 1.1487,
+      "step": 1045
+    },
+    {
+      "epoch": 0.6660323501427212,
+      "grad_norm": 0.23339037702881532,
+      "learning_rate": 0.000302789234884348,
+      "loss": 1.1636,
+      "step": 1050
+    },
+    {
+      "epoch": 0.6692039327624485,
+      "grad_norm": 0.2651227300122355,
+      "learning_rate": 0.00029771169534312583,
+      "loss": 1.1475,
+      "step": 1055
+    },
+    {
+      "epoch": 0.6723755153821757,
+      "grad_norm": 0.23719809453094406,
+      "learning_rate": 0.000292658978765246,
+      "loss": 1.1496,
+      "step": 1060
+    },
+    {
+      "epoch": 0.6755470980019029,
+      "grad_norm": 0.31466276172538943,
+      "learning_rate": 0.000287631705173693,
+      "loss": 1.1404,
+      "step": 1065
+    },
+    {
+      "epoch": 0.6787186806216302,
+      "grad_norm": 0.2657362830496313,
+      "learning_rate": 0.00028263049146932153,
+      "loss": 1.156,
+      "step": 1070
+    },
+    {
+      "epoch": 0.6818902632413575,
+      "grad_norm": 0.2353420135393821,
+      "learning_rate": 0.00027765595135515673,
+      "loss": 1.1382,
+      "step": 1075
+    },
+    {
+      "epoch": 0.6850618458610847,
+      "grad_norm": 0.29180017450918116,
+      "learning_rate": 0.00027270869526108506,
+      "loss": 1.1403,
+      "step": 1080
+    },
+    {
+      "epoch": 0.6882334284808119,
+      "grad_norm": 0.28381426741820764,
+      "learning_rate": 0.000267789330268949,
+      "loss": 1.1351,
+      "step": 1085
+    },
+    {
+      "epoch": 0.6914050111005392,
+      "grad_norm": 0.2368326399732858,
+      "learning_rate": 0.00026289846003805075,
+      "loss": 1.1264,
+      "step": 1090
+    },
+    {
+      "epoch": 0.6945765937202664,
+      "grad_norm": 0.24260754741892487,
+      "learning_rate": 0.0002580366847310774,
+      "loss": 1.1318,
+      "step": 1095
+    },
+    {
+      "epoch": 0.6977481763399936,
+      "grad_norm": 0.33032504483698477,
+      "learning_rate": 0.0002532046009404537,
+      "loss": 1.123,
+      "step": 1100
+    },
+    {
+      "epoch": 0.7009197589597209,
+      "grad_norm": 0.2626626593890248,
+      "learning_rate": 0.00024840280161513446,
+      "loss": 1.1147,
+      "step": 1105
+    },
+    {
+      "epoch": 0.7040913415794482,
+      "grad_norm": 0.24734490639888912,
+      "learning_rate": 0.0002436318759878432,
+      "loss": 1.1141,
+      "step": 1110
+    },
+    {
+      "epoch": 0.7072629241991754,
+      "grad_norm": 0.25777344330608626,
+      "learning_rate": 0.00023889240950276602,
+      "loss": 1.1069,
+      "step": 1115
+    },
+    {
+      "epoch": 0.7104345068189026,
+      "grad_norm": 0.24965316567346824,
+      "learning_rate": 0.00023418498374371268,
+      "loss": 1.0961,
+      "step": 1120
+    },
+    {
+      "epoch": 0.7136060894386299,
+      "grad_norm": 0.2588175173420704,
+      "learning_rate": 0.0002295101763627483,
+      "loss": 1.1062,
+      "step": 1125
+    },
+    {
+      "epoch": 0.7167776720583571,
+      "grad_norm": 0.2617691894820057,
+      "learning_rate": 0.00022486856100931146,
+      "loss": 1.0949,
+      "step": 1130
+    },
+    {
+      "epoch": 0.7199492546780843,
+      "grad_norm": 0.24640261475787326,
+      "learning_rate": 0.00022026070725981867,
+      "loss": 1.1024,
+      "step": 1135
+    },
+    {
+      "epoch": 0.7231208372978116,
+      "grad_norm": 0.25512789636052857,
+      "learning_rate": 0.0002156871805477732,
+      "loss": 1.0981,
+      "step": 1140
+    },
+    {
+      "epoch": 0.7262924199175389,
+      "grad_norm": 0.2380744076277497,
+      "learning_rate": 0.00021114854209437889,
+      "loss": 1.0803,
+      "step": 1145
+    },
+    {
+      "epoch": 0.7294640025372661,
+      "grad_norm": 0.26280691701304987,
+      "learning_rate": 0.00020664534883967311,
+      "loss": 1.0851,
+      "step": 1150
+    },
+    {
+      "epoch": 0.7326355851569933,
+      "grad_norm": 0.2565430586115982,
+      "learning_rate": 0.00020217815337418427,
+      "loss": 1.076,
+      "step": 1155
+    },
+    {
+      "epoch": 0.7358071677767206,
+      "grad_norm": 0.2533752282987412,
+      "learning_rate": 0.00019774750387112174,
+      "loss": 1.0826,
+      "step": 1160
+    },
+    {
+      "epoch": 0.7389787503964478,
+      "grad_norm": 0.28126577459530283,
+      "learning_rate": 0.00019335394401911082,
+      "loss": 1.0719,
+      "step": 1165
+    },
+    {
+      "epoch": 0.7421503330161751,
+      "grad_norm": 0.2650849332335025,
+      "learning_rate": 0.00018899801295547476,
+      "loss": 1.0742,
+      "step": 1170
+    },
+    {
+      "epoch": 0.7453219156359023,
+      "grad_norm": 0.2603829852111257,
+      "learning_rate": 0.00018468024520007764,
+      "loss": 1.0772,
+      "step": 1175
+    },
+    {
+      "epoch": 0.7484934982556296,
+      "grad_norm": 0.2527087543783394,
+      "learning_rate": 0.00018040117058973316,
+      "loss": 1.0595,
+      "step": 1180
+    },
+    {
+      "epoch": 0.7516650808753568,
+      "grad_norm": 0.24678722431639855,
+      "learning_rate": 0.0001761613142131867,
+      "loss": 1.0469,
+      "step": 1185
+    },
+    {
+      "epoch": 0.754836663495084,
+      "grad_norm": 0.25910814410326344,
+      "learning_rate": 0.00017196119634668293,
+      "loss": 1.0627,
+      "step": 1190
+    },
+    {
+      "epoch": 0.7580082461148113,
+      "grad_norm": 0.26173306347429054,
+      "learning_rate": 0.00016780133239012075,
+      "loss": 1.0607,
+      "step": 1195
+    },
+    {
+      "epoch": 0.7611798287345385,
+      "grad_norm": 0.24651016867032868,
+      "learning_rate": 0.0001636822328038095,
+      "loss": 1.0546,
+      "step": 1200
+    },
+    {
+      "epoch": 0.7643514113542658,
+      "grad_norm": 0.28020957707447064,
+      "learning_rate": 0.00015960440304582858,
+      "loss": 1.0579,
+      "step": 1205
+    },
+    {
+      "epoch": 0.767522993973993,
+      "grad_norm": 0.26371476098524943,
+      "learning_rate": 0.00015556834351000354,
+      "loss": 1.0537,
+      "step": 1210
+    },
+    {
+      "epoch": 0.7706945765937203,
+      "grad_norm": 0.24623457163199874,
+      "learning_rate": 0.0001515745494645019,
+      "loss": 1.045,
+      "step": 1215
+    },
+    {
+      "epoch": 0.7738661592134475,
+      "grad_norm": 0.3155558400199454,
+      "learning_rate": 0.0001476235109910576,
+      "loss": 1.0405,
+      "step": 1220
+    },
+    {
+      "epoch": 0.7770377418331748,
+      "grad_norm": 0.2789622570986826,
+      "learning_rate": 0.00014371571292483393,
+      "loss": 1.0381,
+      "step": 1225
+    },
+    {
+      "epoch": 0.780209324452902,
+      "grad_norm": 0.2409114498230053,
+      "learning_rate": 0.0001398516347949284,
+      "loss": 1.0394,
+      "step": 1230
+    },
+    {
+      "epoch": 0.7833809070726292,
+      "grad_norm": 0.267235707838601,
+      "learning_rate": 0.0001360317507655293,
+      "loss": 1.0278,
+      "step": 1235
+    },
+    {
+      "epoch": 0.7865524896923565,
+      "grad_norm": 0.28458374786381546,
+      "learning_rate": 0.00013225652957773044,
+      "loss": 1.0326,
+      "step": 1240
+    },
+    {
+      "epoch": 0.7897240723120837,
+      "grad_norm": 0.25695712786415686,
+      "learning_rate": 0.00012852643449201212,
+      "loss": 1.023,
+      "step": 1245
+    },
+    {
+      "epoch": 0.792895654931811,
+      "grad_norm": 0.2590457354954553,
+      "learning_rate": 0.0001248419232313938,
+      "loss": 1.0232,
+      "step": 1250
+    },
+    {
+      "epoch": 0.7960672375515382,
+      "grad_norm": 0.2715843775728456,
+      "learning_rate": 0.000121203447925266,
+      "loss": 1.0287,
+      "step": 1255
+    },
+    {
+      "epoch": 0.7992388201712655,
+      "grad_norm": 0.2398511137856279,
+      "learning_rate": 0.00011761145505391024,
+      "loss": 1.0186,
+      "step": 1260
+    },
+    {
+      "epoch": 0.8024104027909927,
+      "grad_norm": 0.27281371167245233,
+      "learning_rate": 0.00011406638539370979,
+      "loss": 1.0224,
+      "step": 1265
+    },
+    {
+      "epoch": 0.8055819854107199,
+      "grad_norm": 0.3188098317762095,
+      "learning_rate": 0.00011056867396306292,
+      "loss": 1.0092,
+      "step": 1270
+    },
+    {
+      "epoch": 0.8087535680304472,
+      "grad_norm": 0.3265540754130617,
+      "learning_rate": 0.00010711874996900023,
+      "loss": 1.0104,
+      "step": 1275
+    },
+    {
+      "epoch": 0.8119251506501745,
+      "grad_norm": 0.2607401452606644,
+      "learning_rate": 0.00010371703675451733,
+      "loss": 1.0114,
+      "step": 1280
+    },
+    {
+      "epoch": 0.8150967332699017,
+      "grad_norm": 0.2883090335939891,
+      "learning_rate": 0.0001003639517466256,
+      "loss": 1.0093,
+      "step": 1285
+    },
+    {
+      "epoch": 0.8182683158896289,
+      "grad_norm": 0.25562259108760305,
+      "learning_rate": 9.705990640512907e-05,
+      "loss": 0.9938,
+      "step": 1290
+    },
+    {
+      "epoch": 0.8214398985093562,
+      "grad_norm": 0.2753573095600564,
+      "learning_rate": 9.380530617213456e-05,
+      "loss": 1.0114,
+      "step": 1295
+    },
+    {
+      "epoch": 0.8246114811290834,
+      "grad_norm": 0.23998112779723507,
+      "learning_rate": 9.060055042229881e-05,
+      "loss": 1.0089,
+      "step": 1300
+    },
+    {
+      "epoch": 0.8277830637488106,
+      "grad_norm": 0.2524204007518801,
+      "learning_rate": 8.74460324138216e-05,
+      "loss": 1.007,
+      "step": 1305
+    },
+    {
+      "epoch": 0.8309546463685379,
+      "grad_norm": 0.2526736715480949,
+      "learning_rate": 8.434213924018835e-05,
+      "loss": 1.0,
+      "step": 1310
+    },
+    {
+      "epoch": 0.8341262289882652,
+      "grad_norm": 0.2503643121035892,
+      "learning_rate": 8.128925178266927e-05,
+      "loss": 0.9965,
+      "step": 1315
+    },
+    {
+      "epoch": 0.8372978116079924,
+      "grad_norm": 0.23244697445873563,
+      "learning_rate": 7.828774466358179e-05,
+      "loss": 0.9988,
+      "step": 1320
+    },
+    {
+      "epoch": 0.8404693942277196,
+      "grad_norm": 0.2560633353876119,
+      "learning_rate": 7.53379862003195e-05,
+      "loss": 1.0048,
+      "step": 1325
+    },
+    {
+      "epoch": 0.8436409768474469,
+      "grad_norm": 0.2421969390657872,
+      "learning_rate": 7.244033836015695e-05,
+      "loss": 0.9844,
+      "step": 1330
+    },
+    {
+      "epoch": 0.8468125594671742,
+      "grad_norm": 0.2607793031238756,
+      "learning_rate": 6.95951567158305e-05,
+      "loss": 0.9778,
+      "step": 1335
+    },
+    {
+      "epoch": 0.8499841420869013,
+      "grad_norm": 0.2698408656759126,
+      "learning_rate": 6.680279040190746e-05,
+      "loss": 0.9828,
+      "step": 1340
+    },
+    {
+      "epoch": 0.8531557247066286,
+      "grad_norm": 0.23841421968692497,
+      "learning_rate": 6.406358207194224e-05,
+      "loss": 0.9991,
+      "step": 1345
+    },
+    {
+      "epoch": 0.8563273073263559,
+      "grad_norm": 0.28084531889088754,
+      "learning_rate": 6.137786785642985e-05,
+      "loss": 0.9855,
+      "step": 1350
+    },
+    {
+      "epoch": 0.8594988899460831,
+      "grad_norm": 0.24806562901660065,
+      "learning_rate": 5.8745977321558786e-05,
+      "loss": 0.9747,
+      "step": 1355
+    },
+    {
+      "epoch": 0.8626704725658103,
+      "grad_norm": 0.2511271472454086,
+      "learning_rate": 5.616823342876931e-05,
+      "loss": 0.9758,
+      "step": 1360
+    },
+    {
+      "epoch": 0.8658420551855376,
+      "grad_norm": 0.24241834259427655,
+      "learning_rate": 5.364495249512336e-05,
+      "loss": 0.9765,
+      "step": 1365
+    },
+    {
+      "epoch": 0.8690136378052649,
+      "grad_norm": 0.23883983545121304,
+      "learning_rate": 5.11764441544883e-05,
+      "loss": 0.9808,
+      "step": 1370
+    },
+    {
+      "epoch": 0.872185220424992,
+      "grad_norm": 0.24435079120648112,
+      "learning_rate": 4.8763011319542025e-05,
+      "loss": 0.9777,
+      "step": 1375
+    },
+    {
+      "epoch": 0.8753568030447193,
+      "grad_norm": 0.2629272614210174,
+      "learning_rate": 4.6404950144602e-05,
+      "loss": 0.9819,
+      "step": 1380
+    },
+    {
+      "epoch": 0.8785283856644466,
+      "grad_norm": 0.2663926565440222,
+      "learning_rate": 4.4102549989283756e-05,
+      "loss": 0.9675,
+      "step": 1385
+    },
+    {
+      "epoch": 0.8816999682841739,
+      "grad_norm": 0.2431315966754426,
+      "learning_rate": 4.1856093382994067e-05,
+      "loss": 0.9617,
+      "step": 1390
+    },
+    {
+      "epoch": 0.884871550903901,
+      "grad_norm": 0.2615985135082817,
+      "learning_rate": 3.966585599026051e-05,
+      "loss": 0.9705,
+      "step": 1395
+    },
+    {
+      "epoch": 0.8880431335236283,
+      "grad_norm": 0.2437571495377507,
+      "learning_rate": 3.753210657690537e-05,
+      "loss": 0.9637,
+      "step": 1400
+    },
+    {
+      "epoch": 0.8912147161433556,
+      "grad_norm": 0.2867683320851404,
+      "learning_rate": 3.5455106977064555e-05,
+      "loss": 0.9813,
+      "step": 1405
+    },
+    {
+      "epoch": 0.8943862987630827,
+      "grad_norm": 0.23596873266775822,
+      "learning_rate": 3.343511206105804e-05,
+      "loss": 0.9654,
+      "step": 1410
+    },
+    {
+      "epoch": 0.89755788138281,
+      "grad_norm": 0.2691994741352151,
+      "learning_rate": 3.147236970411449e-05,
+      "loss": 0.9559,
+      "step": 1415
+    },
+    {
+      "epoch": 0.9007294640025373,
+      "grad_norm": 0.25848494724563204,
+      "learning_rate": 2.9567120755953858e-05,
+      "loss": 0.9631,
+      "step": 1420
+    },
+    {
+      "epoch": 0.9039010466222646,
+      "grad_norm": 0.2329147999513356,
+      "learning_rate": 2.7719599011233333e-05,
+      "loss": 0.9654,
+      "step": 1425
+    },
+    {
+      "epoch": 0.9070726292419917,
+      "grad_norm": 0.24897956262619586,
+      "learning_rate": 2.593003118085746e-05,
+      "loss": 0.9686,
+      "step": 1430
+    },
+    {
+      "epoch": 0.910244211861719,
+      "grad_norm": 0.24140747398754628,
+      "learning_rate": 2.4198636864158684e-05,
+      "loss": 0.9709,
+      "step": 1435
+    },
+    {
+      "epoch": 0.9134157944814463,
+      "grad_norm": 0.23859470642150962,
+      "learning_rate": 2.2525628521949837e-05,
+      "loss": 0.9723,
+      "step": 1440
+    },
+    {
+      "epoch": 0.9165873771011734,
+      "grad_norm": 0.23614657553272203,
+      "learning_rate": 2.091121145045327e-05,
+      "loss": 0.96,
+      "step": 1445
+    },
+    {
+      "epoch": 0.9197589597209007,
+      "grad_norm": 0.2305582486421353,
+      "learning_rate": 1.9355583756108407e-05,
+      "loss": 0.9622,
+      "step": 1450
+    },
+    {
+      "epoch": 0.922930542340628,
+      "grad_norm": 0.23818404259591164,
+      "learning_rate": 1.7858936331262122e-05,
+      "loss": 0.9612,
+      "step": 1455
+    },
+    {
+      "epoch": 0.9261021249603553,
+      "grad_norm": 0.2378653986328734,
+      "learning_rate": 1.6421452830744365e-05,
+      "loss": 0.9716,
+      "step": 1460
+    },
+    {
+      "epoch": 0.9292737075800824,
+      "grad_norm": 0.2329586634857264,
+      "learning_rate": 1.5043309649331205e-05,
+      "loss": 0.9558,
+      "step": 1465
+    },
+    {
+      "epoch": 0.9324452901998097,
+      "grad_norm": 0.4063284757979328,
+      "learning_rate": 1.3724675900099959e-05,
+      "loss": 0.9654,
+      "step": 1470
+    },
+    {
+      "epoch": 0.935616872819537,
+      "grad_norm": 0.24883789905797735,
+      "learning_rate": 1.246571339367658e-05,
+      "loss": 0.9603,
+      "step": 1475
+    },
+    {
+      "epoch": 0.9387884554392641,
+      "grad_norm": 0.2484198570399255,
+      "learning_rate": 1.1266576618380097e-05,
+      "loss": 0.9579,
+      "step": 1480
+    },
+    {
+      "epoch": 0.9419600380589914,
+      "grad_norm": 0.24729636213521072,
+      "learning_rate": 1.0127412721265218e-05,
+      "loss": 0.9675,
+      "step": 1485
+    },
+    {
+      "epoch": 0.9451316206787187,
+      "grad_norm": 0.2505398676306425,
+      "learning_rate": 9.048361490065548e-06,
+      "loss": 0.9526,
+      "step": 1490
+    },
+    {
+      "epoch": 0.948303203298446,
+      "grad_norm": 0.26466516239326676,
+      "learning_rate": 8.029555336040383e-06,
+      "loss": 0.9661,
+      "step": 1495
+    },
+    {
+      "epoch": 0.9514747859181731,
+      "grad_norm": 0.24164934386910944,
+      "learning_rate": 7.071119277726301e-06,
+      "loss": 0.9577,
+      "step": 1500
+    },
+    {
+      "epoch": 0.9546463685379004,
+      "grad_norm": 0.23732953651039135,
+      "learning_rate": 6.17317092559605e-06,
+      "loss": 0.9562,
+      "step": 1505
+    },
+    {
+      "epoch": 0.9578179511576277,
+      "grad_norm": 0.23388339171880254,
+      "learning_rate": 5.335820467626485e-06,
+      "loss": 0.973,
+      "step": 1510
+    },
+    {
+      "epoch": 0.9609895337773549,
+      "grad_norm": 0.2318159543809297,
+      "learning_rate": 4.559170655777267e-06,
+      "loss": 0.9478,
+      "step": 1515
+    },
+    {
+      "epoch": 0.9641611163970821,
+      "grad_norm": 0.24710746850544488,
+      "learning_rate": 3.843316793382123e-06,
+      "loss": 0.9707,
+      "step": 1520
+    },
+    {
+      "epoch": 0.9673326990168094,
+      "grad_norm": 0.2862292656525582,
+      "learning_rate": 3.188346723454083e-06,
+      "loss": 0.9643,
+      "step": 1525
+    },
+    {
+      "epoch": 0.9705042816365367,
+      "grad_norm": 0.2380821089662019,
+      "learning_rate": 2.594340817906271e-06,
+      "loss": 0.9624,
+      "step": 1530
+    },
+    {
+      "epoch": 0.9736758642562638,
+      "grad_norm": 0.23933390384190942,
+      "learning_rate": 2.0613719676891853e-06,
+      "loss": 0.9599,
+      "step": 1535
+    },
+    {
+      "epoch": 0.9768474468759911,
+      "grad_norm": 0.24854365220185978,
+      "learning_rate": 1.5895055738465169e-06,
+      "loss": 0.9592,
+      "step": 1540
+    },
+    {
+      "epoch": 0.9800190294957184,
+      "grad_norm": 0.24336558825284596,
+      "learning_rate": 1.1787995394893502e-06,
+      "loss": 0.962,
+      "step": 1545
+    },
+    {
+      "epoch": 0.9831906121154456,
+      "grad_norm": 0.22749162744166387,
+      "learning_rate": 8.293042626912328e-07,
+      "loss": 0.9573,
+      "step": 1550
+    },
+    {
+      "epoch": 0.9863621947351728,
+      "grad_norm": 0.2390082283281969,
+      "learning_rate": 5.410626303034017e-07,
+      "loss": 0.9556,
+      "step": 1555
+    },
+    {
+      "epoch": 0.9895337773549001,
+      "grad_norm": 0.23360184137608606,
+      "learning_rate": 3.141100126923813e-07,
+      "loss": 0.9571,
+      "step": 1560
+    },
+    {
+      "epoch": 0.9927053599746274,
+      "grad_norm": 0.2367998742996814,
+      "learning_rate": 1.4847425939956693e-07,
+      "loss": 0.9495,
+      "step": 1565
+    },
+    {
+      "epoch": 0.9958769425943546,
+      "grad_norm": 0.2269345672235601,
+      "learning_rate": 4.417569572368052e-08,
+      "loss": 0.9499,
+      "step": 1570
+    },
+    {
+      "epoch": 0.9990485252140818,
+      "grad_norm": 0.2484799432231265,
+      "learning_rate": 1.2271202268210324e-09,
+      "loss": 0.956,
+      "step": 1575
+    },
+    {
+      "epoch": 0.9996828417380272,
+      "eval_loss": 2.245941638946533,
+      "eval_runtime": 8.442,
+      "eval_samples_per_second": 46.316,
+      "eval_steps_per_second": 5.804,
+      "step": 1576
+    },
+    {
+      "epoch": 0.9996828417380272,
+      "step": 1576,
+      "total_flos": 38663670988800.0,
+      "train_loss": 1.4483577938854393,
+      "train_runtime": 3409.163,
+      "train_samples_per_second": 14.797,
+      "train_steps_per_second": 0.462
+    }
+  ],
+  "logging_steps": 5,
+  "max_steps": 1576,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": false,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 38663670988800.0,
+  "train_batch_size": 4,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7298d47d0acd3a88973e85468ff24e30141d19bdc3a91d7485f2b0cd219f18f
+size 6840

vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff