dminhvu02 commited on Aug 25, 2024

Commit

b165130

verified ·

1 Parent(s): 0ff6012

Upload folder using huggingface_hub

Browse files

Files changed (17) hide show

README.md +10 -0
adapter_config.json +26 -0
adapter_model.bin +3 -0
adapter_model.safetensors +3 -0
config.json +43 -0
global_step525/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
global_step525/mp_rank_00_model_states.pt +3 -0
latest +1 -0
non_lora_trainables.bin +3 -0
rng_state.pth +3 -0
scheduler.pt +3 -0
special_tokens_map.json +24 -0
tokenizer.model +3 -0
tokenizer_config.json +45 -0
trainer_state.json +3696 -0
training_args.bin +3 -0
zero_to_fp32.py +592 -0

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+library_name: peft
+---
+## Training procedure
+### Framework versions
+- PEFT 0.4.0
+- PEFT 0.4.0

adapter_config.json ADDED Viewed

	@@ -0,0 +1,26 @@

+{
+  "auto_mapping": null,
+  "base_model_name_or_path": "/model_zoo/Vivid-7B-base",
+  "bias": "none",
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "lora_alpha": 64,
+  "lora_dropout": 0.05,
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "r": 32,
+  "revision": null,
+  "target_modules": [
+    "v_proj",
+    "k_proj",
+    "down_proj",
+    "gate_proj",
+    "q_proj",
+    "up_proj",
+    "o_proj"
+  ],
+  "task_type": "CAUSAL_LM"
+}

adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf246c435bb0761c494e9584c22a75d4f6f39822bf1de50757ea16deecd32a0b
+size 167927754

adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:460329d78814996d0825afd1e6be5faa13feb1966e1b749d471469532246268d
+size 167832688

config.json ADDED Viewed

	@@ -0,0 +1,43 @@

+{
+  "_name_or_path": "/model_zoo/Vivid-7B-base",
+  "architectures": [
+    "MistralForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bert_type": "qformer_layer:12",
+  "bos_token_id": 1,
+  "compress_type": "mean",
+  "eos_token_id": 2,
+  "freeze_mm_mlp_adapter": false,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "image_aspect_ratio": "pad",
+  "image_grid_pinpoints": null,
+  "image_processor": "./llamavid/processor/intern-vit",
+  "initializer_range": 0.02,
+  "intermediate_size": 14336,
+  "max_position_embeddings": 32768,
+  "max_token": 4096,
+  "mm_hidden_size": 1024,
+  "mm_projector_type": "mlp2x_gelu",
+  "mm_use_im_patch_token": false,
+  "mm_use_im_start_end": false,
+  "mm_vision_select_feature": "patch",
+  "mm_vision_select_layer": -1,
+  "mm_vision_tower": "/model_zoo/intern-vit",
+  "model_type": "mistral",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 32,
+  "num_key_value_heads": 8,
+  "num_query": 32,
+  "rms_norm_eps": 1e-05,
+  "rope_theta": 10000.0,
+  "sliding_window": 4096,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.38.2",
+  "tune_mm_mlp_adapter": false,
+  "use_cache": false,
+  "use_mm_proj": true,
+  "vocab_size": 48384
+}

global_step525/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30ad26a2b4aeadcd8f57d8eb5f3a21c4f9b791fe59ed5cba807543b60444df8c
+size 7471050560

global_step525/mp_rank_00_model_states.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4742a87baf94651655803534b535661ac260b2b4f1b8e1868633ee07a9c41394
+size 1245738500

latest ADDED Viewed

	@@ -0,0 +1 @@


1	+ global_step525

non_lora_trainables.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3ebdec47215a9bf60f43652cbfb5c4d3dc784812049117f1cb636133c313dfe4
+size 1077631656

rng_state.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ab079d3977b35da0c70c2f86c7b434635b4e725a1b585958f017e16d6008b9c8
+size 14244

scheduler.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3eb458428fb4d710cc6c3e0aba863084014be047e8d6d62df77ae7d341858cd2
+size 1064

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "bos_token": {
+    "content": "<s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "eos_token": {
+    "content": "</s>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<unk>",
+  "unk_token": {
+    "content": "<unk>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.model ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4d88bdadaa2a065aa7c6e18a4b5999ce4c76cec14d9fea882102e7b4931d7ef0
+size 779539

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,45 @@

+{
+  "add_bos_token": true,
+  "add_eos_token": false,
+  "add_prefix_space": true,
+  "added_tokens_decoder": {
+    "0": {
+      "content": "<unk>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "<s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "</s>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "additional_special_tokens": [],
+  "bos_token": "<s>",
+  "chat_template": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '</s>'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "</s>",
+  "legacy": true,
+  "model_max_length": 4096,
+  "pad_token": "<unk>",
+  "padding_side": "right",
+  "sp_model_kwargs": {},
+  "spaces_between_special_tokens": false,
+  "tokenizer_class": "LlamaTokenizer",
+  "unk_token": "<unk>",
+  "use_default_system_prompt": false
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,3696 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 0.3879728419010669,
+  "eval_steps": 500,
+  "global_step": 525,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.9372913499539586,
+      "learning_rate": 2.4390243902439027e-06,
+      "loss": 1.6191,
+      "step": 1
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.8879048490473127,
+      "learning_rate": 4.8780487804878055e-06,
+      "loss": 1.6982,
+      "step": 2
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.857307355920545,
+      "learning_rate": 7.317073170731707e-06,
+      "loss": 1.6724,
+      "step": 3
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.808250401091683,
+      "learning_rate": 9.756097560975611e-06,
+      "loss": 1.647,
+      "step": 4
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 2.5133500505596453,
+      "learning_rate": 1.2195121951219513e-05,
+      "loss": 1.6079,
+      "step": 5
+    },
+    {
+      "epoch": 0.0,
+      "grad_norm": 1.2734146289947597,
+      "learning_rate": 1.4634146341463415e-05,
+      "loss": 1.5908,
+      "step": 6
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.1812917040861377,
+      "learning_rate": 1.707317073170732e-05,
+      "loss": 1.5518,
+      "step": 7
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.293637431287248,
+      "learning_rate": 1.9512195121951222e-05,
+      "loss": 1.5952,
+      "step": 8
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.1620676440097686,
+      "learning_rate": 2.1951219512195124e-05,
+      "loss": 1.5493,
+      "step": 9
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.3191260666446372,
+      "learning_rate": 2.4390243902439026e-05,
+      "loss": 1.5625,
+      "step": 10
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.182981202097366,
+      "learning_rate": 2.682926829268293e-05,
+      "loss": 1.5498,
+      "step": 11
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.0724491677903074,
+      "learning_rate": 2.926829268292683e-05,
+      "loss": 1.5547,
+      "step": 12
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9434780094091623,
+      "learning_rate": 3.170731707317073e-05,
+      "loss": 1.5327,
+      "step": 13
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 1.0202543546064133,
+      "learning_rate": 3.414634146341464e-05,
+      "loss": 1.5933,
+      "step": 14
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.996865818341891,
+      "learning_rate": 3.6585365853658535e-05,
+      "loss": 1.5796,
+      "step": 15
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9288105887086908,
+      "learning_rate": 3.9024390243902444e-05,
+      "loss": 1.4609,
+      "step": 16
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9726608694986103,
+      "learning_rate": 4.146341463414634e-05,
+      "loss": 1.5161,
+      "step": 17
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.8619245275928736,
+      "learning_rate": 4.390243902439025e-05,
+      "loss": 1.5122,
+      "step": 18
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.9215398746800475,
+      "learning_rate": 4.634146341463415e-05,
+      "loss": 1.5078,
+      "step": 19
+    },
+    {
+      "epoch": 0.01,
+      "grad_norm": 0.903097203515963,
+      "learning_rate": 4.878048780487805e-05,
+      "loss": 1.4502,
+      "step": 20
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8761498232482394,
+      "learning_rate": 5.121951219512195e-05,
+      "loss": 1.4893,
+      "step": 21
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8353385747464918,
+      "learning_rate": 5.365853658536586e-05,
+      "loss": 1.4717,
+      "step": 22
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8000291372477917,
+      "learning_rate": 5.6097560975609764e-05,
+      "loss": 1.481,
+      "step": 23
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8452088500727898,
+      "learning_rate": 5.853658536585366e-05,
+      "loss": 1.4644,
+      "step": 24
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8829309199222577,
+      "learning_rate": 6.097560975609756e-05,
+      "loss": 1.4868,
+      "step": 25
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.8162327363449975,
+      "learning_rate": 6.341463414634146e-05,
+      "loss": 1.4883,
+      "step": 26
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7987925882960866,
+      "learning_rate": 6.585365853658538e-05,
+      "loss": 1.4268,
+      "step": 27
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7909140922467949,
+      "learning_rate": 6.829268292682928e-05,
+      "loss": 1.4873,
+      "step": 28
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7560592825415925,
+      "learning_rate": 7.073170731707317e-05,
+      "loss": 1.4116,
+      "step": 29
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7058796878894483,
+      "learning_rate": 7.317073170731707e-05,
+      "loss": 1.4023,
+      "step": 30
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7614550996113684,
+      "learning_rate": 7.560975609756099e-05,
+      "loss": 1.4312,
+      "step": 31
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7531993296256376,
+      "learning_rate": 7.804878048780489e-05,
+      "loss": 1.5024,
+      "step": 32
+    },
+    {
+      "epoch": 0.02,
+      "grad_norm": 0.7475795582718757,
+      "learning_rate": 8.048780487804879e-05,
+      "loss": 1.4363,
+      "step": 33
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7561530704205457,
+      "learning_rate": 8.292682926829268e-05,
+      "loss": 1.4873,
+      "step": 34
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7606234092420118,
+      "learning_rate": 8.53658536585366e-05,
+      "loss": 1.4204,
+      "step": 35
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7078849092381325,
+      "learning_rate": 8.78048780487805e-05,
+      "loss": 1.418,
+      "step": 36
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7583459620401868,
+      "learning_rate": 9.02439024390244e-05,
+      "loss": 1.4365,
+      "step": 37
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6479336734201823,
+      "learning_rate": 9.26829268292683e-05,
+      "loss": 1.3911,
+      "step": 38
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7138445522030739,
+      "learning_rate": 9.51219512195122e-05,
+      "loss": 1.4287,
+      "step": 39
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6772243082870256,
+      "learning_rate": 9.75609756097561e-05,
+      "loss": 1.3779,
+      "step": 40
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7001769060106223,
+      "learning_rate": 0.0001,
+      "loss": 1.3623,
+      "step": 41
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6593306891481673,
+      "learning_rate": 9.999985665852258e-05,
+      "loss": 1.3745,
+      "step": 42
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7111159325021309,
+      "learning_rate": 9.999942663491213e-05,
+      "loss": 1.3799,
+      "step": 43
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.7023696510759943,
+      "learning_rate": 9.999870993163431e-05,
+      "loss": 1.4399,
+      "step": 44
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6736689337950041,
+      "learning_rate": 9.999770655279843e-05,
+      "loss": 1.4106,
+      "step": 45
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6746379997849087,
+      "learning_rate": 9.999641650415752e-05,
+      "loss": 1.4409,
+      "step": 46
+    },
+    {
+      "epoch": 0.03,
+      "grad_norm": 0.6615592598917496,
+      "learning_rate": 9.99948397931083e-05,
+      "loss": 1.3984,
+      "step": 47
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6538222984665192,
+      "learning_rate": 9.999297642869105e-05,
+      "loss": 1.4031,
+      "step": 48
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6129031974400467,
+      "learning_rate": 9.999082642158973e-05,
+      "loss": 1.396,
+      "step": 49
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6148818612628825,
+      "learning_rate": 9.998838978413168e-05,
+      "loss": 1.3574,
+      "step": 50
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6869612852614861,
+      "learning_rate": 9.99856665302878e-05,
+      "loss": 1.3762,
+      "step": 51
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.7410178778694718,
+      "learning_rate": 9.998265667567226e-05,
+      "loss": 1.3481,
+      "step": 52
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6380516168920353,
+      "learning_rate": 9.997936023754257e-05,
+      "loss": 1.3513,
+      "step": 53
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6192351492724488,
+      "learning_rate": 9.997577723479938e-05,
+      "loss": 1.3662,
+      "step": 54
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.633774941417789,
+      "learning_rate": 9.997190768798639e-05,
+      "loss": 1.3457,
+      "step": 55
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6016840416873676,
+      "learning_rate": 9.996775161929027e-05,
+      "loss": 1.3877,
+      "step": 56
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.638026596140304,
+      "learning_rate": 9.99633090525405e-05,
+      "loss": 1.3892,
+      "step": 57
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.5934027179170136,
+      "learning_rate": 9.995858001320926e-05,
+      "loss": 1.3223,
+      "step": 58
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6143195436309025,
+      "learning_rate": 9.995356452841122e-05,
+      "loss": 1.3862,
+      "step": 59
+    },
+    {
+      "epoch": 0.04,
+      "grad_norm": 0.6076935190423259,
+      "learning_rate": 9.994826262690347e-05,
+      "loss": 1.3584,
+      "step": 60
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.6239965555110781,
+      "learning_rate": 9.994267433908533e-05,
+      "loss": 1.2771,
+      "step": 61
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5469871219286494,
+      "learning_rate": 9.99367996969981e-05,
+      "loss": 1.3579,
+      "step": 62
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5975500231663011,
+      "learning_rate": 9.9930638734325e-05,
+      "loss": 1.3872,
+      "step": 63
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.6160102854784424,
+      "learning_rate": 9.992419148639087e-05,
+      "loss": 1.3831,
+      "step": 64
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5815474376554662,
+      "learning_rate": 9.991745799016206e-05,
+      "loss": 1.3745,
+      "step": 65
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5994591436721235,
+      "learning_rate": 9.991043828424612e-05,
+      "loss": 1.396,
+      "step": 66
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5896523240727669,
+      "learning_rate": 9.990313240889167e-05,
+      "loss": 1.3608,
+      "step": 67
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.6062100949214702,
+      "learning_rate": 9.989554040598807e-05,
+      "loss": 1.2996,
+      "step": 68
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5941049216825265,
+      "learning_rate": 9.988766231906533e-05,
+      "loss": 1.4106,
+      "step": 69
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5604128113953568,
+      "learning_rate": 9.987949819329365e-05,
+      "loss": 1.3931,
+      "step": 70
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5519277490096212,
+      "learning_rate": 9.98710480754834e-05,
+      "loss": 1.3691,
+      "step": 71
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5900021330626725,
+      "learning_rate": 9.986231201408467e-05,
+      "loss": 1.4058,
+      "step": 72
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.5699754681306506,
+      "learning_rate": 9.985329005918702e-05,
+      "loss": 1.355,
+      "step": 73
+    },
+    {
+      "epoch": 0.05,
+      "grad_norm": 0.593149750992695,
+      "learning_rate": 9.98439822625193e-05,
+      "loss": 1.3545,
+      "step": 74
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5824626045065218,
+      "learning_rate": 9.983438867744923e-05,
+      "loss": 1.3896,
+      "step": 75
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5900786393120402,
+      "learning_rate": 9.982450935898316e-05,
+      "loss": 1.3716,
+      "step": 76
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5688141367114475,
+      "learning_rate": 9.981434436376572e-05,
+      "loss": 1.3921,
+      "step": 77
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.557565379686218,
+      "learning_rate": 9.980389375007955e-05,
+      "loss": 1.3506,
+      "step": 78
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5740715320740841,
+      "learning_rate": 9.979315757784488e-05,
+      "loss": 1.2917,
+      "step": 79
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5717745274109229,
+      "learning_rate": 9.97821359086193e-05,
+      "loss": 1.3154,
+      "step": 80
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.609615875256831,
+      "learning_rate": 9.977082880559725e-05,
+      "loss": 1.3328,
+      "step": 81
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5777864702702744,
+      "learning_rate": 9.975923633360985e-05,
+      "loss": 1.3599,
+      "step": 82
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.575948499045498,
+      "learning_rate": 9.974735855912436e-05,
+      "loss": 1.4038,
+      "step": 83
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.550693122074238,
+      "learning_rate": 9.97351955502439e-05,
+      "loss": 1.3203,
+      "step": 84
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5561601283605949,
+      "learning_rate": 9.972274737670701e-05,
+      "loss": 1.3477,
+      "step": 85
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.5601251180421914,
+      "learning_rate": 9.971001410988728e-05,
+      "loss": 1.333,
+      "step": 86
+    },
+    {
+      "epoch": 0.06,
+      "grad_norm": 0.6207004745075507,
+      "learning_rate": 9.969699582279292e-05,
+      "loss": 1.4048,
+      "step": 87
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5475040554880181,
+      "learning_rate": 9.968369259006634e-05,
+      "loss": 1.3208,
+      "step": 88
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.6054670378552847,
+      "learning_rate": 9.967010448798375e-05,
+      "loss": 1.4131,
+      "step": 89
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5486336748948858,
+      "learning_rate": 9.965623159445471e-05,
+      "loss": 1.3843,
+      "step": 90
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.585603864758025,
+      "learning_rate": 9.964207398902163e-05,
+      "loss": 1.3186,
+      "step": 91
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5412960874208915,
+      "learning_rate": 9.96276317528594e-05,
+      "loss": 1.2861,
+      "step": 92
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5442105369162202,
+      "learning_rate": 9.96129049687749e-05,
+      "loss": 1.3262,
+      "step": 93
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5816978676309428,
+      "learning_rate": 9.959789372120649e-05,
+      "loss": 1.3279,
+      "step": 94
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5557519862862452,
+      "learning_rate": 9.958259809622352e-05,
+      "loss": 1.3672,
+      "step": 95
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5666965195077155,
+      "learning_rate": 9.956701818152591e-05,
+      "loss": 1.3203,
+      "step": 96
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5354511291609182,
+      "learning_rate": 9.955115406644356e-05,
+      "loss": 1.3081,
+      "step": 97
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5685729288533676,
+      "learning_rate": 9.953500584193592e-05,
+      "loss": 1.3452,
+      "step": 98
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5922446508548838,
+      "learning_rate": 9.95185736005914e-05,
+      "loss": 1.3682,
+      "step": 99
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5837642463681222,
+      "learning_rate": 9.950185743662685e-05,
+      "loss": 1.3691,
+      "step": 100
+    },
+    {
+      "epoch": 0.07,
+      "grad_norm": 0.5761448966076219,
+      "learning_rate": 9.948485744588709e-05,
+      "loss": 1.3281,
+      "step": 101
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.553490008569796,
+      "learning_rate": 9.946757372584423e-05,
+      "loss": 1.292,
+      "step": 102
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5686477341821499,
+      "learning_rate": 9.945000637559727e-05,
+      "loss": 1.3486,
+      "step": 103
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5772487636958804,
+      "learning_rate": 9.943215549587138e-05,
+      "loss": 1.3425,
+      "step": 104
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5758207849461601,
+      "learning_rate": 9.941402118901744e-05,
+      "loss": 1.3701,
+      "step": 105
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5582099537521159,
+      "learning_rate": 9.939560355901136e-05,
+      "loss": 1.3794,
+      "step": 106
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5336197399728324,
+      "learning_rate": 9.937690271145354e-05,
+      "loss": 1.3179,
+      "step": 107
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.539037656457371,
+      "learning_rate": 9.935791875356832e-05,
+      "loss": 1.3071,
+      "step": 108
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5584770963502244,
+      "learning_rate": 9.933865179420321e-05,
+      "loss": 1.3945,
+      "step": 109
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5364047388288558,
+      "learning_rate": 9.931910194382837e-05,
+      "loss": 1.3462,
+      "step": 110
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5956933567804931,
+      "learning_rate": 9.929926931453599e-05,
+      "loss": 1.2585,
+      "step": 111
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5548298244830802,
+      "learning_rate": 9.927915402003964e-05,
+      "loss": 1.3765,
+      "step": 112
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5528131728204222,
+      "learning_rate": 9.92587561756735e-05,
+      "loss": 1.3452,
+      "step": 113
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5181397205586854,
+      "learning_rate": 9.92380758983919e-05,
+      "loss": 1.2671,
+      "step": 114
+    },
+    {
+      "epoch": 0.08,
+      "grad_norm": 0.5429954425262675,
+      "learning_rate": 9.921711330676848e-05,
+      "loss": 1.3574,
+      "step": 115
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5523231773869766,
+      "learning_rate": 9.919586852099562e-05,
+      "loss": 1.3184,
+      "step": 116
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5583959107787768,
+      "learning_rate": 9.917434166288364e-05,
+      "loss": 1.3442,
+      "step": 117
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5850081526075311,
+      "learning_rate": 9.915253285586024e-05,
+      "loss": 1.3477,
+      "step": 118
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5498743192645993,
+      "learning_rate": 9.913044222496966e-05,
+      "loss": 1.3398,
+      "step": 119
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5853233345937257,
+      "learning_rate": 9.910806989687206e-05,
+      "loss": 1.3276,
+      "step": 120
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.559389561256856,
+      "learning_rate": 9.908541599984276e-05,
+      "loss": 1.3462,
+      "step": 121
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5298088621667728,
+      "learning_rate": 9.906248066377143e-05,
+      "loss": 1.2568,
+      "step": 122
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5731884986496186,
+      "learning_rate": 9.903926402016153e-05,
+      "loss": 1.3394,
+      "step": 123
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5549155957971303,
+      "learning_rate": 9.901576620212933e-05,
+      "loss": 1.311,
+      "step": 124
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5620092141236146,
+      "learning_rate": 9.899198734440335e-05,
+      "loss": 1.291,
+      "step": 125
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5405164924320079,
+      "learning_rate": 9.896792758332341e-05,
+      "loss": 1.248,
+      "step": 126
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5602202105737174,
+      "learning_rate": 9.894358705684002e-05,
+      "loss": 1.3115,
+      "step": 127
+    },
+    {
+      "epoch": 0.09,
+      "grad_norm": 0.5580296998093701,
+      "learning_rate": 9.891896590451344e-05,
+      "loss": 1.2947,
+      "step": 128
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5755635897570144,
+      "learning_rate": 9.889406426751296e-05,
+      "loss": 1.3086,
+      "step": 129
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.6025851962917577,
+      "learning_rate": 9.886888228861608e-05,
+      "loss": 1.3447,
+      "step": 130
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5660419268974345,
+      "learning_rate": 9.88434201122077e-05,
+      "loss": 1.3232,
+      "step": 131
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5495648120402916,
+      "learning_rate": 9.881767788427925e-05,
+      "loss": 1.3096,
+      "step": 132
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5577872798163368,
+      "learning_rate": 9.879165575242787e-05,
+      "loss": 1.291,
+      "step": 133
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5540620803629338,
+      "learning_rate": 9.876535386585561e-05,
+      "loss": 1.335,
+      "step": 134
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5573425731012122,
+      "learning_rate": 9.873877237536853e-05,
+      "loss": 1.2327,
+      "step": 135
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5827857038389533,
+      "learning_rate": 9.871191143337582e-05,
+      "loss": 1.3333,
+      "step": 136
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5897883061496167,
+      "learning_rate": 9.868477119388896e-05,
+      "loss": 1.3076,
+      "step": 137
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5800275384221499,
+      "learning_rate": 9.865735181252085e-05,
+      "loss": 1.3188,
+      "step": 138
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5605765677262206,
+      "learning_rate": 9.862965344648485e-05,
+      "loss": 1.3086,
+      "step": 139
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5432447170586258,
+      "learning_rate": 9.860167625459398e-05,
+      "loss": 1.2861,
+      "step": 140
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5687257803544524,
+      "learning_rate": 9.85734203972599e-05,
+      "loss": 1.2839,
+      "step": 141
+    },
+    {
+      "epoch": 0.1,
+      "grad_norm": 0.5475328993701518,
+      "learning_rate": 9.854488603649206e-05,
+      "loss": 1.3169,
+      "step": 142
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5408143803639806,
+      "learning_rate": 9.851607333589677e-05,
+      "loss": 1.3374,
+      "step": 143
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5350053494827027,
+      "learning_rate": 9.848698246067623e-05,
+      "loss": 1.2888,
+      "step": 144
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5642075781884446,
+      "learning_rate": 9.84576135776276e-05,
+      "loss": 1.3105,
+      "step": 145
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5725161088840623,
+      "learning_rate": 9.842796685514203e-05,
+      "loss": 1.3516,
+      "step": 146
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5837888943455876,
+      "learning_rate": 9.839804246320375e-05,
+      "loss": 1.2871,
+      "step": 147
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5833329842842448,
+      "learning_rate": 9.836784057338899e-05,
+      "loss": 1.3232,
+      "step": 148
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5244172538585695,
+      "learning_rate": 9.833736135886512e-05,
+      "loss": 1.2568,
+      "step": 149
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5163576076330887,
+      "learning_rate": 9.830660499438955e-05,
+      "loss": 1.2759,
+      "step": 150
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5617840717093857,
+      "learning_rate": 9.827557165630879e-05,
+      "loss": 1.2524,
+      "step": 151
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.547220410155329,
+      "learning_rate": 9.824426152255741e-05,
+      "loss": 1.312,
+      "step": 152
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5715922980351898,
+      "learning_rate": 9.821267477265705e-05,
+      "loss": 1.335,
+      "step": 153
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.5626236612178414,
+      "learning_rate": 9.818081158771538e-05,
+      "loss": 1.3633,
+      "step": 154
+    },
+    {
+      "epoch": 0.11,
+      "grad_norm": 0.556817713740677,
+      "learning_rate": 9.814867215042502e-05,
+      "loss": 1.3345,
+      "step": 155
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5658424328358594,
+      "learning_rate": 9.811625664506259e-05,
+      "loss": 1.3325,
+      "step": 156
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5518987143292007,
+      "learning_rate": 9.808356525748748e-05,
+      "loss": 1.3179,
+      "step": 157
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5509045139485853,
+      "learning_rate": 9.805059817514101e-05,
+      "loss": 1.3276,
+      "step": 158
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5612999607711056,
+      "learning_rate": 9.801735558704517e-05,
+      "loss": 1.2192,
+      "step": 159
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.530326353544212,
+      "learning_rate": 9.798383768380164e-05,
+      "loss": 1.2988,
+      "step": 160
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5524425336112486,
+      "learning_rate": 9.795004465759065e-05,
+      "loss": 1.2622,
+      "step": 161
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5121240819278214,
+      "learning_rate": 9.791597670216989e-05,
+      "loss": 1.2603,
+      "step": 162
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5262701595678754,
+      "learning_rate": 9.78816340128734e-05,
+      "loss": 1.22,
+      "step": 163
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5866254674193113,
+      "learning_rate": 9.784701678661045e-05,
+      "loss": 1.311,
+      "step": 164
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.567120419528464,
+      "learning_rate": 9.781212522186443e-05,
+      "loss": 1.3145,
+      "step": 165
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5704512174009239,
+      "learning_rate": 9.777695951869164e-05,
+      "loss": 1.2612,
+      "step": 166
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5359884622353506,
+      "learning_rate": 9.774151987872027e-05,
+      "loss": 1.2117,
+      "step": 167
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5772321074843504,
+      "learning_rate": 9.770580650514914e-05,
+      "loss": 1.3525,
+      "step": 168
+    },
+    {
+      "epoch": 0.12,
+      "grad_norm": 0.5316876920831217,
+      "learning_rate": 9.766981960274653e-05,
+      "loss": 1.3442,
+      "step": 169
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5622203218145027,
+      "learning_rate": 9.763355937784909e-05,
+      "loss": 1.2964,
+      "step": 170
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5614932814360857,
+      "learning_rate": 9.759702603836059e-05,
+      "loss": 1.3389,
+      "step": 171
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.568962837143467,
+      "learning_rate": 9.756021979375071e-05,
+      "loss": 1.3174,
+      "step": 172
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5382419139994956,
+      "learning_rate": 9.752314085505395e-05,
+      "loss": 1.3125,
+      "step": 173
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5677837729549118,
+      "learning_rate": 9.748578943486828e-05,
+      "loss": 1.2871,
+      "step": 174
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5602612877442024,
+      "learning_rate": 9.744816574735405e-05,
+      "loss": 1.3438,
+      "step": 175
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5735194400650546,
+      "learning_rate": 9.74102700082326e-05,
+      "loss": 1.3208,
+      "step": 176
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5670876099448275,
+      "learning_rate": 9.737210243478521e-05,
+      "loss": 1.2969,
+      "step": 177
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5450536272385241,
+      "learning_rate": 9.733366324585175e-05,
+      "loss": 1.2673,
+      "step": 178
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5340701964695135,
+      "learning_rate": 9.72949526618294e-05,
+      "loss": 1.3403,
+      "step": 179
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5422933717116616,
+      "learning_rate": 9.725597090467144e-05,
+      "loss": 1.2539,
+      "step": 180
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.5680150103490264,
+      "learning_rate": 9.721671819788602e-05,
+      "loss": 1.3149,
+      "step": 181
+    },
+    {
+      "epoch": 0.13,
+      "grad_norm": 0.560101859043945,
+      "learning_rate": 9.717719476653475e-05,
+      "loss": 1.321,
+      "step": 182
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5267278121510764,
+      "learning_rate": 9.71374008372315e-05,
+      "loss": 1.2227,
+      "step": 183
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5687530339596342,
+      "learning_rate": 9.709733663814113e-05,
+      "loss": 1.3159,
+      "step": 184
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5321503974993333,
+      "learning_rate": 9.705700239897809e-05,
+      "loss": 1.3188,
+      "step": 185
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5593956329311583,
+      "learning_rate": 9.701639835100513e-05,
+      "loss": 1.249,
+      "step": 186
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5591047172889141,
+      "learning_rate": 9.697552472703205e-05,
+      "loss": 1.2756,
+      "step": 187
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5543029039316694,
+      "learning_rate": 9.693438176141425e-05,
+      "loss": 1.2915,
+      "step": 188
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5494961227055172,
+      "learning_rate": 9.68929696900515e-05,
+      "loss": 1.313,
+      "step": 189
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5541252042617403,
+      "learning_rate": 9.685128875038647e-05,
+      "loss": 1.2754,
+      "step": 190
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5163534781462605,
+      "learning_rate": 9.680933918140348e-05,
+      "loss": 1.2681,
+      "step": 191
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.537157272716453,
+      "learning_rate": 9.676712122362706e-05,
+      "loss": 1.2551,
+      "step": 192
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5397175193183968,
+      "learning_rate": 9.672463511912055e-05,
+      "loss": 1.2822,
+      "step": 193
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5488691397441863,
+      "learning_rate": 9.668188111148484e-05,
+      "loss": 1.283,
+      "step": 194
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.5905761212464122,
+      "learning_rate": 9.66388594458568e-05,
+      "loss": 1.2896,
+      "step": 195
+    },
+    {
+      "epoch": 0.14,
+      "grad_norm": 0.580369444338734,
+      "learning_rate": 9.659557036890801e-05,
+      "loss": 1.3416,
+      "step": 196
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5262728809847318,
+      "learning_rate": 9.655201412884327e-05,
+      "loss": 1.2554,
+      "step": 197
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5375550652008795,
+      "learning_rate": 9.650819097539922e-05,
+      "loss": 1.2612,
+      "step": 198
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5208197207069616,
+      "learning_rate": 9.646410115984289e-05,
+      "loss": 1.2358,
+      "step": 199
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5409371788748774,
+      "learning_rate": 9.641974493497024e-05,
+      "loss": 1.3262,
+      "step": 200
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5389211233425135,
+      "learning_rate": 9.637512255510475e-05,
+      "loss": 1.2729,
+      "step": 201
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5501782779153785,
+      "learning_rate": 9.633023427609591e-05,
+      "loss": 1.2322,
+      "step": 202
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5678681105856288,
+      "learning_rate": 9.628508035531785e-05,
+      "loss": 1.3721,
+      "step": 203
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5559621306210715,
+      "learning_rate": 9.623966105166772e-05,
+      "loss": 1.3267,
+      "step": 204
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5417687907113425,
+      "learning_rate": 9.619397662556435e-05,
+      "loss": 1.2666,
+      "step": 205
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5546614199696198,
+      "learning_rate": 9.614802733894665e-05,
+      "loss": 1.3389,
+      "step": 206
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5594799442475286,
+      "learning_rate": 9.610181345527217e-05,
+      "loss": 1.2671,
+      "step": 207
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5852167375394156,
+      "learning_rate": 9.605533523951558e-05,
+      "loss": 1.3335,
+      "step": 208
+    },
+    {
+      "epoch": 0.15,
+      "grad_norm": 0.5465110917787175,
+      "learning_rate": 9.600859295816708e-05,
+      "loss": 1.3096,
+      "step": 209
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5704616015169348,
+      "learning_rate": 9.596158687923104e-05,
+      "loss": 1.3022,
+      "step": 210
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5617616139462727,
+      "learning_rate": 9.591431727222424e-05,
+      "loss": 1.3159,
+      "step": 211
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5465602681324426,
+      "learning_rate": 9.586678440817453e-05,
+      "loss": 1.2708,
+      "step": 212
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5864421378413351,
+      "learning_rate": 9.581898855961912e-05,
+      "loss": 1.2607,
+      "step": 213
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.556548001041405,
+      "learning_rate": 9.577093000060312e-05,
+      "loss": 1.3081,
+      "step": 214
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5642842704902283,
+      "learning_rate": 9.572260900667794e-05,
+      "loss": 1.2759,
+      "step": 215
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5486665255067006,
+      "learning_rate": 9.567402585489963e-05,
+      "loss": 1.2104,
+      "step": 216
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5361207508020517,
+      "learning_rate": 9.56251808238275e-05,
+      "loss": 1.2451,
+      "step": 217
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5149380805556683,
+      "learning_rate": 9.557607419352226e-05,
+      "loss": 1.2778,
+      "step": 218
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5469266902951428,
+      "learning_rate": 9.552670624554461e-05,
+      "loss": 1.2617,
+      "step": 219
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5430295319416,
+      "learning_rate": 9.54770772629535e-05,
+      "loss": 1.2915,
+      "step": 220
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5744217791056692,
+      "learning_rate": 9.542718753030463e-05,
+      "loss": 1.3281,
+      "step": 221
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5587545969611539,
+      "learning_rate": 9.537703733364871e-05,
+      "loss": 1.2837,
+      "step": 222
+    },
+    {
+      "epoch": 0.16,
+      "grad_norm": 0.5288053303373643,
+      "learning_rate": 9.532662696052985e-05,
+      "loss": 1.2949,
+      "step": 223
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5791175310063906,
+      "learning_rate": 9.527595669998399e-05,
+      "loss": 1.2917,
+      "step": 224
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5250029719207272,
+      "learning_rate": 9.522502684253709e-05,
+      "loss": 1.2375,
+      "step": 225
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5177601049436101,
+      "learning_rate": 9.517383768020361e-05,
+      "loss": 1.2695,
+      "step": 226
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5554993860583297,
+      "learning_rate": 9.512238950648474e-05,
+      "loss": 1.2917,
+      "step": 227
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5738329488665082,
+      "learning_rate": 9.507068261636679e-05,
+      "loss": 1.2944,
+      "step": 228
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5562896023700302,
+      "learning_rate": 9.501871730631942e-05,
+      "loss": 1.3296,
+      "step": 229
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5416347008024398,
+      "learning_rate": 9.496649387429404e-05,
+      "loss": 1.2437,
+      "step": 230
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5699356753997783,
+      "learning_rate": 9.491401261972195e-05,
+      "loss": 1.2705,
+      "step": 231
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5481624625613764,
+      "learning_rate": 9.486127384351282e-05,
+      "loss": 1.3779,
+      "step": 232
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5688206917165098,
+      "learning_rate": 9.480827784805278e-05,
+      "loss": 1.2754,
+      "step": 233
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5490377714658476,
+      "learning_rate": 9.475502493720283e-05,
+      "loss": 1.3125,
+      "step": 234
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5355672804730123,
+      "learning_rate": 9.470151541629699e-05,
+      "loss": 1.2627,
+      "step": 235
+    },
+    {
+      "epoch": 0.17,
+      "grad_norm": 0.5905840590902287,
+      "learning_rate": 9.464774959214063e-05,
+      "loss": 1.3027,
+      "step": 236
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.56064622426517,
+      "learning_rate": 9.459372777300864e-05,
+      "loss": 1.2065,
+      "step": 237
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5568610691565873,
+      "learning_rate": 9.45394502686437e-05,
+      "loss": 1.3223,
+      "step": 238
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5300725401389981,
+      "learning_rate": 9.448491739025454e-05,
+      "loss": 1.2805,
+      "step": 239
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5519662242216672,
+      "learning_rate": 9.44301294505141e-05,
+      "loss": 1.2371,
+      "step": 240
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5402101018249572,
+      "learning_rate": 9.437508676355773e-05,
+      "loss": 1.2749,
+      "step": 241
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5389383005608104,
+      "learning_rate": 9.431978964498143e-05,
+      "loss": 1.2876,
+      "step": 242
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5310718244911751,
+      "learning_rate": 9.426423841184005e-05,
+      "loss": 1.3057,
+      "step": 243
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5454082533825911,
+      "learning_rate": 9.420843338264542e-05,
+      "loss": 1.2578,
+      "step": 244
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.565349361879851,
+      "learning_rate": 9.415237487736452e-05,
+      "loss": 1.3306,
+      "step": 245
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5224746893789486,
+      "learning_rate": 9.409606321741775e-05,
+      "loss": 1.2598,
+      "step": 246
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5440997273729092,
+      "learning_rate": 9.403949872567695e-05,
+      "loss": 1.2749,
+      "step": 247
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5668696203741111,
+      "learning_rate": 9.398268172646365e-05,
+      "loss": 1.2739,
+      "step": 248
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.538410569856225,
+      "learning_rate": 9.392561254554713e-05,
+      "loss": 1.2734,
+      "step": 249
+    },
+    {
+      "epoch": 0.18,
+      "grad_norm": 0.5458663263053075,
+      "learning_rate": 9.386829151014262e-05,
+      "loss": 1.3101,
+      "step": 250
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.537905713825921,
+      "learning_rate": 9.381071894890941e-05,
+      "loss": 1.2666,
+      "step": 251
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5288916095430457,
+      "learning_rate": 9.375289519194894e-05,
+      "loss": 1.2666,
+      "step": 252
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5335913282729025,
+      "learning_rate": 9.369482057080292e-05,
+      "loss": 1.2886,
+      "step": 253
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5523824410197196,
+      "learning_rate": 9.363649541845142e-05,
+      "loss": 1.2571,
+      "step": 254
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5912264857528259,
+      "learning_rate": 9.357792006931098e-05,
+      "loss": 1.261,
+      "step": 255
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5594499774840426,
+      "learning_rate": 9.35190948592327e-05,
+      "loss": 1.3027,
+      "step": 256
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5379207919206825,
+      "learning_rate": 9.346002012550027e-05,
+      "loss": 1.2983,
+      "step": 257
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5455629199690059,
+      "learning_rate": 9.340069620682806e-05,
+      "loss": 1.2695,
+      "step": 258
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5471737544580354,
+      "learning_rate": 9.334112344335924e-05,
+      "loss": 1.3047,
+      "step": 259
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5397100655209365,
+      "learning_rate": 9.328130217666366e-05,
+      "loss": 1.2896,
+      "step": 260
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5636004509867364,
+      "learning_rate": 9.322123274973613e-05,
+      "loss": 1.3501,
+      "step": 261
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5605154015144495,
+      "learning_rate": 9.316091550699424e-05,
+      "loss": 1.2983,
+      "step": 262
+    },
+    {
+      "epoch": 0.19,
+      "grad_norm": 0.5461515781521593,
+      "learning_rate": 9.310035079427651e-05,
+      "loss": 1.269,
+      "step": 263
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5175024878789147,
+      "learning_rate": 9.303953895884033e-05,
+      "loss": 1.1653,
+      "step": 264
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5224669601631107,
+      "learning_rate": 9.297848034936006e-05,
+      "loss": 1.2554,
+      "step": 265
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5444106809363777,
+      "learning_rate": 9.291717531592494e-05,
+      "loss": 1.293,
+      "step": 266
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5287552712313793,
+      "learning_rate": 9.285562421003715e-05,
+      "loss": 1.2651,
+      "step": 267
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5381309609110954,
+      "learning_rate": 9.279382738460971e-05,
+      "loss": 1.2812,
+      "step": 268
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5528803396804242,
+      "learning_rate": 9.273178519396459e-05,
+      "loss": 1.3149,
+      "step": 269
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5270531797880375,
+      "learning_rate": 9.266949799383053e-05,
+      "loss": 1.2615,
+      "step": 270
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5488129774725259,
+      "learning_rate": 9.260696614134114e-05,
+      "loss": 1.2837,
+      "step": 271
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5335083589116082,
+      "learning_rate": 9.254418999503271e-05,
+      "loss": 1.2339,
+      "step": 272
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5974061497388541,
+      "learning_rate": 9.248116991484229e-05,
+      "loss": 1.2825,
+      "step": 273
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5381713380415607,
+      "learning_rate": 9.241790626210549e-05,
+      "loss": 1.1895,
+      "step": 274
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5384430847504001,
+      "learning_rate": 9.235439939955457e-05,
+      "loss": 1.2358,
+      "step": 275
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5256588888016233,
+      "learning_rate": 9.229064969131621e-05,
+      "loss": 1.2407,
+      "step": 276
+    },
+    {
+      "epoch": 0.2,
+      "grad_norm": 0.5242296953154587,
+      "learning_rate": 9.222665750290953e-05,
+      "loss": 1.2832,
+      "step": 277
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5224106607183625,
+      "learning_rate": 9.216242320124388e-05,
+      "loss": 1.2388,
+      "step": 278
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.540400861953043,
+      "learning_rate": 9.20979471546169e-05,
+      "loss": 1.2695,
+      "step": 279
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5289483661482471,
+      "learning_rate": 9.203322973271223e-05,
+      "loss": 1.2832,
+      "step": 280
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5376637104674151,
+      "learning_rate": 9.19682713065975e-05,
+      "loss": 1.2783,
+      "step": 281
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5547766359095799,
+      "learning_rate": 9.19030722487222e-05,
+      "loss": 1.2515,
+      "step": 282
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5431030883095361,
+      "learning_rate": 9.183763293291549e-05,
+      "loss": 1.2346,
+      "step": 283
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5767856753870191,
+      "learning_rate": 9.17719537343841e-05,
+      "loss": 1.2974,
+      "step": 284
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5356401648893151,
+      "learning_rate": 9.170603502971016e-05,
+      "loss": 1.2532,
+      "step": 285
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5528695803408737,
+      "learning_rate": 9.163987719684907e-05,
+      "loss": 1.3442,
+      "step": 286
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5356080125920785,
+      "learning_rate": 9.157348061512727e-05,
+      "loss": 1.2686,
+      "step": 287
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5778656916381988,
+      "learning_rate": 9.150684566524012e-05,
+      "loss": 1.2041,
+      "step": 288
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5328749801157324,
+      "learning_rate": 9.143997272924973e-05,
+      "loss": 1.2437,
+      "step": 289
+    },
+    {
+      "epoch": 0.21,
+      "grad_norm": 0.5656275076768376,
+      "learning_rate": 9.13728621905827e-05,
+      "loss": 1.2886,
+      "step": 290
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5655646337419664,
+      "learning_rate": 9.130551443402799e-05,
+      "loss": 1.2783,
+      "step": 291
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.567975953014803,
+      "learning_rate": 9.123792984573466e-05,
+      "loss": 1.3223,
+      "step": 292
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5361585380833186,
+      "learning_rate": 9.117010881320973e-05,
+      "loss": 1.2231,
+      "step": 293
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5527612532950269,
+      "learning_rate": 9.110205172531585e-05,
+      "loss": 1.3506,
+      "step": 294
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5330323483779986,
+      "learning_rate": 9.103375897226918e-05,
+      "loss": 1.2974,
+      "step": 295
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.541076058179259,
+      "learning_rate": 9.096523094563708e-05,
+      "loss": 1.2617,
+      "step": 296
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5340836977689315,
+      "learning_rate": 9.089646803833589e-05,
+      "loss": 1.2603,
+      "step": 297
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5383753245320845,
+      "learning_rate": 9.082747064462867e-05,
+      "loss": 1.2583,
+      "step": 298
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5192836861689345,
+      "learning_rate": 9.075823916012298e-05,
+      "loss": 1.2568,
+      "step": 299
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5744817919271316,
+      "learning_rate": 9.068877398176852e-05,
+      "loss": 1.2131,
+      "step": 300
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5323047093147705,
+      "learning_rate": 9.061907550785498e-05,
+      "loss": 1.2783,
+      "step": 301
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5607328564400242,
+      "learning_rate": 9.054914413800961e-05,
+      "loss": 1.3398,
+      "step": 302
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.5782257895199574,
+      "learning_rate": 9.047898027319507e-05,
+      "loss": 1.2759,
+      "step": 303
+    },
+    {
+      "epoch": 0.22,
+      "grad_norm": 0.546644793451931,
+      "learning_rate": 9.040858431570702e-05,
+      "loss": 1.2632,
+      "step": 304
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5535852227341702,
+      "learning_rate": 9.033795666917191e-05,
+      "loss": 1.312,
+      "step": 305
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5371002551511538,
+      "learning_rate": 9.026709773854457e-05,
+      "loss": 1.2593,
+      "step": 306
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5394441228369942,
+      "learning_rate": 9.019600793010597e-05,
+      "loss": 1.269,
+      "step": 307
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5512445550522174,
+      "learning_rate": 9.012468765146079e-05,
+      "loss": 1.2686,
+      "step": 308
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5043850111181398,
+      "learning_rate": 9.005313731153524e-05,
+      "loss": 1.2363,
+      "step": 309
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5294693808157453,
+      "learning_rate": 8.998135732057458e-05,
+      "loss": 1.2725,
+      "step": 310
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5235449664008548,
+      "learning_rate": 8.990934809014077e-05,
+      "loss": 1.249,
+      "step": 311
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5228082226582549,
+      "learning_rate": 8.983711003311024e-05,
+      "loss": 1.2153,
+      "step": 312
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5525620828249341,
+      "learning_rate": 8.976464356367134e-05,
+      "loss": 1.2136,
+      "step": 313
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5605215996168639,
+      "learning_rate": 8.96919490973221e-05,
+      "loss": 1.271,
+      "step": 314
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5277359930208506,
+      "learning_rate": 8.961902705086785e-05,
+      "loss": 1.1836,
+      "step": 315
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5405930304733125,
+      "learning_rate": 8.954587784241871e-05,
+      "loss": 1.2705,
+      "step": 316
+    },
+    {
+      "epoch": 0.23,
+      "grad_norm": 0.5248476194932483,
+      "learning_rate": 8.947250189138731e-05,
+      "loss": 1.2607,
+      "step": 317
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.573678896783169,
+      "learning_rate": 8.939889961848634e-05,
+      "loss": 1.2727,
+      "step": 318
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5773485095137408,
+      "learning_rate": 8.932507144572616e-05,
+      "loss": 1.2607,
+      "step": 319
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5633980526681968,
+      "learning_rate": 8.925101779641232e-05,
+      "loss": 1.1917,
+      "step": 320
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5300371631849218,
+      "learning_rate": 8.917673909514322e-05,
+      "loss": 1.3105,
+      "step": 321
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5310192196200603,
+      "learning_rate": 8.910223576780758e-05,
+      "loss": 1.2808,
+      "step": 322
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5234569464366723,
+      "learning_rate": 8.902750824158212e-05,
+      "loss": 1.2468,
+      "step": 323
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5473770126434013,
+      "learning_rate": 8.895255694492896e-05,
+      "loss": 1.2676,
+      "step": 324
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5670393642092653,
+      "learning_rate": 8.887738230759333e-05,
+      "loss": 1.2456,
+      "step": 325
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5484650752546845,
+      "learning_rate": 8.880198476060095e-05,
+      "loss": 1.251,
+      "step": 326
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5569076336735002,
+      "learning_rate": 8.872636473625565e-05,
+      "loss": 1.272,
+      "step": 327
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5237290090420638,
+      "learning_rate": 8.865052266813685e-05,
+      "loss": 1.2822,
+      "step": 328
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5507489271814671,
+      "learning_rate": 8.857445899109715e-05,
+      "loss": 1.2783,
+      "step": 329
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5527246685898635,
+      "learning_rate": 8.849817414125973e-05,
+      "loss": 1.2705,
+      "step": 330
+    },
+    {
+      "epoch": 0.24,
+      "grad_norm": 0.5544016696123183,
+      "learning_rate": 8.84216685560159e-05,
+      "loss": 1.2856,
+      "step": 331
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5424146088216879,
+      "learning_rate": 8.834494267402263e-05,
+      "loss": 1.2202,
+      "step": 332
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5323806898987287,
+      "learning_rate": 8.826799693519996e-05,
+      "loss": 1.248,
+      "step": 333
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5595146324987165,
+      "learning_rate": 8.819083178072852e-05,
+      "loss": 1.1672,
+      "step": 334
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5854406580169095,
+      "learning_rate": 8.811344765304698e-05,
+      "loss": 1.2146,
+      "step": 335
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5697562446019094,
+      "learning_rate": 8.80358449958496e-05,
+      "loss": 1.2568,
+      "step": 336
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5538906977604374,
+      "learning_rate": 8.795802425408352e-05,
+      "loss": 1.2544,
+      "step": 337
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5211793067308176,
+      "learning_rate": 8.787998587394637e-05,
+      "loss": 1.2183,
+      "step": 338
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5732446722628473,
+      "learning_rate": 8.780173030288359e-05,
+      "loss": 1.3057,
+      "step": 339
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5352980539739127,
+      "learning_rate": 8.772325798958597e-05,
+      "loss": 1.2598,
+      "step": 340
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5234917926015726,
+      "learning_rate": 8.7644569383987e-05,
+      "loss": 1.1982,
+      "step": 341
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5844314852721842,
+      "learning_rate": 8.75656649372603e-05,
+      "loss": 1.2656,
+      "step": 342
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5646854448914282,
+      "learning_rate": 8.748654510181709e-05,
+      "loss": 1.21,
+      "step": 343
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5216723813831847,
+      "learning_rate": 8.740721033130352e-05,
+      "loss": 1.2329,
+      "step": 344
+    },
+    {
+      "epoch": 0.25,
+      "grad_norm": 0.5099027314874095,
+      "learning_rate": 8.732766108059813e-05,
+      "loss": 1.2236,
+      "step": 345
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5188769999186538,
+      "learning_rate": 8.72478978058092e-05,
+      "loss": 1.2905,
+      "step": 346
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5245157404984339,
+      "learning_rate": 8.716792096427217e-05,
+      "loss": 1.2339,
+      "step": 347
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5160205485678449,
+      "learning_rate": 8.708773101454697e-05,
+      "loss": 1.2524,
+      "step": 348
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.510633107323387,
+      "learning_rate": 8.700732841641542e-05,
+      "loss": 1.2756,
+      "step": 349
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5097028901140956,
+      "learning_rate": 8.692671363087863e-05,
+      "loss": 1.2539,
+      "step": 350
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5506040438253419,
+      "learning_rate": 8.68458871201543e-05,
+      "loss": 1.1733,
+      "step": 351
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5339837805003954,
+      "learning_rate": 8.676484934767409e-05,
+      "loss": 1.1919,
+      "step": 352
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5243053855032012,
+      "learning_rate": 8.668360077808093e-05,
+      "loss": 1.2637,
+      "step": 353
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5475923045103417,
+      "learning_rate": 8.660214187722646e-05,
+      "loss": 1.2583,
+      "step": 354
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5139607250185231,
+      "learning_rate": 8.652047311216822e-05,
+      "loss": 1.2939,
+      "step": 355
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5310090229071474,
+      "learning_rate": 8.64385949511671e-05,
+      "loss": 1.2788,
+      "step": 356
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5531120494965365,
+      "learning_rate": 8.635650786368452e-05,
+      "loss": 1.25,
+      "step": 357
+    },
+    {
+      "epoch": 0.26,
+      "grad_norm": 0.5315969577054235,
+      "learning_rate": 8.627421232037989e-05,
+      "loss": 1.2357,
+      "step": 358
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5266216921573422,
+      "learning_rate": 8.619170879310779e-05,
+      "loss": 1.2729,
+      "step": 359
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5593055072800345,
+      "learning_rate": 8.61089977549153e-05,
+      "loss": 1.2529,
+      "step": 360
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5596710951308123,
+      "learning_rate": 8.602607968003935e-05,
+      "loss": 1.2725,
+      "step": 361
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5433552854623133,
+      "learning_rate": 8.59429550439039e-05,
+      "loss": 1.2446,
+      "step": 362
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5818949631250041,
+      "learning_rate": 8.585962432311727e-05,
+      "loss": 1.2998,
+      "step": 363
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.514243535892493,
+      "learning_rate": 8.577608799546942e-05,
+      "loss": 1.23,
+      "step": 364
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5465838481685172,
+      "learning_rate": 8.569234653992916e-05,
+      "loss": 1.2532,
+      "step": 365
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.519563471824199,
+      "learning_rate": 8.560840043664144e-05,
+      "loss": 1.2607,
+      "step": 366
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5334398982863738,
+      "learning_rate": 8.552425016692464e-05,
+      "loss": 1.2363,
+      "step": 367
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5530652812053678,
+      "learning_rate": 8.543989621326768e-05,
+      "loss": 1.2681,
+      "step": 368
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5502954863671434,
+      "learning_rate": 8.535533905932738e-05,
+      "loss": 1.1721,
+      "step": 369
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5180001078920966,
+      "learning_rate": 8.527057918992565e-05,
+      "loss": 1.2139,
+      "step": 370
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5333180911534254,
+      "learning_rate": 8.518561709104667e-05,
+      "loss": 1.2461,
+      "step": 371
+    },
+    {
+      "epoch": 0.27,
+      "grad_norm": 0.5479350107655593,
+      "learning_rate": 8.510045324983417e-05,
+      "loss": 1.2512,
+      "step": 372
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5246093324411485,
+      "learning_rate": 8.501508815458855e-05,
+      "loss": 1.1787,
+      "step": 373
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.50033135264865,
+      "learning_rate": 8.492952229476421e-05,
+      "loss": 1.2271,
+      "step": 374
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5418162221365314,
+      "learning_rate": 8.484375616096658e-05,
+      "loss": 1.2383,
+      "step": 375
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.516783670359288,
+      "learning_rate": 8.475779024494945e-05,
+      "loss": 1.2681,
+      "step": 376
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5298750460233759,
+      "learning_rate": 8.467162503961208e-05,
+      "loss": 1.2451,
+      "step": 377
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5149476400550106,
+      "learning_rate": 8.45852610389964e-05,
+      "loss": 1.23,
+      "step": 378
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5268563601419046,
+      "learning_rate": 8.449869873828411e-05,
+      "loss": 1.2129,
+      "step": 379
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5357435202461692,
+      "learning_rate": 8.441193863379396e-05,
+      "loss": 1.2881,
+      "step": 380
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5407114377511073,
+      "learning_rate": 8.432498122297878e-05,
+      "loss": 1.2559,
+      "step": 381
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5376253272809564,
+      "learning_rate": 8.423782700442277e-05,
+      "loss": 1.2346,
+      "step": 382
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5378153063595059,
+      "learning_rate": 8.415047647783847e-05,
+      "loss": 1.2031,
+      "step": 383
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.514779002563088,
+      "learning_rate": 8.406293014406403e-05,
+      "loss": 1.2056,
+      "step": 384
+    },
+    {
+      "epoch": 0.28,
+      "grad_norm": 0.5659231392943161,
+      "learning_rate": 8.397518850506028e-05,
+      "loss": 1.2346,
+      "step": 385
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5483974446090379,
+      "learning_rate": 8.388725206390788e-05,
+      "loss": 1.2974,
+      "step": 386
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5297423113703096,
+      "learning_rate": 8.379912132480441e-05,
+      "loss": 1.2427,
+      "step": 387
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5339239833592698,
+      "learning_rate": 8.371079679306146e-05,
+      "loss": 1.2788,
+      "step": 388
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5346762752364651,
+      "learning_rate": 8.36222789751018e-05,
+      "loss": 1.2329,
+      "step": 389
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5267945253503268,
+      "learning_rate": 8.353356837845642e-05,
+      "loss": 1.3101,
+      "step": 390
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5227678407329124,
+      "learning_rate": 8.344466551176164e-05,
+      "loss": 1.2544,
+      "step": 391
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5351886972585579,
+      "learning_rate": 8.335557088475618e-05,
+      "loss": 1.2036,
+      "step": 392
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.547855768363372,
+      "learning_rate": 8.326628500827826e-05,
+      "loss": 1.2256,
+      "step": 393
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5232912428703006,
+      "learning_rate": 8.31768083942627e-05,
+      "loss": 1.2524,
+      "step": 394
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5355407135538937,
+      "learning_rate": 8.308714155573785e-05,
+      "loss": 1.1904,
+      "step": 395
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5398818834520477,
+      "learning_rate": 8.29972850068228e-05,
+      "loss": 1.2544,
+      "step": 396
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5365767973671521,
+      "learning_rate": 8.290723926272439e-05,
+      "loss": 1.2378,
+      "step": 397
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5505960932890972,
+      "learning_rate": 8.281700483973421e-05,
+      "loss": 1.2471,
+      "step": 398
+    },
+    {
+      "epoch": 0.29,
+      "grad_norm": 0.5479428166637395,
+      "learning_rate": 8.272658225522569e-05,
+      "loss": 1.2607,
+      "step": 399
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5764125413085645,
+      "learning_rate": 8.263597202765109e-05,
+      "loss": 1.2888,
+      "step": 400
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5193462362673806,
+      "learning_rate": 8.254517467653858e-05,
+      "loss": 1.1882,
+      "step": 401
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5374168368793678,
+      "learning_rate": 8.245419072248919e-05,
+      "loss": 1.2358,
+      "step": 402
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5560345573494497,
+      "learning_rate": 8.236302068717392e-05,
+      "loss": 1.3,
+      "step": 403
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5223138605512301,
+      "learning_rate": 8.227166509333068e-05,
+      "loss": 1.2559,
+      "step": 404
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5009208364979428,
+      "learning_rate": 8.218012446476128e-05,
+      "loss": 1.2617,
+      "step": 405
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.509867725986647,
+      "learning_rate": 8.208839932632849e-05,
+      "loss": 1.2715,
+      "step": 406
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5190782935920448,
+      "learning_rate": 8.199649020395298e-05,
+      "loss": 1.2183,
+      "step": 407
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.551317848502644,
+      "learning_rate": 8.190439762461033e-05,
+      "loss": 1.2241,
+      "step": 408
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5299140869699253,
+      "learning_rate": 8.181212211632799e-05,
+      "loss": 1.1746,
+      "step": 409
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5161200175965883,
+      "learning_rate": 8.171966420818228e-05,
+      "loss": 1.2544,
+      "step": 410
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5368310977870265,
+      "learning_rate": 8.162702443029531e-05,
+      "loss": 1.2505,
+      "step": 411
+    },
+    {
+      "epoch": 0.3,
+      "grad_norm": 0.5392135585371384,
+      "learning_rate": 8.153420331383199e-05,
+      "loss": 1.2378,
+      "step": 412
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5652426070182841,
+      "learning_rate": 8.144120139099697e-05,
+      "loss": 1.2788,
+      "step": 413
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5264883521440279,
+      "learning_rate": 8.134801919503154e-05,
+      "loss": 1.2432,
+      "step": 414
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5391198787958846,
+      "learning_rate": 8.125465726021069e-05,
+      "loss": 1.2642,
+      "step": 415
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5447234901673647,
+      "learning_rate": 8.116111612183989e-05,
+      "loss": 1.2598,
+      "step": 416
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5239448356746366,
+      "learning_rate": 8.106739631625217e-05,
+      "loss": 1.2383,
+      "step": 417
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.522466994953917,
+      "learning_rate": 8.09734983808049e-05,
+      "loss": 1.21,
+      "step": 418
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.49320728726020635,
+      "learning_rate": 8.087942285387688e-05,
+      "loss": 1.1643,
+      "step": 419
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.538615135680076,
+      "learning_rate": 8.07851702748651e-05,
+      "loss": 1.2485,
+      "step": 420
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5546864636999657,
+      "learning_rate": 8.06907411841817e-05,
+      "loss": 1.1887,
+      "step": 421
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5337150121699967,
+      "learning_rate": 8.05961361232509e-05,
+      "loss": 1.2378,
+      "step": 422
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5548120199862732,
+      "learning_rate": 8.050135563450587e-05,
+      "loss": 1.2129,
+      "step": 423
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5491477319207145,
+      "learning_rate": 8.040640026138562e-05,
+      "loss": 1.2615,
+      "step": 424
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5292609791678348,
+      "learning_rate": 8.03112705483319e-05,
+      "loss": 1.1963,
+      "step": 425
+    },
+    {
+      "epoch": 0.31,
+      "grad_norm": 0.5386073890465884,
+      "learning_rate": 8.021596704078605e-05,
+      "loss": 1.2822,
+      "step": 426
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5208877771953219,
+      "learning_rate": 8.012049028518589e-05,
+      "loss": 1.2468,
+      "step": 427
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5300893442105213,
+      "learning_rate": 8.002484082896257e-05,
+      "loss": 1.2141,
+      "step": 428
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5426660622332912,
+      "learning_rate": 7.992901922053752e-05,
+      "loss": 1.2083,
+      "step": 429
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5280778314237736,
+      "learning_rate": 7.983302600931911e-05,
+      "loss": 1.2556,
+      "step": 430
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5303015472910759,
+      "learning_rate": 7.973686174569972e-05,
+      "loss": 1.2246,
+      "step": 431
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5385117857553907,
+      "learning_rate": 7.964052698105247e-05,
+      "loss": 1.2544,
+      "step": 432
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5175160927509813,
+      "learning_rate": 7.954402226772804e-05,
+      "loss": 1.1724,
+      "step": 433
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5167307050244405,
+      "learning_rate": 7.944734815905154e-05,
+      "loss": 1.228,
+      "step": 434
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.533666702216578,
+      "learning_rate": 7.93505052093194e-05,
+      "loss": 1.2349,
+      "step": 435
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5259498652131873,
+      "learning_rate": 7.925349397379604e-05,
+      "loss": 1.2415,
+      "step": 436
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5445977576017799,
+      "learning_rate": 7.915631500871083e-05,
+      "loss": 1.2065,
+      "step": 437
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5649990455410109,
+      "learning_rate": 7.905896887125482e-05,
+      "loss": 1.2417,
+      "step": 438
+    },
+    {
+      "epoch": 0.32,
+      "grad_norm": 0.5260513948557283,
+      "learning_rate": 7.896145611957759e-05,
+      "loss": 1.1918,
+      "step": 439
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5258410063287358,
+      "learning_rate": 7.8863777312784e-05,
+      "loss": 1.2124,
+      "step": 440
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5434644442116746,
+      "learning_rate": 7.876593301093104e-05,
+      "loss": 1.2349,
+      "step": 441
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5462561748612222,
+      "learning_rate": 7.866792377502457e-05,
+      "loss": 1.2373,
+      "step": 442
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5661256454549024,
+      "learning_rate": 7.856975016701615e-05,
+      "loss": 1.2334,
+      "step": 443
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5517524055311237,
+      "learning_rate": 7.847141274979977e-05,
+      "loss": 1.2549,
+      "step": 444
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5588533911643465,
+      "learning_rate": 7.837291208720866e-05,
+      "loss": 1.248,
+      "step": 445
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5432341108696274,
+      "learning_rate": 7.827424874401203e-05,
+      "loss": 1.207,
+      "step": 446
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5185655878803792,
+      "learning_rate": 7.81754232859119e-05,
+      "loss": 1.2087,
+      "step": 447
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.546989000271988,
+      "learning_rate": 7.807643627953969e-05,
+      "loss": 1.2852,
+      "step": 448
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5609807732483688,
+      "learning_rate": 7.797728829245321e-05,
+      "loss": 1.23,
+      "step": 449
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5290536891546959,
+      "learning_rate": 7.787797989313317e-05,
+      "loss": 1.1687,
+      "step": 450
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.527486366572943,
+      "learning_rate": 7.777851165098012e-05,
+      "loss": 1.2349,
+      "step": 451
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5444668761845415,
+      "learning_rate": 7.767888413631101e-05,
+      "loss": 1.248,
+      "step": 452
+    },
+    {
+      "epoch": 0.33,
+      "grad_norm": 0.5194113505588946,
+      "learning_rate": 7.757909792035608e-05,
+      "loss": 1.3081,
+      "step": 453
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5174613130879753,
+      "learning_rate": 7.747915357525545e-05,
+      "loss": 1.2046,
+      "step": 454
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5535670191712191,
+      "learning_rate": 7.737905167405595e-05,
+      "loss": 1.2136,
+      "step": 455
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.546209627520353,
+      "learning_rate": 7.727879279070773e-05,
+      "loss": 1.2097,
+      "step": 456
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5221397456131871,
+      "learning_rate": 7.717837750006106e-05,
+      "loss": 1.2832,
+      "step": 457
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5380906003507856,
+      "learning_rate": 7.7077806377863e-05,
+      "loss": 1.1807,
+      "step": 458
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.546159089637007,
+      "learning_rate": 7.697708000075403e-05,
+      "loss": 1.262,
+      "step": 459
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5378903447286532,
+      "learning_rate": 7.687619894626493e-05,
+      "loss": 1.2639,
+      "step": 460
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5183593724417229,
+      "learning_rate": 7.677516379281321e-05,
+      "loss": 1.2344,
+      "step": 461
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5110004203240966,
+      "learning_rate": 7.667397511970005e-05,
+      "loss": 1.2144,
+      "step": 462
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5237401648978784,
+      "learning_rate": 7.657263350710676e-05,
+      "loss": 1.1992,
+      "step": 463
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5458624581753624,
+      "learning_rate": 7.647113953609163e-05,
+      "loss": 1.252,
+      "step": 464
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.55612272064723,
+      "learning_rate": 7.636949378858646e-05,
+      "loss": 1.188,
+      "step": 465
+    },
+    {
+      "epoch": 0.34,
+      "grad_norm": 0.5578526299155908,
+      "learning_rate": 7.626769684739337e-05,
+      "loss": 1.1951,
+      "step": 466
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5092511020982519,
+      "learning_rate": 7.616574929618125e-05,
+      "loss": 1.1543,
+      "step": 467
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5348616024567703,
+      "learning_rate": 7.606365171948267e-05,
+      "loss": 1.2368,
+      "step": 468
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.532298079012496,
+      "learning_rate": 7.596140470269029e-05,
+      "loss": 1.2107,
+      "step": 469
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5514395726265122,
+      "learning_rate": 7.585900883205364e-05,
+      "loss": 1.241,
+      "step": 470
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5539874834294591,
+      "learning_rate": 7.575646469467575e-05,
+      "loss": 1.2249,
+      "step": 471
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5141238427544136,
+      "learning_rate": 7.565377287850977e-05,
+      "loss": 1.21,
+      "step": 472
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.526119772429715,
+      "learning_rate": 7.555093397235552e-05,
+      "loss": 1.2141,
+      "step": 473
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5239544155150679,
+      "learning_rate": 7.544794856585626e-05,
+      "loss": 1.2446,
+      "step": 474
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5116743183638587,
+      "learning_rate": 7.53448172494952e-05,
+      "loss": 1.2251,
+      "step": 475
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5465278452905271,
+      "learning_rate": 7.524154061459215e-05,
+      "loss": 1.1744,
+      "step": 476
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5242898434746838,
+      "learning_rate": 7.51381192533001e-05,
+      "loss": 1.2305,
+      "step": 477
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5524906450650563,
+      "learning_rate": 7.503455375860192e-05,
+      "loss": 1.271,
+      "step": 478
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5422094091125237,
+      "learning_rate": 7.493084472430682e-05,
+      "loss": 1.2983,
+      "step": 479
+    },
+    {
+      "epoch": 0.35,
+      "grad_norm": 0.5100606069460412,
+      "learning_rate": 7.482699274504708e-05,
+      "loss": 1.1914,
+      "step": 480
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5258246755815246,
+      "learning_rate": 7.472299841627451e-05,
+      "loss": 1.1948,
+      "step": 481
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5183104456102203,
+      "learning_rate": 7.461886233425717e-05,
+      "loss": 1.1658,
+      "step": 482
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5283305385961874,
+      "learning_rate": 7.451458509607582e-05,
+      "loss": 1.2378,
+      "step": 483
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5552677702446687,
+      "learning_rate": 7.441016729962064e-05,
+      "loss": 1.1938,
+      "step": 484
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5198625616185957,
+      "learning_rate": 7.430560954358764e-05,
+      "loss": 1.2515,
+      "step": 485
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.524907115545136,
+      "learning_rate": 7.420091242747536e-05,
+      "loss": 1.2437,
+      "step": 486
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.520819742542826,
+      "learning_rate": 7.409607655158139e-05,
+      "loss": 1.2764,
+      "step": 487
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5297968503831433,
+      "learning_rate": 7.399110251699887e-05,
+      "loss": 1.2529,
+      "step": 488
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5214545833543685,
+      "learning_rate": 7.388599092561315e-05,
+      "loss": 1.2979,
+      "step": 489
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5158994351772959,
+      "learning_rate": 7.378074238009826e-05,
+      "loss": 1.2363,
+      "step": 490
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.49265767229951024,
+      "learning_rate": 7.367535748391349e-05,
+      "loss": 1.228,
+      "step": 491
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5308141896787576,
+      "learning_rate": 7.35698368412999e-05,
+      "loss": 1.2527,
+      "step": 492
+    },
+    {
+      "epoch": 0.36,
+      "grad_norm": 0.5185543266636785,
+      "learning_rate": 7.346418105727686e-05,
+      "loss": 1.2192,
+      "step": 493
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5231300605729964,
+      "learning_rate": 7.335839073763865e-05,
+      "loss": 1.2065,
+      "step": 494
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5399567824066669,
+      "learning_rate": 7.325246648895088e-05,
+      "loss": 1.2563,
+      "step": 495
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5239942836551379,
+      "learning_rate": 7.31464089185471e-05,
+      "loss": 1.2549,
+      "step": 496
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5367247940798874,
+      "learning_rate": 7.304021863452524e-05,
+      "loss": 1.2061,
+      "step": 497
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5404506218621764,
+      "learning_rate": 7.293389624574422e-05,
+      "loss": 1.2142,
+      "step": 498
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5055969660442964,
+      "learning_rate": 7.282744236182034e-05,
+      "loss": 1.2451,
+      "step": 499
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5423433133756662,
+      "learning_rate": 7.27208575931239e-05,
+      "loss": 1.2012,
+      "step": 500
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5291351969461193,
+      "learning_rate": 7.26141425507756e-05,
+      "loss": 1.1768,
+      "step": 501
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5217703642849318,
+      "learning_rate": 7.250729784664316e-05,
+      "loss": 1.209,
+      "step": 502
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5201622197991884,
+      "learning_rate": 7.240032409333764e-05,
+      "loss": 1.2031,
+      "step": 503
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5281271991799672,
+      "learning_rate": 7.22932219042101e-05,
+      "loss": 1.1987,
+      "step": 504
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5573441678253518,
+      "learning_rate": 7.218599189334799e-05,
+      "loss": 1.2739,
+      "step": 505
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5665017191299871,
+      "learning_rate": 7.207863467557162e-05,
+      "loss": 1.2773,
+      "step": 506
+    },
+    {
+      "epoch": 0.37,
+      "grad_norm": 0.5325104774494102,
+      "learning_rate": 7.19711508664307e-05,
+      "loss": 1.209,
+      "step": 507
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.518792873366363,
+      "learning_rate": 7.186354108220072e-05,
+      "loss": 1.2173,
+      "step": 508
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.530762745727063,
+      "learning_rate": 7.175580593987951e-05,
+      "loss": 1.2466,
+      "step": 509
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5140061528285057,
+      "learning_rate": 7.164794605718366e-05,
+      "loss": 1.2139,
+      "step": 510
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5194168189274216,
+      "learning_rate": 7.153996205254495e-05,
+      "loss": 1.2476,
+      "step": 511
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5487088087238914,
+      "learning_rate": 7.143185454510686e-05,
+      "loss": 1.2251,
+      "step": 512
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.49449833617368844,
+      "learning_rate": 7.1323624154721e-05,
+      "loss": 1.2021,
+      "step": 513
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5209680110441622,
+      "learning_rate": 7.121527150194349e-05,
+      "loss": 1.229,
+      "step": 514
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5179658980514732,
+      "learning_rate": 7.110679720803156e-05,
+      "loss": 1.2324,
+      "step": 515
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5237224991500224,
+      "learning_rate": 7.099820189493977e-05,
+      "loss": 1.269,
+      "step": 516
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5302189416292129,
+      "learning_rate": 7.088948618531667e-05,
+      "loss": 1.2041,
+      "step": 517
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5384341108312423,
+      "learning_rate": 7.078065070250106e-05,
+      "loss": 1.1746,
+      "step": 518
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5521437637462966,
+      "learning_rate": 7.067169607051851e-05,
+      "loss": 1.2886,
+      "step": 519
+    },
+    {
+      "epoch": 0.38,
+      "grad_norm": 0.5328288678743964,
+      "learning_rate": 7.056262291407772e-05,
+      "loss": 1.1877,
+      "step": 520
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.5359494830051162,
+      "learning_rate": 7.045343185856701e-05,
+      "loss": 1.2202,
+      "step": 521
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.5288532232218185,
+      "learning_rate": 7.034412353005063e-05,
+      "loss": 1.21,
+      "step": 522
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.5512085122241619,
+      "learning_rate": 7.02346985552653e-05,
+      "loss": 1.2798,
+      "step": 523
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.533944460040126,
+      "learning_rate": 7.01251575616165e-05,
+      "loss": 1.2539,
+      "step": 524
+    },
+    {
+      "epoch": 0.39,
+      "grad_norm": 0.5837632563221825,
+      "learning_rate": 7.0015501177175e-05,
+      "loss": 1.1335,
+      "step": 525
+    }
+  ],
+  "logging_steps": 1.0,
+  "max_steps": 1353,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 1,
+  "save_steps": 25,
+  "total_flos": 4.4085090777437307e+18,
+  "train_batch_size": 16,
+  "trial_name": null,
+  "trial_params": null
+}

training_args.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:91a5b0aeacf520d00ebefbd60b0266b0ab146e9ad9ce5cb558fac2a023344a6d
+size 6584

zero_to_fp32.py ADDED Viewed

	@@ -0,0 +1,592 @@

+#!/usr/bin/env python
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+# This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
+# copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
+# the future. Once extracted, the weights don't require DeepSpeed and can be used in any
+# application.
+#
+# example: python zero_to_fp32.py . pytorch_model.bin
+import argparse
+import torch
+import glob
+import math
+import os
+import re
+from collections import OrderedDict
+from dataclasses import dataclass
+# while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
+# DeepSpeed data structures it has to be available in the current python environment.
+from deepspeed.utils import logger
+from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
+                                            FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
+                                            FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
+@dataclass
+class zero_model_state:
+    buffers: dict()
+    param_shapes: dict()
+    shared_params: list
+    ds_version: int
+    frozen_param_shapes: dict()
+    frozen_param_fragments: dict()
+debug = 0
+# load to cpu
+device = torch.device('cpu')
+def atoi(text):
+    return int(text) if text.isdigit() else text
+def natural_keys(text):
+    '''
+    alist.sort(key=natural_keys) sorts in human order
+    http://nedbatchelder.com/blog/200712/human_sorting.html
+    (See Toothy's implementation in the comments)
+    '''
+    return [atoi(c) for c in re.split(r'(\d+)', text)]
+def get_model_state_file(checkpoint_dir, zero_stage):
+    if not os.path.isdir(checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
+    # there should be only one file
+    if zero_stage <= 2:
+        file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
+    elif zero_stage == 3:
+        file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
+    if not os.path.exists(file):
+        raise FileNotFoundError(f"can't find model states file at '{file}'")
+    return file
+def get_checkpoint_files(checkpoint_dir, glob_pattern):
+    # XXX: need to test that this simple glob rule works for multi-node setup too
+    ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
+    if len(ckpt_files) == 0:
+        raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
+    return ckpt_files
+def get_optim_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
+def get_model_state_files(checkpoint_dir):
+    return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
+def parse_model_states(files):
+    zero_model_states = []
+    for file in files:
+        state_dict = torch.load(file, map_location=device)
+        if BUFFER_NAMES not in state_dict:
+            raise ValueError(f"{file} is not a model state checkpoint")
+        buffer_names = state_dict[BUFFER_NAMES]
+        if debug:
+            print("Found buffers:", buffer_names)
+        # recover just the buffers while restoring them to fp32 if they were saved in fp16
+        buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
+        param_shapes = state_dict[PARAM_SHAPES]
+        # collect parameters that are included in param_shapes
+        param_names = []
+        for s in param_shapes:
+            for name in s.keys():
+                param_names.append(name)
+        # update with frozen parameters
+        frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
+        if frozen_param_shapes is not None:
+            if debug:
+                print(f"Found frozen_param_shapes: {frozen_param_shapes}")
+            param_names += list(frozen_param_shapes.keys())
+        # handle shared params
+        shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
+        ds_version = state_dict.get(DS_VERSION, None)
+        frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
+        z_model_state = zero_model_state(buffers=buffers,
+                                         param_shapes=param_shapes,
+                                         shared_params=shared_params,
+                                         ds_version=ds_version,
+                                         frozen_param_shapes=frozen_param_shapes,
+                                         frozen_param_fragments=frozen_param_fragments)
+        zero_model_states.append(z_model_state)
+    return zero_model_states
+def parse_optim_states(files, ds_checkpoint_dir):
+    total_files = len(files)
+    state_dicts = []
+    for f in files:
+        state_dict = torch.load(f, map_location=device)
+        # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
+        # and also handle the case where it was already removed by another helper script
+        state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
+        state_dicts.append(state_dict)
+    if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
+        raise ValueError(f"{files[0]} is not a zero checkpoint")
+    zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
+    world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
+    # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
+    # parameters can be different from data parallelism for non-expert parameters. So we can just
+    # use the max of the partition_count to get the dp world_size.
+    if type(world_size) is list:
+        world_size = max(world_size)
+    if world_size != total_files:
+        raise ValueError(
+            f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
+            "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
+        )
+    # the groups are named differently in each stage
+    if zero_stage <= 2:
+        fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
+    elif zero_stage == 3:
+        fp32_groups_key = FP32_FLAT_GROUPS
+    else:
+        raise ValueError(f"unknown zero stage {zero_stage}")
+    if zero_stage <= 2:
+        fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
+    elif zero_stage == 3:
+        # if there is more than one param group, there will be multiple flattened tensors - one
+        # flattened tensor per group - for simplicity merge them into a single tensor
+        #
+        # XXX: could make the script more memory efficient for when there are multiple groups - it
+        # will require matching the sub-lists of param_shapes for each param group flattened tensor
+        fp32_flat_groups = [
+            torch.cat(state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key], 0) for i in range(len(state_dicts))
+        ]
+    return zero_stage, world_size, fp32_flat_groups
+def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir):
+    """
+    Returns fp32 state_dict reconstructed from ds checkpoint
+    Args:
+        - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
+    """
+    print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
+    optim_files = get_optim_files(ds_checkpoint_dir)
+    zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
+    print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
+    model_files = get_model_state_files(ds_checkpoint_dir)
+    zero_model_states = parse_model_states(model_files)
+    print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
+    if zero_stage <= 2:
+        return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+    elif zero_stage == 3:
+        return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states)
+def _zero2_merge_frozen_params(state_dict, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+    frozen_param_fragments = zero_model_states[0].frozen_param_fragments
+    if debug:
+        num_elem = sum(s.numel() for s in frozen_param_shapes.values())
+        print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        state_dict[name] = frozen_param_fragments[name]
+        if debug:
+            print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _has_callable(obj, fn):
+    attr = getattr(obj, fn, None)
+    return callable(attr)
+def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    # Reconstruction protocol:
+    #
+    # XXX: document this
+    if debug:
+        for i in range(world_size):
+            for j in range(len(fp32_flat_groups[0])):
+                print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
+    # XXX: memory usage doubles here (zero2)
+    num_param_groups = len(fp32_flat_groups[0])
+    merged_single_partition_of_fp32_groups = []
+    for i in range(num_param_groups):
+        merged_partitions = [sd[i] for sd in fp32_flat_groups]
+        full_single_fp32_vector = torch.cat(merged_partitions, 0)
+        merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
+    avail_numel = sum(
+        [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
+    if debug:
+        wanted_params = sum([len(shapes) for shapes in param_shapes])
+        wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
+        # not asserting if there is a mismatch due to possible padding
+        print(f"Have {avail_numel} numels to process.")
+        print(f"Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    total_numel = 0
+    total_params = 0
+    for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
+        offset = 0
+        avail_numel = full_single_fp32_vector.numel()
+        for name, shape in shapes.items():
+            unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
+            total_numel += unpartitioned_numel
+            total_params += 1
+            if debug:
+                print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
+            state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
+            offset += unpartitioned_numel
+        # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
+        # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
+        # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
+        # live optimizer object, so we are checking that the numbers are within the right range
+        align_to = 2 * world_size
+        def zero2_align(x):
+            return align_to * math.ceil(x / align_to)
+        if debug:
+            print(f"original offset={offset}, avail_numel={avail_numel}")
+        offset = zero2_align(offset)
+        avail_numel = zero2_align(avail_numel)
+        if debug:
+            print(f"aligned  offset={offset}, avail_numel={avail_numel}")
+        # Sanity check
+        if offset != avail_numel:
+            raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero2_merge_frozen_params(state_dict, zero_model_states)
+    _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def zero3_partitioned_param_info(unpartitioned_numel, world_size):
+    remainder = unpartitioned_numel % world_size
+    padding_numel = (world_size - remainder) if remainder else 0
+    partitioned_numel = math.ceil(unpartitioned_numel / world_size)
+    return partitioned_numel, padding_numel
+def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
+    if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
+        return
+    if debug:
+        for i in range(world_size):
+            num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
+            print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
+        frozen_param_shapes = zero_model_states[0].frozen_param_shapes
+        wanted_params = len(frozen_param_shapes)
+        wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
+        avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
+        print(f'Frozen params: Have {avail_numel} numels to process.')
+        print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
+    total_params = 0
+    total_numel = 0
+    for name, shape in zero_model_states[0].frozen_param_shapes.items():
+        total_params += 1
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
+        state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+    print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
+def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
+    param_shapes = zero_model_states[0].param_shapes
+    avail_numel = fp32_flat_groups[0].numel() * world_size
+    # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
+    # param, re-consolidating each param, while dealing with padding if any
+    # merge list of dicts, preserving order
+    param_shapes = {k: v for d in param_shapes for k, v in d.items()}
+    if debug:
+        for i in range(world_size):
+            print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
+        wanted_params = len(param_shapes)
+        wanted_numel = sum(shape.numel() for shape in param_shapes.values())
+        # not asserting if there is a mismatch due to possible padding
+        avail_numel = fp32_flat_groups[0].numel() * world_size
+        print(f"Trainable params: Have {avail_numel} numels to process.")
+        print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
+    # params
+    # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
+    # out-of-core computing solution
+    offset = 0
+    total_numel = 0
+    total_params = 0
+    for name, shape in param_shapes.items():
+        unpartitioned_numel = shape.numel()
+        total_numel += unpartitioned_numel
+        total_params += 1
+        partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
+        if debug:
+            print(
+                f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
+            )
+        # XXX: memory usage doubles here
+        state_dict[name] = torch.cat(
+            tuple(fp32_flat_groups[i].narrow(0, offset, partitioned_numel) for i in range(world_size)),
+            0).narrow(0, 0, unpartitioned_numel).view(shape)
+        offset += partitioned_numel
+    offset *= world_size
+    # Sanity check
+    if offset != avail_numel:
+        raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
+    print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
+def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states):
+    state_dict = OrderedDict()
+    # buffers
+    buffers = zero_model_states[0].buffers
+    state_dict.update(buffers)
+    if debug:
+        print(f"added {len(buffers)} buffers")
+    _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
+    _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
+    # recover shared parameters
+    for pair in zero_model_states[0].shared_params:
+        if pair[1] in state_dict:
+            state_dict[pair[0]] = state_dict[pair[1]]
+    return state_dict
+def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
+    ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
+    via a model hub.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
+    Returns:
+        - pytorch ``state_dict``
+    Note: this approach may not work if your application doesn't have sufficient free CPU memory and
+    you may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
+    the checkpoint.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+        # do the training and checkpoint saving
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+        model = model.cpu() # move to cpu
+        model.load_state_dict(state_dict)
+        # submit to model hub or save the model to share with others
+    In this example the ``model`` will no longer be usable in the deepspeed context of the same
+    application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
+    """
+    if tag is None:
+        latest_path = os.path.join(checkpoint_dir, 'latest')
+        if os.path.isfile(latest_path):
+            with open(latest_path, 'r') as fd:
+                tag = fd.read().strip()
+        else:
+            raise ValueError(f"Unable to find 'latest' file at {latest_path}")
+    ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
+    if not os.path.isdir(ds_checkpoint_dir):
+        raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
+    return _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir)
+def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir, output_file, tag=None):
+    """
+    Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
+    loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
+    Args:
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``output_file``: path to the pytorch fp32 state_dict output file (e.g. path/pytorch_model.bin)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    """
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    print(f"Saving fp32 state dict to {output_file}")
+    torch.save(state_dict, output_file)
+def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
+    """
+    1. Put the provided model to cpu
+    2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
+    3. Load it into the provided model
+    Args:
+        - ``model``: the model object to update
+        - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
+        - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
+    Returns:
+        - ``model`: modified model
+    Make sure you have plenty of CPU memory available before you call this function. If you don't
+    have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
+    conveniently placed for you in the checkpoint folder.
+    A typical usage might be ::
+        from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+        model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+        # submit to model hub or save the model to share with others
+    Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
+    of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
+    """
+    logger.info(f"Extracting fp32 weights")
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
+    logger.info(f"Overwriting model with fp32 weights")
+    model = model.cpu()
+    model.load_state_dict(state_dict, strict=False)
+    return model
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("checkpoint_dir",
+                        type=str,
+                        help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
+    parser.add_argument(
+        "output_file",
+        type=str,
+        help="path to the pytorch fp32 state_dict output file (e.g. path/checkpoint-12/pytorch_model.bin)")
+    parser.add_argument("-t",
+                        "--tag",
+                        type=str,
+                        default=None,
+                        help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
+    parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
+    args = parser.parse_args()
+    debug = args.debug
+    convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir, args.output_file, tag=args.tag)