diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/README.md b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/adapter_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..e1cb137dcd67002024395b7d0ec5f27ba099057b --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "olora", + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 512, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 256, + "revision": null, + "target_modules": [ + "v_proj", + "down_proj", + "k_proj", + "o_proj", + "q_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/adapter_model.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..a819e9f89f8f640aca9dae9b62ccaec1c53d904c --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9ed0c732db686c9cc96998774e6c637420d6e91dd9fa322165ad0fb7df98a35 +size 1384057050 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a67f02d0063c3de7740207b9ab2a3eb7be1cbe3 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/config.json @@ -0,0 +1,221 @@ +{ + "_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "add_faster_video": false, + "add_time_instruction": true, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": true, + "hidden_act": "silu", + "hidden_size": 3584, + "ignore_index": -100, + "image_aspect_ratio": "anyres_max_9", + "image_crop_resolution": null, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "grid", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": 2e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": null, + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": 2, + "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "projector_hidden_act": "gelu", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "text_config": { + "model_type": "llama" + }, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 32768, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_config": { + "hidden_size": 1024, + "image_size": 336, + "intermediate_size": 4096, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "vocab_size": 32000 + }, + "vision_feature_layer": -2, + "vision_feature_select_strategy": "default", + "vision_tower_pretrained": null +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/generation_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/non_lora_trainables.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..7a032beeac98ae79c66c8b1418b282f65f052797 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0afc643d6c17a6e89d0a2528bebd61a99d7f47b7518d57a6787f55cfb1cc67fb +size 33964208 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/trainer_state.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..aa11d31093310a3cd11eb7634f7927e53c8a4e18 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split00_all_mm_tune_olora256_512_llm/trainer_state.json @@ -0,0 +1,3530 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 1.928880050484433, + "learning_rate": 6.666666666666667e-07, + "loss": 0.4907, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1.588300236671549, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.4211, + "step": 2 + }, + { + "epoch": 0.01, + "grad_norm": 3.339991984294469, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.7389, + "step": 3 + }, + { + "epoch": 0.01, + "grad_norm": 5.970866801586073, + "learning_rate": 2.666666666666667e-06, + "loss": 1.5158, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 1.847088910892997, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.5582, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 1.902184023132924, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5615, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 2.1103083869089794, + "learning_rate": 4.666666666666667e-06, + "loss": 0.5533, + "step": 7 + }, + { + "epoch": 0.02, + "grad_norm": 1.3753084815529804, + "learning_rate": 5.333333333333334e-06, + "loss": 0.3678, + "step": 8 + }, + { + "epoch": 0.02, + "grad_norm": 1.6008535027371964, + "learning_rate": 6e-06, + "loss": 0.4722, + "step": 9 + }, + { + "epoch": 0.02, + "grad_norm": 1.4125509496168551, + "learning_rate": 6.666666666666667e-06, + "loss": 0.4045, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 2.7161654242431323, + "learning_rate": 7.333333333333333e-06, + "loss": 0.7498, + "step": 11 + }, + { + "epoch": 0.02, + "grad_norm": 1.402522894370017, + "learning_rate": 8.000000000000001e-06, + "loss": 0.4185, + "step": 12 + }, + { + "epoch": 0.03, + "grad_norm": 1.513671206638328, + "learning_rate": 8.666666666666668e-06, + "loss": 0.309, + "step": 13 + }, + { + "epoch": 0.03, + "grad_norm": 0.9819893353895838, + "learning_rate": 9.333333333333334e-06, + "loss": 0.2349, + "step": 14 + }, + { + "epoch": 0.03, + "grad_norm": 1.3027073556008346, + "learning_rate": 1e-05, + "loss": 0.3234, + "step": 15 + }, + { + "epoch": 0.03, + "grad_norm": 1.4381709175193402, + "learning_rate": 9.999895105006995e-06, + "loss": 0.2945, + "step": 16 + }, + { + "epoch": 0.03, + "grad_norm": 2.052815018764843, + "learning_rate": 9.99958042442916e-06, + "loss": 0.3493, + "step": 17 + }, + { + "epoch": 0.04, + "grad_norm": 1.6196920005213762, + "learning_rate": 9.999055971469864e-06, + "loss": 0.2761, + "step": 18 + }, + { + "epoch": 0.04, + "grad_norm": 1.4712226066536305, + "learning_rate": 9.998321768134101e-06, + "loss": 0.2012, + "step": 19 + }, + { + "epoch": 0.04, + "grad_norm": 0.8568837598037423, + "learning_rate": 9.997377845227577e-06, + "loss": 0.1712, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 0.9693693191969218, + "learning_rate": 9.9962242423554e-06, + "loss": 0.2447, + "step": 21 + }, + { + "epoch": 0.04, + "grad_norm": 0.9895575629794001, + "learning_rate": 9.99486100792044e-06, + "loss": 0.1854, + "step": 22 + }, + { + "epoch": 0.05, + "grad_norm": 1.7594217480912717, + "learning_rate": 9.993288199121283e-06, + "loss": 0.1996, + "step": 23 + }, + { + "epoch": 0.05, + "grad_norm": 2.0192540791922693, + "learning_rate": 9.991505881949837e-06, + "loss": 0.3051, + "step": 24 + }, + { + "epoch": 0.05, + "grad_norm": 1.3919378979755264, + "learning_rate": 9.98951413118856e-06, + "loss": 0.1417, + "step": 25 + }, + { + "epoch": 0.05, + "grad_norm": 0.7508302307462176, + "learning_rate": 9.987313030407325e-06, + "loss": 0.1757, + "step": 26 + }, + { + "epoch": 0.05, + "grad_norm": 1.0172169036173535, + "learning_rate": 9.984902671959911e-06, + "loss": 0.2013, + "step": 27 + }, + { + "epoch": 0.06, + "grad_norm": 1.3428007732278717, + "learning_rate": 9.982283156980133e-06, + "loss": 0.1983, + "step": 28 + }, + { + "epoch": 0.06, + "grad_norm": 0.7539923799727971, + "learning_rate": 9.979454595377594e-06, + "loss": 0.1219, + "step": 29 + }, + { + "epoch": 0.06, + "grad_norm": 0.8659715135984464, + "learning_rate": 9.97641710583307e-06, + "loss": 0.1899, + "step": 30 + }, + { + "epoch": 0.06, + "grad_norm": 0.7710723460098894, + "learning_rate": 9.973170815793543e-06, + "loss": 0.134, + "step": 31 + }, + { + "epoch": 0.06, + "grad_norm": 0.7935213054318757, + "learning_rate": 9.969715861466839e-06, + "loss": 0.1215, + "step": 32 + }, + { + "epoch": 0.07, + "grad_norm": 1.0478419132907202, + "learning_rate": 9.966052387815923e-06, + "loss": 0.1752, + "step": 33 + }, + { + "epoch": 0.07, + "grad_norm": 1.056628710250906, + "learning_rate": 9.962180548552812e-06, + "loss": 0.1691, + "step": 34 + }, + { + "epoch": 0.07, + "grad_norm": 0.6599623594758851, + "learning_rate": 9.958100506132127e-06, + "loss": 0.11, + "step": 35 + }, + { + "epoch": 0.07, + "grad_norm": 0.8740425578465124, + "learning_rate": 9.953812431744274e-06, + "loss": 0.1382, + "step": 36 + }, + { + "epoch": 0.07, + "grad_norm": 0.9871589222528848, + "learning_rate": 9.94931650530827e-06, + "loss": 0.1378, + "step": 37 + }, + { + "epoch": 0.08, + "grad_norm": 1.0965428723520145, + "learning_rate": 9.944612915464183e-06, + "loss": 0.1911, + "step": 38 + }, + { + "epoch": 0.08, + "grad_norm": 0.8288408100645241, + "learning_rate": 9.93970185956522e-06, + "loss": 0.1526, + "step": 39 + }, + { + "epoch": 0.08, + "grad_norm": 0.8489303067819424, + "learning_rate": 9.934583543669454e-06, + "loss": 0.1406, + "step": 40 + }, + { + "epoch": 0.08, + "grad_norm": 1.4514902977171549, + "learning_rate": 9.929258182531167e-06, + "loss": 0.1848, + "step": 41 + }, + { + "epoch": 0.08, + "grad_norm": 0.8064037496605132, + "learning_rate": 9.923725999591846e-06, + "loss": 0.108, + "step": 42 + }, + { + "epoch": 0.09, + "grad_norm": 0.7308253552869113, + "learning_rate": 9.917987226970811e-06, + "loss": 0.13, + "step": 43 + }, + { + "epoch": 0.09, + "grad_norm": 0.641530457475072, + "learning_rate": 9.912042105455462e-06, + "loss": 0.1276, + "step": 44 + }, + { + "epoch": 0.09, + "grad_norm": 0.8300848785648857, + "learning_rate": 9.905890884491196e-06, + "loss": 0.125, + "step": 45 + }, + { + "epoch": 0.09, + "grad_norm": 0.6144652207379732, + "learning_rate": 9.899533822170922e-06, + "loss": 0.117, + "step": 46 + }, + { + "epoch": 0.09, + "grad_norm": 0.8180138707863049, + "learning_rate": 9.892971185224244e-06, + "loss": 0.1293, + "step": 47 + }, + { + "epoch": 0.1, + "grad_norm": 0.9351068141307641, + "learning_rate": 9.886203249006265e-06, + "loss": 0.1322, + "step": 48 + }, + { + "epoch": 0.1, + "grad_norm": 0.7711243311834907, + "learning_rate": 9.879230297486034e-06, + "loss": 0.1266, + "step": 49 + }, + { + "epoch": 0.1, + "grad_norm": 1.2380701876377125, + "learning_rate": 9.872052623234632e-06, + "loss": 0.1255, + "step": 50 + }, + { + "epoch": 0.1, + "grad_norm": 1.203808913144882, + "learning_rate": 9.864670527412891e-06, + "loss": 0.1921, + "step": 51 + }, + { + "epoch": 0.1, + "grad_norm": 1.2896863465751387, + "learning_rate": 9.857084319758772e-06, + "loss": 0.1581, + "step": 52 + }, + { + "epoch": 0.11, + "grad_norm": 0.7548162165103374, + "learning_rate": 9.849294318574353e-06, + "loss": 0.1463, + "step": 53 + }, + { + "epoch": 0.11, + "grad_norm": 1.0334367656162968, + "learning_rate": 9.841300850712479e-06, + "loss": 0.1332, + "step": 54 + }, + { + "epoch": 0.11, + "grad_norm": 0.6592483676317302, + "learning_rate": 9.833104251563058e-06, + "loss": 0.1155, + "step": 55 + }, + { + "epoch": 0.11, + "grad_norm": 0.7856988079219412, + "learning_rate": 9.824704865038967e-06, + "loss": 0.1223, + "step": 56 + }, + { + "epoch": 0.11, + "grad_norm": 0.9841576867137853, + "learning_rate": 9.816103043561648e-06, + "loss": 0.1457, + "step": 57 + }, + { + "epoch": 0.12, + "grad_norm": 0.6806196475936801, + "learning_rate": 9.807299148046301e-06, + "loss": 0.1189, + "step": 58 + }, + { + "epoch": 0.12, + "grad_norm": 0.8763554375141681, + "learning_rate": 9.798293547886748e-06, + "loss": 0.0842, + "step": 59 + }, + { + "epoch": 0.12, + "grad_norm": 1.4191117216965232, + "learning_rate": 9.789086620939936e-06, + "loss": 0.1708, + "step": 60 + }, + { + "epoch": 0.12, + "grad_norm": 1.0979618364115356, + "learning_rate": 9.779678753510082e-06, + "loss": 0.1723, + "step": 61 + }, + { + "epoch": 0.12, + "grad_norm": 0.7699797592772706, + "learning_rate": 9.770070340332457e-06, + "loss": 0.1287, + "step": 62 + }, + { + "epoch": 0.13, + "grad_norm": 0.8877764874268894, + "learning_rate": 9.76026178455684e-06, + "loss": 0.1876, + "step": 63 + }, + { + "epoch": 0.13, + "grad_norm": 0.6994853026271824, + "learning_rate": 9.75025349773058e-06, + "loss": 0.136, + "step": 64 + }, + { + "epoch": 0.13, + "grad_norm": 0.9216021155237354, + "learning_rate": 9.740045899781353e-06, + "loss": 0.1292, + "step": 65 + }, + { + "epoch": 0.13, + "grad_norm": 0.7453943118920977, + "learning_rate": 9.729639418999524e-06, + "loss": 0.155, + "step": 66 + }, + { + "epoch": 0.13, + "grad_norm": 0.79773477245724, + "learning_rate": 9.719034492020183e-06, + "loss": 0.1769, + "step": 67 + }, + { + "epoch": 0.14, + "grad_norm": 0.6847427502446188, + "learning_rate": 9.708231563804828e-06, + "loss": 0.0942, + "step": 68 + }, + { + "epoch": 0.14, + "grad_norm": 1.2119073354718934, + "learning_rate": 9.697231087622691e-06, + "loss": 0.1372, + "step": 69 + }, + { + "epoch": 0.14, + "grad_norm": 0.5694650059673866, + "learning_rate": 9.68603352503172e-06, + "loss": 0.0993, + "step": 70 + }, + { + "epoch": 0.14, + "grad_norm": 0.39819093816684803, + "learning_rate": 9.674639345859213e-06, + "loss": 0.0916, + "step": 71 + }, + { + "epoch": 0.14, + "grad_norm": 0.7362844200692608, + "learning_rate": 9.663049028182112e-06, + "loss": 0.1288, + "step": 72 + }, + { + "epoch": 0.15, + "grad_norm": 0.48385101980537876, + "learning_rate": 9.651263058306932e-06, + "loss": 0.1037, + "step": 73 + }, + { + "epoch": 0.15, + "grad_norm": 0.7838597210641812, + "learning_rate": 9.639281930749363e-06, + "loss": 0.1471, + "step": 74 + }, + { + "epoch": 0.15, + "grad_norm": 0.9663173755107086, + "learning_rate": 9.627106148213521e-06, + "loss": 0.1437, + "step": 75 + }, + { + "epoch": 0.15, + "grad_norm": 0.4796822883173506, + "learning_rate": 9.61473622157086e-06, + "loss": 0.0926, + "step": 76 + }, + { + "epoch": 0.15, + "grad_norm": 4.155197024481699, + "learning_rate": 9.602172669838721e-06, + "loss": 0.2146, + "step": 77 + }, + { + "epoch": 0.16, + "grad_norm": 1.1639055084652334, + "learning_rate": 9.589416020158577e-06, + "loss": 0.1724, + "step": 78 + }, + { + "epoch": 0.16, + "grad_norm": 3.547966633302887, + "learning_rate": 9.5764668077739e-06, + "loss": 0.1084, + "step": 79 + }, + { + "epoch": 0.16, + "grad_norm": 0.8814626046410395, + "learning_rate": 9.563325576007702e-06, + "loss": 0.1372, + "step": 80 + }, + { + "epoch": 0.16, + "grad_norm": 0.6210765018596596, + "learning_rate": 9.549992876239753e-06, + "loss": 0.0694, + "step": 81 + }, + { + "epoch": 0.16, + "grad_norm": 1.110500549491553, + "learning_rate": 9.536469267883432e-06, + "loss": 0.1839, + "step": 82 + }, + { + "epoch": 0.17, + "grad_norm": 1.2471118804751122, + "learning_rate": 9.52275531836226e-06, + "loss": 0.1467, + "step": 83 + }, + { + "epoch": 0.17, + "grad_norm": 0.9123055530527371, + "learning_rate": 9.508851603086094e-06, + "loss": 0.1337, + "step": 84 + }, + { + "epoch": 0.17, + "grad_norm": 0.9805374180275667, + "learning_rate": 9.494758705426978e-06, + "loss": 0.1071, + "step": 85 + }, + { + "epoch": 0.17, + "grad_norm": 0.5789811708800969, + "learning_rate": 9.480477216694674e-06, + "loss": 0.1159, + "step": 86 + }, + { + "epoch": 0.17, + "grad_norm": 0.7790018124634032, + "learning_rate": 9.466007736111846e-06, + "loss": 0.1393, + "step": 87 + }, + { + "epoch": 0.18, + "grad_norm": 0.7230480461407474, + "learning_rate": 9.451350870788922e-06, + "loss": 0.1321, + "step": 88 + }, + { + "epoch": 0.18, + "grad_norm": 0.82321440490818, + "learning_rate": 9.436507235698613e-06, + "loss": 0.1687, + "step": 89 + }, + { + "epoch": 0.18, + "grad_norm": 1.1456530878281221, + "learning_rate": 9.421477453650118e-06, + "loss": 0.1707, + "step": 90 + }, + { + "epoch": 0.18, + "grad_norm": 0.49974163825682083, + "learning_rate": 9.406262155262995e-06, + "loss": 0.0922, + "step": 91 + }, + { + "epoch": 0.18, + "grad_norm": 1.0660136843265906, + "learning_rate": 9.390861978940687e-06, + "loss": 0.1935, + "step": 92 + }, + { + "epoch": 0.19, + "grad_norm": 0.7010294818727274, + "learning_rate": 9.37527757084375e-06, + "loss": 0.1286, + "step": 93 + }, + { + "epoch": 0.19, + "grad_norm": 0.8687260383951586, + "learning_rate": 9.359509584862735e-06, + "loss": 0.1232, + "step": 94 + }, + { + "epoch": 0.19, + "grad_norm": 0.7431869719716694, + "learning_rate": 9.343558682590757e-06, + "loss": 0.1313, + "step": 95 + }, + { + "epoch": 0.19, + "grad_norm": 0.7125664703001464, + "learning_rate": 9.327425533295725e-06, + "loss": 0.1075, + "step": 96 + }, + { + "epoch": 0.19, + "grad_norm": 1.1341306934510087, + "learning_rate": 9.31111081389227e-06, + "loss": 0.1856, + "step": 97 + }, + { + "epoch": 0.2, + "grad_norm": 0.5920305603497104, + "learning_rate": 9.29461520891335e-06, + "loss": 0.0908, + "step": 98 + }, + { + "epoch": 0.2, + "grad_norm": 1.115166027446719, + "learning_rate": 9.277939410481507e-06, + "loss": 0.1228, + "step": 99 + }, + { + "epoch": 0.2, + "grad_norm": 0.9285199941811209, + "learning_rate": 9.261084118279846e-06, + "loss": 0.1757, + "step": 100 + }, + { + "epoch": 0.2, + "grad_norm": 1.3218391768689233, + "learning_rate": 9.244050039522673e-06, + "loss": 0.1563, + "step": 101 + }, + { + "epoch": 0.2, + "grad_norm": 0.8936925872658409, + "learning_rate": 9.226837888925813e-06, + "loss": 0.1381, + "step": 102 + }, + { + "epoch": 0.21, + "grad_norm": 0.616864481202159, + "learning_rate": 9.209448388676636e-06, + "loss": 0.0965, + "step": 103 + }, + { + "epoch": 0.21, + "grad_norm": 0.5886766076497468, + "learning_rate": 9.191882268403743e-06, + "loss": 0.0995, + "step": 104 + }, + { + "epoch": 0.21, + "grad_norm": 0.7418873901004167, + "learning_rate": 9.174140265146356e-06, + "loss": 0.1268, + "step": 105 + }, + { + "epoch": 0.21, + "grad_norm": 0.6683463268452202, + "learning_rate": 9.156223123323405e-06, + "loss": 0.0945, + "step": 106 + }, + { + "epoch": 0.21, + "grad_norm": 0.8100037014458216, + "learning_rate": 9.13813159470227e-06, + "loss": 0.0965, + "step": 107 + }, + { + "epoch": 0.22, + "grad_norm": 0.5813603939289472, + "learning_rate": 9.119866438367263e-06, + "loss": 0.0625, + "step": 108 + }, + { + "epoch": 0.22, + "grad_norm": 0.6823981550941903, + "learning_rate": 9.101428420687759e-06, + "loss": 0.1, + "step": 109 + }, + { + "epoch": 0.22, + "grad_norm": 1.0685791474330528, + "learning_rate": 9.082818315286054e-06, + "loss": 0.1305, + "step": 110 + }, + { + "epoch": 0.22, + "grad_norm": 1.423832608761244, + "learning_rate": 9.0640369030049e-06, + "loss": 0.2443, + "step": 111 + }, + { + "epoch": 0.22, + "grad_norm": 0.7077886634021346, + "learning_rate": 9.045084971874738e-06, + "loss": 0.1361, + "step": 112 + }, + { + "epoch": 0.23, + "grad_norm": 0.8833141546052482, + "learning_rate": 9.025963317080641e-06, + "loss": 0.1054, + "step": 113 + }, + { + "epoch": 0.23, + "grad_norm": 0.7609054854017556, + "learning_rate": 9.006672740928952e-06, + "loss": 0.0669, + "step": 114 + }, + { + "epoch": 0.23, + "grad_norm": 0.8351944085373199, + "learning_rate": 8.987214052813605e-06, + "loss": 0.1025, + "step": 115 + }, + { + "epoch": 0.23, + "grad_norm": 0.5319790825694221, + "learning_rate": 8.967588069182184e-06, + "loss": 0.1023, + "step": 116 + }, + { + "epoch": 0.23, + "grad_norm": 0.6636541086727228, + "learning_rate": 8.947795613501658e-06, + "loss": 0.1215, + "step": 117 + }, + { + "epoch": 0.24, + "grad_norm": 0.5052715765394535, + "learning_rate": 8.927837516223824e-06, + "loss": 0.1127, + "step": 118 + }, + { + "epoch": 0.24, + "grad_norm": 0.8072207817927785, + "learning_rate": 8.907714614750473e-06, + "loss": 0.1361, + "step": 119 + }, + { + "epoch": 0.24, + "grad_norm": 0.9690886908335553, + "learning_rate": 8.887427753398249e-06, + "loss": 0.1899, + "step": 120 + }, + { + "epoch": 0.24, + "grad_norm": 0.7903552707748308, + "learning_rate": 8.866977783363219e-06, + "loss": 0.1321, + "step": 121 + }, + { + "epoch": 0.24, + "grad_norm": 1.0268300546563336, + "learning_rate": 8.846365562685178e-06, + "loss": 0.1555, + "step": 122 + }, + { + "epoch": 0.25, + "grad_norm": 0.6839614945715745, + "learning_rate": 8.825591956211614e-06, + "loss": 0.1055, + "step": 123 + }, + { + "epoch": 0.25, + "grad_norm": 0.8393856778492443, + "learning_rate": 8.804657835561456e-06, + "loss": 0.1667, + "step": 124 + }, + { + "epoch": 0.25, + "grad_norm": 0.7391743261975082, + "learning_rate": 8.783564079088478e-06, + "loss": 0.1233, + "step": 125 + }, + { + "epoch": 0.25, + "grad_norm": 0.8375549159216054, + "learning_rate": 8.762311571844453e-06, + "loss": 0.0841, + "step": 126 + }, + { + "epoch": 0.25, + "grad_norm": 0.578947985021812, + "learning_rate": 8.74090120554202e-06, + "loss": 0.0695, + "step": 127 + }, + { + "epoch": 0.26, + "grad_norm": 1.5941668469144197, + "learning_rate": 8.719333878517274e-06, + "loss": 0.181, + "step": 128 + }, + { + "epoch": 0.26, + "grad_norm": 0.5770276591153891, + "learning_rate": 8.697610495692055e-06, + "loss": 0.1127, + "step": 129 + }, + { + "epoch": 0.26, + "grad_norm": 0.6171453740689381, + "learning_rate": 8.675731968536004e-06, + "loss": 0.1386, + "step": 130 + }, + { + "epoch": 0.26, + "grad_norm": 1.138825520481436, + "learning_rate": 8.653699215028298e-06, + "loss": 0.15, + "step": 131 + }, + { + "epoch": 0.26, + "grad_norm": 1.0865950831257416, + "learning_rate": 8.63151315961915e-06, + "loss": 0.1578, + "step": 132 + }, + { + "epoch": 0.27, + "grad_norm": 0.9354551880186306, + "learning_rate": 8.609174733191012e-06, + "loss": 0.1444, + "step": 133 + }, + { + "epoch": 0.27, + "grad_norm": 0.521471359027822, + "learning_rate": 8.586684873019513e-06, + "loss": 0.0994, + "step": 134 + }, + { + "epoch": 0.27, + "grad_norm": 0.6629304093571784, + "learning_rate": 8.564044522734147e-06, + "loss": 0.0978, + "step": 135 + }, + { + "epoch": 0.27, + "grad_norm": 0.6831151476298498, + "learning_rate": 8.541254632278667e-06, + "loss": 0.0779, + "step": 136 + }, + { + "epoch": 0.27, + "grad_norm": 0.7367583952734872, + "learning_rate": 8.518316157871232e-06, + "loss": 0.0911, + "step": 137 + }, + { + "epoch": 0.28, + "grad_norm": 0.6590736711583746, + "learning_rate": 8.495230061964289e-06, + "loss": 0.1356, + "step": 138 + }, + { + "epoch": 0.28, + "grad_norm": 1.153818794835716, + "learning_rate": 8.471997313204183e-06, + "loss": 0.146, + "step": 139 + }, + { + "epoch": 0.28, + "grad_norm": 0.6008053964463435, + "learning_rate": 8.448618886390523e-06, + "loss": 0.0967, + "step": 140 + }, + { + "epoch": 0.28, + "grad_norm": 0.7825428576530492, + "learning_rate": 8.425095762435274e-06, + "loss": 0.1575, + "step": 141 + }, + { + "epoch": 0.28, + "grad_norm": 0.4077617053377632, + "learning_rate": 8.401428928321607e-06, + "loss": 0.082, + "step": 142 + }, + { + "epoch": 0.29, + "grad_norm": 0.6497568265720193, + "learning_rate": 8.377619377062483e-06, + "loss": 0.1463, + "step": 143 + }, + { + "epoch": 0.29, + "grad_norm": 0.6962679958043572, + "learning_rate": 8.353668107658984e-06, + "loss": 0.1304, + "step": 144 + }, + { + "epoch": 0.29, + "grad_norm": 0.5834749099786907, + "learning_rate": 8.329576125058406e-06, + "loss": 0.107, + "step": 145 + }, + { + "epoch": 0.29, + "grad_norm": 0.5503783852935425, + "learning_rate": 8.305344440112089e-06, + "loss": 0.124, + "step": 146 + }, + { + "epoch": 0.29, + "grad_norm": 0.7365506208146866, + "learning_rate": 8.280974069532999e-06, + "loss": 0.147, + "step": 147 + }, + { + "epoch": 0.3, + "grad_norm": 0.6978628022776826, + "learning_rate": 8.256466035853077e-06, + "loss": 0.102, + "step": 148 + }, + { + "epoch": 0.3, + "grad_norm": 0.7758804192060227, + "learning_rate": 8.231821367380335e-06, + "loss": 0.0845, + "step": 149 + }, + { + "epoch": 0.3, + "grad_norm": 0.5477123756526534, + "learning_rate": 8.207041098155701e-06, + "loss": 0.1259, + "step": 150 + }, + { + "epoch": 0.3, + "grad_norm": 0.6017501035266978, + "learning_rate": 8.182126267909642e-06, + "loss": 0.0587, + "step": 151 + }, + { + "epoch": 0.3, + "grad_norm": 0.7687312103332277, + "learning_rate": 8.157077922018537e-06, + "loss": 0.1451, + "step": 152 + }, + { + "epoch": 0.31, + "grad_norm": 1.0955298515155794, + "learning_rate": 8.13189711146081e-06, + "loss": 0.1656, + "step": 153 + }, + { + "epoch": 0.31, + "grad_norm": 0.5866736794773539, + "learning_rate": 8.106584892772844e-06, + "loss": 0.0905, + "step": 154 + }, + { + "epoch": 0.31, + "grad_norm": 0.7155438066725858, + "learning_rate": 8.081142328004638e-06, + "loss": 0.1528, + "step": 155 + }, + { + "epoch": 0.31, + "grad_norm": 0.6565395168851769, + "learning_rate": 8.055570484675252e-06, + "loss": 0.1251, + "step": 156 + }, + { + "epoch": 0.31, + "grad_norm": 1.2875502388618136, + "learning_rate": 8.029870435728018e-06, + "loss": 0.0913, + "step": 157 + }, + { + "epoch": 0.32, + "grad_norm": 0.5982546707253796, + "learning_rate": 8.004043259485519e-06, + "loss": 0.1115, + "step": 158 + }, + { + "epoch": 0.32, + "grad_norm": 0.46008235561169636, + "learning_rate": 7.978090039604342e-06, + "loss": 0.087, + "step": 159 + }, + { + "epoch": 0.32, + "grad_norm": 0.43711300108945894, + "learning_rate": 7.952011865029614e-06, + "loss": 0.1184, + "step": 160 + }, + { + "epoch": 0.32, + "grad_norm": 1.918267884953489, + "learning_rate": 7.925809829949312e-06, + "loss": 0.1293, + "step": 161 + }, + { + "epoch": 0.32, + "grad_norm": 0.7265832047624626, + "learning_rate": 7.89948503374835e-06, + "loss": 0.1569, + "step": 162 + }, + { + "epoch": 0.33, + "grad_norm": 0.5409486213897078, + "learning_rate": 7.873038580962453e-06, + "loss": 0.0966, + "step": 163 + }, + { + "epoch": 0.33, + "grad_norm": 0.41989818162281206, + "learning_rate": 7.846471581231814e-06, + "loss": 0.0985, + "step": 164 + }, + { + "epoch": 0.33, + "grad_norm": 0.8624237182282338, + "learning_rate": 7.819785149254534e-06, + "loss": 0.1495, + "step": 165 + }, + { + "epoch": 0.33, + "grad_norm": 1.3427024418979756, + "learning_rate": 7.792980404739849e-06, + "loss": 0.1465, + "step": 166 + }, + { + "epoch": 0.33, + "grad_norm": 0.6747523799681433, + "learning_rate": 7.766058472361154e-06, + "loss": 0.1482, + "step": 167 + }, + { + "epoch": 0.34, + "grad_norm": 0.583861344914313, + "learning_rate": 7.739020481708816e-06, + "loss": 0.1241, + "step": 168 + }, + { + "epoch": 0.34, + "grad_norm": 0.39792159088639656, + "learning_rate": 7.711867567242769e-06, + "loss": 0.078, + "step": 169 + }, + { + "epoch": 0.34, + "grad_norm": 0.988133766491529, + "learning_rate": 7.68460086824492e-06, + "loss": 0.144, + "step": 170 + }, + { + "epoch": 0.34, + "grad_norm": 0.5438192691474706, + "learning_rate": 7.657221528771352e-06, + "loss": 0.1085, + "step": 171 + }, + { + "epoch": 0.34, + "grad_norm": 0.7255348589603168, + "learning_rate": 7.629730697604314e-06, + "loss": 0.095, + "step": 172 + }, + { + "epoch": 0.35, + "grad_norm": 0.6567578642812082, + "learning_rate": 7.602129528204023e-06, + "loss": 0.1258, + "step": 173 + }, + { + "epoch": 0.35, + "grad_norm": 0.6610744479736924, + "learning_rate": 7.574419178660269e-06, + "loss": 0.1004, + "step": 174 + }, + { + "epoch": 0.35, + "grad_norm": 2.069879470565798, + "learning_rate": 7.546600811643816e-06, + "loss": 0.1755, + "step": 175 + }, + { + "epoch": 0.35, + "grad_norm": 0.9032056389882546, + "learning_rate": 7.5186755943576324e-06, + "loss": 0.1846, + "step": 176 + }, + { + "epoch": 0.35, + "grad_norm": 0.8292236391024451, + "learning_rate": 7.490644698487909e-06, + "loss": 0.1637, + "step": 177 + }, + { + "epoch": 0.36, + "grad_norm": 0.513027858704895, + "learning_rate": 7.462509300154892e-06, + "loss": 0.1069, + "step": 178 + }, + { + "epoch": 0.36, + "grad_norm": 0.4333926139924782, + "learning_rate": 7.434270579863549e-06, + "loss": 0.1035, + "step": 179 + }, + { + "epoch": 0.36, + "grad_norm": 1.0347242731404998, + "learning_rate": 7.405929722454026e-06, + "loss": 0.145, + "step": 180 + }, + { + "epoch": 0.36, + "grad_norm": 0.5121541327494573, + "learning_rate": 7.3774879170519386e-06, + "loss": 0.119, + "step": 181 + }, + { + "epoch": 0.36, + "grad_norm": 0.4107383137309751, + "learning_rate": 7.348946357018479e-06, + "loss": 0.103, + "step": 182 + }, + { + "epoch": 0.37, + "grad_norm": 1.6738373632141648, + "learning_rate": 7.320306239900343e-06, + "loss": 0.1263, + "step": 183 + }, + { + "epoch": 0.37, + "grad_norm": 0.47232430178292983, + "learning_rate": 7.291568767379484e-06, + "loss": 0.0802, + "step": 184 + }, + { + "epoch": 0.37, + "grad_norm": 0.6445442067197152, + "learning_rate": 7.262735145222696e-06, + "loss": 0.0718, + "step": 185 + }, + { + "epoch": 0.37, + "grad_norm": 0.925502100538713, + "learning_rate": 7.233806583231012e-06, + "loss": 0.1209, + "step": 186 + }, + { + "epoch": 0.37, + "grad_norm": 0.45358994344115217, + "learning_rate": 7.204784295188959e-06, + "loss": 0.11, + "step": 187 + }, + { + "epoch": 0.38, + "grad_norm": 1.592074593684059, + "learning_rate": 7.1756694988136165e-06, + "loss": 0.1587, + "step": 188 + }, + { + "epoch": 0.38, + "grad_norm": 1.1993290449955363, + "learning_rate": 7.14646341570353e-06, + "loss": 0.1702, + "step": 189 + }, + { + "epoch": 0.38, + "grad_norm": 0.4325803651617202, + "learning_rate": 7.117167271287453e-06, + "loss": 0.079, + "step": 190 + }, + { + "epoch": 0.38, + "grad_norm": 0.8384022277203674, + "learning_rate": 7.0877822947729265e-06, + "loss": 0.1362, + "step": 191 + }, + { + "epoch": 0.38, + "grad_norm": 0.7303306509391421, + "learning_rate": 7.05830971909472e-06, + "loss": 0.1186, + "step": 192 + }, + { + "epoch": 0.39, + "grad_norm": 0.7516368588709544, + "learning_rate": 7.028750780863078e-06, + "loss": 0.0887, + "step": 193 + }, + { + "epoch": 0.39, + "grad_norm": 0.938542188821057, + "learning_rate": 6.999106720311846e-06, + "loss": 0.1444, + "step": 194 + }, + { + "epoch": 0.39, + "grad_norm": 0.8717472174419715, + "learning_rate": 6.969378781246436e-06, + "loss": 0.1435, + "step": 195 + }, + { + "epoch": 0.39, + "grad_norm": 1.67495945265085, + "learning_rate": 6.939568210991633e-06, + "loss": 0.1626, + "step": 196 + }, + { + "epoch": 0.39, + "grad_norm": 4.342119658716737, + "learning_rate": 6.9096762603392595e-06, + "loss": 0.1953, + "step": 197 + }, + { + "epoch": 0.4, + "grad_norm": 0.6263126368661642, + "learning_rate": 6.8797041834956955e-06, + "loss": 0.1336, + "step": 198 + }, + { + "epoch": 0.4, + "grad_norm": 0.5941737014361337, + "learning_rate": 6.849653238029261e-06, + "loss": 0.153, + "step": 199 + }, + { + "epoch": 0.4, + "grad_norm": 0.7886180708514274, + "learning_rate": 6.819524684817439e-06, + "loss": 0.1223, + "step": 200 + }, + { + "epoch": 0.4, + "grad_norm": 0.7039612451739138, + "learning_rate": 6.78931978799398e-06, + "loss": 0.1335, + "step": 201 + }, + { + "epoch": 0.4, + "grad_norm": 0.8084406855671586, + "learning_rate": 6.7590398148958625e-06, + "loss": 0.1678, + "step": 202 + }, + { + "epoch": 0.41, + "grad_norm": 0.9860973166784854, + "learning_rate": 6.728686036010115e-06, + "loss": 0.153, + "step": 203 + }, + { + "epoch": 0.41, + "grad_norm": 1.5916914892453895, + "learning_rate": 6.698259724920503e-06, + "loss": 0.1385, + "step": 204 + }, + { + "epoch": 0.41, + "grad_norm": 0.6092574184981456, + "learning_rate": 6.667762158254104e-06, + "loss": 0.11, + "step": 205 + }, + { + "epoch": 0.41, + "grad_norm": 0.7140207170863097, + "learning_rate": 6.637194615627733e-06, + "loss": 0.1186, + "step": 206 + }, + { + "epoch": 0.41, + "grad_norm": 0.5017767013539293, + "learning_rate": 6.6065583795942625e-06, + "loss": 0.1103, + "step": 207 + }, + { + "epoch": 0.42, + "grad_norm": 0.7894584955176154, + "learning_rate": 6.5758547355887944e-06, + "loss": 0.1462, + "step": 208 + }, + { + "epoch": 0.42, + "grad_norm": 0.8876268352844191, + "learning_rate": 6.545084971874738e-06, + "loss": 0.1354, + "step": 209 + }, + { + "epoch": 0.42, + "grad_norm": 0.5673653854310852, + "learning_rate": 6.514250379489754e-06, + "loss": 0.0658, + "step": 210 + }, + { + "epoch": 0.42, + "grad_norm": 0.36727540507660567, + "learning_rate": 6.483352252191585e-06, + "loss": 0.0771, + "step": 211 + }, + { + "epoch": 0.42, + "grad_norm": 0.8054289615117477, + "learning_rate": 6.452391886403767e-06, + "loss": 0.169, + "step": 212 + }, + { + "epoch": 0.43, + "grad_norm": 0.5441797169347325, + "learning_rate": 6.421370581161244e-06, + "loss": 0.1048, + "step": 213 + }, + { + "epoch": 0.43, + "grad_norm": 0.89874238353916, + "learning_rate": 6.390289638055851e-06, + "loss": 0.1357, + "step": 214 + }, + { + "epoch": 0.43, + "grad_norm": 0.6949933563598905, + "learning_rate": 6.3591503611817155e-06, + "loss": 0.1054, + "step": 215 + }, + { + "epoch": 0.43, + "grad_norm": 0.3751201335684607, + "learning_rate": 6.3279540570805265e-06, + "loss": 0.1042, + "step": 216 + }, + { + "epoch": 0.43, + "grad_norm": 0.47528910333657093, + "learning_rate": 6.296702034686726e-06, + "loss": 0.0769, + "step": 217 + }, + { + "epoch": 0.44, + "grad_norm": 0.8443699711520065, + "learning_rate": 6.265395605272581e-06, + "loss": 0.0964, + "step": 218 + }, + { + "epoch": 0.44, + "grad_norm": 0.3965451318034568, + "learning_rate": 6.234036082393171e-06, + "loss": 0.0664, + "step": 219 + }, + { + "epoch": 0.44, + "grad_norm": 0.8268942968030871, + "learning_rate": 6.202624781831269e-06, + "loss": 0.119, + "step": 220 + }, + { + "epoch": 0.44, + "grad_norm": 0.41911373347383746, + "learning_rate": 6.171163021542134e-06, + "loss": 0.0799, + "step": 221 + }, + { + "epoch": 0.44, + "grad_norm": 0.9614476837400521, + "learning_rate": 6.139652121598219e-06, + "loss": 0.1709, + "step": 222 + }, + { + "epoch": 0.45, + "grad_norm": 0.9056929475746165, + "learning_rate": 6.108093404133772e-06, + "loss": 0.1738, + "step": 223 + }, + { + "epoch": 0.45, + "grad_norm": 0.44336861675183054, + "learning_rate": 6.076488193289375e-06, + "loss": 0.0897, + "step": 224 + }, + { + "epoch": 0.45, + "grad_norm": 0.49980277000157225, + "learning_rate": 6.044837815156377e-06, + "loss": 0.1065, + "step": 225 + }, + { + "epoch": 0.45, + "grad_norm": 0.8501092910803394, + "learning_rate": 6.013143597721252e-06, + "loss": 0.1185, + "step": 226 + }, + { + "epoch": 0.45, + "grad_norm": 2.0820255027100942, + "learning_rate": 5.981406870809889e-06, + "loss": 0.1042, + "step": 227 + }, + { + "epoch": 0.46, + "grad_norm": 0.5020359740609432, + "learning_rate": 5.949628966031785e-06, + "loss": 0.1051, + "step": 228 + }, + { + "epoch": 0.46, + "grad_norm": 0.5128920826197394, + "learning_rate": 5.9178112167241805e-06, + "loss": 0.0827, + "step": 229 + }, + { + "epoch": 0.46, + "grad_norm": 0.45253579722167236, + "learning_rate": 5.885954957896115e-06, + "loss": 0.1069, + "step": 230 + }, + { + "epoch": 0.46, + "grad_norm": 0.872506236560932, + "learning_rate": 5.854061526172402e-06, + "loss": 0.16, + "step": 231 + }, + { + "epoch": 0.46, + "grad_norm": 0.42871376495620556, + "learning_rate": 5.822132259737565e-06, + "loss": 0.0795, + "step": 232 + }, + { + "epoch": 0.47, + "grad_norm": 0.524971430598943, + "learning_rate": 5.7901684982796716e-06, + "loss": 0.0714, + "step": 233 + }, + { + "epoch": 0.47, + "grad_norm": 1.5362145472199313, + "learning_rate": 5.75817158293414e-06, + "loss": 0.1626, + "step": 234 + }, + { + "epoch": 0.47, + "grad_norm": 0.6267996709551327, + "learning_rate": 5.726142856227453e-06, + "loss": 0.1052, + "step": 235 + }, + { + "epoch": 0.47, + "grad_norm": 0.661716060278592, + "learning_rate": 5.694083662020835e-06, + "loss": 0.1013, + "step": 236 + }, + { + "epoch": 0.47, + "grad_norm": 0.6582684822727328, + "learning_rate": 5.661995345453867e-06, + "loss": 0.1363, + "step": 237 + }, + { + "epoch": 0.48, + "grad_norm": 0.6930382608343246, + "learning_rate": 5.629879252888046e-06, + "loss": 0.1155, + "step": 238 + }, + { + "epoch": 0.48, + "grad_norm": 0.9097902075987594, + "learning_rate": 5.597736731850295e-06, + "loss": 0.1544, + "step": 239 + }, + { + "epoch": 0.48, + "grad_norm": 0.4492863280580798, + "learning_rate": 5.5655691309764225e-06, + "loss": 0.0981, + "step": 240 + }, + { + "epoch": 0.48, + "grad_norm": 0.36242889591820937, + "learning_rate": 5.533377799954532e-06, + "loss": 0.0699, + "step": 241 + }, + { + "epoch": 0.48, + "grad_norm": 0.8398730038170705, + "learning_rate": 5.501164089468406e-06, + "loss": 0.133, + "step": 242 + }, + { + "epoch": 0.49, + "grad_norm": 0.9695085003069862, + "learning_rate": 5.4689293511408155e-06, + "loss": 0.1703, + "step": 243 + }, + { + "epoch": 0.49, + "grad_norm": 0.838488275294773, + "learning_rate": 5.43667493747682e-06, + "loss": 0.1294, + "step": 244 + }, + { + "epoch": 0.49, + "grad_norm": 0.45164214007362696, + "learning_rate": 5.404402201807022e-06, + "loss": 0.0822, + "step": 245 + }, + { + "epoch": 0.49, + "grad_norm": 0.7344032699831387, + "learning_rate": 5.372112498230771e-06, + "loss": 0.135, + "step": 246 + }, + { + "epoch": 0.49, + "grad_norm": 0.7068287460456814, + "learning_rate": 5.339807181559359e-06, + "loss": 0.1137, + "step": 247 + }, + { + "epoch": 0.5, + "grad_norm": 0.43221336760669393, + "learning_rate": 5.307487607259175e-06, + "loss": 0.0663, + "step": 248 + }, + { + "epoch": 0.5, + "grad_norm": 0.6101399921737543, + "learning_rate": 5.275155131394825e-06, + "loss": 0.1045, + "step": 249 + }, + { + "epoch": 0.5, + "grad_norm": 0.6482460605334966, + "learning_rate": 5.242811110572243e-06, + "loss": 0.1245, + "step": 250 + }, + { + "epoch": 0.5, + "grad_norm": 0.5398676773671225, + "learning_rate": 5.210456901881761e-06, + "loss": 0.111, + "step": 251 + }, + { + "epoch": 0.5, + "grad_norm": 0.7645673370818892, + "learning_rate": 5.1780938628411795e-06, + "loss": 0.1363, + "step": 252 + }, + { + "epoch": 0.51, + "grad_norm": 0.5169589806284677, + "learning_rate": 5.145723351338799e-06, + "loss": 0.0993, + "step": 253 + }, + { + "epoch": 0.51, + "grad_norm": 0.7053804089435339, + "learning_rate": 5.11334672557645e-06, + "loss": 0.1194, + "step": 254 + }, + { + "epoch": 0.51, + "grad_norm": 0.5975589404773662, + "learning_rate": 5.080965344012509e-06, + "loss": 0.0934, + "step": 255 + }, + { + "epoch": 0.51, + "grad_norm": 0.41552303091522763, + "learning_rate": 5.048580565304887e-06, + "loss": 0.0972, + "step": 256 + }, + { + "epoch": 0.51, + "grad_norm": 0.5489918448360888, + "learning_rate": 5.016193748254045e-06, + "loss": 0.0751, + "step": 257 + }, + { + "epoch": 0.52, + "grad_norm": 2.22308617581959, + "learning_rate": 4.983806251745958e-06, + "loss": 0.1535, + "step": 258 + }, + { + "epoch": 0.52, + "grad_norm": 0.43444777406659896, + "learning_rate": 4.951419434695115e-06, + "loss": 0.0886, + "step": 259 + }, + { + "epoch": 0.52, + "grad_norm": 2.660673409335142, + "learning_rate": 4.919034655987493e-06, + "loss": 0.1149, + "step": 260 + }, + { + "epoch": 0.52, + "grad_norm": 0.3283375001173057, + "learning_rate": 4.886653274423551e-06, + "loss": 0.093, + "step": 261 + }, + { + "epoch": 0.52, + "grad_norm": 1.280151105288537, + "learning_rate": 4.8542766486612035e-06, + "loss": 0.1663, + "step": 262 + }, + { + "epoch": 0.53, + "grad_norm": 0.3123335199204402, + "learning_rate": 4.821906137158822e-06, + "loss": 0.0724, + "step": 263 + }, + { + "epoch": 0.53, + "grad_norm": 0.42192359918134575, + "learning_rate": 4.7895430981182415e-06, + "loss": 0.0708, + "step": 264 + }, + { + "epoch": 0.53, + "grad_norm": 0.5967015598287172, + "learning_rate": 4.757188889427761e-06, + "loss": 0.1091, + "step": 265 + }, + { + "epoch": 0.53, + "grad_norm": 0.8761618680748017, + "learning_rate": 4.724844868605176e-06, + "loss": 0.0922, + "step": 266 + }, + { + "epoch": 0.53, + "grad_norm": 0.5960199337090709, + "learning_rate": 4.6925123927408265e-06, + "loss": 0.119, + "step": 267 + }, + { + "epoch": 0.54, + "grad_norm": 0.8037891258649469, + "learning_rate": 4.660192818440642e-06, + "loss": 0.1221, + "step": 268 + }, + { + "epoch": 0.54, + "grad_norm": 0.5556173800063137, + "learning_rate": 4.627887501769231e-06, + "loss": 0.1177, + "step": 269 + }, + { + "epoch": 0.54, + "grad_norm": 0.9049550817206606, + "learning_rate": 4.59559779819298e-06, + "loss": 0.1591, + "step": 270 + }, + { + "epoch": 0.54, + "grad_norm": 1.1702651051641604, + "learning_rate": 4.5633250625231806e-06, + "loss": 0.182, + "step": 271 + }, + { + "epoch": 0.54, + "grad_norm": 0.8483471329393831, + "learning_rate": 4.531070648859186e-06, + "loss": 0.1109, + "step": 272 + }, + { + "epoch": 0.55, + "grad_norm": 0.32161415640173774, + "learning_rate": 4.498835910531595e-06, + "loss": 0.0721, + "step": 273 + }, + { + "epoch": 0.55, + "grad_norm": 0.43434061341999136, + "learning_rate": 4.4666222000454685e-06, + "loss": 0.1053, + "step": 274 + }, + { + "epoch": 0.55, + "grad_norm": 0.8637865606289452, + "learning_rate": 4.434430869023579e-06, + "loss": 0.0861, + "step": 275 + }, + { + "epoch": 0.55, + "grad_norm": 0.628646931876877, + "learning_rate": 4.402263268149707e-06, + "loss": 0.1135, + "step": 276 + }, + { + "epoch": 0.55, + "grad_norm": 0.34970147815123154, + "learning_rate": 4.370120747111956e-06, + "loss": 0.0723, + "step": 277 + }, + { + "epoch": 0.56, + "grad_norm": 0.4765585692957405, + "learning_rate": 4.338004654546136e-06, + "loss": 0.1145, + "step": 278 + }, + { + "epoch": 0.56, + "grad_norm": 0.7803990928349243, + "learning_rate": 4.3059163379791676e-06, + "loss": 0.1156, + "step": 279 + }, + { + "epoch": 0.56, + "grad_norm": 1.7671333748360498, + "learning_rate": 4.27385714377255e-06, + "loss": 0.1053, + "step": 280 + }, + { + "epoch": 0.56, + "grad_norm": 0.7121038442085383, + "learning_rate": 4.24182841706586e-06, + "loss": 0.1289, + "step": 281 + }, + { + "epoch": 0.56, + "grad_norm": 0.8800209303666127, + "learning_rate": 4.209831501720328e-06, + "loss": 0.1778, + "step": 282 + }, + { + "epoch": 0.57, + "grad_norm": 0.665862686003406, + "learning_rate": 4.177867740262437e-06, + "loss": 0.1501, + "step": 283 + }, + { + "epoch": 0.57, + "grad_norm": 1.121777185384995, + "learning_rate": 4.145938473827598e-06, + "loss": 0.1149, + "step": 284 + }, + { + "epoch": 0.57, + "grad_norm": 0.40686853519037686, + "learning_rate": 4.1140450421038865e-06, + "loss": 0.085, + "step": 285 + }, + { + "epoch": 0.57, + "grad_norm": 0.4216094226014865, + "learning_rate": 4.08218878327582e-06, + "loss": 0.1087, + "step": 286 + }, + { + "epoch": 0.57, + "grad_norm": 0.8226728277647426, + "learning_rate": 4.050371033968216e-06, + "loss": 0.17, + "step": 287 + }, + { + "epoch": 0.58, + "grad_norm": 0.48187584984948256, + "learning_rate": 4.018593129190113e-06, + "loss": 0.0978, + "step": 288 + }, + { + "epoch": 0.58, + "grad_norm": 0.4526743619684988, + "learning_rate": 3.98685640227875e-06, + "loss": 0.0858, + "step": 289 + }, + { + "epoch": 0.58, + "grad_norm": 1.2046001007142648, + "learning_rate": 3.955162184843625e-06, + "loss": 0.1467, + "step": 290 + }, + { + "epoch": 0.58, + "grad_norm": 0.5177528518475938, + "learning_rate": 3.9235118067106255e-06, + "loss": 0.0904, + "step": 291 + }, + { + "epoch": 0.58, + "grad_norm": 0.5460427300809364, + "learning_rate": 3.89190659586623e-06, + "loss": 0.0746, + "step": 292 + }, + { + "epoch": 0.59, + "grad_norm": 0.5241932527164748, + "learning_rate": 3.8603478784017845e-06, + "loss": 0.1297, + "step": 293 + }, + { + "epoch": 0.59, + "grad_norm": 0.39990316884451654, + "learning_rate": 3.828836978457868e-06, + "loss": 0.0857, + "step": 294 + }, + { + "epoch": 0.59, + "grad_norm": 0.5332681583136303, + "learning_rate": 3.7973752181687336e-06, + "loss": 0.1067, + "step": 295 + }, + { + "epoch": 0.59, + "grad_norm": 0.7158724785835472, + "learning_rate": 3.7659639176068287e-06, + "loss": 0.1198, + "step": 296 + }, + { + "epoch": 0.59, + "grad_norm": 0.7926331004862835, + "learning_rate": 3.734604394727419e-06, + "loss": 0.1302, + "step": 297 + }, + { + "epoch": 0.6, + "grad_norm": 0.4726616087570671, + "learning_rate": 3.703297965313275e-06, + "loss": 0.0943, + "step": 298 + }, + { + "epoch": 0.6, + "grad_norm": 0.3365001206743048, + "learning_rate": 3.6720459429194743e-06, + "loss": 0.0761, + "step": 299 + }, + { + "epoch": 0.6, + "grad_norm": 0.5041980032450573, + "learning_rate": 3.6408496388182857e-06, + "loss": 0.1348, + "step": 300 + }, + { + "epoch": 0.6, + "grad_norm": 0.3980953006694637, + "learning_rate": 3.6097103619441505e-06, + "loss": 0.0745, + "step": 301 + }, + { + "epoch": 0.6, + "grad_norm": 0.38002146091571526, + "learning_rate": 3.578629418838757e-06, + "loss": 0.0705, + "step": 302 + }, + { + "epoch": 0.61, + "grad_norm": 0.6150028421766434, + "learning_rate": 3.5476081135962335e-06, + "loss": 0.1014, + "step": 303 + }, + { + "epoch": 0.61, + "grad_norm": 0.47726586681317523, + "learning_rate": 3.516647747808417e-06, + "loss": 0.0873, + "step": 304 + }, + { + "epoch": 0.61, + "grad_norm": 0.7303160554132148, + "learning_rate": 3.4857496205102475e-06, + "loss": 0.1218, + "step": 305 + }, + { + "epoch": 0.61, + "grad_norm": 0.5063263339961966, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.1003, + "step": 306 + }, + { + "epoch": 0.61, + "grad_norm": 0.5173752740699328, + "learning_rate": 3.4241452644112085e-06, + "loss": 0.1441, + "step": 307 + }, + { + "epoch": 0.62, + "grad_norm": 0.578168860033146, + "learning_rate": 3.3934416204057396e-06, + "loss": 0.1384, + "step": 308 + }, + { + "epoch": 0.62, + "grad_norm": 0.256424858150544, + "learning_rate": 3.3628053843722674e-06, + "loss": 0.0444, + "step": 309 + }, + { + "epoch": 0.62, + "grad_norm": 0.4025835046318359, + "learning_rate": 3.3322378417458985e-06, + "loss": 0.0658, + "step": 310 + }, + { + "epoch": 0.62, + "grad_norm": 0.6353664899766872, + "learning_rate": 3.3017402750794976e-06, + "loss": 0.1037, + "step": 311 + }, + { + "epoch": 0.62, + "grad_norm": 1.065012599456067, + "learning_rate": 3.271313963989886e-06, + "loss": 0.1453, + "step": 312 + }, + { + "epoch": 0.63, + "grad_norm": 0.8933738678604561, + "learning_rate": 3.240960185104137e-06, + "loss": 0.1155, + "step": 313 + }, + { + "epoch": 0.63, + "grad_norm": 0.6295023999749918, + "learning_rate": 3.2106802120060197e-06, + "loss": 0.0817, + "step": 314 + }, + { + "epoch": 0.63, + "grad_norm": 0.6825569076251282, + "learning_rate": 3.180475315182563e-06, + "loss": 0.1074, + "step": 315 + }, + { + "epoch": 0.63, + "grad_norm": 1.003002066232109, + "learning_rate": 3.1503467619707407e-06, + "loss": 0.1333, + "step": 316 + }, + { + "epoch": 0.63, + "grad_norm": 0.5539936219856008, + "learning_rate": 3.1202958165043053e-06, + "loss": 0.0892, + "step": 317 + }, + { + "epoch": 0.64, + "grad_norm": 0.5499033259681437, + "learning_rate": 3.090323739660742e-06, + "loss": 0.1171, + "step": 318 + }, + { + "epoch": 0.64, + "grad_norm": 0.7295234710326629, + "learning_rate": 3.060431789008368e-06, + "loss": 0.1339, + "step": 319 + }, + { + "epoch": 0.64, + "grad_norm": 0.7039705596052677, + "learning_rate": 3.0306212187535653e-06, + "loss": 0.1735, + "step": 320 + }, + { + "epoch": 0.64, + "grad_norm": 0.7237951210549803, + "learning_rate": 3.000893279688155e-06, + "loss": 0.1272, + "step": 321 + }, + { + "epoch": 0.64, + "grad_norm": 0.5977750840343024, + "learning_rate": 2.9712492191369245e-06, + "loss": 0.1161, + "step": 322 + }, + { + "epoch": 0.65, + "grad_norm": 0.9137575856242299, + "learning_rate": 2.9416902809052817e-06, + "loss": 0.1631, + "step": 323 + }, + { + "epoch": 0.65, + "grad_norm": 0.5604895386564465, + "learning_rate": 2.912217705227075e-06, + "loss": 0.0935, + "step": 324 + }, + { + "epoch": 0.65, + "grad_norm": 2.312594001816466, + "learning_rate": 2.882832728712551e-06, + "loss": 0.2357, + "step": 325 + }, + { + "epoch": 0.65, + "grad_norm": 0.8125476078354773, + "learning_rate": 2.8535365842964713e-06, + "loss": 0.0856, + "step": 326 + }, + { + "epoch": 0.65, + "grad_norm": 0.6050490227647147, + "learning_rate": 2.8243305011863843e-06, + "loss": 0.0858, + "step": 327 + }, + { + "epoch": 0.66, + "grad_norm": 0.5686503856565632, + "learning_rate": 2.7952157048110406e-06, + "loss": 0.0786, + "step": 328 + }, + { + "epoch": 0.66, + "grad_norm": 0.3341595489320209, + "learning_rate": 2.7661934167689887e-06, + "loss": 0.0765, + "step": 329 + }, + { + "epoch": 0.66, + "grad_norm": 0.48523257788152796, + "learning_rate": 2.7372648547773063e-06, + "loss": 0.0818, + "step": 330 + }, + { + "epoch": 0.66, + "grad_norm": 0.4353651802388437, + "learning_rate": 2.7084312326205164e-06, + "loss": 0.1027, + "step": 331 + }, + { + "epoch": 0.66, + "grad_norm": 0.5804653250200166, + "learning_rate": 2.6796937600996587e-06, + "loss": 0.1287, + "step": 332 + }, + { + "epoch": 0.67, + "grad_norm": 0.7258700154480898, + "learning_rate": 2.6510536429815224e-06, + "loss": 0.1477, + "step": 333 + }, + { + "epoch": 0.67, + "grad_norm": 0.5569984089795129, + "learning_rate": 2.622512082948063e-06, + "loss": 0.1006, + "step": 334 + }, + { + "epoch": 0.67, + "grad_norm": 0.5618985392420127, + "learning_rate": 2.594070277545975e-06, + "loss": 0.1022, + "step": 335 + }, + { + "epoch": 0.67, + "grad_norm": 0.4057866106940638, + "learning_rate": 2.5657294201364526e-06, + "loss": 0.1026, + "step": 336 + }, + { + "epoch": 0.67, + "grad_norm": 0.3949073034443962, + "learning_rate": 2.5374906998451094e-06, + "loss": 0.0701, + "step": 337 + }, + { + "epoch": 0.68, + "grad_norm": 0.5821479003733443, + "learning_rate": 2.5093553015120937e-06, + "loss": 0.1141, + "step": 338 + }, + { + "epoch": 0.68, + "grad_norm": 0.37657783683758206, + "learning_rate": 2.4813244056423692e-06, + "loss": 0.0807, + "step": 339 + }, + { + "epoch": 0.68, + "grad_norm": 0.8996794097563738, + "learning_rate": 2.4533991883561868e-06, + "loss": 0.1392, + "step": 340 + }, + { + "epoch": 0.68, + "grad_norm": 0.7862777875907287, + "learning_rate": 2.425580821339733e-06, + "loss": 0.1427, + "step": 341 + }, + { + "epoch": 0.68, + "grad_norm": 0.4158720785594425, + "learning_rate": 2.3978704717959777e-06, + "loss": 0.057, + "step": 342 + }, + { + "epoch": 0.69, + "grad_norm": 0.445461113749062, + "learning_rate": 2.3702693023956853e-06, + "loss": 0.0982, + "step": 343 + }, + { + "epoch": 0.69, + "grad_norm": 0.3380111458364013, + "learning_rate": 2.342778471228648e-06, + "loss": 0.074, + "step": 344 + }, + { + "epoch": 0.69, + "grad_norm": 0.442065574751732, + "learning_rate": 2.315399131755081e-06, + "loss": 0.0836, + "step": 345 + }, + { + "epoch": 0.69, + "grad_norm": 0.3959460207104249, + "learning_rate": 2.2881324327572336e-06, + "loss": 0.0796, + "step": 346 + }, + { + "epoch": 0.69, + "grad_norm": 0.5811148185563871, + "learning_rate": 2.260979518291186e-06, + "loss": 0.1085, + "step": 347 + }, + { + "epoch": 0.7, + "grad_norm": 0.5786190629593744, + "learning_rate": 2.233941527638848e-06, + "loss": 0.1267, + "step": 348 + }, + { + "epoch": 0.7, + "grad_norm": 1.0171032000489182, + "learning_rate": 2.207019595260154e-06, + "loss": 0.1309, + "step": 349 + }, + { + "epoch": 0.7, + "grad_norm": 0.48160306249010754, + "learning_rate": 2.1802148507454675e-06, + "loss": 0.0878, + "step": 350 + }, + { + "epoch": 0.7, + "grad_norm": 0.3280483511586015, + "learning_rate": 2.1535284187681866e-06, + "loss": 0.0693, + "step": 351 + }, + { + "epoch": 0.7, + "grad_norm": 0.34096842916481573, + "learning_rate": 2.1269614190375477e-06, + "loss": 0.0499, + "step": 352 + }, + { + "epoch": 0.71, + "grad_norm": 0.47978813814877763, + "learning_rate": 2.1005149662516517e-06, + "loss": 0.0625, + "step": 353 + }, + { + "epoch": 0.71, + "grad_norm": 0.7425879124863387, + "learning_rate": 2.07419017005069e-06, + "loss": 0.1186, + "step": 354 + }, + { + "epoch": 0.71, + "grad_norm": 0.6228364584187716, + "learning_rate": 2.0479881349703885e-06, + "loss": 0.1052, + "step": 355 + }, + { + "epoch": 0.71, + "grad_norm": 0.4806977117902373, + "learning_rate": 2.021909960395661e-06, + "loss": 0.1054, + "step": 356 + }, + { + "epoch": 0.71, + "grad_norm": 0.609915927744076, + "learning_rate": 1.9959567405144825e-06, + "loss": 0.1285, + "step": 357 + }, + { + "epoch": 0.72, + "grad_norm": 0.3594194411476996, + "learning_rate": 1.9701295642719836e-06, + "loss": 0.0644, + "step": 358 + }, + { + "epoch": 0.72, + "grad_norm": 0.6102620773665237, + "learning_rate": 1.944429515324749e-06, + "loss": 0.0967, + "step": 359 + }, + { + "epoch": 0.72, + "grad_norm": 0.49370723009062395, + "learning_rate": 1.9188576719953635e-06, + "loss": 0.1382, + "step": 360 + }, + { + "epoch": 0.72, + "grad_norm": 0.9307415830568554, + "learning_rate": 1.8934151072271573e-06, + "loss": 0.1114, + "step": 361 + }, + { + "epoch": 0.72, + "grad_norm": 0.5873584645988698, + "learning_rate": 1.8681028885391905e-06, + "loss": 0.1111, + "step": 362 + }, + { + "epoch": 0.73, + "grad_norm": 0.5086410983198352, + "learning_rate": 1.8429220779814654e-06, + "loss": 0.0813, + "step": 363 + }, + { + "epoch": 0.73, + "grad_norm": 0.7494157766725494, + "learning_rate": 1.81787373209036e-06, + "loss": 0.143, + "step": 364 + }, + { + "epoch": 0.73, + "grad_norm": 0.59457257382502, + "learning_rate": 1.7929589018443016e-06, + "loss": 0.0995, + "step": 365 + }, + { + "epoch": 0.73, + "grad_norm": 0.7171481785607706, + "learning_rate": 1.7681786326196665e-06, + "loss": 0.135, + "step": 366 + }, + { + "epoch": 0.73, + "grad_norm": 0.6544054085500117, + "learning_rate": 1.743533964146924e-06, + "loss": 0.1084, + "step": 367 + }, + { + "epoch": 0.74, + "grad_norm": 0.9507053821842658, + "learning_rate": 1.7190259304670038e-06, + "loss": 0.1547, + "step": 368 + }, + { + "epoch": 0.74, + "grad_norm": 0.5798816104551191, + "learning_rate": 1.6946555598879138e-06, + "loss": 0.124, + "step": 369 + }, + { + "epoch": 0.74, + "grad_norm": 0.7567771885229603, + "learning_rate": 1.6704238749415958e-06, + "loss": 0.1481, + "step": 370 + }, + { + "epoch": 0.74, + "grad_norm": 1.3037738050212244, + "learning_rate": 1.6463318923410183e-06, + "loss": 0.1183, + "step": 371 + }, + { + "epoch": 0.74, + "grad_norm": 0.6640220162663745, + "learning_rate": 1.6223806229375182e-06, + "loss": 0.122, + "step": 372 + }, + { + "epoch": 0.75, + "grad_norm": 0.477033527760463, + "learning_rate": 1.5985710716783936e-06, + "loss": 0.0744, + "step": 373 + }, + { + "epoch": 0.75, + "grad_norm": 0.5646325992453681, + "learning_rate": 1.5749042375647261e-06, + "loss": 0.1229, + "step": 374 + }, + { + "epoch": 0.75, + "grad_norm": 0.497343124798125, + "learning_rate": 1.5513811136094786e-06, + "loss": 0.0914, + "step": 375 + }, + { + "epoch": 0.75, + "grad_norm": 0.4787217350165173, + "learning_rate": 1.5280026867958186e-06, + "loss": 0.1012, + "step": 376 + }, + { + "epoch": 0.75, + "grad_norm": 2.7864702413137152, + "learning_rate": 1.5047699380357134e-06, + "loss": 0.1432, + "step": 377 + }, + { + "epoch": 0.76, + "grad_norm": 0.3630059899374488, + "learning_rate": 1.4816838421287693e-06, + "loss": 0.0499, + "step": 378 + }, + { + "epoch": 0.76, + "grad_norm": 0.9135185471337351, + "learning_rate": 1.4587453677213348e-06, + "loss": 0.1565, + "step": 379 + }, + { + "epoch": 0.76, + "grad_norm": 0.4025025771883063, + "learning_rate": 1.4359554772658551e-06, + "loss": 0.0813, + "step": 380 + }, + { + "epoch": 0.76, + "grad_norm": 0.8944633586248318, + "learning_rate": 1.4133151269804873e-06, + "loss": 0.1194, + "step": 381 + }, + { + "epoch": 0.76, + "grad_norm": 0.4963838870293355, + "learning_rate": 1.39082526680899e-06, + "loss": 0.1533, + "step": 382 + }, + { + "epoch": 0.77, + "grad_norm": 0.6233327379521121, + "learning_rate": 1.368486840380851e-06, + "loss": 0.1167, + "step": 383 + }, + { + "epoch": 0.77, + "grad_norm": 0.3567685987225913, + "learning_rate": 1.3463007849717035e-06, + "loss": 0.0699, + "step": 384 + }, + { + "epoch": 0.77, + "grad_norm": 0.4907523234293947, + "learning_rate": 1.3242680314639995e-06, + "loss": 0.0638, + "step": 385 + }, + { + "epoch": 0.77, + "grad_norm": 0.8918356254984783, + "learning_rate": 1.3023895043079476e-06, + "loss": 0.1236, + "step": 386 + }, + { + "epoch": 0.77, + "grad_norm": 0.4932187447287198, + "learning_rate": 1.2806661214827286e-06, + "loss": 0.0744, + "step": 387 + }, + { + "epoch": 0.78, + "grad_norm": 0.7425082873706774, + "learning_rate": 1.2590987944579808e-06, + "loss": 0.1324, + "step": 388 + }, + { + "epoch": 0.78, + "grad_norm": 0.773838140527864, + "learning_rate": 1.2376884281555485e-06, + "loss": 0.133, + "step": 389 + }, + { + "epoch": 0.78, + "grad_norm": 0.5173817236431654, + "learning_rate": 1.2164359209115235e-06, + "loss": 0.0962, + "step": 390 + }, + { + "epoch": 0.78, + "grad_norm": 0.5641305515634265, + "learning_rate": 1.1953421644385444e-06, + "loss": 0.1363, + "step": 391 + }, + { + "epoch": 0.78, + "grad_norm": 0.9415313279746156, + "learning_rate": 1.1744080437883859e-06, + "loss": 0.1398, + "step": 392 + }, + { + "epoch": 0.79, + "grad_norm": 0.506483121069416, + "learning_rate": 1.1536344373148245e-06, + "loss": 0.0848, + "step": 393 + }, + { + "epoch": 0.79, + "grad_norm": 0.9255572933453069, + "learning_rate": 1.133022216636781e-06, + "loss": 0.1123, + "step": 394 + }, + { + "epoch": 0.79, + "grad_norm": 0.6029862576440691, + "learning_rate": 1.1125722466017547e-06, + "loss": 0.1182, + "step": 395 + }, + { + "epoch": 0.79, + "grad_norm": 0.6376670932752512, + "learning_rate": 1.092285385249528e-06, + "loss": 0.1217, + "step": 396 + }, + { + "epoch": 0.79, + "grad_norm": 0.6945037548136007, + "learning_rate": 1.0721624837761768e-06, + "loss": 0.1146, + "step": 397 + }, + { + "epoch": 0.8, + "grad_norm": 0.3933583908849212, + "learning_rate": 1.0522043864983428e-06, + "loss": 0.0851, + "step": 398 + }, + { + "epoch": 0.8, + "grad_norm": 0.8528313429730696, + "learning_rate": 1.0324119308178166e-06, + "loss": 0.1346, + "step": 399 + }, + { + "epoch": 0.8, + "grad_norm": 0.7032753581261906, + "learning_rate": 1.012785947186397e-06, + "loss": 0.1213, + "step": 400 + }, + { + "epoch": 0.8, + "grad_norm": 0.700851792321212, + "learning_rate": 9.933272590710508e-07, + "loss": 0.0983, + "step": 401 + }, + { + "epoch": 0.8, + "grad_norm": 0.6810211763792263, + "learning_rate": 9.740366829193587e-07, + "loss": 0.1227, + "step": 402 + }, + { + "epoch": 0.81, + "grad_norm": 0.7204524580947844, + "learning_rate": 9.549150281252633e-07, + "loss": 0.0993, + "step": 403 + }, + { + "epoch": 0.81, + "grad_norm": 1.0497265670010762, + "learning_rate": 9.359630969951012e-07, + "loss": 0.0656, + "step": 404 + }, + { + "epoch": 0.81, + "grad_norm": 0.4493415113338161, + "learning_rate": 9.171816847139447e-07, + "loss": 0.1023, + "step": 405 + }, + { + "epoch": 0.81, + "grad_norm": 0.6197686610517851, + "learning_rate": 8.985715793122407e-07, + "loss": 0.0944, + "step": 406 + }, + { + "epoch": 0.81, + "grad_norm": 0.6454561325542718, + "learning_rate": 8.801335616327378e-07, + "loss": 0.1135, + "step": 407 + }, + { + "epoch": 0.82, + "grad_norm": 0.8188263044708098, + "learning_rate": 8.618684052977305e-07, + "loss": 0.1069, + "step": 408 + }, + { + "epoch": 0.82, + "grad_norm": 0.3682638853893349, + "learning_rate": 8.437768766765975e-07, + "loss": 0.091, + "step": 409 + }, + { + "epoch": 0.82, + "grad_norm": 0.6929729652828701, + "learning_rate": 8.258597348536452e-07, + "loss": 0.1138, + "step": 410 + }, + { + "epoch": 0.82, + "grad_norm": 0.7147551524391744, + "learning_rate": 8.081177315962601e-07, + "loss": 0.1293, + "step": 411 + }, + { + "epoch": 0.82, + "grad_norm": 0.7024450782828899, + "learning_rate": 7.905516113233652e-07, + "loss": 0.1297, + "step": 412 + }, + { + "epoch": 0.83, + "grad_norm": 0.30317714532691387, + "learning_rate": 7.731621110741871e-07, + "loss": 0.075, + "step": 413 + }, + { + "epoch": 0.83, + "grad_norm": 0.3697260307910134, + "learning_rate": 7.55949960477328e-07, + "loss": 0.096, + "step": 414 + }, + { + "epoch": 0.83, + "grad_norm": 0.4428959709714789, + "learning_rate": 7.389158817201541e-07, + "loss": 0.0748, + "step": 415 + }, + { + "epoch": 0.83, + "grad_norm": 0.8004523089375409, + "learning_rate": 7.220605895184946e-07, + "loss": 0.0782, + "step": 416 + }, + { + "epoch": 0.83, + "grad_norm": 1.1444774950837089, + "learning_rate": 7.053847910866513e-07, + "loss": 0.1297, + "step": 417 + }, + { + "epoch": 0.84, + "grad_norm": 0.7898964558677061, + "learning_rate": 6.888891861077301e-07, + "loss": 0.095, + "step": 418 + }, + { + "epoch": 0.84, + "grad_norm": 2.033195499917111, + "learning_rate": 6.725744667042778e-07, + "loss": 0.1317, + "step": 419 + }, + { + "epoch": 0.84, + "grad_norm": 0.47792986539711907, + "learning_rate": 6.564413174092443e-07, + "loss": 0.1169, + "step": 420 + }, + { + "epoch": 0.84, + "grad_norm": 0.6550032710177911, + "learning_rate": 6.404904151372649e-07, + "loss": 0.0896, + "step": 421 + }, + { + "epoch": 0.84, + "grad_norm": 0.8084734832162704, + "learning_rate": 6.24722429156251e-07, + "loss": 0.0998, + "step": 422 + }, + { + "epoch": 0.85, + "grad_norm": 0.5357925216101086, + "learning_rate": 6.091380210593145e-07, + "loss": 0.0814, + "step": 423 + }, + { + "epoch": 0.85, + "grad_norm": 0.4881261803935285, + "learning_rate": 5.937378447370068e-07, + "loss": 0.0929, + "step": 424 + }, + { + "epoch": 0.85, + "grad_norm": 0.6924211795087191, + "learning_rate": 5.785225463498828e-07, + "loss": 0.0986, + "step": 425 + }, + { + "epoch": 0.85, + "grad_norm": 0.7707349079438567, + "learning_rate": 5.634927643013899e-07, + "loss": 0.1393, + "step": 426 + }, + { + "epoch": 0.85, + "grad_norm": 2.5765244494687627, + "learning_rate": 5.486491292110796e-07, + "loss": 0.097, + "step": 427 + }, + { + "epoch": 0.86, + "grad_norm": 0.4969093516786432, + "learning_rate": 5.339922638881545e-07, + "loss": 0.0848, + "step": 428 + }, + { + "epoch": 0.86, + "grad_norm": 0.9387402710009252, + "learning_rate": 5.195227833053273e-07, + "loss": 0.126, + "step": 429 + }, + { + "epoch": 0.86, + "grad_norm": 0.8802367999231513, + "learning_rate": 5.05241294573024e-07, + "loss": 0.1259, + "step": 430 + }, + { + "epoch": 0.86, + "grad_norm": 0.46639751531285945, + "learning_rate": 4.911483969139086e-07, + "loss": 0.1235, + "step": 431 + }, + { + "epoch": 0.86, + "grad_norm": 0.872214566697276, + "learning_rate": 4.772446816377408e-07, + "loss": 0.1188, + "step": 432 + }, + { + "epoch": 0.87, + "grad_norm": 0.5081856819309839, + "learning_rate": 4.6353073211656886e-07, + "loss": 0.0933, + "step": 433 + }, + { + "epoch": 0.87, + "grad_norm": 0.6697265799715091, + "learning_rate": 4.5000712376024826e-07, + "loss": 0.1028, + "step": 434 + }, + { + "epoch": 0.87, + "grad_norm": 0.5417303256208303, + "learning_rate": 4.3667442399229985e-07, + "loss": 0.0853, + "step": 435 + }, + { + "epoch": 0.87, + "grad_norm": 0.4919464736847065, + "learning_rate": 4.2353319222610265e-07, + "loss": 0.1185, + "step": 436 + }, + { + "epoch": 0.87, + "grad_norm": 0.7312868401103222, + "learning_rate": 4.1058397984142405e-07, + "loss": 0.1091, + "step": 437 + }, + { + "epoch": 0.88, + "grad_norm": 0.8929518208862638, + "learning_rate": 3.9782733016128006e-07, + "loss": 0.1014, + "step": 438 + }, + { + "epoch": 0.88, + "grad_norm": 0.5064559227512172, + "learning_rate": 3.852637784291424e-07, + "loss": 0.0812, + "step": 439 + }, + { + "epoch": 0.88, + "grad_norm": 0.7003756083181368, + "learning_rate": 3.728938517864794e-07, + "loss": 0.0914, + "step": 440 + }, + { + "epoch": 0.88, + "grad_norm": 0.9754357254280254, + "learning_rate": 3.60718069250639e-07, + "loss": 0.0963, + "step": 441 + }, + { + "epoch": 0.88, + "grad_norm": 1.4166659852770702, + "learning_rate": 3.4873694169306915e-07, + "loss": 0.0752, + "step": 442 + }, + { + "epoch": 0.89, + "grad_norm": 0.5994165255977032, + "learning_rate": 3.369509718178887e-07, + "loss": 0.0914, + "step": 443 + }, + { + "epoch": 0.89, + "grad_norm": 0.5532696609253416, + "learning_rate": 3.2536065414078724e-07, + "loss": 0.1189, + "step": 444 + }, + { + "epoch": 0.89, + "grad_norm": 0.4898561361634835, + "learning_rate": 3.1396647496828245e-07, + "loss": 0.1086, + "step": 445 + }, + { + "epoch": 0.89, + "grad_norm": 0.5204425966121, + "learning_rate": 3.0276891237731085e-07, + "loss": 0.091, + "step": 446 + }, + { + "epoch": 0.89, + "grad_norm": 0.6907546342225368, + "learning_rate": 2.917684361951728e-07, + "loss": 0.1217, + "step": 447 + }, + { + "epoch": 0.9, + "grad_norm": 0.4176504020041913, + "learning_rate": 2.809655079798179e-07, + "loss": 0.0508, + "step": 448 + }, + { + "epoch": 0.9, + "grad_norm": 0.43581953108596455, + "learning_rate": 2.7036058100047723e-07, + "loss": 0.0787, + "step": 449 + }, + { + "epoch": 0.9, + "grad_norm": 0.8249172088818233, + "learning_rate": 2.599541002186479e-07, + "loss": 0.1025, + "step": 450 + }, + { + "epoch": 0.9, + "grad_norm": 0.7051733307894276, + "learning_rate": 2.497465022694207e-07, + "loss": 0.1129, + "step": 451 + }, + { + "epoch": 0.9, + "grad_norm": 0.3685818395664113, + "learning_rate": 2.397382154431621e-07, + "loss": 0.0623, + "step": 452 + }, + { + "epoch": 0.91, + "grad_norm": 0.7701828038636435, + "learning_rate": 2.2992965966754378e-07, + "loss": 0.1578, + "step": 453 + }, + { + "epoch": 0.91, + "grad_norm": 0.47951540454898006, + "learning_rate": 2.2032124648992015e-07, + "loss": 0.1065, + "step": 454 + }, + { + "epoch": 0.91, + "grad_norm": 0.967219792124932, + "learning_rate": 2.109133790600648e-07, + "loss": 0.1079, + "step": 455 + }, + { + "epoch": 0.91, + "grad_norm": 0.7740093949626732, + "learning_rate": 2.0170645211325335e-07, + "loss": 0.1417, + "step": 456 + }, + { + "epoch": 0.91, + "grad_norm": 0.5131423955626542, + "learning_rate": 1.9270085195370048e-07, + "loss": 0.1095, + "step": 457 + }, + { + "epoch": 0.92, + "grad_norm": 0.4475424388277173, + "learning_rate": 1.838969564383525e-07, + "loss": 0.1004, + "step": 458 + }, + { + "epoch": 0.92, + "grad_norm": 0.7253496435886067, + "learning_rate": 1.7529513496103322e-07, + "loss": 0.1099, + "step": 459 + }, + { + "epoch": 0.92, + "grad_norm": 0.6110013706554089, + "learning_rate": 1.6689574843694433e-07, + "loss": 0.085, + "step": 460 + }, + { + "epoch": 0.92, + "grad_norm": 0.5522128545402384, + "learning_rate": 1.5869914928752117e-07, + "loss": 0.1208, + "step": 461 + }, + { + "epoch": 0.92, + "grad_norm": 0.7719280457464354, + "learning_rate": 1.5070568142564912e-07, + "loss": 0.1264, + "step": 462 + }, + { + "epoch": 0.93, + "grad_norm": 1.1812476336742652, + "learning_rate": 1.4291568024122848e-07, + "loss": 0.186, + "step": 463 + }, + { + "epoch": 0.93, + "grad_norm": 0.43320873919423475, + "learning_rate": 1.3532947258710905e-07, + "loss": 0.0833, + "step": 464 + }, + { + "epoch": 0.93, + "grad_norm": 0.8054906254618007, + "learning_rate": 1.2794737676536993e-07, + "loss": 0.146, + "step": 465 + }, + { + "epoch": 0.93, + "grad_norm": 0.50148856743542, + "learning_rate": 1.2076970251396593e-07, + "loss": 0.0987, + "step": 466 + }, + { + "epoch": 0.93, + "grad_norm": 0.47531633403256274, + "learning_rate": 1.1379675099373489e-07, + "loss": 0.0853, + "step": 467 + }, + { + "epoch": 0.94, + "grad_norm": 1.9020478089825394, + "learning_rate": 1.0702881477575589e-07, + "loss": 0.0893, + "step": 468 + }, + { + "epoch": 0.94, + "grad_norm": 0.3989134945661557, + "learning_rate": 1.004661778290783e-07, + "loss": 0.0847, + "step": 469 + }, + { + "epoch": 0.94, + "grad_norm": 0.5858494168173927, + "learning_rate": 9.410911550880474e-08, + "loss": 0.0947, + "step": 470 + }, + { + "epoch": 0.94, + "grad_norm": 1.0751517292115917, + "learning_rate": 8.795789454453862e-08, + "loss": 0.1158, + "step": 471 + }, + { + "epoch": 0.94, + "grad_norm": 0.7065663950146411, + "learning_rate": 8.201277302919086e-08, + "loss": 0.0955, + "step": 472 + }, + { + "epoch": 0.95, + "grad_norm": 0.5260547199750213, + "learning_rate": 7.627400040815414e-08, + "loss": 0.0816, + "step": 473 + }, + { + "epoch": 0.95, + "grad_norm": 2.1865371271179677, + "learning_rate": 7.074181746883402e-08, + "loss": 0.0977, + "step": 474 + }, + { + "epoch": 0.95, + "grad_norm": 0.728326472998703, + "learning_rate": 6.54164563305465e-08, + "loss": 0.1569, + "step": 475 + }, + { + "epoch": 0.95, + "grad_norm": 0.5759374898440636, + "learning_rate": 6.029814043478022e-08, + "loss": 0.1095, + "step": 476 + }, + { + "epoch": 0.95, + "grad_norm": 0.5159800707502684, + "learning_rate": 5.538708453581787e-08, + "loss": 0.1186, + "step": 477 + }, + { + "epoch": 0.96, + "grad_norm": 0.5912916700717128, + "learning_rate": 5.068349469173006e-08, + "loss": 0.1295, + "step": 478 + }, + { + "epoch": 0.96, + "grad_norm": 0.6628394938861257, + "learning_rate": 4.618756825572612e-08, + "loss": 0.138, + "step": 479 + }, + { + "epoch": 0.96, + "grad_norm": 0.6669753700150592, + "learning_rate": 4.189949386787462e-08, + "loss": 0.1209, + "step": 480 + }, + { + "epoch": 0.96, + "grad_norm": 0.7521903097769528, + "learning_rate": 3.781945144718912e-08, + "loss": 0.1393, + "step": 481 + }, + { + "epoch": 0.96, + "grad_norm": 0.8721516844944895, + "learning_rate": 3.394761218407705e-08, + "loss": 0.1097, + "step": 482 + }, + { + "epoch": 0.97, + "grad_norm": 0.5187003748899651, + "learning_rate": 3.0284138533160924e-08, + "loss": 0.0901, + "step": 483 + }, + { + "epoch": 0.97, + "grad_norm": 0.6504180266917741, + "learning_rate": 2.6829184206457194e-08, + "loss": 0.1314, + "step": 484 + }, + { + "epoch": 0.97, + "grad_norm": 0.5560843324253453, + "learning_rate": 2.358289416693027e-08, + "loss": 0.0801, + "step": 485 + }, + { + "epoch": 0.97, + "grad_norm": 0.5857924563014624, + "learning_rate": 2.0545404622407396e-08, + "loss": 0.129, + "step": 486 + }, + { + "epoch": 0.97, + "grad_norm": 0.7418705435308932, + "learning_rate": 1.7716843019867646e-08, + "loss": 0.0845, + "step": 487 + }, + { + "epoch": 0.98, + "grad_norm": 0.6368611012787012, + "learning_rate": 1.509732804009012e-08, + "loss": 0.1266, + "step": 488 + }, + { + "epoch": 0.98, + "grad_norm": 0.7396925851613066, + "learning_rate": 1.268696959267679e-08, + "loss": 0.1313, + "step": 489 + }, + { + "epoch": 0.98, + "grad_norm": 0.5753378740087396, + "learning_rate": 1.0485868811441757e-08, + "loss": 0.0977, + "step": 490 + }, + { + "epoch": 0.98, + "grad_norm": 0.511791927295535, + "learning_rate": 8.494118050164646e-09, + "loss": 0.118, + "step": 491 + }, + { + "epoch": 0.98, + "grad_norm": 0.4569556080132059, + "learning_rate": 6.711800878718144e-09, + "loss": 0.0674, + "step": 492 + }, + { + "epoch": 0.99, + "grad_norm": 0.57891326152388, + "learning_rate": 5.138992079561367e-09, + "loss": 0.1116, + "step": 493 + }, + { + "epoch": 0.99, + "grad_norm": 0.8006661841027647, + "learning_rate": 3.775757644601808e-09, + "loss": 0.1464, + "step": 494 + }, + { + "epoch": 0.99, + "grad_norm": 0.5278677382991025, + "learning_rate": 2.6221547724253337e-09, + "loss": 0.0883, + "step": 495 + }, + { + "epoch": 0.99, + "grad_norm": 0.43730996329037447, + "learning_rate": 1.6782318658992159e-09, + "loss": 0.0847, + "step": 496 + }, + { + "epoch": 0.99, + "grad_norm": 0.42980308851818955, + "learning_rate": 9.440285301370865e-10, + "loss": 0.0876, + "step": 497 + }, + { + "epoch": 1.0, + "grad_norm": 0.5362614035022275, + "learning_rate": 4.1957557084082447e-10, + "loss": 0.0951, + "step": 498 + }, + { + "epoch": 1.0, + "grad_norm": 0.420427750356471, + "learning_rate": 1.0489499300603279e-10, + "loss": 0.0816, + "step": 499 + }, + { + "epoch": 1.0, + "grad_norm": 0.724803929021113, + "learning_rate": 0.0, + "loss": 0.1418, + "step": 500 + }, + { + "epoch": 1.0, + "step": 500, + "total_flos": 19125432246272.0, + "train_loss": 0.13232657791674138, + "train_runtime": 1203.9784, + "train_samples_per_second": 0.831, + "train_steps_per_second": 0.415 + } + ], + "logging_steps": 1.0, + "max_steps": 500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "total_flos": 19125432246272.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/README.md b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/adapter_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0ff40233217d5959c9549b350d75ed1d93daffb1 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "olora", + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 512, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 256, + "revision": null, + "target_modules": [ + "up_proj", + "o_proj", + "down_proj", + "gate_proj", + "v_proj", + "q_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/adapter_model.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..3c06f029efd6c3c56a60377177ad0ac54de45e91 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c710d9a1b1f16680adbbb33421b57d61d4baefbf94c194ec80720c97d9bf489b +size 1384057050 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a67f02d0063c3de7740207b9ab2a3eb7be1cbe3 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/config.json @@ -0,0 +1,221 @@ +{ + "_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "add_faster_video": false, + "add_time_instruction": true, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": true, + "hidden_act": "silu", + "hidden_size": 3584, + "ignore_index": -100, + "image_aspect_ratio": "anyres_max_9", + "image_crop_resolution": null, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "grid", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": 2e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": null, + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": 2, + "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "projector_hidden_act": "gelu", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "text_config": { + "model_type": "llama" + }, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 32768, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_config": { + "hidden_size": 1024, + "image_size": 336, + "intermediate_size": 4096, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "vocab_size": 32000 + }, + "vision_feature_layer": -2, + "vision_feature_select_strategy": "default", + "vision_tower_pretrained": null +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/generation_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/non_lora_trainables.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..6f6f801daba98ef297d1415e4afa0b069b5ac103 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e6b67548d1e0c12a0ba9bc1f7f3fe44a54a6e2601ea37d771260fd162be7419 +size 33964208 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/trainer_state.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1820211b23cfcd9734df8819c9c7a72e7ccc5c45 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split01_all_mm_tune_olora256_512_llm/trainer_state.json @@ -0,0 +1,7030 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2.3607246746171, + "learning_rate": 3.3333333333333335e-07, + "loss": 0.5894, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 1.6015062312526702, + "learning_rate": 6.666666666666667e-07, + "loss": 0.4503, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1.7883022890023197, + "learning_rate": 1.0000000000000002e-06, + "loss": 0.564, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 1.446709129426947, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.4052, + "step": 4 + }, + { + "epoch": 0.01, + "grad_norm": 1.950005793454983, + "learning_rate": 1.6666666666666667e-06, + "loss": 0.5694, + "step": 5 + }, + { + "epoch": 0.01, + "grad_norm": 2.0315383895224093, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6325, + "step": 6 + }, + { + "epoch": 0.01, + "grad_norm": 1.4380940429454823, + "learning_rate": 2.3333333333333336e-06, + "loss": 0.4073, + "step": 7 + }, + { + "epoch": 0.01, + "grad_norm": 1.7902410169543705, + "learning_rate": 2.666666666666667e-06, + "loss": 0.496, + "step": 8 + }, + { + "epoch": 0.01, + "grad_norm": 1.019886784152221, + "learning_rate": 3e-06, + "loss": 0.2978, + "step": 9 + }, + { + "epoch": 0.01, + "grad_norm": 2.305448906172369, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.6514, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 1.7461464618031717, + "learning_rate": 3.6666666666666666e-06, + "loss": 0.4841, + "step": 11 + }, + { + "epoch": 0.01, + "grad_norm": 2.133559780473121, + "learning_rate": 4.000000000000001e-06, + "loss": 0.5899, + "step": 12 + }, + { + "epoch": 0.01, + "grad_norm": 1.9577956061913435, + "learning_rate": 4.333333333333334e-06, + "loss": 0.5176, + "step": 13 + }, + { + "epoch": 0.01, + "grad_norm": 1.3215929577477967, + "learning_rate": 4.666666666666667e-06, + "loss": 0.3868, + "step": 14 + }, + { + "epoch": 0.01, + "grad_norm": 1.2001009666150546, + "learning_rate": 5e-06, + "loss": 0.3958, + "step": 15 + }, + { + "epoch": 0.02, + "grad_norm": 0.9304714590374654, + "learning_rate": 5.333333333333334e-06, + "loss": 0.306, + "step": 16 + }, + { + "epoch": 0.02, + "grad_norm": 1.7767444580668679, + "learning_rate": 5.666666666666667e-06, + "loss": 0.408, + "step": 17 + }, + { + "epoch": 0.02, + "grad_norm": 1.3287907694675276, + "learning_rate": 6e-06, + "loss": 0.3444, + "step": 18 + }, + { + "epoch": 0.02, + "grad_norm": 2.597203933659128, + "learning_rate": 6.333333333333333e-06, + "loss": 0.632, + "step": 19 + }, + { + "epoch": 0.02, + "grad_norm": 1.2620348074780825, + "learning_rate": 6.666666666666667e-06, + "loss": 0.294, + "step": 20 + }, + { + "epoch": 0.02, + "grad_norm": 1.5328912209345755, + "learning_rate": 7e-06, + "loss": 0.3178, + "step": 21 + }, + { + "epoch": 0.02, + "grad_norm": 1.2419857209516763, + "learning_rate": 7.333333333333333e-06, + "loss": 0.2809, + "step": 22 + }, + { + "epoch": 0.02, + "grad_norm": 1.0122248180630153, + "learning_rate": 7.666666666666667e-06, + "loss": 0.1807, + "step": 23 + }, + { + "epoch": 0.02, + "grad_norm": 1.3320938267594107, + "learning_rate": 8.000000000000001e-06, + "loss": 0.2579, + "step": 24 + }, + { + "epoch": 0.03, + "grad_norm": 1.2756210972480526, + "learning_rate": 8.333333333333334e-06, + "loss": 0.2214, + "step": 25 + }, + { + "epoch": 0.03, + "grad_norm": 0.9440953818070815, + "learning_rate": 8.666666666666668e-06, + "loss": 0.1571, + "step": 26 + }, + { + "epoch": 0.03, + "grad_norm": 1.0654633495505794, + "learning_rate": 9e-06, + "loss": 0.1946, + "step": 27 + }, + { + "epoch": 0.03, + "grad_norm": 1.0307271012692505, + "learning_rate": 9.333333333333334e-06, + "loss": 0.2028, + "step": 28 + }, + { + "epoch": 0.03, + "grad_norm": 1.1525380505511964, + "learning_rate": 9.666666666666667e-06, + "loss": 0.2617, + "step": 29 + }, + { + "epoch": 0.03, + "grad_norm": 1.7953701897420127, + "learning_rate": 1e-05, + "loss": 0.1606, + "step": 30 + }, + { + "epoch": 0.03, + "grad_norm": 0.7658330397139902, + "learning_rate": 9.99997377618298e-06, + "loss": 0.1623, + "step": 31 + }, + { + "epoch": 0.03, + "grad_norm": 1.3688769608235227, + "learning_rate": 9.999895105006995e-06, + "loss": 0.232, + "step": 32 + }, + { + "epoch": 0.03, + "grad_norm": 1.0710880606725555, + "learning_rate": 9.999763987297266e-06, + "loss": 0.199, + "step": 33 + }, + { + "epoch": 0.03, + "grad_norm": 0.8922477359097567, + "learning_rate": 9.99958042442916e-06, + "loss": 0.0938, + "step": 34 + }, + { + "epoch": 0.04, + "grad_norm": 1.0519325757175217, + "learning_rate": 9.999344418328161e-06, + "loss": 0.1081, + "step": 35 + }, + { + "epoch": 0.04, + "grad_norm": 1.42008932273011, + "learning_rate": 9.999055971469864e-06, + "loss": 0.2147, + "step": 36 + }, + { + "epoch": 0.04, + "grad_norm": 1.5090612769262195, + "learning_rate": 9.998715086879938e-06, + "loss": 0.2645, + "step": 37 + }, + { + "epoch": 0.04, + "grad_norm": 1.6386571363100515, + "learning_rate": 9.998321768134101e-06, + "loss": 0.1775, + "step": 38 + }, + { + "epoch": 0.04, + "grad_norm": 1.177034629976266, + "learning_rate": 9.997876019358083e-06, + "loss": 0.2036, + "step": 39 + }, + { + "epoch": 0.04, + "grad_norm": 1.0024062591496823, + "learning_rate": 9.997377845227577e-06, + "loss": 0.1728, + "step": 40 + }, + { + "epoch": 0.04, + "grad_norm": 1.0929263562861968, + "learning_rate": 9.99682725096819e-06, + "loss": 0.1565, + "step": 41 + }, + { + "epoch": 0.04, + "grad_norm": 1.3187393755711911, + "learning_rate": 9.9962242423554e-06, + "loss": 0.1816, + "step": 42 + }, + { + "epoch": 0.04, + "grad_norm": 0.8213087137487407, + "learning_rate": 9.995568825714479e-06, + "loss": 0.1353, + "step": 43 + }, + { + "epoch": 0.04, + "grad_norm": 0.649253335581187, + "learning_rate": 9.99486100792044e-06, + "loss": 0.1324, + "step": 44 + }, + { + "epoch": 0.04, + "grad_norm": 0.5717233913418078, + "learning_rate": 9.994100796397954e-06, + "loss": 0.1286, + "step": 45 + }, + { + "epoch": 0.05, + "grad_norm": 0.737297938600789, + "learning_rate": 9.993288199121283e-06, + "loss": 0.1902, + "step": 46 + }, + { + "epoch": 0.05, + "grad_norm": 1.0028142866905823, + "learning_rate": 9.992423224614185e-06, + "loss": 0.1567, + "step": 47 + }, + { + "epoch": 0.05, + "grad_norm": 1.0374643268503156, + "learning_rate": 9.991505881949837e-06, + "loss": 0.1834, + "step": 48 + }, + { + "epoch": 0.05, + "grad_norm": 1.2828809631329192, + "learning_rate": 9.990536180750724e-06, + "loss": 0.1352, + "step": 49 + }, + { + "epoch": 0.05, + "grad_norm": 0.807626588306248, + "learning_rate": 9.98951413118856e-06, + "loss": 0.1295, + "step": 50 + }, + { + "epoch": 0.05, + "grad_norm": 1.6402633907444848, + "learning_rate": 9.988439743984155e-06, + "loss": 0.2704, + "step": 51 + }, + { + "epoch": 0.05, + "grad_norm": 0.9909565607487354, + "learning_rate": 9.987313030407325e-06, + "loss": 0.1682, + "step": 52 + }, + { + "epoch": 0.05, + "grad_norm": 1.1824740893896895, + "learning_rate": 9.98613400227676e-06, + "loss": 0.1557, + "step": 53 + }, + { + "epoch": 0.05, + "grad_norm": 0.5358200613295229, + "learning_rate": 9.984902671959911e-06, + "loss": 0.1026, + "step": 54 + }, + { + "epoch": 0.06, + "grad_norm": 0.9805793717317114, + "learning_rate": 9.983619052372847e-06, + "loss": 0.1293, + "step": 55 + }, + { + "epoch": 0.06, + "grad_norm": 0.6228000314436585, + "learning_rate": 9.982283156980133e-06, + "loss": 0.1249, + "step": 56 + }, + { + "epoch": 0.06, + "grad_norm": 0.6112410766798437, + "learning_rate": 9.980894999794678e-06, + "loss": 0.1226, + "step": 57 + }, + { + "epoch": 0.06, + "grad_norm": 0.5350250183893485, + "learning_rate": 9.979454595377594e-06, + "loss": 0.0583, + "step": 58 + }, + { + "epoch": 0.06, + "grad_norm": 0.5595000396004797, + "learning_rate": 9.97796195883804e-06, + "loss": 0.1351, + "step": 59 + }, + { + "epoch": 0.06, + "grad_norm": 0.7974050724404418, + "learning_rate": 9.97641710583307e-06, + "loss": 0.163, + "step": 60 + }, + { + "epoch": 0.06, + "grad_norm": 0.7866162745931496, + "learning_rate": 9.97482005256746e-06, + "loss": 0.1425, + "step": 61 + }, + { + "epoch": 0.06, + "grad_norm": 0.6241895011307711, + "learning_rate": 9.973170815793543e-06, + "loss": 0.064, + "step": 62 + }, + { + "epoch": 0.06, + "grad_norm": 0.913102515262216, + "learning_rate": 9.971469412811032e-06, + "loss": 0.1483, + "step": 63 + }, + { + "epoch": 0.06, + "grad_norm": 1.3697825079458334, + "learning_rate": 9.969715861466839e-06, + "loss": 0.1984, + "step": 64 + }, + { + "epoch": 0.07, + "grad_norm": 2.187974862551584, + "learning_rate": 9.96791018015489e-06, + "loss": 0.1756, + "step": 65 + }, + { + "epoch": 0.07, + "grad_norm": 0.6520756310162947, + "learning_rate": 9.966052387815923e-06, + "loss": 0.1027, + "step": 66 + }, + { + "epoch": 0.07, + "grad_norm": 0.9336791633038164, + "learning_rate": 9.964142503937305e-06, + "loss": 0.1663, + "step": 67 + }, + { + "epoch": 0.07, + "grad_norm": 0.7459608541470574, + "learning_rate": 9.962180548552812e-06, + "loss": 0.1269, + "step": 68 + }, + { + "epoch": 0.07, + "grad_norm": 0.5369458077637503, + "learning_rate": 9.96016654224243e-06, + "loss": 0.1168, + "step": 69 + }, + { + "epoch": 0.07, + "grad_norm": 2.3099380290630775, + "learning_rate": 9.958100506132127e-06, + "loss": 0.1417, + "step": 70 + }, + { + "epoch": 0.07, + "grad_norm": 0.7088466229007135, + "learning_rate": 9.955982461893648e-06, + "loss": 0.1128, + "step": 71 + }, + { + "epoch": 0.07, + "grad_norm": 0.6273327117991097, + "learning_rate": 9.953812431744274e-06, + "loss": 0.1235, + "step": 72 + }, + { + "epoch": 0.07, + "grad_norm": 0.5693900280941977, + "learning_rate": 9.951590438446597e-06, + "loss": 0.0968, + "step": 73 + }, + { + "epoch": 0.07, + "grad_norm": 0.5719038668639742, + "learning_rate": 9.94931650530827e-06, + "loss": 0.0969, + "step": 74 + }, + { + "epoch": 0.07, + "grad_norm": 0.8622476614437252, + "learning_rate": 9.946990656181782e-06, + "loss": 0.1384, + "step": 75 + }, + { + "epoch": 0.08, + "grad_norm": 0.6993792887886608, + "learning_rate": 9.944612915464183e-06, + "loss": 0.1265, + "step": 76 + }, + { + "epoch": 0.08, + "grad_norm": 0.7640565569055416, + "learning_rate": 9.942183308096853e-06, + "loss": 0.143, + "step": 77 + }, + { + "epoch": 0.08, + "grad_norm": 1.1258360637887, + "learning_rate": 9.93970185956522e-06, + "loss": 0.1361, + "step": 78 + }, + { + "epoch": 0.08, + "grad_norm": 0.5445199443767731, + "learning_rate": 9.93716859589851e-06, + "loss": 0.093, + "step": 79 + }, + { + "epoch": 0.08, + "grad_norm": 1.040171743521041, + "learning_rate": 9.934583543669454e-06, + "loss": 0.124, + "step": 80 + }, + { + "epoch": 0.08, + "grad_norm": 1.9839597034019067, + "learning_rate": 9.93194672999403e-06, + "loss": 0.202, + "step": 81 + }, + { + "epoch": 0.08, + "grad_norm": 0.9301812327121488, + "learning_rate": 9.929258182531167e-06, + "loss": 0.1327, + "step": 82 + }, + { + "epoch": 0.08, + "grad_norm": 1.1345207127377692, + "learning_rate": 9.926517929482454e-06, + "loss": 0.1526, + "step": 83 + }, + { + "epoch": 0.08, + "grad_norm": 0.6622530400858483, + "learning_rate": 9.923725999591846e-06, + "loss": 0.1047, + "step": 84 + }, + { + "epoch": 0.09, + "grad_norm": 0.6711658591110805, + "learning_rate": 9.920882422145372e-06, + "loss": 0.1242, + "step": 85 + }, + { + "epoch": 0.09, + "grad_norm": 0.5879483840752662, + "learning_rate": 9.917987226970811e-06, + "loss": 0.1041, + "step": 86 + }, + { + "epoch": 0.09, + "grad_norm": 0.8511584440885696, + "learning_rate": 9.91504044443739e-06, + "loss": 0.1283, + "step": 87 + }, + { + "epoch": 0.09, + "grad_norm": 0.6782669907972337, + "learning_rate": 9.912042105455462e-06, + "loss": 0.0576, + "step": 88 + }, + { + "epoch": 0.09, + "grad_norm": 0.6457585382953173, + "learning_rate": 9.908992241476189e-06, + "loss": 0.1026, + "step": 89 + }, + { + "epoch": 0.09, + "grad_norm": 0.8833122772627231, + "learning_rate": 9.905890884491196e-06, + "loss": 0.1507, + "step": 90 + }, + { + "epoch": 0.09, + "grad_norm": 1.1980970686962742, + "learning_rate": 9.902738067032254e-06, + "loss": 0.1062, + "step": 91 + }, + { + "epoch": 0.09, + "grad_norm": 1.1334701103868552, + "learning_rate": 9.899533822170922e-06, + "loss": 0.164, + "step": 92 + }, + { + "epoch": 0.09, + "grad_norm": 0.9011291147381055, + "learning_rate": 9.896278183518216e-06, + "loss": 0.1399, + "step": 93 + }, + { + "epoch": 0.09, + "grad_norm": 0.7569316599830422, + "learning_rate": 9.892971185224244e-06, + "loss": 0.1326, + "step": 94 + }, + { + "epoch": 0.1, + "grad_norm": 1.2328121035048378, + "learning_rate": 9.889612861977855e-06, + "loss": 0.1503, + "step": 95 + }, + { + "epoch": 0.1, + "grad_norm": 0.596300915381594, + "learning_rate": 9.886203249006265e-06, + "loss": 0.1142, + "step": 96 + }, + { + "epoch": 0.1, + "grad_norm": 0.6168275952114758, + "learning_rate": 9.882742382074707e-06, + "loss": 0.1048, + "step": 97 + }, + { + "epoch": 0.1, + "grad_norm": 0.6646867123233097, + "learning_rate": 9.879230297486034e-06, + "loss": 0.1268, + "step": 98 + }, + { + "epoch": 0.1, + "grad_norm": 1.4624576438541248, + "learning_rate": 9.875667032080354e-06, + "loss": 0.1636, + "step": 99 + }, + { + "epoch": 0.1, + "grad_norm": 0.696569485211007, + "learning_rate": 9.872052623234632e-06, + "loss": 0.0904, + "step": 100 + }, + { + "epoch": 0.1, + "grad_norm": 0.6876476738040996, + "learning_rate": 9.868387108862307e-06, + "loss": 0.1326, + "step": 101 + }, + { + "epoch": 0.1, + "grad_norm": 1.3128848330139118, + "learning_rate": 9.864670527412891e-06, + "loss": 0.1819, + "step": 102 + }, + { + "epoch": 0.1, + "grad_norm": 0.9664864009368302, + "learning_rate": 9.860902917871566e-06, + "loss": 0.0792, + "step": 103 + }, + { + "epoch": 0.1, + "grad_norm": 0.6920287290973034, + "learning_rate": 9.857084319758772e-06, + "loss": 0.1043, + "step": 104 + }, + { + "epoch": 0.1, + "grad_norm": 0.6092944325732855, + "learning_rate": 9.853214773129796e-06, + "loss": 0.0876, + "step": 105 + }, + { + "epoch": 0.11, + "grad_norm": 1.0643719636119542, + "learning_rate": 9.849294318574353e-06, + "loss": 0.1577, + "step": 106 + }, + { + "epoch": 0.11, + "grad_norm": 0.7729367078232964, + "learning_rate": 9.845322997216153e-06, + "loss": 0.1009, + "step": 107 + }, + { + "epoch": 0.11, + "grad_norm": 1.0307500009173252, + "learning_rate": 9.841300850712479e-06, + "loss": 0.1418, + "step": 108 + }, + { + "epoch": 0.11, + "grad_norm": 0.6172660361836317, + "learning_rate": 9.837227921253747e-06, + "loss": 0.1078, + "step": 109 + }, + { + "epoch": 0.11, + "grad_norm": 0.6053934199519216, + "learning_rate": 9.833104251563058e-06, + "loss": 0.1046, + "step": 110 + }, + { + "epoch": 0.11, + "grad_norm": 0.8492583798544472, + "learning_rate": 9.828929884895753e-06, + "loss": 0.1615, + "step": 111 + }, + { + "epoch": 0.11, + "grad_norm": 0.7898012278460997, + "learning_rate": 9.824704865038967e-06, + "loss": 0.1504, + "step": 112 + }, + { + "epoch": 0.11, + "grad_norm": 0.4423348640674584, + "learning_rate": 9.820429236311158e-06, + "loss": 0.0746, + "step": 113 + }, + { + "epoch": 0.11, + "grad_norm": 0.43000135393988737, + "learning_rate": 9.816103043561648e-06, + "loss": 0.0938, + "step": 114 + }, + { + "epoch": 0.12, + "grad_norm": 1.370505236760701, + "learning_rate": 9.811726332170153e-06, + "loss": 0.1902, + "step": 115 + }, + { + "epoch": 0.12, + "grad_norm": 0.5419284871881185, + "learning_rate": 9.807299148046301e-06, + "loss": 0.0816, + "step": 116 + }, + { + "epoch": 0.12, + "grad_norm": 0.582272008730522, + "learning_rate": 9.802821537629162e-06, + "loss": 0.093, + "step": 117 + }, + { + "epoch": 0.12, + "grad_norm": 0.5038565292887868, + "learning_rate": 9.798293547886748e-06, + "loss": 0.1, + "step": 118 + }, + { + "epoch": 0.12, + "grad_norm": 0.509865304312992, + "learning_rate": 9.79371522631553e-06, + "loss": 0.0955, + "step": 119 + }, + { + "epoch": 0.12, + "grad_norm": 0.4957796733902061, + "learning_rate": 9.789086620939936e-06, + "loss": 0.0876, + "step": 120 + }, + { + "epoch": 0.12, + "grad_norm": 1.0857429677099277, + "learning_rate": 9.784407780311845e-06, + "loss": 0.1366, + "step": 121 + }, + { + "epoch": 0.12, + "grad_norm": 1.3891390182343122, + "learning_rate": 9.779678753510082e-06, + "loss": 0.1722, + "step": 122 + }, + { + "epoch": 0.12, + "grad_norm": 0.6338060483690361, + "learning_rate": 9.774899590139897e-06, + "loss": 0.1151, + "step": 123 + }, + { + "epoch": 0.12, + "grad_norm": 0.38917086192216094, + "learning_rate": 9.770070340332457e-06, + "loss": 0.0861, + "step": 124 + }, + { + "epoch": 0.12, + "grad_norm": 0.7336168996432986, + "learning_rate": 9.765191054744305e-06, + "loss": 0.134, + "step": 125 + }, + { + "epoch": 0.13, + "grad_norm": 1.0534518367367267, + "learning_rate": 9.76026178455684e-06, + "loss": 0.1289, + "step": 126 + }, + { + "epoch": 0.13, + "grad_norm": 0.8079918912060179, + "learning_rate": 9.755282581475769e-06, + "loss": 0.1913, + "step": 127 + }, + { + "epoch": 0.13, + "grad_norm": 0.4763900468548068, + "learning_rate": 9.75025349773058e-06, + "loss": 0.0803, + "step": 128 + }, + { + "epoch": 0.13, + "grad_norm": 0.7681534439869117, + "learning_rate": 9.745174586073982e-06, + "loss": 0.1378, + "step": 129 + }, + { + "epoch": 0.13, + "grad_norm": 0.750963291621297, + "learning_rate": 9.740045899781353e-06, + "loss": 0.1259, + "step": 130 + }, + { + "epoch": 0.13, + "grad_norm": 0.4344226462757411, + "learning_rate": 9.734867492650187e-06, + "loss": 0.0995, + "step": 131 + }, + { + "epoch": 0.13, + "grad_norm": 0.539997704930958, + "learning_rate": 9.729639418999524e-06, + "loss": 0.1098, + "step": 132 + }, + { + "epoch": 0.13, + "grad_norm": 0.6762346368086923, + "learning_rate": 9.724361733669383e-06, + "loss": 0.125, + "step": 133 + }, + { + "epoch": 0.13, + "grad_norm": 0.5485194569881519, + "learning_rate": 9.719034492020183e-06, + "loss": 0.0991, + "step": 134 + }, + { + "epoch": 0.14, + "grad_norm": 1.3110924332122729, + "learning_rate": 9.713657749932172e-06, + "loss": 0.1883, + "step": 135 + }, + { + "epoch": 0.14, + "grad_norm": 0.8191799283206096, + "learning_rate": 9.708231563804828e-06, + "loss": 0.1505, + "step": 136 + }, + { + "epoch": 0.14, + "grad_norm": 0.6577131240580671, + "learning_rate": 9.702755990556277e-06, + "loss": 0.1412, + "step": 137 + }, + { + "epoch": 0.14, + "grad_norm": 0.432630262046169, + "learning_rate": 9.697231087622691e-06, + "loss": 0.1068, + "step": 138 + }, + { + "epoch": 0.14, + "grad_norm": 0.6304900358752927, + "learning_rate": 9.691656912957686e-06, + "loss": 0.104, + "step": 139 + }, + { + "epoch": 0.14, + "grad_norm": 0.5559743037433238, + "learning_rate": 9.68603352503172e-06, + "loss": 0.1034, + "step": 140 + }, + { + "epoch": 0.14, + "grad_norm": 0.4505737091122359, + "learning_rate": 9.680360982831467e-06, + "loss": 0.1156, + "step": 141 + }, + { + "epoch": 0.14, + "grad_norm": 1.0503414847004549, + "learning_rate": 9.674639345859213e-06, + "loss": 0.1081, + "step": 142 + }, + { + "epoch": 0.14, + "grad_norm": 0.953092212431232, + "learning_rate": 9.668868674132224e-06, + "loss": 0.142, + "step": 143 + }, + { + "epoch": 0.14, + "grad_norm": 2.5773173116053294, + "learning_rate": 9.663049028182112e-06, + "loss": 0.1665, + "step": 144 + }, + { + "epoch": 0.14, + "grad_norm": 0.5211093890918267, + "learning_rate": 9.657180469054213e-06, + "loss": 0.1083, + "step": 145 + }, + { + "epoch": 0.15, + "grad_norm": 0.6772333518808913, + "learning_rate": 9.651263058306932e-06, + "loss": 0.1402, + "step": 146 + }, + { + "epoch": 0.15, + "grad_norm": 0.4728466109803736, + "learning_rate": 9.645296858011109e-06, + "loss": 0.1, + "step": 147 + }, + { + "epoch": 0.15, + "grad_norm": 0.5686134641449667, + "learning_rate": 9.639281930749363e-06, + "loss": 0.0981, + "step": 148 + }, + { + "epoch": 0.15, + "grad_norm": 0.5150381464311025, + "learning_rate": 9.633218339615433e-06, + "loss": 0.0768, + "step": 149 + }, + { + "epoch": 0.15, + "grad_norm": 0.7829238992726227, + "learning_rate": 9.627106148213521e-06, + "loss": 0.1199, + "step": 150 + }, + { + "epoch": 0.15, + "grad_norm": 0.5634299468247613, + "learning_rate": 9.620945420657625e-06, + "loss": 0.1284, + "step": 151 + }, + { + "epoch": 0.15, + "grad_norm": 0.7739328263970474, + "learning_rate": 9.61473622157086e-06, + "loss": 0.0756, + "step": 152 + }, + { + "epoch": 0.15, + "grad_norm": 0.4929993829829465, + "learning_rate": 9.608478616084784e-06, + "loss": 0.0835, + "step": 153 + }, + { + "epoch": 0.15, + "grad_norm": 0.6627675519129016, + "learning_rate": 9.602172669838721e-06, + "loss": 0.0857, + "step": 154 + }, + { + "epoch": 0.15, + "grad_norm": 0.43333238348562697, + "learning_rate": 9.595818448979061e-06, + "loss": 0.0629, + "step": 155 + }, + { + "epoch": 0.16, + "grad_norm": 0.4318823799173638, + "learning_rate": 9.589416020158577e-06, + "loss": 0.0726, + "step": 156 + }, + { + "epoch": 0.16, + "grad_norm": 0.3223767412425636, + "learning_rate": 9.582965450535716e-06, + "loss": 0.0711, + "step": 157 + }, + { + "epoch": 0.16, + "grad_norm": 0.8479275013069608, + "learning_rate": 9.5764668077739e-06, + "loss": 0.177, + "step": 158 + }, + { + "epoch": 0.16, + "grad_norm": 0.6033706807381652, + "learning_rate": 9.569920160040815e-06, + "loss": 0.1022, + "step": 159 + }, + { + "epoch": 0.16, + "grad_norm": 0.5960746908872542, + "learning_rate": 9.563325576007702e-06, + "loss": 0.0917, + "step": 160 + }, + { + "epoch": 0.16, + "grad_norm": 0.5989400193286061, + "learning_rate": 9.556683124848624e-06, + "loss": 0.1499, + "step": 161 + }, + { + "epoch": 0.16, + "grad_norm": 0.6321761364280136, + "learning_rate": 9.549992876239753e-06, + "loss": 0.1021, + "step": 162 + }, + { + "epoch": 0.16, + "grad_norm": 0.9456659052294284, + "learning_rate": 9.54325490035863e-06, + "loss": 0.1732, + "step": 163 + }, + { + "epoch": 0.16, + "grad_norm": 1.2369762916556193, + "learning_rate": 9.536469267883432e-06, + "loss": 0.1457, + "step": 164 + }, + { + "epoch": 0.17, + "grad_norm": 0.5776155361505679, + "learning_rate": 9.529636049992235e-06, + "loss": 0.108, + "step": 165 + }, + { + "epoch": 0.17, + "grad_norm": 0.5861181662355707, + "learning_rate": 9.52275531836226e-06, + "loss": 0.0885, + "step": 166 + }, + { + "epoch": 0.17, + "grad_norm": 1.466704353066058, + "learning_rate": 9.515827145169128e-06, + "loss": 0.1213, + "step": 167 + }, + { + "epoch": 0.17, + "grad_norm": 0.9732726445645538, + "learning_rate": 9.508851603086094e-06, + "loss": 0.1545, + "step": 168 + }, + { + "epoch": 0.17, + "grad_norm": 0.8489210921256994, + "learning_rate": 9.501828765283295e-06, + "loss": 0.1307, + "step": 169 + }, + { + "epoch": 0.17, + "grad_norm": 0.7185761664234415, + "learning_rate": 9.494758705426978e-06, + "loss": 0.1384, + "step": 170 + }, + { + "epoch": 0.17, + "grad_norm": 0.7416125172072018, + "learning_rate": 9.487641497678724e-06, + "loss": 0.1252, + "step": 171 + }, + { + "epoch": 0.17, + "grad_norm": 1.1794527596283202, + "learning_rate": 9.480477216694674e-06, + "loss": 0.1259, + "step": 172 + }, + { + "epoch": 0.17, + "grad_norm": 0.7402358672676869, + "learning_rate": 9.473265937624748e-06, + "loss": 0.1198, + "step": 173 + }, + { + "epoch": 0.17, + "grad_norm": 0.6312235522978472, + "learning_rate": 9.466007736111846e-06, + "loss": 0.1148, + "step": 174 + }, + { + "epoch": 0.17, + "grad_norm": 0.7196515773415135, + "learning_rate": 9.458702688291072e-06, + "loss": 0.1319, + "step": 175 + }, + { + "epoch": 0.18, + "grad_norm": 0.6850486515308531, + "learning_rate": 9.451350870788922e-06, + "loss": 0.1245, + "step": 176 + }, + { + "epoch": 0.18, + "grad_norm": 0.5696445336187979, + "learning_rate": 9.443952360722477e-06, + "loss": 0.1393, + "step": 177 + }, + { + "epoch": 0.18, + "grad_norm": 0.5404192629395442, + "learning_rate": 9.436507235698613e-06, + "loss": 0.1339, + "step": 178 + }, + { + "epoch": 0.18, + "grad_norm": 0.8471694217332177, + "learning_rate": 9.429015573813163e-06, + "loss": 0.1648, + "step": 179 + }, + { + "epoch": 0.18, + "grad_norm": 2.2583760623452127, + "learning_rate": 9.421477453650118e-06, + "loss": 0.1804, + "step": 180 + }, + { + "epoch": 0.18, + "grad_norm": 0.9438170221888703, + "learning_rate": 9.413892954280793e-06, + "loss": 0.1331, + "step": 181 + }, + { + "epoch": 0.18, + "grad_norm": 0.5315911820285912, + "learning_rate": 9.406262155262995e-06, + "loss": 0.1131, + "step": 182 + }, + { + "epoch": 0.18, + "grad_norm": 0.519208078747679, + "learning_rate": 9.398585136640195e-06, + "loss": 0.1156, + "step": 183 + }, + { + "epoch": 0.18, + "grad_norm": 0.5286389605503108, + "learning_rate": 9.390861978940687e-06, + "loss": 0.0938, + "step": 184 + }, + { + "epoch": 0.18, + "grad_norm": 0.56762669333044, + "learning_rate": 9.38309276317674e-06, + "loss": 0.1242, + "step": 185 + }, + { + "epoch": 0.19, + "grad_norm": 0.573116084711049, + "learning_rate": 9.37527757084375e-06, + "loss": 0.1163, + "step": 186 + }, + { + "epoch": 0.19, + "grad_norm": 1.1280550535208913, + "learning_rate": 9.367416483919387e-06, + "loss": 0.1394, + "step": 187 + }, + { + "epoch": 0.19, + "grad_norm": 0.7361759351819016, + "learning_rate": 9.359509584862735e-06, + "loss": 0.0944, + "step": 188 + }, + { + "epoch": 0.19, + "grad_norm": 0.6086432318944858, + "learning_rate": 9.351556956613423e-06, + "loss": 0.0966, + "step": 189 + }, + { + "epoch": 0.19, + "grad_norm": 1.5310758967896163, + "learning_rate": 9.343558682590757e-06, + "loss": 0.1755, + "step": 190 + }, + { + "epoch": 0.19, + "grad_norm": 0.5161731418528083, + "learning_rate": 9.335514846692846e-06, + "loss": 0.1072, + "step": 191 + }, + { + "epoch": 0.19, + "grad_norm": 1.6043224377669838, + "learning_rate": 9.327425533295725e-06, + "loss": 0.1523, + "step": 192 + }, + { + "epoch": 0.19, + "grad_norm": 0.5596996008010169, + "learning_rate": 9.31929082725246e-06, + "loss": 0.0894, + "step": 193 + }, + { + "epoch": 0.19, + "grad_norm": 0.4741364703546338, + "learning_rate": 9.31111081389227e-06, + "loss": 0.1253, + "step": 194 + }, + { + "epoch": 0.2, + "grad_norm": 0.4319905138436581, + "learning_rate": 9.302885579019626e-06, + "loss": 0.107, + "step": 195 + }, + { + "epoch": 0.2, + "grad_norm": 0.627710521584554, + "learning_rate": 9.29461520891335e-06, + "loss": 0.1123, + "step": 196 + }, + { + "epoch": 0.2, + "grad_norm": 0.36311863871574673, + "learning_rate": 9.286299790325708e-06, + "loss": 0.0525, + "step": 197 + }, + { + "epoch": 0.2, + "grad_norm": 0.7891128732253475, + "learning_rate": 9.277939410481507e-06, + "loss": 0.1446, + "step": 198 + }, + { + "epoch": 0.2, + "grad_norm": 1.2654640878518608, + "learning_rate": 9.269534157077177e-06, + "loss": 0.1481, + "step": 199 + }, + { + "epoch": 0.2, + "grad_norm": 0.5008647483406797, + "learning_rate": 9.261084118279846e-06, + "loss": 0.0961, + "step": 200 + }, + { + "epoch": 0.2, + "grad_norm": 0.5492177287320504, + "learning_rate": 9.252589382726426e-06, + "loss": 0.0914, + "step": 201 + }, + { + "epoch": 0.2, + "grad_norm": 0.8402239998184694, + "learning_rate": 9.244050039522673e-06, + "loss": 0.1617, + "step": 202 + }, + { + "epoch": 0.2, + "grad_norm": 0.5559678389424397, + "learning_rate": 9.235466178242255e-06, + "loss": 0.0805, + "step": 203 + }, + { + "epoch": 0.2, + "grad_norm": 0.34769819923378315, + "learning_rate": 9.226837888925813e-06, + "loss": 0.0627, + "step": 204 + }, + { + "epoch": 0.2, + "grad_norm": 0.5464670400325233, + "learning_rate": 9.218165262080024e-06, + "loss": 0.0956, + "step": 205 + }, + { + "epoch": 0.21, + "grad_norm": 0.8564112615923611, + "learning_rate": 9.209448388676636e-06, + "loss": 0.1793, + "step": 206 + }, + { + "epoch": 0.21, + "grad_norm": 0.4637904050413668, + "learning_rate": 9.200687360151527e-06, + "loss": 0.0837, + "step": 207 + }, + { + "epoch": 0.21, + "grad_norm": 0.5601509840385756, + "learning_rate": 9.191882268403743e-06, + "loss": 0.1054, + "step": 208 + }, + { + "epoch": 0.21, + "grad_norm": 0.5539453679241664, + "learning_rate": 9.183033205794525e-06, + "loss": 0.1133, + "step": 209 + }, + { + "epoch": 0.21, + "grad_norm": 0.5577119185386734, + "learning_rate": 9.174140265146356e-06, + "loss": 0.1092, + "step": 210 + }, + { + "epoch": 0.21, + "grad_norm": 0.7982965587061583, + "learning_rate": 9.165203539741976e-06, + "loss": 0.0945, + "step": 211 + }, + { + "epoch": 0.21, + "grad_norm": 0.7005488888369035, + "learning_rate": 9.156223123323405e-06, + "loss": 0.1175, + "step": 212 + }, + { + "epoch": 0.21, + "grad_norm": 0.49775597774931946, + "learning_rate": 9.14719911009096e-06, + "loss": 0.1005, + "step": 213 + }, + { + "epoch": 0.21, + "grad_norm": 1.2939083285022925, + "learning_rate": 9.13813159470227e-06, + "loss": 0.1264, + "step": 214 + }, + { + "epoch": 0.21, + "grad_norm": 0.4817398733660537, + "learning_rate": 9.129020672271283e-06, + "loss": 0.104, + "step": 215 + }, + { + "epoch": 0.22, + "grad_norm": 1.6234597598880522, + "learning_rate": 9.119866438367263e-06, + "loss": 0.1971, + "step": 216 + }, + { + "epoch": 0.22, + "grad_norm": 0.44266381304483315, + "learning_rate": 9.11066898901379e-06, + "loss": 0.0781, + "step": 217 + }, + { + "epoch": 0.22, + "grad_norm": 1.1016366481857078, + "learning_rate": 9.101428420687759e-06, + "loss": 0.1109, + "step": 218 + }, + { + "epoch": 0.22, + "grad_norm": 0.7277807395614646, + "learning_rate": 9.092144830318357e-06, + "loss": 0.111, + "step": 219 + }, + { + "epoch": 0.22, + "grad_norm": 0.39280208312630055, + "learning_rate": 9.082818315286054e-06, + "loss": 0.0812, + "step": 220 + }, + { + "epoch": 0.22, + "grad_norm": 0.6050461775054878, + "learning_rate": 9.073448973421581e-06, + "loss": 0.1104, + "step": 221 + }, + { + "epoch": 0.22, + "grad_norm": 0.7444788675747808, + "learning_rate": 9.0640369030049e-06, + "loss": 0.1285, + "step": 222 + }, + { + "epoch": 0.22, + "grad_norm": 0.8148118998555378, + "learning_rate": 9.054582202764175e-06, + "loss": 0.165, + "step": 223 + }, + { + "epoch": 0.22, + "grad_norm": 0.5611328780760997, + "learning_rate": 9.045084971874738e-06, + "loss": 0.0996, + "step": 224 + }, + { + "epoch": 0.23, + "grad_norm": 0.5613399147559089, + "learning_rate": 9.035545309958048e-06, + "loss": 0.1338, + "step": 225 + }, + { + "epoch": 0.23, + "grad_norm": 0.35377812380473267, + "learning_rate": 9.025963317080641e-06, + "loss": 0.0574, + "step": 226 + }, + { + "epoch": 0.23, + "grad_norm": 1.124413626677449, + "learning_rate": 9.016339093753093e-06, + "loss": 0.111, + "step": 227 + }, + { + "epoch": 0.23, + "grad_norm": 0.681070569595969, + "learning_rate": 9.006672740928952e-06, + "loss": 0.0982, + "step": 228 + }, + { + "epoch": 0.23, + "grad_norm": 0.5759668179153918, + "learning_rate": 8.99696436000368e-06, + "loss": 0.0726, + "step": 229 + }, + { + "epoch": 0.23, + "grad_norm": 1.1282044565957114, + "learning_rate": 8.987214052813605e-06, + "loss": 0.1048, + "step": 230 + }, + { + "epoch": 0.23, + "grad_norm": 0.45035923851895315, + "learning_rate": 8.977421921634833e-06, + "loss": 0.0972, + "step": 231 + }, + { + "epoch": 0.23, + "grad_norm": 0.679954755110806, + "learning_rate": 8.967588069182184e-06, + "loss": 0.0965, + "step": 232 + }, + { + "epoch": 0.23, + "grad_norm": 0.4958838697150068, + "learning_rate": 8.957712598608123e-06, + "loss": 0.1084, + "step": 233 + }, + { + "epoch": 0.23, + "grad_norm": 0.946219199103302, + "learning_rate": 8.947795613501658e-06, + "loss": 0.1722, + "step": 234 + }, + { + "epoch": 0.23, + "grad_norm": 0.5869393928381269, + "learning_rate": 8.937837217887273e-06, + "loss": 0.1092, + "step": 235 + }, + { + "epoch": 0.24, + "grad_norm": 0.4229221773571093, + "learning_rate": 8.927837516223824e-06, + "loss": 0.1151, + "step": 236 + }, + { + "epoch": 0.24, + "grad_norm": 0.4559163753035949, + "learning_rate": 8.917796613403451e-06, + "loss": 0.0691, + "step": 237 + }, + { + "epoch": 0.24, + "grad_norm": 0.670297330053043, + "learning_rate": 8.907714614750473e-06, + "loss": 0.13, + "step": 238 + }, + { + "epoch": 0.24, + "grad_norm": 0.43386300447049314, + "learning_rate": 8.897591626020284e-06, + "loss": 0.0955, + "step": 239 + }, + { + "epoch": 0.24, + "grad_norm": 0.6785813412177598, + "learning_rate": 8.887427753398249e-06, + "loss": 0.1308, + "step": 240 + }, + { + "epoch": 0.24, + "grad_norm": 0.6197078038796717, + "learning_rate": 8.877223103498576e-06, + "loss": 0.0958, + "step": 241 + }, + { + "epoch": 0.24, + "grad_norm": 0.7076683957063388, + "learning_rate": 8.866977783363219e-06, + "loss": 0.1035, + "step": 242 + }, + { + "epoch": 0.24, + "grad_norm": 0.6210521343309058, + "learning_rate": 8.85669190046074e-06, + "loss": 0.0873, + "step": 243 + }, + { + "epoch": 0.24, + "grad_norm": 0.3961276294508894, + "learning_rate": 8.846365562685178e-06, + "loss": 0.0856, + "step": 244 + }, + { + "epoch": 0.24, + "grad_norm": 0.7986412285531543, + "learning_rate": 8.83599887835493e-06, + "loss": 0.1862, + "step": 245 + }, + { + "epoch": 0.25, + "grad_norm": 0.4381098035523295, + "learning_rate": 8.825591956211614e-06, + "loss": 0.0909, + "step": 246 + }, + { + "epoch": 0.25, + "grad_norm": 0.7819035176829826, + "learning_rate": 8.815144905418918e-06, + "loss": 0.0882, + "step": 247 + }, + { + "epoch": 0.25, + "grad_norm": 0.8340751878964306, + "learning_rate": 8.804657835561456e-06, + "loss": 0.0708, + "step": 248 + }, + { + "epoch": 0.25, + "grad_norm": 0.4759107242077579, + "learning_rate": 8.794130856643635e-06, + "loss": 0.1174, + "step": 249 + }, + { + "epoch": 0.25, + "grad_norm": 0.4419037098541667, + "learning_rate": 8.783564079088478e-06, + "loss": 0.0611, + "step": 250 + }, + { + "epoch": 0.25, + "grad_norm": 0.6838098339826392, + "learning_rate": 8.772957613736483e-06, + "loss": 0.1318, + "step": 251 + }, + { + "epoch": 0.25, + "grad_norm": 0.6805585810191564, + "learning_rate": 8.762311571844453e-06, + "loss": 0.1144, + "step": 252 + }, + { + "epoch": 0.25, + "grad_norm": 0.880891649880843, + "learning_rate": 8.751626065084328e-06, + "loss": 0.1613, + "step": 253 + }, + { + "epoch": 0.25, + "grad_norm": 0.6484061037081844, + "learning_rate": 8.74090120554202e-06, + "loss": 0.0721, + "step": 254 + }, + { + "epoch": 0.26, + "grad_norm": 0.5481113998080442, + "learning_rate": 8.730137105716231e-06, + "loss": 0.09, + "step": 255 + }, + { + "epoch": 0.26, + "grad_norm": 0.8729016284845439, + "learning_rate": 8.719333878517274e-06, + "loss": 0.1313, + "step": 256 + }, + { + "epoch": 0.26, + "grad_norm": 0.8361974654852907, + "learning_rate": 8.708491637265888e-06, + "loss": 0.1075, + "step": 257 + }, + { + "epoch": 0.26, + "grad_norm": 1.5774979943719853, + "learning_rate": 8.697610495692055e-06, + "loss": 0.107, + "step": 258 + }, + { + "epoch": 0.26, + "grad_norm": 0.6649869661098885, + "learning_rate": 8.686690567933803e-06, + "loss": 0.1118, + "step": 259 + }, + { + "epoch": 0.26, + "grad_norm": 0.7000964181249009, + "learning_rate": 8.675731968536004e-06, + "loss": 0.1259, + "step": 260 + }, + { + "epoch": 0.26, + "grad_norm": 0.662137270152306, + "learning_rate": 8.66473481244918e-06, + "loss": 0.0957, + "step": 261 + }, + { + "epoch": 0.26, + "grad_norm": 0.8378809743130473, + "learning_rate": 8.653699215028298e-06, + "loss": 0.1436, + "step": 262 + }, + { + "epoch": 0.26, + "grad_norm": 0.6933424859787171, + "learning_rate": 8.64262529203155e-06, + "loss": 0.1198, + "step": 263 + }, + { + "epoch": 0.26, + "grad_norm": 1.5675466712230994, + "learning_rate": 8.63151315961915e-06, + "loss": 0.1583, + "step": 264 + }, + { + "epoch": 0.27, + "grad_norm": 0.6733269469920226, + "learning_rate": 8.620362934352109e-06, + "loss": 0.1049, + "step": 265 + }, + { + "epoch": 0.27, + "grad_norm": 1.086757466314047, + "learning_rate": 8.609174733191012e-06, + "loss": 0.1396, + "step": 266 + }, + { + "epoch": 0.27, + "grad_norm": 0.7256776861439452, + "learning_rate": 8.597948673494794e-06, + "loss": 0.162, + "step": 267 + }, + { + "epoch": 0.27, + "grad_norm": 0.47995856922745955, + "learning_rate": 8.586684873019513e-06, + "loss": 0.0983, + "step": 268 + }, + { + "epoch": 0.27, + "grad_norm": 0.8221525546080067, + "learning_rate": 8.575383449917103e-06, + "loss": 0.1, + "step": 269 + }, + { + "epoch": 0.27, + "grad_norm": 0.6149586237967274, + "learning_rate": 8.564044522734147e-06, + "loss": 0.1187, + "step": 270 + }, + { + "epoch": 0.27, + "grad_norm": 5.290261899072581, + "learning_rate": 8.552668210410624e-06, + "loss": 0.1215, + "step": 271 + }, + { + "epoch": 0.27, + "grad_norm": 0.7103356271751441, + "learning_rate": 8.541254632278667e-06, + "loss": 0.1128, + "step": 272 + }, + { + "epoch": 0.27, + "grad_norm": 0.8183102626323511, + "learning_rate": 8.52980390806131e-06, + "loss": 0.0652, + "step": 273 + }, + { + "epoch": 0.27, + "grad_norm": 0.4924364063295098, + "learning_rate": 8.518316157871232e-06, + "loss": 0.0811, + "step": 274 + }, + { + "epoch": 0.28, + "grad_norm": 0.6379863736427677, + "learning_rate": 8.506791502209497e-06, + "loss": 0.1421, + "step": 275 + }, + { + "epoch": 0.28, + "grad_norm": 0.5365549756136844, + "learning_rate": 8.495230061964289e-06, + "loss": 0.1037, + "step": 276 + }, + { + "epoch": 0.28, + "grad_norm": 0.5918213588332102, + "learning_rate": 8.483631958409644e-06, + "loss": 0.1091, + "step": 277 + }, + { + "epoch": 0.28, + "grad_norm": 0.5910185718944498, + "learning_rate": 8.471997313204183e-06, + "loss": 0.1008, + "step": 278 + }, + { + "epoch": 0.28, + "grad_norm": 0.8434865299141647, + "learning_rate": 8.460326248389825e-06, + "loss": 0.1261, + "step": 279 + }, + { + "epoch": 0.28, + "grad_norm": 0.8084790273022816, + "learning_rate": 8.448618886390523e-06, + "loss": 0.1472, + "step": 280 + }, + { + "epoch": 0.28, + "grad_norm": 0.6064244109977189, + "learning_rate": 8.436875350010958e-06, + "loss": 0.1378, + "step": 281 + }, + { + "epoch": 0.28, + "grad_norm": 0.8099455428698011, + "learning_rate": 8.425095762435274e-06, + "loss": 0.1488, + "step": 282 + }, + { + "epoch": 0.28, + "grad_norm": 0.854892262931246, + "learning_rate": 8.41328024722577e-06, + "loss": 0.1498, + "step": 283 + }, + { + "epoch": 0.28, + "grad_norm": 0.5185575941538992, + "learning_rate": 8.401428928321607e-06, + "loss": 0.0902, + "step": 284 + }, + { + "epoch": 0.28, + "grad_norm": 0.5965587314972228, + "learning_rate": 8.389541930037516e-06, + "loss": 0.106, + "step": 285 + }, + { + "epoch": 0.29, + "grad_norm": 0.459582914012201, + "learning_rate": 8.377619377062483e-06, + "loss": 0.0982, + "step": 286 + }, + { + "epoch": 0.29, + "grad_norm": 0.6673096852048638, + "learning_rate": 8.365661394458446e-06, + "loss": 0.1108, + "step": 287 + }, + { + "epoch": 0.29, + "grad_norm": 0.6698950014998947, + "learning_rate": 8.353668107658984e-06, + "loss": 0.1296, + "step": 288 + }, + { + "epoch": 0.29, + "grad_norm": 0.9163329174066326, + "learning_rate": 8.341639642468002e-06, + "loss": 0.1499, + "step": 289 + }, + { + "epoch": 0.29, + "grad_norm": 0.6835073933135426, + "learning_rate": 8.329576125058406e-06, + "loss": 0.0651, + "step": 290 + }, + { + "epoch": 0.29, + "grad_norm": 0.47586344778493134, + "learning_rate": 8.317477681970786e-06, + "loss": 0.0982, + "step": 291 + }, + { + "epoch": 0.29, + "grad_norm": 1.027887942457216, + "learning_rate": 8.305344440112089e-06, + "loss": 0.1193, + "step": 292 + }, + { + "epoch": 0.29, + "grad_norm": 0.6883146738724482, + "learning_rate": 8.293176526754274e-06, + "loss": 0.1636, + "step": 293 + }, + { + "epoch": 0.29, + "grad_norm": 1.308519284995934, + "learning_rate": 8.280974069532999e-06, + "loss": 0.1081, + "step": 294 + }, + { + "epoch": 0.29, + "grad_norm": 0.5575442309211142, + "learning_rate": 8.268737196446264e-06, + "loss": 0.1049, + "step": 295 + }, + { + "epoch": 0.3, + "grad_norm": 0.4295260773986241, + "learning_rate": 8.256466035853077e-06, + "loss": 0.0751, + "step": 296 + }, + { + "epoch": 0.3, + "grad_norm": 1.6141976073277213, + "learning_rate": 8.244160716472109e-06, + "loss": 0.1336, + "step": 297 + }, + { + "epoch": 0.3, + "grad_norm": 1.1909104778355368, + "learning_rate": 8.231821367380335e-06, + "loss": 0.1719, + "step": 298 + }, + { + "epoch": 0.3, + "grad_norm": 0.5340253751607172, + "learning_rate": 8.219448118011687e-06, + "loss": 0.0899, + "step": 299 + }, + { + "epoch": 0.3, + "grad_norm": 1.029527908281168, + "learning_rate": 8.207041098155701e-06, + "loss": 0.1151, + "step": 300 + }, + { + "epoch": 0.3, + "grad_norm": 0.8985930248766892, + "learning_rate": 8.19460043795614e-06, + "loss": 0.1225, + "step": 301 + }, + { + "epoch": 0.3, + "grad_norm": 0.5206416280199094, + "learning_rate": 8.182126267909642e-06, + "loss": 0.0987, + "step": 302 + }, + { + "epoch": 0.3, + "grad_norm": 0.9381246435874875, + "learning_rate": 8.16961871886435e-06, + "loss": 0.2008, + "step": 303 + }, + { + "epoch": 0.3, + "grad_norm": 1.0766922252901279, + "learning_rate": 8.157077922018537e-06, + "loss": 0.1593, + "step": 304 + }, + { + "epoch": 0.3, + "grad_norm": 0.5950079693273216, + "learning_rate": 8.144504008919224e-06, + "loss": 0.099, + "step": 305 + }, + { + "epoch": 0.31, + "grad_norm": 0.5071063988827308, + "learning_rate": 8.13189711146081e-06, + "loss": 0.0884, + "step": 306 + }, + { + "epoch": 0.31, + "grad_norm": 0.6170093430029727, + "learning_rate": 8.119257361883686e-06, + "loss": 0.0949, + "step": 307 + }, + { + "epoch": 0.31, + "grad_norm": 0.6216963302278578, + "learning_rate": 8.106584892772844e-06, + "loss": 0.135, + "step": 308 + }, + { + "epoch": 0.31, + "grad_norm": 0.7515361895060123, + "learning_rate": 8.093879837056486e-06, + "loss": 0.1424, + "step": 309 + }, + { + "epoch": 0.31, + "grad_norm": 0.5301845393435228, + "learning_rate": 8.081142328004638e-06, + "loss": 0.0842, + "step": 310 + }, + { + "epoch": 0.31, + "grad_norm": 0.36401321561122435, + "learning_rate": 8.068372499227738e-06, + "loss": 0.0941, + "step": 311 + }, + { + "epoch": 0.31, + "grad_norm": 0.43388092832373126, + "learning_rate": 8.055570484675252e-06, + "loss": 0.0812, + "step": 312 + }, + { + "epoch": 0.31, + "grad_norm": 0.3111677501066997, + "learning_rate": 8.042736418634252e-06, + "loss": 0.071, + "step": 313 + }, + { + "epoch": 0.31, + "grad_norm": 0.2711927555540456, + "learning_rate": 8.029870435728018e-06, + "loss": 0.0665, + "step": 314 + }, + { + "epoch": 0.32, + "grad_norm": 0.6158192375387915, + "learning_rate": 8.016972670914624e-06, + "loss": 0.1366, + "step": 315 + }, + { + "epoch": 0.32, + "grad_norm": 0.5393832711474328, + "learning_rate": 8.004043259485519e-06, + "loss": 0.1256, + "step": 316 + }, + { + "epoch": 0.32, + "grad_norm": 0.819073710300546, + "learning_rate": 7.99108233706411e-06, + "loss": 0.1342, + "step": 317 + }, + { + "epoch": 0.32, + "grad_norm": 0.4881682283012466, + "learning_rate": 7.978090039604342e-06, + "loss": 0.1098, + "step": 318 + }, + { + "epoch": 0.32, + "grad_norm": 1.425625759706754, + "learning_rate": 7.965066503389264e-06, + "loss": 0.1157, + "step": 319 + }, + { + "epoch": 0.32, + "grad_norm": 0.8417583243303938, + "learning_rate": 7.952011865029614e-06, + "loss": 0.1504, + "step": 320 + }, + { + "epoch": 0.32, + "grad_norm": 0.5241553526731788, + "learning_rate": 7.938926261462366e-06, + "loss": 0.1047, + "step": 321 + }, + { + "epoch": 0.32, + "grad_norm": 1.092665792919607, + "learning_rate": 7.925809829949312e-06, + "loss": 0.1766, + "step": 322 + }, + { + "epoch": 0.32, + "grad_norm": 0.48982963314229716, + "learning_rate": 7.91266270807561e-06, + "loss": 0.1012, + "step": 323 + }, + { + "epoch": 0.32, + "grad_norm": 0.5045779559864191, + "learning_rate": 7.89948503374835e-06, + "loss": 0.0838, + "step": 324 + }, + { + "epoch": 0.33, + "grad_norm": 1.0147095478683965, + "learning_rate": 7.886276945195098e-06, + "loss": 0.1543, + "step": 325 + }, + { + "epoch": 0.33, + "grad_norm": 0.6721766277581208, + "learning_rate": 7.873038580962453e-06, + "loss": 0.0797, + "step": 326 + }, + { + "epoch": 0.33, + "grad_norm": 1.2897950063397245, + "learning_rate": 7.859770079914592e-06, + "loss": 0.1268, + "step": 327 + }, + { + "epoch": 0.33, + "grad_norm": 0.6630020038299413, + "learning_rate": 7.846471581231814e-06, + "loss": 0.0768, + "step": 328 + }, + { + "epoch": 0.33, + "grad_norm": 0.7728033471228617, + "learning_rate": 7.833143224409076e-06, + "loss": 0.1191, + "step": 329 + }, + { + "epoch": 0.33, + "grad_norm": 0.4554370312043164, + "learning_rate": 7.819785149254534e-06, + "loss": 0.0948, + "step": 330 + }, + { + "epoch": 0.33, + "grad_norm": 0.4023913241784609, + "learning_rate": 7.806397495888074e-06, + "loss": 0.1015, + "step": 331 + }, + { + "epoch": 0.33, + "grad_norm": 0.5534162223569474, + "learning_rate": 7.792980404739849e-06, + "loss": 0.0938, + "step": 332 + }, + { + "epoch": 0.33, + "grad_norm": 0.46691388495842806, + "learning_rate": 7.779534016548791e-06, + "loss": 0.0601, + "step": 333 + }, + { + "epoch": 0.33, + "grad_norm": 0.7957525089538556, + "learning_rate": 7.766058472361154e-06, + "loss": 0.0786, + "step": 334 + }, + { + "epoch": 0.34, + "grad_norm": 0.6171254836650297, + "learning_rate": 7.752553913529019e-06, + "loss": 0.1148, + "step": 335 + }, + { + "epoch": 0.34, + "grad_norm": 0.39913106915745733, + "learning_rate": 7.739020481708816e-06, + "loss": 0.1068, + "step": 336 + }, + { + "epoch": 0.34, + "grad_norm": 0.8911333363777895, + "learning_rate": 7.725458318859842e-06, + "loss": 0.102, + "step": 337 + }, + { + "epoch": 0.34, + "grad_norm": 0.4629156522652224, + "learning_rate": 7.711867567242769e-06, + "loss": 0.0943, + "step": 338 + }, + { + "epoch": 0.34, + "grad_norm": 0.6427640885980827, + "learning_rate": 7.698248369418146e-06, + "loss": 0.1326, + "step": 339 + }, + { + "epoch": 0.34, + "grad_norm": 0.5659495632220061, + "learning_rate": 7.68460086824492e-06, + "loss": 0.0995, + "step": 340 + }, + { + "epoch": 0.34, + "grad_norm": 0.6325174096409139, + "learning_rate": 7.670925206878917e-06, + "loss": 0.1138, + "step": 341 + }, + { + "epoch": 0.34, + "grad_norm": 0.7936200068990925, + "learning_rate": 7.657221528771352e-06, + "loss": 0.1654, + "step": 342 + }, + { + "epoch": 0.34, + "grad_norm": 0.8356850463342996, + "learning_rate": 7.643489977667327e-06, + "loss": 0.1486, + "step": 343 + }, + { + "epoch": 0.34, + "grad_norm": 0.7685126012757484, + "learning_rate": 7.629730697604314e-06, + "loss": 0.1103, + "step": 344 + }, + { + "epoch": 0.34, + "grad_norm": 0.4838660469726114, + "learning_rate": 7.61594383291065e-06, + "loss": 0.1066, + "step": 345 + }, + { + "epoch": 0.35, + "grad_norm": 0.3483440899453769, + "learning_rate": 7.602129528204023e-06, + "loss": 0.0903, + "step": 346 + }, + { + "epoch": 0.35, + "grad_norm": 0.8548146759546644, + "learning_rate": 7.588287928389952e-06, + "loss": 0.1636, + "step": 347 + }, + { + "epoch": 0.35, + "grad_norm": 0.4294305083473748, + "learning_rate": 7.574419178660269e-06, + "loss": 0.1158, + "step": 348 + }, + { + "epoch": 0.35, + "grad_norm": 0.4115732988087181, + "learning_rate": 7.560523424491595e-06, + "loss": 0.1021, + "step": 349 + }, + { + "epoch": 0.35, + "grad_norm": 1.0097008417071993, + "learning_rate": 7.546600811643816e-06, + "loss": 0.1783, + "step": 350 + }, + { + "epoch": 0.35, + "grad_norm": 0.6259202083211147, + "learning_rate": 7.532651486158554e-06, + "loss": 0.1021, + "step": 351 + }, + { + "epoch": 0.35, + "grad_norm": 0.7320795596317787, + "learning_rate": 7.5186755943576324e-06, + "loss": 0.1083, + "step": 352 + }, + { + "epoch": 0.35, + "grad_norm": 0.39660777221329047, + "learning_rate": 7.504673282841544e-06, + "loss": 0.0782, + "step": 353 + }, + { + "epoch": 0.35, + "grad_norm": 0.5037925010178963, + "learning_rate": 7.490644698487909e-06, + "loss": 0.1066, + "step": 354 + }, + { + "epoch": 0.35, + "grad_norm": 0.3920215273167089, + "learning_rate": 7.476589988449939e-06, + "loss": 0.073, + "step": 355 + }, + { + "epoch": 0.36, + "grad_norm": 0.6877910161758389, + "learning_rate": 7.462509300154892e-06, + "loss": 0.1165, + "step": 356 + }, + { + "epoch": 0.36, + "grad_norm": 0.6193936684890975, + "learning_rate": 7.448402781302526e-06, + "loss": 0.0897, + "step": 357 + }, + { + "epoch": 0.36, + "grad_norm": 0.6544150672463809, + "learning_rate": 7.434270579863549e-06, + "loss": 0.0911, + "step": 358 + }, + { + "epoch": 0.36, + "grad_norm": 1.314171970891291, + "learning_rate": 7.420112844078066e-06, + "loss": 0.1305, + "step": 359 + }, + { + "epoch": 0.36, + "grad_norm": 0.8041982462278258, + "learning_rate": 7.405929722454026e-06, + "loss": 0.1192, + "step": 360 + }, + { + "epoch": 0.36, + "grad_norm": 0.5637989804443223, + "learning_rate": 7.391721363765664e-06, + "loss": 0.1563, + "step": 361 + }, + { + "epoch": 0.36, + "grad_norm": 0.7521330412268286, + "learning_rate": 7.3774879170519386e-06, + "loss": 0.1577, + "step": 362 + }, + { + "epoch": 0.36, + "grad_norm": 0.39846411449313535, + "learning_rate": 7.363229531614973e-06, + "loss": 0.1052, + "step": 363 + }, + { + "epoch": 0.36, + "grad_norm": 0.46842312605388364, + "learning_rate": 7.348946357018479e-06, + "loss": 0.0967, + "step": 364 + }, + { + "epoch": 0.36, + "grad_norm": 0.5908090831853243, + "learning_rate": 7.334638543086203e-06, + "loss": 0.0759, + "step": 365 + }, + { + "epoch": 0.37, + "grad_norm": 0.3214400708242895, + "learning_rate": 7.320306239900343e-06, + "loss": 0.0385, + "step": 366 + }, + { + "epoch": 0.37, + "grad_norm": 0.722132793266274, + "learning_rate": 7.305949597799976e-06, + "loss": 0.1114, + "step": 367 + }, + { + "epoch": 0.37, + "grad_norm": 3.3269546028744137, + "learning_rate": 7.291568767379484e-06, + "loss": 0.1547, + "step": 368 + }, + { + "epoch": 0.37, + "grad_norm": 0.43869990888711935, + "learning_rate": 7.277163899486975e-06, + "loss": 0.0412, + "step": 369 + }, + { + "epoch": 0.37, + "grad_norm": 2.0948960403311223, + "learning_rate": 7.262735145222696e-06, + "loss": 0.117, + "step": 370 + }, + { + "epoch": 0.37, + "grad_norm": 0.6680283933724516, + "learning_rate": 7.248282655937451e-06, + "loss": 0.0918, + "step": 371 + }, + { + "epoch": 0.37, + "grad_norm": 0.5454331336315559, + "learning_rate": 7.233806583231012e-06, + "loss": 0.0615, + "step": 372 + }, + { + "epoch": 0.37, + "grad_norm": 0.5088438263092467, + "learning_rate": 7.219307078950536e-06, + "loss": 0.0816, + "step": 373 + }, + { + "epoch": 0.37, + "grad_norm": 0.6588995635030355, + "learning_rate": 7.204784295188959e-06, + "loss": 0.1221, + "step": 374 + }, + { + "epoch": 0.38, + "grad_norm": 0.9800509058632336, + "learning_rate": 7.190238384283413e-06, + "loss": 0.1574, + "step": 375 + }, + { + "epoch": 0.38, + "grad_norm": 1.2052741828186413, + "learning_rate": 7.1756694988136165e-06, + "loss": 0.1793, + "step": 376 + }, + { + "epoch": 0.38, + "grad_norm": 0.7092844962709588, + "learning_rate": 7.161077791600288e-06, + "loss": 0.1434, + "step": 377 + }, + { + "epoch": 0.38, + "grad_norm": 0.8103864711082105, + "learning_rate": 7.14646341570353e-06, + "loss": 0.1735, + "step": 378 + }, + { + "epoch": 0.38, + "grad_norm": 0.5186224805906783, + "learning_rate": 7.1318265244212305e-06, + "loss": 0.1008, + "step": 379 + }, + { + "epoch": 0.38, + "grad_norm": 0.5046095769937358, + "learning_rate": 7.117167271287453e-06, + "loss": 0.1266, + "step": 380 + }, + { + "epoch": 0.38, + "grad_norm": 0.6291806702745448, + "learning_rate": 7.102485810070824e-06, + "loss": 0.129, + "step": 381 + }, + { + "epoch": 0.38, + "grad_norm": 0.8706680878449085, + "learning_rate": 7.0877822947729265e-06, + "loss": 0.1425, + "step": 382 + }, + { + "epoch": 0.38, + "grad_norm": 0.7517343416434489, + "learning_rate": 7.073056879626681e-06, + "loss": 0.1381, + "step": 383 + }, + { + "epoch": 0.38, + "grad_norm": 0.4850521528617002, + "learning_rate": 7.05830971909472e-06, + "loss": 0.0837, + "step": 384 + }, + { + "epoch": 0.39, + "grad_norm": 0.467190916228377, + "learning_rate": 7.043540967867782e-06, + "loss": 0.1229, + "step": 385 + }, + { + "epoch": 0.39, + "grad_norm": 0.35442767259956226, + "learning_rate": 7.028750780863078e-06, + "loss": 0.0604, + "step": 386 + }, + { + "epoch": 0.39, + "grad_norm": 0.632531620020242, + "learning_rate": 7.013939313222669e-06, + "loss": 0.0992, + "step": 387 + }, + { + "epoch": 0.39, + "grad_norm": 0.7721017686992716, + "learning_rate": 6.999106720311846e-06, + "loss": 0.1562, + "step": 388 + }, + { + "epoch": 0.39, + "grad_norm": 0.6265392153084421, + "learning_rate": 6.9842531577174865e-06, + "loss": 0.138, + "step": 389 + }, + { + "epoch": 0.39, + "grad_norm": 0.8619646556127373, + "learning_rate": 6.969378781246436e-06, + "loss": 0.1323, + "step": 390 + }, + { + "epoch": 0.39, + "grad_norm": 0.6973949993687323, + "learning_rate": 6.954483746923865e-06, + "loss": 0.1082, + "step": 391 + }, + { + "epoch": 0.39, + "grad_norm": 0.48241252511918464, + "learning_rate": 6.939568210991633e-06, + "loss": 0.1345, + "step": 392 + }, + { + "epoch": 0.39, + "grad_norm": 0.39274471270934264, + "learning_rate": 6.924632329906657e-06, + "loss": 0.1055, + "step": 393 + }, + { + "epoch": 0.39, + "grad_norm": 0.5781946342673107, + "learning_rate": 6.9096762603392595e-06, + "loss": 0.1409, + "step": 394 + }, + { + "epoch": 0.4, + "grad_norm": 0.3525825420240066, + "learning_rate": 6.894700159171535e-06, + "loss": 0.0883, + "step": 395 + }, + { + "epoch": 0.4, + "grad_norm": 0.5246273586743376, + "learning_rate": 6.8797041834956955e-06, + "loss": 0.081, + "step": 396 + }, + { + "epoch": 0.4, + "grad_norm": 0.6518585669331103, + "learning_rate": 6.8646884906124345e-06, + "loss": 0.1007, + "step": 397 + }, + { + "epoch": 0.4, + "grad_norm": 0.3508427750003166, + "learning_rate": 6.849653238029261e-06, + "loss": 0.0641, + "step": 398 + }, + { + "epoch": 0.4, + "grad_norm": 0.6551363944925317, + "learning_rate": 6.834598583458862e-06, + "loss": 0.1292, + "step": 399 + }, + { + "epoch": 0.4, + "grad_norm": 0.7451482471986087, + "learning_rate": 6.819524684817439e-06, + "loss": 0.1621, + "step": 400 + }, + { + "epoch": 0.4, + "grad_norm": 0.5304490064747236, + "learning_rate": 6.804431700223057e-06, + "loss": 0.0912, + "step": 401 + }, + { + "epoch": 0.4, + "grad_norm": 0.767445319430965, + "learning_rate": 6.78931978799398e-06, + "loss": 0.1184, + "step": 402 + }, + { + "epoch": 0.4, + "grad_norm": 0.5097226662887627, + "learning_rate": 6.774189106647021e-06, + "loss": 0.0819, + "step": 403 + }, + { + "epoch": 0.4, + "grad_norm": 0.40426010526771866, + "learning_rate": 6.7590398148958625e-06, + "loss": 0.0621, + "step": 404 + }, + { + "epoch": 0.41, + "grad_norm": 0.6698796219963394, + "learning_rate": 6.743872071649411e-06, + "loss": 0.0607, + "step": 405 + }, + { + "epoch": 0.41, + "grad_norm": 0.7051411305657742, + "learning_rate": 6.728686036010115e-06, + "loss": 0.1194, + "step": 406 + }, + { + "epoch": 0.41, + "grad_norm": 0.5192947281652533, + "learning_rate": 6.7134818672723005e-06, + "loss": 0.1401, + "step": 407 + }, + { + "epoch": 0.41, + "grad_norm": 0.5375041644147377, + "learning_rate": 6.698259724920503e-06, + "loss": 0.114, + "step": 408 + }, + { + "epoch": 0.41, + "grad_norm": 0.5586400961711949, + "learning_rate": 6.6830197686277945e-06, + "loss": 0.1078, + "step": 409 + }, + { + "epoch": 0.41, + "grad_norm": 1.1282069492368068, + "learning_rate": 6.667762158254104e-06, + "loss": 0.1872, + "step": 410 + }, + { + "epoch": 0.41, + "grad_norm": 0.4389746508236176, + "learning_rate": 6.652487053844544e-06, + "loss": 0.0831, + "step": 411 + }, + { + "epoch": 0.41, + "grad_norm": 0.5030133333814518, + "learning_rate": 6.637194615627733e-06, + "loss": 0.1168, + "step": 412 + }, + { + "epoch": 0.41, + "grad_norm": 0.5244578006283286, + "learning_rate": 6.621885004014113e-06, + "loss": 0.1154, + "step": 413 + }, + { + "epoch": 0.41, + "grad_norm": 0.3319122797122553, + "learning_rate": 6.6065583795942625e-06, + "loss": 0.0897, + "step": 414 + }, + { + "epoch": 0.41, + "grad_norm": 0.8670940267704824, + "learning_rate": 6.591214903137221e-06, + "loss": 0.1614, + "step": 415 + }, + { + "epoch": 0.42, + "grad_norm": 0.3790472396407466, + "learning_rate": 6.5758547355887944e-06, + "loss": 0.0799, + "step": 416 + }, + { + "epoch": 0.42, + "grad_norm": 0.6708691225198117, + "learning_rate": 6.560478038069873e-06, + "loss": 0.1179, + "step": 417 + }, + { + "epoch": 0.42, + "grad_norm": 0.5613038123983146, + "learning_rate": 6.545084971874738e-06, + "loss": 0.0733, + "step": 418 + }, + { + "epoch": 0.42, + "grad_norm": 0.3911040134070448, + "learning_rate": 6.52967569846937e-06, + "loss": 0.0664, + "step": 419 + }, + { + "epoch": 0.42, + "grad_norm": 0.7275454662991591, + "learning_rate": 6.514250379489754e-06, + "loss": 0.1519, + "step": 420 + }, + { + "epoch": 0.42, + "grad_norm": 0.5393692947961118, + "learning_rate": 6.49880917674019e-06, + "loss": 0.0969, + "step": 421 + }, + { + "epoch": 0.42, + "grad_norm": 0.44961915653679846, + "learning_rate": 6.483352252191585e-06, + "loss": 0.0591, + "step": 422 + }, + { + "epoch": 0.42, + "grad_norm": 0.4186506337439106, + "learning_rate": 6.467879767979764e-06, + "loss": 0.062, + "step": 423 + }, + { + "epoch": 0.42, + "grad_norm": 0.3253164042363115, + "learning_rate": 6.452391886403767e-06, + "loss": 0.0544, + "step": 424 + }, + { + "epoch": 0.42, + "grad_norm": 0.8390011982461838, + "learning_rate": 6.436888769924142e-06, + "loss": 0.1382, + "step": 425 + }, + { + "epoch": 0.43, + "grad_norm": 0.795830650877867, + "learning_rate": 6.421370581161244e-06, + "loss": 0.119, + "step": 426 + }, + { + "epoch": 0.43, + "grad_norm": 1.3216213141007211, + "learning_rate": 6.405837482893529e-06, + "loss": 0.2146, + "step": 427 + }, + { + "epoch": 0.43, + "grad_norm": 0.7353417059435452, + "learning_rate": 6.390289638055851e-06, + "loss": 0.16, + "step": 428 + }, + { + "epoch": 0.43, + "grad_norm": 0.4257100364828846, + "learning_rate": 6.374727209737743e-06, + "loss": 0.0681, + "step": 429 + }, + { + "epoch": 0.43, + "grad_norm": 0.9308011184791537, + "learning_rate": 6.3591503611817155e-06, + "loss": 0.1233, + "step": 430 + }, + { + "epoch": 0.43, + "grad_norm": 0.46600768468553494, + "learning_rate": 6.343559255781538e-06, + "loss": 0.0968, + "step": 431 + }, + { + "epoch": 0.43, + "grad_norm": 0.45217837773837133, + "learning_rate": 6.3279540570805265e-06, + "loss": 0.1168, + "step": 432 + }, + { + "epoch": 0.43, + "grad_norm": 0.8122425030745498, + "learning_rate": 6.3123349287698345e-06, + "loss": 0.1554, + "step": 433 + }, + { + "epoch": 0.43, + "grad_norm": 0.5831822627226863, + "learning_rate": 6.296702034686726e-06, + "loss": 0.1024, + "step": 434 + }, + { + "epoch": 0.43, + "grad_norm": 0.6510368385175214, + "learning_rate": 6.281055538812861e-06, + "loss": 0.1557, + "step": 435 + }, + { + "epoch": 0.44, + "grad_norm": 0.5908036903679669, + "learning_rate": 6.265395605272581e-06, + "loss": 0.1302, + "step": 436 + }, + { + "epoch": 0.44, + "grad_norm": 1.083073206243553, + "learning_rate": 6.249722398331177e-06, + "loss": 0.1211, + "step": 437 + }, + { + "epoch": 0.44, + "grad_norm": 0.5113900815850158, + "learning_rate": 6.234036082393171e-06, + "loss": 0.1038, + "step": 438 + }, + { + "epoch": 0.44, + "grad_norm": 0.4695176010179583, + "learning_rate": 6.218336822000598e-06, + "loss": 0.0823, + "step": 439 + }, + { + "epoch": 0.44, + "grad_norm": 0.5152907538075204, + "learning_rate": 6.202624781831269e-06, + "loss": 0.0954, + "step": 440 + }, + { + "epoch": 0.44, + "grad_norm": 0.31267507550158435, + "learning_rate": 6.18690012669705e-06, + "loss": 0.0657, + "step": 441 + }, + { + "epoch": 0.44, + "grad_norm": 0.5493800886906555, + "learning_rate": 6.171163021542134e-06, + "loss": 0.1007, + "step": 442 + }, + { + "epoch": 0.44, + "grad_norm": 1.056816520305224, + "learning_rate": 6.155413631441307e-06, + "loss": 0.148, + "step": 443 + }, + { + "epoch": 0.44, + "grad_norm": 0.6045608645180944, + "learning_rate": 6.139652121598219e-06, + "loss": 0.1264, + "step": 444 + }, + { + "epoch": 0.45, + "grad_norm": 0.5505192451579597, + "learning_rate": 6.123878657343648e-06, + "loss": 0.1074, + "step": 445 + }, + { + "epoch": 0.45, + "grad_norm": 0.7797223186850629, + "learning_rate": 6.108093404133772e-06, + "loss": 0.1064, + "step": 446 + }, + { + "epoch": 0.45, + "grad_norm": 0.7391319365614268, + "learning_rate": 6.092296527548427e-06, + "loss": 0.1364, + "step": 447 + }, + { + "epoch": 0.45, + "grad_norm": 0.3580599720523544, + "learning_rate": 6.076488193289375e-06, + "loss": 0.0802, + "step": 448 + }, + { + "epoch": 0.45, + "grad_norm": 0.7448422709657808, + "learning_rate": 6.060668567178561e-06, + "loss": 0.1341, + "step": 449 + }, + { + "epoch": 0.45, + "grad_norm": 0.6370753832372978, + "learning_rate": 6.044837815156377e-06, + "loss": 0.0976, + "step": 450 + }, + { + "epoch": 0.45, + "grad_norm": 0.34856966727165745, + "learning_rate": 6.028996103279918e-06, + "loss": 0.0905, + "step": 451 + }, + { + "epoch": 0.45, + "grad_norm": 0.5308458686033565, + "learning_rate": 6.013143597721252e-06, + "loss": 0.1205, + "step": 452 + }, + { + "epoch": 0.45, + "grad_norm": 0.69040451724238, + "learning_rate": 5.997280464765655e-06, + "loss": 0.1336, + "step": 453 + }, + { + "epoch": 0.45, + "grad_norm": 0.48804773229436027, + "learning_rate": 5.981406870809889e-06, + "loss": 0.1038, + "step": 454 + }, + { + "epoch": 0.46, + "grad_norm": 0.5309662342793907, + "learning_rate": 5.965522982360441e-06, + "loss": 0.1165, + "step": 455 + }, + { + "epoch": 0.46, + "grad_norm": 0.45807565342272794, + "learning_rate": 5.949628966031785e-06, + "loss": 0.1204, + "step": 456 + }, + { + "epoch": 0.46, + "grad_norm": 2.053483997823371, + "learning_rate": 5.933724988544632e-06, + "loss": 0.0992, + "step": 457 + }, + { + "epoch": 0.46, + "grad_norm": 0.628667312080432, + "learning_rate": 5.9178112167241805e-06, + "loss": 0.0693, + "step": 458 + }, + { + "epoch": 0.46, + "grad_norm": 0.3146694384985098, + "learning_rate": 5.9018878174983674e-06, + "loss": 0.0792, + "step": 459 + }, + { + "epoch": 0.46, + "grad_norm": 0.774760113025305, + "learning_rate": 5.885954957896115e-06, + "loss": 0.1119, + "step": 460 + }, + { + "epoch": 0.46, + "grad_norm": 0.7246436390810435, + "learning_rate": 5.87001280504558e-06, + "loss": 0.1165, + "step": 461 + }, + { + "epoch": 0.46, + "grad_norm": 0.8854769979402363, + "learning_rate": 5.854061526172402e-06, + "loss": 0.0715, + "step": 462 + }, + { + "epoch": 0.46, + "grad_norm": 0.6484126267384271, + "learning_rate": 5.838101288597951e-06, + "loss": 0.112, + "step": 463 + }, + { + "epoch": 0.46, + "grad_norm": 0.8365157565396312, + "learning_rate": 5.822132259737565e-06, + "loss": 0.139, + "step": 464 + }, + { + "epoch": 0.47, + "grad_norm": 0.3293273384065748, + "learning_rate": 5.806154607098799e-06, + "loss": 0.0763, + "step": 465 + }, + { + "epoch": 0.47, + "grad_norm": 0.6777090984134903, + "learning_rate": 5.7901684982796716e-06, + "loss": 0.0652, + "step": 466 + }, + { + "epoch": 0.47, + "grad_norm": 0.3589078794021504, + "learning_rate": 5.774174100966899e-06, + "loss": 0.0743, + "step": 467 + }, + { + "epoch": 0.47, + "grad_norm": 0.3797961215560621, + "learning_rate": 5.75817158293414e-06, + "loss": 0.0999, + "step": 468 + }, + { + "epoch": 0.47, + "grad_norm": 0.6213340240753514, + "learning_rate": 5.742161112040237e-06, + "loss": 0.1444, + "step": 469 + }, + { + "epoch": 0.47, + "grad_norm": 0.5452388616304445, + "learning_rate": 5.726142856227453e-06, + "loss": 0.1048, + "step": 470 + }, + { + "epoch": 0.47, + "grad_norm": 0.2808905141821018, + "learning_rate": 5.7101169835197115e-06, + "loss": 0.0678, + "step": 471 + }, + { + "epoch": 0.47, + "grad_norm": 0.3891114334553897, + "learning_rate": 5.694083662020835e-06, + "loss": 0.0782, + "step": 472 + }, + { + "epoch": 0.47, + "grad_norm": 0.8848874048446539, + "learning_rate": 5.678043059912776e-06, + "loss": 0.179, + "step": 473 + }, + { + "epoch": 0.47, + "grad_norm": 0.6590686572962577, + "learning_rate": 5.661995345453867e-06, + "loss": 0.0686, + "step": 474 + }, + { + "epoch": 0.47, + "grad_norm": 0.3629048994912652, + "learning_rate": 5.645940686977033e-06, + "loss": 0.0751, + "step": 475 + }, + { + "epoch": 0.48, + "grad_norm": 0.35568250345038716, + "learning_rate": 5.629879252888046e-06, + "loss": 0.0757, + "step": 476 + }, + { + "epoch": 0.48, + "grad_norm": 0.5521471050077407, + "learning_rate": 5.613811211663751e-06, + "loss": 0.086, + "step": 477 + }, + { + "epoch": 0.48, + "grad_norm": 0.5984229410801276, + "learning_rate": 5.597736731850295e-06, + "loss": 0.1275, + "step": 478 + }, + { + "epoch": 0.48, + "grad_norm": 0.6060209143197848, + "learning_rate": 5.581655982061367e-06, + "loss": 0.1195, + "step": 479 + }, + { + "epoch": 0.48, + "grad_norm": 0.3324520039486348, + "learning_rate": 5.5655691309764225e-06, + "loss": 0.0839, + "step": 480 + }, + { + "epoch": 0.48, + "grad_norm": 0.5127937592468951, + "learning_rate": 5.549476347338915e-06, + "loss": 0.0996, + "step": 481 + }, + { + "epoch": 0.48, + "grad_norm": 0.31282947055083743, + "learning_rate": 5.533377799954532e-06, + "loss": 0.0699, + "step": 482 + }, + { + "epoch": 0.48, + "grad_norm": 0.5965805392065356, + "learning_rate": 5.517273657689419e-06, + "loss": 0.1073, + "step": 483 + }, + { + "epoch": 0.48, + "grad_norm": 0.679873280620491, + "learning_rate": 5.501164089468406e-06, + "loss": 0.1126, + "step": 484 + }, + { + "epoch": 0.48, + "grad_norm": 0.23598676421594147, + "learning_rate": 5.485049264273241e-06, + "loss": 0.0546, + "step": 485 + }, + { + "epoch": 0.49, + "grad_norm": 0.34477926408670995, + "learning_rate": 5.4689293511408155e-06, + "loss": 0.0724, + "step": 486 + }, + { + "epoch": 0.49, + "grad_norm": 0.489492463045434, + "learning_rate": 5.45280451916139e-06, + "loss": 0.0986, + "step": 487 + }, + { + "epoch": 0.49, + "grad_norm": 0.4499802484193189, + "learning_rate": 5.43667493747682e-06, + "loss": 0.1027, + "step": 488 + }, + { + "epoch": 0.49, + "grad_norm": 0.5272285944848393, + "learning_rate": 5.4205407752787884e-06, + "loss": 0.0978, + "step": 489 + }, + { + "epoch": 0.49, + "grad_norm": 0.8062238664270278, + "learning_rate": 5.404402201807022e-06, + "loss": 0.1298, + "step": 490 + }, + { + "epoch": 0.49, + "grad_norm": 0.5706207633505472, + "learning_rate": 5.388259386347518e-06, + "loss": 0.1054, + "step": 491 + }, + { + "epoch": 0.49, + "grad_norm": 1.6137644564369118, + "learning_rate": 5.372112498230771e-06, + "loss": 0.0861, + "step": 492 + }, + { + "epoch": 0.49, + "grad_norm": 0.5348038604556673, + "learning_rate": 5.355961706829997e-06, + "loss": 0.111, + "step": 493 + }, + { + "epoch": 0.49, + "grad_norm": 0.726680269615648, + "learning_rate": 5.339807181559359e-06, + "loss": 0.1462, + "step": 494 + }, + { + "epoch": 0.49, + "grad_norm": 0.8348663322329462, + "learning_rate": 5.323649091872179e-06, + "loss": 0.1371, + "step": 495 + }, + { + "epoch": 0.5, + "grad_norm": 0.7891876906540254, + "learning_rate": 5.307487607259175e-06, + "loss": 0.1428, + "step": 496 + }, + { + "epoch": 0.5, + "grad_norm": 0.9819820558100029, + "learning_rate": 5.291322897246669e-06, + "loss": 0.1564, + "step": 497 + }, + { + "epoch": 0.5, + "grad_norm": 0.6935458072615769, + "learning_rate": 5.275155131394825e-06, + "loss": 0.12, + "step": 498 + }, + { + "epoch": 0.5, + "grad_norm": 0.6661975848806015, + "learning_rate": 5.258984479295853e-06, + "loss": 0.1184, + "step": 499 + }, + { + "epoch": 0.5, + "grad_norm": 0.5423519400422018, + "learning_rate": 5.242811110572243e-06, + "loss": 0.0722, + "step": 500 + }, + { + "epoch": 0.5, + "grad_norm": 0.7463419132209343, + "learning_rate": 5.226635194874978e-06, + "loss": 0.0835, + "step": 501 + }, + { + "epoch": 0.5, + "grad_norm": 1.3914366947605155, + "learning_rate": 5.210456901881761e-06, + "loss": 0.1508, + "step": 502 + }, + { + "epoch": 0.5, + "grad_norm": 0.4734523496979647, + "learning_rate": 5.194276401295231e-06, + "loss": 0.0897, + "step": 503 + }, + { + "epoch": 0.5, + "grad_norm": 0.6083711854607663, + "learning_rate": 5.1780938628411795e-06, + "loss": 0.0839, + "step": 504 + }, + { + "epoch": 0.51, + "grad_norm": 0.4548664811607439, + "learning_rate": 5.161909456266781e-06, + "loss": 0.0977, + "step": 505 + }, + { + "epoch": 0.51, + "grad_norm": 0.5423324275876441, + "learning_rate": 5.145723351338799e-06, + "loss": 0.0907, + "step": 506 + }, + { + "epoch": 0.51, + "grad_norm": 0.8035750104285685, + "learning_rate": 5.129535717841818e-06, + "loss": 0.1195, + "step": 507 + }, + { + "epoch": 0.51, + "grad_norm": 0.40132851504371736, + "learning_rate": 5.11334672557645e-06, + "loss": 0.1208, + "step": 508 + }, + { + "epoch": 0.51, + "grad_norm": 0.6114501978559828, + "learning_rate": 5.097156544357567e-06, + "loss": 0.1301, + "step": 509 + }, + { + "epoch": 0.51, + "grad_norm": 0.49583161784800067, + "learning_rate": 5.080965344012509e-06, + "loss": 0.1156, + "step": 510 + }, + { + "epoch": 0.51, + "grad_norm": 0.36787842036529156, + "learning_rate": 5.064773294379302e-06, + "loss": 0.0778, + "step": 511 + }, + { + "epoch": 0.51, + "grad_norm": 0.3817167517079446, + "learning_rate": 5.048580565304887e-06, + "loss": 0.1222, + "step": 512 + }, + { + "epoch": 0.51, + "grad_norm": 0.22305406444966328, + "learning_rate": 5.032387326643331e-06, + "loss": 0.0432, + "step": 513 + }, + { + "epoch": 0.51, + "grad_norm": 0.3409084631457501, + "learning_rate": 5.016193748254045e-06, + "loss": 0.1031, + "step": 514 + }, + { + "epoch": 0.52, + "grad_norm": 0.29297434360276947, + "learning_rate": 5e-06, + "loss": 0.0648, + "step": 515 + }, + { + "epoch": 0.52, + "grad_norm": 0.326977097196938, + "learning_rate": 4.983806251745958e-06, + "loss": 0.0799, + "step": 516 + }, + { + "epoch": 0.52, + "grad_norm": 0.5782706967667893, + "learning_rate": 4.9676126733566705e-06, + "loss": 0.1153, + "step": 517 + }, + { + "epoch": 0.52, + "grad_norm": 0.6163249310474644, + "learning_rate": 4.951419434695115e-06, + "loss": 0.1516, + "step": 518 + }, + { + "epoch": 0.52, + "grad_norm": 0.35774650461873464, + "learning_rate": 4.935226705620699e-06, + "loss": 0.1215, + "step": 519 + }, + { + "epoch": 0.52, + "grad_norm": 0.38764313060720784, + "learning_rate": 4.919034655987493e-06, + "loss": 0.0817, + "step": 520 + }, + { + "epoch": 0.52, + "grad_norm": 0.3948929977017619, + "learning_rate": 4.9028434556424335e-06, + "loss": 0.1035, + "step": 521 + }, + { + "epoch": 0.52, + "grad_norm": 0.6165264593739969, + "learning_rate": 4.886653274423551e-06, + "loss": 0.1222, + "step": 522 + }, + { + "epoch": 0.52, + "grad_norm": 0.6155385207318133, + "learning_rate": 4.870464282158184e-06, + "loss": 0.1023, + "step": 523 + }, + { + "epoch": 0.52, + "grad_norm": 0.30092895692862326, + "learning_rate": 4.8542766486612035e-06, + "loss": 0.08, + "step": 524 + }, + { + "epoch": 0.53, + "grad_norm": 0.7494847442096868, + "learning_rate": 4.838090543733222e-06, + "loss": 0.1681, + "step": 525 + }, + { + "epoch": 0.53, + "grad_norm": 0.3984866922106728, + "learning_rate": 4.821906137158822e-06, + "loss": 0.131, + "step": 526 + }, + { + "epoch": 0.53, + "grad_norm": 0.5422271752011945, + "learning_rate": 4.805723598704772e-06, + "loss": 0.0787, + "step": 527 + }, + { + "epoch": 0.53, + "grad_norm": 0.5069148743800129, + "learning_rate": 4.7895430981182415e-06, + "loss": 0.0845, + "step": 528 + }, + { + "epoch": 0.53, + "grad_norm": 0.325494130964241, + "learning_rate": 4.773364805125025e-06, + "loss": 0.0803, + "step": 529 + }, + { + "epoch": 0.53, + "grad_norm": 0.271987833994552, + "learning_rate": 4.757188889427761e-06, + "loss": 0.0659, + "step": 530 + }, + { + "epoch": 0.53, + "grad_norm": 0.5398440656890574, + "learning_rate": 4.741015520704148e-06, + "loss": 0.1236, + "step": 531 + }, + { + "epoch": 0.53, + "grad_norm": 0.41418520065692305, + "learning_rate": 4.724844868605176e-06, + "loss": 0.0673, + "step": 532 + }, + { + "epoch": 0.53, + "grad_norm": 0.43921876895037953, + "learning_rate": 4.708677102753331e-06, + "loss": 0.12, + "step": 533 + }, + { + "epoch": 0.53, + "grad_norm": 0.3679446962756676, + "learning_rate": 4.6925123927408265e-06, + "loss": 0.0894, + "step": 534 + }, + { + "epoch": 0.54, + "grad_norm": 0.40484965677896917, + "learning_rate": 4.6763509081278215e-06, + "loss": 0.0787, + "step": 535 + }, + { + "epoch": 0.54, + "grad_norm": 0.3540453646142033, + "learning_rate": 4.660192818440642e-06, + "loss": 0.0601, + "step": 536 + }, + { + "epoch": 0.54, + "grad_norm": 0.4955006571394118, + "learning_rate": 4.644038293170003e-06, + "loss": 0.1223, + "step": 537 + }, + { + "epoch": 0.54, + "grad_norm": 0.3990714576406284, + "learning_rate": 4.627887501769231e-06, + "loss": 0.0932, + "step": 538 + }, + { + "epoch": 0.54, + "grad_norm": 0.3389236780913822, + "learning_rate": 4.611740613652485e-06, + "loss": 0.0603, + "step": 539 + }, + { + "epoch": 0.54, + "grad_norm": 0.4683123239462747, + "learning_rate": 4.59559779819298e-06, + "loss": 0.1154, + "step": 540 + }, + { + "epoch": 0.54, + "grad_norm": 0.5513301800384146, + "learning_rate": 4.579459224721212e-06, + "loss": 0.0788, + "step": 541 + }, + { + "epoch": 0.54, + "grad_norm": 0.6503069347315288, + "learning_rate": 4.5633250625231806e-06, + "loss": 0.0957, + "step": 542 + }, + { + "epoch": 0.54, + "grad_norm": 0.866742745789913, + "learning_rate": 4.547195480838612e-06, + "loss": 0.0933, + "step": 543 + }, + { + "epoch": 0.54, + "grad_norm": 0.693403208304385, + "learning_rate": 4.531070648859186e-06, + "loss": 0.1292, + "step": 544 + }, + { + "epoch": 0.55, + "grad_norm": 0.3536703836603506, + "learning_rate": 4.51495073572676e-06, + "loss": 0.0801, + "step": 545 + }, + { + "epoch": 0.55, + "grad_norm": 0.4008054007992528, + "learning_rate": 4.498835910531595e-06, + "loss": 0.0848, + "step": 546 + }, + { + "epoch": 0.55, + "grad_norm": 0.47284322872739853, + "learning_rate": 4.482726342310582e-06, + "loss": 0.06, + "step": 547 + }, + { + "epoch": 0.55, + "grad_norm": 0.4175256051238674, + "learning_rate": 4.4666222000454685e-06, + "loss": 0.1091, + "step": 548 + }, + { + "epoch": 0.55, + "grad_norm": 0.4586154908814644, + "learning_rate": 4.450523652661086e-06, + "loss": 0.0782, + "step": 549 + }, + { + "epoch": 0.55, + "grad_norm": 0.9556460754355709, + "learning_rate": 4.434430869023579e-06, + "loss": 0.1668, + "step": 550 + }, + { + "epoch": 0.55, + "grad_norm": 0.3707445337609915, + "learning_rate": 4.418344017938634e-06, + "loss": 0.0892, + "step": 551 + }, + { + "epoch": 0.55, + "grad_norm": 0.4089291338259481, + "learning_rate": 4.402263268149707e-06, + "loss": 0.083, + "step": 552 + }, + { + "epoch": 0.55, + "grad_norm": 0.4465819266048553, + "learning_rate": 4.386188788336251e-06, + "loss": 0.0772, + "step": 553 + }, + { + "epoch": 0.55, + "grad_norm": 0.534441190046031, + "learning_rate": 4.370120747111956e-06, + "loss": 0.1126, + "step": 554 + }, + { + "epoch": 0.56, + "grad_norm": 0.3109373606701782, + "learning_rate": 4.3540593130229695e-06, + "loss": 0.081, + "step": 555 + }, + { + "epoch": 0.56, + "grad_norm": 0.3506086444658416, + "learning_rate": 4.338004654546136e-06, + "loss": 0.0702, + "step": 556 + }, + { + "epoch": 0.56, + "grad_norm": 0.4009813003806628, + "learning_rate": 4.3219569400872244e-06, + "loss": 0.1124, + "step": 557 + }, + { + "epoch": 0.56, + "grad_norm": 0.3757141177945834, + "learning_rate": 4.3059163379791676e-06, + "loss": 0.0705, + "step": 558 + }, + { + "epoch": 0.56, + "grad_norm": 0.40190096513745965, + "learning_rate": 4.289883016480291e-06, + "loss": 0.0875, + "step": 559 + }, + { + "epoch": 0.56, + "grad_norm": 0.5700004266893263, + "learning_rate": 4.27385714377255e-06, + "loss": 0.1128, + "step": 560 + }, + { + "epoch": 0.56, + "grad_norm": 0.8271321860711406, + "learning_rate": 4.257838887959764e-06, + "loss": 0.2003, + "step": 561 + }, + { + "epoch": 0.56, + "grad_norm": 0.5976969809739466, + "learning_rate": 4.24182841706586e-06, + "loss": 0.0856, + "step": 562 + }, + { + "epoch": 0.56, + "grad_norm": 0.44157772328502304, + "learning_rate": 4.2258258990331015e-06, + "loss": 0.0876, + "step": 563 + }, + { + "epoch": 0.56, + "grad_norm": 0.4450141985659714, + "learning_rate": 4.209831501720328e-06, + "loss": 0.0731, + "step": 564 + }, + { + "epoch": 0.56, + "grad_norm": 0.6582841589746702, + "learning_rate": 4.1938453929012014e-06, + "loss": 0.1316, + "step": 565 + }, + { + "epoch": 0.57, + "grad_norm": 0.3533324598426827, + "learning_rate": 4.177867740262437e-06, + "loss": 0.0644, + "step": 566 + }, + { + "epoch": 0.57, + "grad_norm": 0.3307438638772335, + "learning_rate": 4.16189871140205e-06, + "loss": 0.0681, + "step": 567 + }, + { + "epoch": 0.57, + "grad_norm": 0.620825674856218, + "learning_rate": 4.145938473827598e-06, + "loss": 0.1064, + "step": 568 + }, + { + "epoch": 0.57, + "grad_norm": 0.43238197956559593, + "learning_rate": 4.129987194954421e-06, + "loss": 0.078, + "step": 569 + }, + { + "epoch": 0.57, + "grad_norm": 0.7632901889924528, + "learning_rate": 4.1140450421038865e-06, + "loss": 0.1456, + "step": 570 + }, + { + "epoch": 0.57, + "grad_norm": 0.5004370967842584, + "learning_rate": 4.098112182501633e-06, + "loss": 0.0879, + "step": 571 + }, + { + "epoch": 0.57, + "grad_norm": 0.7096444469082916, + "learning_rate": 4.08218878327582e-06, + "loss": 0.1555, + "step": 572 + }, + { + "epoch": 0.57, + "grad_norm": 0.5698177130717139, + "learning_rate": 4.066275011455369e-06, + "loss": 0.126, + "step": 573 + }, + { + "epoch": 0.57, + "grad_norm": 0.5760084319378798, + "learning_rate": 4.050371033968216e-06, + "loss": 0.0682, + "step": 574 + }, + { + "epoch": 0.57, + "grad_norm": 0.8730902060488447, + "learning_rate": 4.034477017639561e-06, + "loss": 0.1257, + "step": 575 + }, + { + "epoch": 0.58, + "grad_norm": 0.8184188635015923, + "learning_rate": 4.018593129190113e-06, + "loss": 0.1016, + "step": 576 + }, + { + "epoch": 0.58, + "grad_norm": 0.6838885473379398, + "learning_rate": 4.002719535234346e-06, + "loss": 0.0851, + "step": 577 + }, + { + "epoch": 0.58, + "grad_norm": 0.6176165420319377, + "learning_rate": 3.98685640227875e-06, + "loss": 0.068, + "step": 578 + }, + { + "epoch": 0.58, + "grad_norm": 0.41160404509320364, + "learning_rate": 3.9710038967200825e-06, + "loss": 0.1078, + "step": 579 + }, + { + "epoch": 0.58, + "grad_norm": 0.7288198845829278, + "learning_rate": 3.955162184843625e-06, + "loss": 0.1327, + "step": 580 + }, + { + "epoch": 0.58, + "grad_norm": 0.41488840926339043, + "learning_rate": 3.93933143282144e-06, + "loss": 0.1052, + "step": 581 + }, + { + "epoch": 0.58, + "grad_norm": 0.42115462528877484, + "learning_rate": 3.9235118067106255e-06, + "loss": 0.0838, + "step": 582 + }, + { + "epoch": 0.58, + "grad_norm": 0.5249101217958358, + "learning_rate": 3.907703472451574e-06, + "loss": 0.0598, + "step": 583 + }, + { + "epoch": 0.58, + "grad_norm": 0.5453484651752785, + "learning_rate": 3.89190659586623e-06, + "loss": 0.1203, + "step": 584 + }, + { + "epoch": 0.58, + "grad_norm": 0.41905508315746437, + "learning_rate": 3.8761213426563546e-06, + "loss": 0.0749, + "step": 585 + }, + { + "epoch": 0.59, + "grad_norm": 0.7890185726165332, + "learning_rate": 3.8603478784017845e-06, + "loss": 0.1391, + "step": 586 + }, + { + "epoch": 0.59, + "grad_norm": 0.7715916565540125, + "learning_rate": 3.8445863685586946e-06, + "loss": 0.1475, + "step": 587 + }, + { + "epoch": 0.59, + "grad_norm": 0.5767704289056564, + "learning_rate": 3.828836978457868e-06, + "loss": 0.093, + "step": 588 + }, + { + "epoch": 0.59, + "grad_norm": 0.5642051557086234, + "learning_rate": 3.8130998733029517e-06, + "loss": 0.0927, + "step": 589 + }, + { + "epoch": 0.59, + "grad_norm": 0.8153865148499294, + "learning_rate": 3.7973752181687336e-06, + "loss": 0.132, + "step": 590 + }, + { + "epoch": 0.59, + "grad_norm": 0.5126391487727465, + "learning_rate": 3.7816631779994018e-06, + "loss": 0.0971, + "step": 591 + }, + { + "epoch": 0.59, + "grad_norm": 0.33503983096330675, + "learning_rate": 3.7659639176068287e-06, + "loss": 0.043, + "step": 592 + }, + { + "epoch": 0.59, + "grad_norm": 0.7429372469335324, + "learning_rate": 3.7502776016688234e-06, + "loss": 0.1171, + "step": 593 + }, + { + "epoch": 0.59, + "grad_norm": 0.8137940244667058, + "learning_rate": 3.734604394727419e-06, + "loss": 0.0901, + "step": 594 + }, + { + "epoch": 0.59, + "grad_norm": 0.6816129739354029, + "learning_rate": 3.7189444611871383e-06, + "loss": 0.1312, + "step": 595 + }, + { + "epoch": 0.6, + "grad_norm": 0.3915209329433682, + "learning_rate": 3.703297965313275e-06, + "loss": 0.0829, + "step": 596 + }, + { + "epoch": 0.6, + "grad_norm": 0.6138621407262453, + "learning_rate": 3.6876650712301654e-06, + "loss": 0.1041, + "step": 597 + }, + { + "epoch": 0.6, + "grad_norm": 0.8133074874470426, + "learning_rate": 3.6720459429194743e-06, + "loss": 0.1473, + "step": 598 + }, + { + "epoch": 0.6, + "grad_norm": 0.5357114819689096, + "learning_rate": 3.656440744218464e-06, + "loss": 0.1138, + "step": 599 + }, + { + "epoch": 0.6, + "grad_norm": 0.48803614493782865, + "learning_rate": 3.6408496388182857e-06, + "loss": 0.0743, + "step": 600 + }, + { + "epoch": 0.6, + "grad_norm": 0.7808333607409286, + "learning_rate": 3.6252727902622575e-06, + "loss": 0.1627, + "step": 601 + }, + { + "epoch": 0.6, + "grad_norm": 0.8907522302041608, + "learning_rate": 3.6097103619441505e-06, + "loss": 0.1136, + "step": 602 + }, + { + "epoch": 0.6, + "grad_norm": 0.6361251500086073, + "learning_rate": 3.594162517106472e-06, + "loss": 0.1272, + "step": 603 + }, + { + "epoch": 0.6, + "grad_norm": 0.8430236560862056, + "learning_rate": 3.578629418838757e-06, + "loss": 0.1464, + "step": 604 + }, + { + "epoch": 0.6, + "grad_norm": 0.9563954698231266, + "learning_rate": 3.5631112300758595e-06, + "loss": 0.126, + "step": 605 + }, + { + "epoch": 0.61, + "grad_norm": 0.5546794613455583, + "learning_rate": 3.5476081135962335e-06, + "loss": 0.0922, + "step": 606 + }, + { + "epoch": 0.61, + "grad_norm": 0.6877638381128309, + "learning_rate": 3.532120232020236e-06, + "loss": 0.0916, + "step": 607 + }, + { + "epoch": 0.61, + "grad_norm": 0.5036152867163292, + "learning_rate": 3.516647747808417e-06, + "loss": 0.1354, + "step": 608 + }, + { + "epoch": 0.61, + "grad_norm": 0.33842916353091324, + "learning_rate": 3.5011908232598124e-06, + "loss": 0.0646, + "step": 609 + }, + { + "epoch": 0.61, + "grad_norm": 0.9320829595951963, + "learning_rate": 3.4857496205102475e-06, + "loss": 0.1311, + "step": 610 + }, + { + "epoch": 0.61, + "grad_norm": 0.8024408164610765, + "learning_rate": 3.4703243015306314e-06, + "loss": 0.1109, + "step": 611 + }, + { + "epoch": 0.61, + "grad_norm": 0.5114684978236648, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.0941, + "step": 612 + }, + { + "epoch": 0.61, + "grad_norm": 0.4830183987334865, + "learning_rate": 3.4395219619301288e-06, + "loss": 0.0973, + "step": 613 + }, + { + "epoch": 0.61, + "grad_norm": 0.7966028580680918, + "learning_rate": 3.4241452644112085e-06, + "loss": 0.116, + "step": 614 + }, + { + "epoch": 0.61, + "grad_norm": 0.7095415092901032, + "learning_rate": 3.4087850968627823e-06, + "loss": 0.1453, + "step": 615 + }, + { + "epoch": 0.62, + "grad_norm": 0.7133196430221032, + "learning_rate": 3.3934416204057396e-06, + "loss": 0.1217, + "step": 616 + }, + { + "epoch": 0.62, + "grad_norm": 0.42557164050608987, + "learning_rate": 3.3781149959858894e-06, + "loss": 0.0838, + "step": 617 + }, + { + "epoch": 0.62, + "grad_norm": 0.5936729599711326, + "learning_rate": 3.3628053843722674e-06, + "loss": 0.1305, + "step": 618 + }, + { + "epoch": 0.62, + "grad_norm": 0.6447716869921588, + "learning_rate": 3.3475129461554567e-06, + "loss": 0.1263, + "step": 619 + }, + { + "epoch": 0.62, + "grad_norm": 0.5007167642691519, + "learning_rate": 3.3322378417458985e-06, + "loss": 0.0926, + "step": 620 + }, + { + "epoch": 0.62, + "grad_norm": 0.587047058888338, + "learning_rate": 3.3169802313722076e-06, + "loss": 0.111, + "step": 621 + }, + { + "epoch": 0.62, + "grad_norm": 0.8108969289508441, + "learning_rate": 3.3017402750794976e-06, + "loss": 0.1494, + "step": 622 + }, + { + "epoch": 0.62, + "grad_norm": 0.3957587554261609, + "learning_rate": 3.2865181327277007e-06, + "loss": 0.0785, + "step": 623 + }, + { + "epoch": 0.62, + "grad_norm": 0.5710083551683404, + "learning_rate": 3.271313963989886e-06, + "loss": 0.1493, + "step": 624 + }, + { + "epoch": 0.62, + "grad_norm": 0.388057071644006, + "learning_rate": 3.2561279283505888e-06, + "loss": 0.0846, + "step": 625 + }, + { + "epoch": 0.63, + "grad_norm": 0.9746132297403017, + "learning_rate": 3.240960185104137e-06, + "loss": 0.1204, + "step": 626 + }, + { + "epoch": 0.63, + "grad_norm": 0.4718093731794653, + "learning_rate": 3.2258108933529808e-06, + "loss": 0.1121, + "step": 627 + }, + { + "epoch": 0.63, + "grad_norm": 0.5259558631591478, + "learning_rate": 3.2106802120060197e-06, + "loss": 0.0992, + "step": 628 + }, + { + "epoch": 0.63, + "grad_norm": 0.4221936820795589, + "learning_rate": 3.195568299776945e-06, + "loss": 0.0966, + "step": 629 + }, + { + "epoch": 0.63, + "grad_norm": 0.7976862815632301, + "learning_rate": 3.180475315182563e-06, + "loss": 0.122, + "step": 630 + }, + { + "epoch": 0.63, + "grad_norm": 0.7384611149356484, + "learning_rate": 3.16540141654114e-06, + "loss": 0.0992, + "step": 631 + }, + { + "epoch": 0.63, + "grad_norm": 0.6931742137365803, + "learning_rate": 3.1503467619707407e-06, + "loss": 0.128, + "step": 632 + }, + { + "epoch": 0.63, + "grad_norm": 0.5620189111954943, + "learning_rate": 3.1353115093875676e-06, + "loss": 0.0722, + "step": 633 + }, + { + "epoch": 0.63, + "grad_norm": 0.37304979034092023, + "learning_rate": 3.1202958165043053e-06, + "loss": 0.0743, + "step": 634 + }, + { + "epoch": 0.64, + "grad_norm": 0.7052205794973762, + "learning_rate": 3.1052998408284664e-06, + "loss": 0.1205, + "step": 635 + }, + { + "epoch": 0.64, + "grad_norm": 0.5027489633022588, + "learning_rate": 3.090323739660742e-06, + "loss": 0.0826, + "step": 636 + }, + { + "epoch": 0.64, + "grad_norm": 0.46204912764624495, + "learning_rate": 3.0753676700933448e-06, + "loss": 0.1099, + "step": 637 + }, + { + "epoch": 0.64, + "grad_norm": 0.45300681594685277, + "learning_rate": 3.060431789008368e-06, + "loss": 0.0837, + "step": 638 + }, + { + "epoch": 0.64, + "grad_norm": 0.47759829311418994, + "learning_rate": 3.045516253076137e-06, + "loss": 0.1125, + "step": 639 + }, + { + "epoch": 0.64, + "grad_norm": 0.33787287232114194, + "learning_rate": 3.0306212187535653e-06, + "loss": 0.0608, + "step": 640 + }, + { + "epoch": 0.64, + "grad_norm": 0.46425912363765914, + "learning_rate": 3.0157468422825148e-06, + "loss": 0.0816, + "step": 641 + }, + { + "epoch": 0.64, + "grad_norm": 1.4900976922127227, + "learning_rate": 3.000893279688155e-06, + "loss": 0.1401, + "step": 642 + }, + { + "epoch": 0.64, + "grad_norm": 0.8104387688541158, + "learning_rate": 2.9860606867773323e-06, + "loss": 0.1081, + "step": 643 + }, + { + "epoch": 0.64, + "grad_norm": 0.3535704802647087, + "learning_rate": 2.9712492191369245e-06, + "loss": 0.065, + "step": 644 + }, + { + "epoch": 0.65, + "grad_norm": 1.9991322797628877, + "learning_rate": 2.9564590321322206e-06, + "loss": 0.0665, + "step": 645 + }, + { + "epoch": 0.65, + "grad_norm": 1.0422825768722481, + "learning_rate": 2.9416902809052817e-06, + "loss": 0.1375, + "step": 646 + }, + { + "epoch": 0.65, + "grad_norm": 0.5299253822155083, + "learning_rate": 2.9269431203733213e-06, + "loss": 0.0961, + "step": 647 + }, + { + "epoch": 0.65, + "grad_norm": 0.4834405310049794, + "learning_rate": 2.912217705227075e-06, + "loss": 0.0769, + "step": 648 + }, + { + "epoch": 0.65, + "grad_norm": 0.8731749363014457, + "learning_rate": 2.8975141899291777e-06, + "loss": 0.1258, + "step": 649 + }, + { + "epoch": 0.65, + "grad_norm": 0.537391607246855, + "learning_rate": 2.882832728712551e-06, + "loss": 0.0982, + "step": 650 + }, + { + "epoch": 0.65, + "grad_norm": 0.5282659039608084, + "learning_rate": 2.868173475578772e-06, + "loss": 0.1295, + "step": 651 + }, + { + "epoch": 0.65, + "grad_norm": 1.0433353406984989, + "learning_rate": 2.8535365842964713e-06, + "loss": 0.1705, + "step": 652 + }, + { + "epoch": 0.65, + "grad_norm": 0.7453860522113966, + "learning_rate": 2.838922208399712e-06, + "loss": 0.1035, + "step": 653 + }, + { + "epoch": 0.65, + "grad_norm": 0.6212394757546471, + "learning_rate": 2.8243305011863843e-06, + "loss": 0.1055, + "step": 654 + }, + { + "epoch": 0.66, + "grad_norm": 0.5694658461876471, + "learning_rate": 2.8097616157165886e-06, + "loss": 0.1018, + "step": 655 + }, + { + "epoch": 0.66, + "grad_norm": 0.32345558779100947, + "learning_rate": 2.7952157048110406e-06, + "loss": 0.0489, + "step": 656 + }, + { + "epoch": 0.66, + "grad_norm": 0.5575135094907753, + "learning_rate": 2.780692921049465e-06, + "loss": 0.1246, + "step": 657 + }, + { + "epoch": 0.66, + "grad_norm": 0.38517134000721004, + "learning_rate": 2.7661934167689887e-06, + "loss": 0.0742, + "step": 658 + }, + { + "epoch": 0.66, + "grad_norm": 0.6099408082762448, + "learning_rate": 2.751717344062552e-06, + "loss": 0.092, + "step": 659 + }, + { + "epoch": 0.66, + "grad_norm": 0.6631133502618448, + "learning_rate": 2.7372648547773063e-06, + "loss": 0.0935, + "step": 660 + }, + { + "epoch": 0.66, + "grad_norm": 0.7921567001033962, + "learning_rate": 2.722836100513027e-06, + "loss": 0.1173, + "step": 661 + }, + { + "epoch": 0.66, + "grad_norm": 0.3708726071175215, + "learning_rate": 2.7084312326205164e-06, + "loss": 0.0586, + "step": 662 + }, + { + "epoch": 0.66, + "grad_norm": 0.8014839693489464, + "learning_rate": 2.6940504022000248e-06, + "loss": 0.1147, + "step": 663 + }, + { + "epoch": 0.66, + "grad_norm": 0.3666430283803144, + "learning_rate": 2.6796937600996587e-06, + "loss": 0.0961, + "step": 664 + }, + { + "epoch": 0.67, + "grad_norm": 0.2623498823649957, + "learning_rate": 2.665361456913797e-06, + "loss": 0.0542, + "step": 665 + }, + { + "epoch": 0.67, + "grad_norm": 0.44296477230391784, + "learning_rate": 2.6510536429815224e-06, + "loss": 0.0861, + "step": 666 + }, + { + "epoch": 0.67, + "grad_norm": 0.8430283865858859, + "learning_rate": 2.6367704683850293e-06, + "loss": 0.1427, + "step": 667 + }, + { + "epoch": 0.67, + "grad_norm": 0.5750145406212099, + "learning_rate": 2.622512082948063e-06, + "loss": 0.1356, + "step": 668 + }, + { + "epoch": 0.67, + "grad_norm": 0.5605681913755481, + "learning_rate": 2.6082786362343377e-06, + "loss": 0.0903, + "step": 669 + }, + { + "epoch": 0.67, + "grad_norm": 0.5671797648361183, + "learning_rate": 2.594070277545975e-06, + "loss": 0.0829, + "step": 670 + }, + { + "epoch": 0.67, + "grad_norm": 0.453226327889582, + "learning_rate": 2.5798871559219362e-06, + "loss": 0.0805, + "step": 671 + }, + { + "epoch": 0.67, + "grad_norm": 0.6601329345109225, + "learning_rate": 2.5657294201364526e-06, + "loss": 0.0955, + "step": 672 + }, + { + "epoch": 0.67, + "grad_norm": 0.4707117892944517, + "learning_rate": 2.551597218697476e-06, + "loss": 0.1027, + "step": 673 + }, + { + "epoch": 0.67, + "grad_norm": 0.5998457328358333, + "learning_rate": 2.5374906998451094e-06, + "loss": 0.1241, + "step": 674 + }, + { + "epoch": 0.68, + "grad_norm": 0.6802417736597872, + "learning_rate": 2.5234100115500643e-06, + "loss": 0.1153, + "step": 675 + }, + { + "epoch": 0.68, + "grad_norm": 0.4283427944295628, + "learning_rate": 2.5093553015120937e-06, + "loss": 0.0893, + "step": 676 + }, + { + "epoch": 0.68, + "grad_norm": 0.8127336103022881, + "learning_rate": 2.4953267171584573e-06, + "loss": 0.1255, + "step": 677 + }, + { + "epoch": 0.68, + "grad_norm": 0.45310730028913915, + "learning_rate": 2.4813244056423692e-06, + "loss": 0.1255, + "step": 678 + }, + { + "epoch": 0.68, + "grad_norm": 0.42178118278585036, + "learning_rate": 2.467348513841447e-06, + "loss": 0.0759, + "step": 679 + }, + { + "epoch": 0.68, + "grad_norm": 0.38756636994650595, + "learning_rate": 2.4533991883561868e-06, + "loss": 0.0895, + "step": 680 + }, + { + "epoch": 0.68, + "grad_norm": 0.47015869594707566, + "learning_rate": 2.439476575508408e-06, + "loss": 0.0948, + "step": 681 + }, + { + "epoch": 0.68, + "grad_norm": 0.7040022486529203, + "learning_rate": 2.425580821339733e-06, + "loss": 0.1352, + "step": 682 + }, + { + "epoch": 0.68, + "grad_norm": 0.6922280792965334, + "learning_rate": 2.4117120716100484e-06, + "loss": 0.1328, + "step": 683 + }, + { + "epoch": 0.68, + "grad_norm": 0.5427720510108438, + "learning_rate": 2.3978704717959777e-06, + "loss": 0.1086, + "step": 684 + }, + { + "epoch": 0.69, + "grad_norm": 0.2945662139998699, + "learning_rate": 2.38405616708935e-06, + "loss": 0.0737, + "step": 685 + }, + { + "epoch": 0.69, + "grad_norm": 0.3933990917790085, + "learning_rate": 2.3702693023956853e-06, + "loss": 0.103, + "step": 686 + }, + { + "epoch": 0.69, + "grad_norm": 0.49956299931638654, + "learning_rate": 2.356510022332674e-06, + "loss": 0.09, + "step": 687 + }, + { + "epoch": 0.69, + "grad_norm": 0.7508054308369841, + "learning_rate": 2.342778471228648e-06, + "loss": 0.1146, + "step": 688 + }, + { + "epoch": 0.69, + "grad_norm": 0.5271553950947587, + "learning_rate": 2.329074793121085e-06, + "loss": 0.0895, + "step": 689 + }, + { + "epoch": 0.69, + "grad_norm": 0.7965928687713791, + "learning_rate": 2.315399131755081e-06, + "loss": 0.1128, + "step": 690 + }, + { + "epoch": 0.69, + "grad_norm": 0.49009664769117406, + "learning_rate": 2.301751630581855e-06, + "loss": 0.1134, + "step": 691 + }, + { + "epoch": 0.69, + "grad_norm": 0.49315966608767386, + "learning_rate": 2.2881324327572336e-06, + "loss": 0.0987, + "step": 692 + }, + { + "epoch": 0.69, + "grad_norm": 0.5191502870797712, + "learning_rate": 2.274541681140159e-06, + "loss": 0.0859, + "step": 693 + }, + { + "epoch": 0.69, + "grad_norm": 0.5078356213753658, + "learning_rate": 2.260979518291186e-06, + "loss": 0.1592, + "step": 694 + }, + { + "epoch": 0.69, + "grad_norm": 0.5207191641837007, + "learning_rate": 2.2474460864709825e-06, + "loss": 0.1523, + "step": 695 + }, + { + "epoch": 0.7, + "grad_norm": 0.6670319462170284, + "learning_rate": 2.233941527638848e-06, + "loss": 0.123, + "step": 696 + }, + { + "epoch": 0.7, + "grad_norm": 0.40816819748101985, + "learning_rate": 2.2204659834512095e-06, + "loss": 0.0849, + "step": 697 + }, + { + "epoch": 0.7, + "grad_norm": 0.5867536893857979, + "learning_rate": 2.207019595260154e-06, + "loss": 0.0888, + "step": 698 + }, + { + "epoch": 0.7, + "grad_norm": 0.4389856821033172, + "learning_rate": 2.1936025041119268e-06, + "loss": 0.0412, + "step": 699 + }, + { + "epoch": 0.7, + "grad_norm": 0.7263951215397981, + "learning_rate": 2.1802148507454675e-06, + "loss": 0.0867, + "step": 700 + }, + { + "epoch": 0.7, + "grad_norm": 0.5984255990773606, + "learning_rate": 2.1668567755909257e-06, + "loss": 0.0962, + "step": 701 + }, + { + "epoch": 0.7, + "grad_norm": 0.36921996516220956, + "learning_rate": 2.1535284187681866e-06, + "loss": 0.045, + "step": 702 + }, + { + "epoch": 0.7, + "grad_norm": 0.5018032525251166, + "learning_rate": 2.140229920085409e-06, + "loss": 0.0922, + "step": 703 + }, + { + "epoch": 0.7, + "grad_norm": 0.48277919928941554, + "learning_rate": 2.1269614190375477e-06, + "loss": 0.1087, + "step": 704 + }, + { + "epoch": 0.7, + "grad_norm": 0.405947855722002, + "learning_rate": 2.1137230548049042e-06, + "loss": 0.0759, + "step": 705 + }, + { + "epoch": 0.71, + "grad_norm": 0.4198312145346275, + "learning_rate": 2.1005149662516517e-06, + "loss": 0.0861, + "step": 706 + }, + { + "epoch": 0.71, + "grad_norm": 0.7563228799716432, + "learning_rate": 2.08733729192439e-06, + "loss": 0.0957, + "step": 707 + }, + { + "epoch": 0.71, + "grad_norm": 0.38788962473556865, + "learning_rate": 2.07419017005069e-06, + "loss": 0.0732, + "step": 708 + }, + { + "epoch": 0.71, + "grad_norm": 0.5149575322411407, + "learning_rate": 2.061073738537635e-06, + "loss": 0.0762, + "step": 709 + }, + { + "epoch": 0.71, + "grad_norm": 0.3955849378216403, + "learning_rate": 2.0479881349703885e-06, + "loss": 0.0821, + "step": 710 + }, + { + "epoch": 0.71, + "grad_norm": 0.7497119030249353, + "learning_rate": 2.0349334966107363e-06, + "loss": 0.1142, + "step": 711 + }, + { + "epoch": 0.71, + "grad_norm": 0.46099566896848265, + "learning_rate": 2.021909960395661e-06, + "loss": 0.0767, + "step": 712 + }, + { + "epoch": 0.71, + "grad_norm": 0.782998484314079, + "learning_rate": 2.0089176629358904e-06, + "loss": 0.1022, + "step": 713 + }, + { + "epoch": 0.71, + "grad_norm": 0.5358045007152038, + "learning_rate": 1.9959567405144825e-06, + "loss": 0.111, + "step": 714 + }, + { + "epoch": 0.71, + "grad_norm": 0.48557075109442327, + "learning_rate": 1.983027329085377e-06, + "loss": 0.1026, + "step": 715 + }, + { + "epoch": 0.72, + "grad_norm": 0.5586759145776173, + "learning_rate": 1.9701295642719836e-06, + "loss": 0.1102, + "step": 716 + }, + { + "epoch": 0.72, + "grad_norm": 0.44842535485767154, + "learning_rate": 1.957263581365749e-06, + "loss": 0.0644, + "step": 717 + }, + { + "epoch": 0.72, + "grad_norm": 1.94541261269077, + "learning_rate": 1.944429515324749e-06, + "loss": 0.1162, + "step": 718 + }, + { + "epoch": 0.72, + "grad_norm": 0.6891229860681114, + "learning_rate": 1.931627500772263e-06, + "loss": 0.1033, + "step": 719 + }, + { + "epoch": 0.72, + "grad_norm": 1.047583625186985, + "learning_rate": 1.9188576719953635e-06, + "loss": 0.1685, + "step": 720 + }, + { + "epoch": 0.72, + "grad_norm": 0.936455102385331, + "learning_rate": 1.906120162943515e-06, + "loss": 0.1058, + "step": 721 + }, + { + "epoch": 0.72, + "grad_norm": 1.1606737870410293, + "learning_rate": 1.8934151072271573e-06, + "loss": 0.1194, + "step": 722 + }, + { + "epoch": 0.72, + "grad_norm": 0.607503861810507, + "learning_rate": 1.8807426381163151e-06, + "loss": 0.0691, + "step": 723 + }, + { + "epoch": 0.72, + "grad_norm": 0.4364356890469223, + "learning_rate": 1.8681028885391905e-06, + "loss": 0.0798, + "step": 724 + }, + { + "epoch": 0.72, + "grad_norm": 0.6835322140988652, + "learning_rate": 1.8554959910807773e-06, + "loss": 0.1322, + "step": 725 + }, + { + "epoch": 0.73, + "grad_norm": 0.9815104193344515, + "learning_rate": 1.8429220779814654e-06, + "loss": 0.0908, + "step": 726 + }, + { + "epoch": 0.73, + "grad_norm": 1.3210822865580083, + "learning_rate": 1.8303812811356503e-06, + "loss": 0.1309, + "step": 727 + }, + { + "epoch": 0.73, + "grad_norm": 0.5785209760215473, + "learning_rate": 1.81787373209036e-06, + "loss": 0.0876, + "step": 728 + }, + { + "epoch": 0.73, + "grad_norm": 0.4064375335049112, + "learning_rate": 1.8053995620438625e-06, + "loss": 0.0931, + "step": 729 + }, + { + "epoch": 0.73, + "grad_norm": 0.37455731813663073, + "learning_rate": 1.7929589018443016e-06, + "loss": 0.1118, + "step": 730 + }, + { + "epoch": 0.73, + "grad_norm": 0.5159708041577111, + "learning_rate": 1.7805518819883134e-06, + "loss": 0.088, + "step": 731 + }, + { + "epoch": 0.73, + "grad_norm": 0.7719412037208926, + "learning_rate": 1.7681786326196665e-06, + "loss": 0.1429, + "step": 732 + }, + { + "epoch": 0.73, + "grad_norm": 0.6732284452708844, + "learning_rate": 1.755839283527893e-06, + "loss": 0.1136, + "step": 733 + }, + { + "epoch": 0.73, + "grad_norm": 0.5646313484116418, + "learning_rate": 1.743533964146924e-06, + "loss": 0.1117, + "step": 734 + }, + { + "epoch": 0.73, + "grad_norm": 0.3371832406390867, + "learning_rate": 1.7312628035537388e-06, + "loss": 0.049, + "step": 735 + }, + { + "epoch": 0.74, + "grad_norm": 0.40546197646401577, + "learning_rate": 1.7190259304670038e-06, + "loss": 0.0614, + "step": 736 + }, + { + "epoch": 0.74, + "grad_norm": 0.7659360231859863, + "learning_rate": 1.706823473245729e-06, + "loss": 0.1069, + "step": 737 + }, + { + "epoch": 0.74, + "grad_norm": 0.5880219130646935, + "learning_rate": 1.6946555598879138e-06, + "loss": 0.1132, + "step": 738 + }, + { + "epoch": 0.74, + "grad_norm": 0.27949475630415416, + "learning_rate": 1.6825223180292138e-06, + "loss": 0.0547, + "step": 739 + }, + { + "epoch": 0.74, + "grad_norm": 0.5695985268937083, + "learning_rate": 1.6704238749415958e-06, + "loss": 0.1115, + "step": 740 + }, + { + "epoch": 0.74, + "grad_norm": 0.4261816246363467, + "learning_rate": 1.6583603575320002e-06, + "loss": 0.0629, + "step": 741 + }, + { + "epoch": 0.74, + "grad_norm": 0.42622038967695236, + "learning_rate": 1.6463318923410183e-06, + "loss": 0.0923, + "step": 742 + }, + { + "epoch": 0.74, + "grad_norm": 0.3976860508185511, + "learning_rate": 1.6343386055415545e-06, + "loss": 0.0902, + "step": 743 + }, + { + "epoch": 0.74, + "grad_norm": 0.9295478081934369, + "learning_rate": 1.6223806229375182e-06, + "loss": 0.1128, + "step": 744 + }, + { + "epoch": 0.74, + "grad_norm": 0.3716687173126451, + "learning_rate": 1.6104580699624839e-06, + "loss": 0.0745, + "step": 745 + }, + { + "epoch": 0.75, + "grad_norm": 0.43052003327195315, + "learning_rate": 1.5985710716783936e-06, + "loss": 0.0875, + "step": 746 + }, + { + "epoch": 0.75, + "grad_norm": 0.6567640294936006, + "learning_rate": 1.5867197527742312e-06, + "loss": 0.1606, + "step": 747 + }, + { + "epoch": 0.75, + "grad_norm": 0.5264679447674188, + "learning_rate": 1.5749042375647261e-06, + "loss": 0.0651, + "step": 748 + }, + { + "epoch": 0.75, + "grad_norm": 0.877656014114188, + "learning_rate": 1.563124649989043e-06, + "loss": 0.1476, + "step": 749 + }, + { + "epoch": 0.75, + "grad_norm": 0.5281814939456732, + "learning_rate": 1.5513811136094786e-06, + "loss": 0.113, + "step": 750 + }, + { + "epoch": 0.75, + "grad_norm": 0.3476887016609292, + "learning_rate": 1.5396737516101757e-06, + "loss": 0.0665, + "step": 751 + }, + { + "epoch": 0.75, + "grad_norm": 0.5665787485305963, + "learning_rate": 1.5280026867958186e-06, + "loss": 0.0674, + "step": 752 + }, + { + "epoch": 0.75, + "grad_norm": 0.5506665105163838, + "learning_rate": 1.516368041590358e-06, + "loss": 0.1022, + "step": 753 + }, + { + "epoch": 0.75, + "grad_norm": 0.7185287494672667, + "learning_rate": 1.5047699380357134e-06, + "loss": 0.0783, + "step": 754 + }, + { + "epoch": 0.76, + "grad_norm": 0.4239042152308306, + "learning_rate": 1.4932084977905043e-06, + "loss": 0.0628, + "step": 755 + }, + { + "epoch": 0.76, + "grad_norm": 0.3959681085581489, + "learning_rate": 1.4816838421287693e-06, + "loss": 0.0877, + "step": 756 + }, + { + "epoch": 0.76, + "grad_norm": 0.41661033848056206, + "learning_rate": 1.470196091938691e-06, + "loss": 0.0951, + "step": 757 + }, + { + "epoch": 0.76, + "grad_norm": 0.8160533610936686, + "learning_rate": 1.4587453677213348e-06, + "loss": 0.1162, + "step": 758 + }, + { + "epoch": 0.76, + "grad_norm": 0.6466654348738692, + "learning_rate": 1.4473317895893773e-06, + "loss": 0.0761, + "step": 759 + }, + { + "epoch": 0.76, + "grad_norm": 0.4556826559116612, + "learning_rate": 1.4359554772658551e-06, + "loss": 0.058, + "step": 760 + }, + { + "epoch": 0.76, + "grad_norm": 0.3665097084372266, + "learning_rate": 1.4246165500828974e-06, + "loss": 0.0659, + "step": 761 + }, + { + "epoch": 0.76, + "grad_norm": 0.3547535384550275, + "learning_rate": 1.4133151269804873e-06, + "loss": 0.0617, + "step": 762 + }, + { + "epoch": 0.76, + "grad_norm": 0.7574630322327562, + "learning_rate": 1.4020513265052072e-06, + "loss": 0.1073, + "step": 763 + }, + { + "epoch": 0.76, + "grad_norm": 0.501896848643187, + "learning_rate": 1.39082526680899e-06, + "loss": 0.1029, + "step": 764 + }, + { + "epoch": 0.77, + "grad_norm": 0.5675894103495152, + "learning_rate": 1.3796370656478936e-06, + "loss": 0.0824, + "step": 765 + }, + { + "epoch": 0.77, + "grad_norm": 1.0309323915280348, + "learning_rate": 1.368486840380851e-06, + "loss": 0.1023, + "step": 766 + }, + { + "epoch": 0.77, + "grad_norm": 0.32109560258885117, + "learning_rate": 1.357374707968452e-06, + "loss": 0.0752, + "step": 767 + }, + { + "epoch": 0.77, + "grad_norm": 0.6702479147784276, + "learning_rate": 1.3463007849717035e-06, + "loss": 0.1248, + "step": 768 + }, + { + "epoch": 0.77, + "grad_norm": 0.5467165079831062, + "learning_rate": 1.3352651875508204e-06, + "loss": 0.0864, + "step": 769 + }, + { + "epoch": 0.77, + "grad_norm": 0.47073459355099145, + "learning_rate": 1.3242680314639995e-06, + "loss": 0.1016, + "step": 770 + }, + { + "epoch": 0.77, + "grad_norm": 0.36995184837167383, + "learning_rate": 1.3133094320662e-06, + "loss": 0.093, + "step": 771 + }, + { + "epoch": 0.77, + "grad_norm": 0.39535081463036353, + "learning_rate": 1.3023895043079476e-06, + "loss": 0.0753, + "step": 772 + }, + { + "epoch": 0.77, + "grad_norm": 0.5734293156151007, + "learning_rate": 1.291508362734113e-06, + "loss": 0.124, + "step": 773 + }, + { + "epoch": 0.77, + "grad_norm": 0.3462354071238205, + "learning_rate": 1.2806661214827286e-06, + "loss": 0.0814, + "step": 774 + }, + { + "epoch": 0.78, + "grad_norm": 0.3499217729523727, + "learning_rate": 1.2698628942837698e-06, + "loss": 0.0787, + "step": 775 + }, + { + "epoch": 0.78, + "grad_norm": 0.3616177005181913, + "learning_rate": 1.2590987944579808e-06, + "loss": 0.076, + "step": 776 + }, + { + "epoch": 0.78, + "grad_norm": 0.6637701056151413, + "learning_rate": 1.2483739349156726e-06, + "loss": 0.1121, + "step": 777 + }, + { + "epoch": 0.78, + "grad_norm": 0.3133002950838982, + "learning_rate": 1.2376884281555485e-06, + "loss": 0.0566, + "step": 778 + }, + { + "epoch": 0.78, + "grad_norm": 0.5249896746398682, + "learning_rate": 1.2270423862635188e-06, + "loss": 0.082, + "step": 779 + }, + { + "epoch": 0.78, + "grad_norm": 0.7169971351316476, + "learning_rate": 1.2164359209115235e-06, + "loss": 0.1448, + "step": 780 + }, + { + "epoch": 0.78, + "grad_norm": 0.6354694159681136, + "learning_rate": 1.2058691433563675e-06, + "loss": 0.1056, + "step": 781 + }, + { + "epoch": 0.78, + "grad_norm": 0.36229322611138237, + "learning_rate": 1.1953421644385444e-06, + "loss": 0.0532, + "step": 782 + }, + { + "epoch": 0.78, + "grad_norm": 0.7440990889116061, + "learning_rate": 1.184855094581085e-06, + "loss": 0.0838, + "step": 783 + }, + { + "epoch": 0.78, + "grad_norm": 0.44345146358008225, + "learning_rate": 1.1744080437883859e-06, + "loss": 0.0506, + "step": 784 + }, + { + "epoch": 0.79, + "grad_norm": 0.6745690110949678, + "learning_rate": 1.164001121645069e-06, + "loss": 0.1162, + "step": 785 + }, + { + "epoch": 0.79, + "grad_norm": 0.3260271984781166, + "learning_rate": 1.1536344373148245e-06, + "loss": 0.0703, + "step": 786 + }, + { + "epoch": 0.79, + "grad_norm": 0.6149266796401971, + "learning_rate": 1.1433080995392614e-06, + "loss": 0.116, + "step": 787 + }, + { + "epoch": 0.79, + "grad_norm": 0.46876880137370996, + "learning_rate": 1.133022216636781e-06, + "loss": 0.1031, + "step": 788 + }, + { + "epoch": 0.79, + "grad_norm": 0.553811515271877, + "learning_rate": 1.1227768965014246e-06, + "loss": 0.1212, + "step": 789 + }, + { + "epoch": 0.79, + "grad_norm": 0.5099010882096217, + "learning_rate": 1.1125722466017547e-06, + "loss": 0.0966, + "step": 790 + }, + { + "epoch": 0.79, + "grad_norm": 0.4437081223496812, + "learning_rate": 1.102408373979717e-06, + "loss": 0.086, + "step": 791 + }, + { + "epoch": 0.79, + "grad_norm": 0.5357333206322139, + "learning_rate": 1.092285385249528e-06, + "loss": 0.0581, + "step": 792 + }, + { + "epoch": 0.79, + "grad_norm": 0.6152606654522678, + "learning_rate": 1.0822033865965503e-06, + "loss": 0.1013, + "step": 793 + }, + { + "epoch": 0.79, + "grad_norm": 0.5097304509814814, + "learning_rate": 1.0721624837761768e-06, + "loss": 0.0735, + "step": 794 + }, + { + "epoch": 0.8, + "grad_norm": 0.4375681758880017, + "learning_rate": 1.062162782112729e-06, + "loss": 0.086, + "step": 795 + }, + { + "epoch": 0.8, + "grad_norm": 0.3309292309626842, + "learning_rate": 1.0522043864983428e-06, + "loss": 0.0877, + "step": 796 + }, + { + "epoch": 0.8, + "grad_norm": 0.47916005764026764, + "learning_rate": 1.0422874013918793e-06, + "loss": 0.0942, + "step": 797 + }, + { + "epoch": 0.8, + "grad_norm": 0.4944521220442185, + "learning_rate": 1.0324119308178166e-06, + "loss": 0.0791, + "step": 798 + }, + { + "epoch": 0.8, + "grad_norm": 0.6332692184131137, + "learning_rate": 1.0225780783651689e-06, + "loss": 0.1008, + "step": 799 + }, + { + "epoch": 0.8, + "grad_norm": 0.5720309815606217, + "learning_rate": 1.012785947186397e-06, + "loss": 0.0862, + "step": 800 + }, + { + "epoch": 0.8, + "grad_norm": 0.6044429021896905, + "learning_rate": 1.0030356399963204e-06, + "loss": 0.1115, + "step": 801 + }, + { + "epoch": 0.8, + "grad_norm": 0.36518784495833523, + "learning_rate": 9.933272590710508e-07, + "loss": 0.0721, + "step": 802 + }, + { + "epoch": 0.8, + "grad_norm": 0.6836496553279878, + "learning_rate": 9.836609062469066e-07, + "loss": 0.1074, + "step": 803 + }, + { + "epoch": 0.8, + "grad_norm": 0.5199729931620068, + "learning_rate": 9.740366829193587e-07, + "loss": 0.1431, + "step": 804 + }, + { + "epoch": 0.81, + "grad_norm": 0.42167201694835255, + "learning_rate": 9.644546900419533e-07, + "loss": 0.0852, + "step": 805 + }, + { + "epoch": 0.81, + "grad_norm": 0.572680728037601, + "learning_rate": 9.549150281252633e-07, + "loss": 0.1054, + "step": 806 + }, + { + "epoch": 0.81, + "grad_norm": 0.47298662378855444, + "learning_rate": 9.454177972358258e-07, + "loss": 0.0973, + "step": 807 + }, + { + "epoch": 0.81, + "grad_norm": 0.8017043505628718, + "learning_rate": 9.359630969951012e-07, + "loss": 0.143, + "step": 808 + }, + { + "epoch": 0.81, + "grad_norm": 0.9306228479938863, + "learning_rate": 9.265510265784189e-07, + "loss": 0.1303, + "step": 809 + }, + { + "epoch": 0.81, + "grad_norm": 0.5950675192192034, + "learning_rate": 9.171816847139447e-07, + "loss": 0.0757, + "step": 810 + }, + { + "epoch": 0.81, + "grad_norm": 0.4359479937637749, + "learning_rate": 9.078551696816434e-07, + "loss": 0.0946, + "step": 811 + }, + { + "epoch": 0.81, + "grad_norm": 0.4436620372001849, + "learning_rate": 8.985715793122407e-07, + "loss": 0.0724, + "step": 812 + }, + { + "epoch": 0.81, + "grad_norm": 0.3667687933779485, + "learning_rate": 8.893310109862102e-07, + "loss": 0.0764, + "step": 813 + }, + { + "epoch": 0.81, + "grad_norm": 0.550835015205489, + "learning_rate": 8.801335616327378e-07, + "loss": 0.0817, + "step": 814 + }, + { + "epoch": 0.81, + "grad_norm": 0.3821253923796869, + "learning_rate": 8.709793277287182e-07, + "loss": 0.054, + "step": 815 + }, + { + "epoch": 0.82, + "grad_norm": 0.3377257006076869, + "learning_rate": 8.618684052977305e-07, + "loss": 0.0455, + "step": 816 + }, + { + "epoch": 0.82, + "grad_norm": 0.392984261268136, + "learning_rate": 8.528008899090412e-07, + "loss": 0.0906, + "step": 817 + }, + { + "epoch": 0.82, + "grad_norm": 0.6319458856582114, + "learning_rate": 8.437768766765975e-07, + "loss": 0.0999, + "step": 818 + }, + { + "epoch": 0.82, + "grad_norm": 0.5997712785383388, + "learning_rate": 8.347964602580245e-07, + "loss": 0.1051, + "step": 819 + }, + { + "epoch": 0.82, + "grad_norm": 0.34809517658133093, + "learning_rate": 8.258597348536452e-07, + "loss": 0.0708, + "step": 820 + }, + { + "epoch": 0.82, + "grad_norm": 0.5256431514802758, + "learning_rate": 8.16966794205476e-07, + "loss": 0.0838, + "step": 821 + }, + { + "epoch": 0.82, + "grad_norm": 0.4075759755235258, + "learning_rate": 8.081177315962601e-07, + "loss": 0.0825, + "step": 822 + }, + { + "epoch": 0.82, + "grad_norm": 0.76627654803204, + "learning_rate": 7.993126398484741e-07, + "loss": 0.1359, + "step": 823 + }, + { + "epoch": 0.82, + "grad_norm": 0.3222103505297358, + "learning_rate": 7.905516113233652e-07, + "loss": 0.0509, + "step": 824 + }, + { + "epoch": 0.82, + "grad_norm": 0.4714922164002309, + "learning_rate": 7.818347379199781e-07, + "loss": 0.1038, + "step": 825 + }, + { + "epoch": 0.83, + "grad_norm": 0.5186381434617623, + "learning_rate": 7.731621110741871e-07, + "loss": 0.0692, + "step": 826 + }, + { + "epoch": 0.83, + "grad_norm": 0.5364703858338294, + "learning_rate": 7.645338217577474e-07, + "loss": 0.1117, + "step": 827 + }, + { + "epoch": 0.83, + "grad_norm": 0.4335026860798553, + "learning_rate": 7.55949960477328e-07, + "loss": 0.0974, + "step": 828 + }, + { + "epoch": 0.83, + "grad_norm": 0.755360241582875, + "learning_rate": 7.474106172735746e-07, + "loss": 0.1068, + "step": 829 + }, + { + "epoch": 0.83, + "grad_norm": 0.48524350549450007, + "learning_rate": 7.389158817201541e-07, + "loss": 0.083, + "step": 830 + }, + { + "epoch": 0.83, + "grad_norm": 0.42491558152066805, + "learning_rate": 7.304658429228245e-07, + "loss": 0.0857, + "step": 831 + }, + { + "epoch": 0.83, + "grad_norm": 0.4861633779871085, + "learning_rate": 7.220605895184946e-07, + "loss": 0.0789, + "step": 832 + }, + { + "epoch": 0.83, + "grad_norm": 0.6358496471179419, + "learning_rate": 7.13700209674294e-07, + "loss": 0.0934, + "step": 833 + }, + { + "epoch": 0.83, + "grad_norm": 0.44338981475516126, + "learning_rate": 7.053847910866513e-07, + "loss": 0.0771, + "step": 834 + }, + { + "epoch": 0.83, + "grad_norm": 0.6714763536022414, + "learning_rate": 6.971144209803738e-07, + "loss": 0.1262, + "step": 835 + }, + { + "epoch": 0.84, + "grad_norm": 0.41897166828047444, + "learning_rate": 6.888891861077301e-07, + "loss": 0.0481, + "step": 836 + }, + { + "epoch": 0.84, + "grad_norm": 0.4255996435276327, + "learning_rate": 6.807091727475412e-07, + "loss": 0.0525, + "step": 837 + }, + { + "epoch": 0.84, + "grad_norm": 0.474536598786256, + "learning_rate": 6.725744667042778e-07, + "loss": 0.0923, + "step": 838 + }, + { + "epoch": 0.84, + "grad_norm": 0.4896460204492975, + "learning_rate": 6.644851533071556e-07, + "loss": 0.1151, + "step": 839 + }, + { + "epoch": 0.84, + "grad_norm": 0.38425843847511787, + "learning_rate": 6.564413174092443e-07, + "loss": 0.0755, + "step": 840 + }, + { + "epoch": 0.84, + "grad_norm": 1.3938923759762976, + "learning_rate": 6.484430433865785e-07, + "loss": 0.1674, + "step": 841 + }, + { + "epoch": 0.84, + "grad_norm": 0.5023810794843392, + "learning_rate": 6.404904151372649e-07, + "loss": 0.1079, + "step": 842 + }, + { + "epoch": 0.84, + "grad_norm": 0.6340913982384018, + "learning_rate": 6.325835160806132e-07, + "loss": 0.112, + "step": 843 + }, + { + "epoch": 0.84, + "grad_norm": 0.5225108859572722, + "learning_rate": 6.24722429156251e-07, + "loss": 0.0989, + "step": 844 + }, + { + "epoch": 0.84, + "grad_norm": 1.2990337541490649, + "learning_rate": 6.16907236823262e-07, + "loss": 0.0878, + "step": 845 + }, + { + "epoch": 0.85, + "grad_norm": 0.32466614878115196, + "learning_rate": 6.091380210593145e-07, + "loss": 0.0719, + "step": 846 + }, + { + "epoch": 0.85, + "grad_norm": 0.5267810752911024, + "learning_rate": 6.014148633598055e-07, + "loss": 0.125, + "step": 847 + }, + { + "epoch": 0.85, + "grad_norm": 0.4634858411250408, + "learning_rate": 5.937378447370068e-07, + "loss": 0.0675, + "step": 848 + }, + { + "epoch": 0.85, + "grad_norm": 0.41201627699477966, + "learning_rate": 5.861070457192081e-07, + "loss": 0.0974, + "step": 849 + }, + { + "epoch": 0.85, + "grad_norm": 1.0069609403070547, + "learning_rate": 5.785225463498828e-07, + "loss": 0.1299, + "step": 850 + }, + { + "epoch": 0.85, + "grad_norm": 0.5473304468749308, + "learning_rate": 5.709844261868381e-07, + "loss": 0.1042, + "step": 851 + }, + { + "epoch": 0.85, + "grad_norm": 0.3672255838242061, + "learning_rate": 5.634927643013899e-07, + "loss": 0.0995, + "step": 852 + }, + { + "epoch": 0.85, + "grad_norm": 0.7117682001734832, + "learning_rate": 5.560476392775239e-07, + "loss": 0.1007, + "step": 853 + }, + { + "epoch": 0.85, + "grad_norm": 0.310169949837104, + "learning_rate": 5.486491292110796e-07, + "loss": 0.0942, + "step": 854 + }, + { + "epoch": 0.85, + "grad_norm": 0.44696554803595046, + "learning_rate": 5.412973117089288e-07, + "loss": 0.0632, + "step": 855 + }, + { + "epoch": 0.86, + "grad_norm": 0.5791409344657856, + "learning_rate": 5.339922638881545e-07, + "loss": 0.0993, + "step": 856 + }, + { + "epoch": 0.86, + "grad_norm": 0.5469183216245861, + "learning_rate": 5.267340623752554e-07, + "loss": 0.1217, + "step": 857 + }, + { + "epoch": 0.86, + "grad_norm": 0.3715118970165225, + "learning_rate": 5.195227833053273e-07, + "loss": 0.0748, + "step": 858 + }, + { + "epoch": 0.86, + "grad_norm": 0.6057710253253564, + "learning_rate": 5.123585023212785e-07, + "loss": 0.1089, + "step": 859 + }, + { + "epoch": 0.86, + "grad_norm": 0.4238894048911, + "learning_rate": 5.05241294573024e-07, + "loss": 0.0788, + "step": 860 + }, + { + "epoch": 0.86, + "grad_norm": 0.5997134947258506, + "learning_rate": 4.981712347167061e-07, + "loss": 0.0713, + "step": 861 + }, + { + "epoch": 0.86, + "grad_norm": 0.5534958553812986, + "learning_rate": 4.911483969139086e-07, + "loss": 0.098, + "step": 862 + }, + { + "epoch": 0.86, + "grad_norm": 0.8066007062748421, + "learning_rate": 4.841728548308744e-07, + "loss": 0.1272, + "step": 863 + }, + { + "epoch": 0.86, + "grad_norm": 0.3825555555448329, + "learning_rate": 4.772446816377408e-07, + "loss": 0.0629, + "step": 864 + }, + { + "epoch": 0.86, + "grad_norm": 0.5436519823519242, + "learning_rate": 4.7036395000776556e-07, + "loss": 0.0661, + "step": 865 + }, + { + "epoch": 0.87, + "grad_norm": 0.6999572333406373, + "learning_rate": 4.6353073211656886e-07, + "loss": 0.0782, + "step": 866 + }, + { + "epoch": 0.87, + "grad_norm": 0.6665398985207782, + "learning_rate": 4.5674509964137136e-07, + "loss": 0.1244, + "step": 867 + }, + { + "epoch": 0.87, + "grad_norm": 0.4102524219578372, + "learning_rate": 4.5000712376024826e-07, + "loss": 0.0725, + "step": 868 + }, + { + "epoch": 0.87, + "grad_norm": 0.31539447100958523, + "learning_rate": 4.4331687515137614e-07, + "loss": 0.0389, + "step": 869 + }, + { + "epoch": 0.87, + "grad_norm": 0.32079003707334564, + "learning_rate": 4.3667442399229985e-07, + "loss": 0.0723, + "step": 870 + }, + { + "epoch": 0.87, + "grad_norm": 0.7334622696512892, + "learning_rate": 4.30079839959186e-07, + "loss": 0.1115, + "step": 871 + }, + { + "epoch": 0.87, + "grad_norm": 0.5104145536374513, + "learning_rate": 4.2353319222610265e-07, + "loss": 0.1117, + "step": 872 + }, + { + "epoch": 0.87, + "grad_norm": 0.407631950449968, + "learning_rate": 4.1703454946428635e-07, + "loss": 0.098, + "step": 873 + }, + { + "epoch": 0.87, + "grad_norm": 0.5052468200921778, + "learning_rate": 4.1058397984142405e-07, + "loss": 0.1079, + "step": 874 + }, + { + "epoch": 0.88, + "grad_norm": 0.5647002734695262, + "learning_rate": 4.041815510209396e-07, + "loss": 0.0969, + "step": 875 + }, + { + "epoch": 0.88, + "grad_norm": 0.9680459120711749, + "learning_rate": 3.9782733016128006e-07, + "loss": 0.1692, + "step": 876 + }, + { + "epoch": 0.88, + "grad_norm": 0.5588469027832113, + "learning_rate": 3.9152138391521766e-07, + "loss": 0.095, + "step": 877 + }, + { + "epoch": 0.88, + "grad_norm": 0.3323981905313235, + "learning_rate": 3.852637784291424e-07, + "loss": 0.0641, + "step": 878 + }, + { + "epoch": 0.88, + "grad_norm": 0.8081738368564643, + "learning_rate": 3.790545793423761e-07, + "loss": 0.1244, + "step": 879 + }, + { + "epoch": 0.88, + "grad_norm": 0.6412500408187927, + "learning_rate": 3.728938517864794e-07, + "loss": 0.1266, + "step": 880 + }, + { + "epoch": 0.88, + "grad_norm": 0.28687232827929193, + "learning_rate": 3.667816603845681e-07, + "loss": 0.0747, + "step": 881 + }, + { + "epoch": 0.88, + "grad_norm": 0.4982165667621581, + "learning_rate": 3.60718069250639e-07, + "loss": 0.1053, + "step": 882 + }, + { + "epoch": 0.88, + "grad_norm": 0.5464715501708446, + "learning_rate": 3.547031419888919e-07, + "loss": 0.1301, + "step": 883 + }, + { + "epoch": 0.88, + "grad_norm": 0.5942245125339878, + "learning_rate": 3.4873694169306915e-07, + "loss": 0.0986, + "step": 884 + }, + { + "epoch": 0.89, + "grad_norm": 0.5399557216992824, + "learning_rate": 3.4281953094578877e-07, + "loss": 0.0904, + "step": 885 + }, + { + "epoch": 0.89, + "grad_norm": 0.7068041293977915, + "learning_rate": 3.369509718178887e-07, + "loss": 0.1054, + "step": 886 + }, + { + "epoch": 0.89, + "grad_norm": 0.5820835803293782, + "learning_rate": 3.3113132586777786e-07, + "loss": 0.0814, + "step": 887 + }, + { + "epoch": 0.89, + "grad_norm": 0.540465794297693, + "learning_rate": 3.2536065414078724e-07, + "loss": 0.1207, + "step": 888 + }, + { + "epoch": 0.89, + "grad_norm": 0.7907977007084812, + "learning_rate": 3.196390171685343e-07, + "loss": 0.1397, + "step": 889 + }, + { + "epoch": 0.89, + "grad_norm": 0.7159037109972106, + "learning_rate": 3.1396647496828245e-07, + "loss": 0.1585, + "step": 890 + }, + { + "epoch": 0.89, + "grad_norm": 0.38466650660591445, + "learning_rate": 3.0834308704231485e-07, + "loss": 0.0882, + "step": 891 + }, + { + "epoch": 0.89, + "grad_norm": 0.8151401337946305, + "learning_rate": 3.0276891237731085e-07, + "loss": 0.1342, + "step": 892 + }, + { + "epoch": 0.89, + "grad_norm": 0.47364592790571436, + "learning_rate": 2.97244009443724e-07, + "loss": 0.0863, + "step": 893 + }, + { + "epoch": 0.89, + "grad_norm": 0.30711784182012725, + "learning_rate": 2.917684361951728e-07, + "loss": 0.0549, + "step": 894 + }, + { + "epoch": 0.9, + "grad_norm": 0.47101524283620005, + "learning_rate": 2.8634225006782867e-07, + "loss": 0.0903, + "step": 895 + }, + { + "epoch": 0.9, + "grad_norm": 0.30967047878137277, + "learning_rate": 2.809655079798179e-07, + "loss": 0.0533, + "step": 896 + }, + { + "epoch": 0.9, + "grad_norm": 0.5143493600993974, + "learning_rate": 2.75638266330619e-07, + "loss": 0.088, + "step": 897 + }, + { + "epoch": 0.9, + "grad_norm": 0.6743754796337982, + "learning_rate": 2.7036058100047723e-07, + "loss": 0.1103, + "step": 898 + }, + { + "epoch": 0.9, + "grad_norm": 0.6095538912751249, + "learning_rate": 2.65132507349814e-07, + "loss": 0.0781, + "step": 899 + }, + { + "epoch": 0.9, + "grad_norm": 0.3958113201950967, + "learning_rate": 2.599541002186479e-07, + "loss": 0.0779, + "step": 900 + }, + { + "epoch": 0.9, + "grad_norm": 2.047123808357352, + "learning_rate": 2.5482541392601924e-07, + "loss": 0.0555, + "step": 901 + }, + { + "epoch": 0.9, + "grad_norm": 0.4420409227261539, + "learning_rate": 2.497465022694207e-07, + "loss": 0.0777, + "step": 902 + }, + { + "epoch": 0.9, + "grad_norm": 0.5935850991221072, + "learning_rate": 2.447174185242324e-07, + "loss": 0.0846, + "step": 903 + }, + { + "epoch": 0.9, + "grad_norm": 0.5221496297926413, + "learning_rate": 2.397382154431621e-07, + "loss": 0.0739, + "step": 904 + }, + { + "epoch": 0.91, + "grad_norm": 0.5415360874192054, + "learning_rate": 2.3480894525569564e-07, + "loss": 0.0866, + "step": 905 + }, + { + "epoch": 0.91, + "grad_norm": 0.4109552316884023, + "learning_rate": 2.2992965966754378e-07, + "loss": 0.083, + "step": 906 + }, + { + "epoch": 0.91, + "grad_norm": 1.909885268355602, + "learning_rate": 2.251004098601034e-07, + "loss": 0.0825, + "step": 907 + }, + { + "epoch": 0.91, + "grad_norm": 0.4658964372668037, + "learning_rate": 2.2032124648992015e-07, + "loss": 0.1046, + "step": 908 + }, + { + "epoch": 0.91, + "grad_norm": 0.33553011782146736, + "learning_rate": 2.1559221968815547e-07, + "loss": 0.0899, + "step": 909 + }, + { + "epoch": 0.91, + "grad_norm": 0.4834566512135711, + "learning_rate": 2.109133790600648e-07, + "loss": 0.0851, + "step": 910 + }, + { + "epoch": 0.91, + "grad_norm": 1.6260157288294408, + "learning_rate": 2.062847736844703e-07, + "loss": 0.1829, + "step": 911 + }, + { + "epoch": 0.91, + "grad_norm": 0.4284786562742925, + "learning_rate": 2.0170645211325335e-07, + "loss": 0.0632, + "step": 912 + }, + { + "epoch": 0.91, + "grad_norm": 0.42840539869243016, + "learning_rate": 1.9717846237084005e-07, + "loss": 0.0902, + "step": 913 + }, + { + "epoch": 0.91, + "grad_norm": 0.39495047949288664, + "learning_rate": 1.9270085195370048e-07, + "loss": 0.0784, + "step": 914 + }, + { + "epoch": 0.92, + "grad_norm": 0.6895093352181992, + "learning_rate": 1.8827366782984913e-07, + "loss": 0.1401, + "step": 915 + }, + { + "epoch": 0.92, + "grad_norm": 0.4341918274282859, + "learning_rate": 1.838969564383525e-07, + "loss": 0.0991, + "step": 916 + }, + { + "epoch": 0.92, + "grad_norm": 0.7336846337260968, + "learning_rate": 1.7957076368884274e-07, + "loss": 0.1068, + "step": 917 + }, + { + "epoch": 0.92, + "grad_norm": 0.33843907932125694, + "learning_rate": 1.7529513496103322e-07, + "loss": 0.098, + "step": 918 + }, + { + "epoch": 0.92, + "grad_norm": 0.641426534079293, + "learning_rate": 1.7107011510424766e-07, + "loss": 0.1121, + "step": 919 + }, + { + "epoch": 0.92, + "grad_norm": 0.6088894766654794, + "learning_rate": 1.6689574843694433e-07, + "loss": 0.049, + "step": 920 + }, + { + "epoch": 0.92, + "grad_norm": 0.7638243262874345, + "learning_rate": 1.6277207874625444e-07, + "loss": 0.1407, + "step": 921 + }, + { + "epoch": 0.92, + "grad_norm": 0.38273022218175334, + "learning_rate": 1.5869914928752117e-07, + "loss": 0.0869, + "step": 922 + }, + { + "epoch": 0.92, + "grad_norm": 0.47809969343613545, + "learning_rate": 1.546770027838479e-07, + "loss": 0.1059, + "step": 923 + }, + { + "epoch": 0.92, + "grad_norm": 0.3328505710537316, + "learning_rate": 1.5070568142564912e-07, + "loss": 0.0824, + "step": 924 + }, + { + "epoch": 0.93, + "grad_norm": 0.3412516515646284, + "learning_rate": 1.4678522687020414e-07, + "loss": 0.0673, + "step": 925 + }, + { + "epoch": 0.93, + "grad_norm": 0.3582880986734356, + "learning_rate": 1.4291568024122848e-07, + "loss": 0.0813, + "step": 926 + }, + { + "epoch": 0.93, + "grad_norm": 0.4627890159848611, + "learning_rate": 1.390970821284343e-07, + "loss": 0.0972, + "step": 927 + }, + { + "epoch": 0.93, + "grad_norm": 0.4058521199725231, + "learning_rate": 1.3532947258710905e-07, + "loss": 0.0477, + "step": 928 + }, + { + "epoch": 0.93, + "grad_norm": 0.5285869257912966, + "learning_rate": 1.3161289113769405e-07, + "loss": 0.069, + "step": 929 + }, + { + "epoch": 0.93, + "grad_norm": 0.886532533745455, + "learning_rate": 1.2794737676536993e-07, + "loss": 0.1483, + "step": 930 + }, + { + "epoch": 0.93, + "grad_norm": 0.48656899155350514, + "learning_rate": 1.2433296791964754e-07, + "loss": 0.1062, + "step": 931 + }, + { + "epoch": 0.93, + "grad_norm": 0.7968133314235069, + "learning_rate": 1.2076970251396593e-07, + "loss": 0.1477, + "step": 932 + }, + { + "epoch": 0.93, + "grad_norm": 0.5725663468018927, + "learning_rate": 1.1725761792529378e-07, + "loss": 0.1135, + "step": 933 + }, + { + "epoch": 0.93, + "grad_norm": 0.5170245470451504, + "learning_rate": 1.1379675099373489e-07, + "loss": 0.1299, + "step": 934 + }, + { + "epoch": 0.94, + "grad_norm": 0.6572490103667149, + "learning_rate": 1.1038713802214718e-07, + "loss": 0.1291, + "step": 935 + }, + { + "epoch": 0.94, + "grad_norm": 0.6625741117878644, + "learning_rate": 1.0702881477575589e-07, + "loss": 0.0883, + "step": 936 + }, + { + "epoch": 0.94, + "grad_norm": 0.4741783508324799, + "learning_rate": 1.0372181648178436e-07, + "loss": 0.1157, + "step": 937 + }, + { + "epoch": 0.94, + "grad_norm": 0.35230005784167795, + "learning_rate": 1.004661778290783e-07, + "loss": 0.059, + "step": 938 + }, + { + "epoch": 0.94, + "grad_norm": 0.38374367424824235, + "learning_rate": 9.726193296774767e-08, + "loss": 0.075, + "step": 939 + }, + { + "epoch": 0.94, + "grad_norm": 0.3103326494714687, + "learning_rate": 9.410911550880474e-08, + "loss": 0.0384, + "step": 940 + }, + { + "epoch": 0.94, + "grad_norm": 0.32360309647427055, + "learning_rate": 9.100775852381227e-08, + "loss": 0.0957, + "step": 941 + }, + { + "epoch": 0.94, + "grad_norm": 0.4687389813884961, + "learning_rate": 8.795789454453862e-08, + "loss": 0.0634, + "step": 942 + }, + { + "epoch": 0.94, + "grad_norm": 0.6463975180537007, + "learning_rate": 8.495955556261204e-08, + "loss": 0.1049, + "step": 943 + }, + { + "epoch": 0.94, + "grad_norm": 0.3725035351846233, + "learning_rate": 8.201277302919086e-08, + "loss": 0.0687, + "step": 944 + }, + { + "epoch": 0.94, + "grad_norm": 0.5327315777783594, + "learning_rate": 7.911757785462882e-08, + "loss": 0.0863, + "step": 945 + }, + { + "epoch": 0.95, + "grad_norm": 0.5649959636734481, + "learning_rate": 7.627400040815414e-08, + "loss": 0.1035, + "step": 946 + }, + { + "epoch": 0.95, + "grad_norm": 1.1124113720729487, + "learning_rate": 7.34820705175482e-08, + "loss": 0.1088, + "step": 947 + }, + { + "epoch": 0.95, + "grad_norm": 0.2575662592302751, + "learning_rate": 7.074181746883402e-08, + "loss": 0.0597, + "step": 948 + }, + { + "epoch": 0.95, + "grad_norm": 1.4217917378062126, + "learning_rate": 6.805327000596995e-08, + "loss": 0.1233, + "step": 949 + }, + { + "epoch": 0.95, + "grad_norm": 0.5900202418657501, + "learning_rate": 6.54164563305465e-08, + "loss": 0.1005, + "step": 950 + }, + { + "epoch": 0.95, + "grad_norm": 0.4903148765855846, + "learning_rate": 6.283140410149213e-08, + "loss": 0.102, + "step": 951 + }, + { + "epoch": 0.95, + "grad_norm": 0.6283680733361539, + "learning_rate": 6.029814043478022e-08, + "loss": 0.0986, + "step": 952 + }, + { + "epoch": 0.95, + "grad_norm": 0.6437189610067745, + "learning_rate": 5.781669190314809e-08, + "loss": 0.1144, + "step": 953 + }, + { + "epoch": 0.95, + "grad_norm": 0.40301108037963757, + "learning_rate": 5.538708453581787e-08, + "loss": 0.0823, + "step": 954 + }, + { + "epoch": 0.95, + "grad_norm": 0.46077593051803917, + "learning_rate": 5.3009343818219985e-08, + "loss": 0.1069, + "step": 955 + }, + { + "epoch": 0.96, + "grad_norm": 1.1633111722577614, + "learning_rate": 5.068349469173006e-08, + "loss": 0.1768, + "step": 956 + }, + { + "epoch": 0.96, + "grad_norm": 0.5676334727677624, + "learning_rate": 4.840956155340415e-08, + "loss": 0.1069, + "step": 957 + }, + { + "epoch": 0.96, + "grad_norm": 1.1418603533214473, + "learning_rate": 4.618756825572612e-08, + "loss": 0.1313, + "step": 958 + }, + { + "epoch": 0.96, + "grad_norm": 0.4998266333127885, + "learning_rate": 4.40175381063529e-08, + "loss": 0.1141, + "step": 959 + }, + { + "epoch": 0.96, + "grad_norm": 0.49451116960113756, + "learning_rate": 4.189949386787462e-08, + "loss": 0.1055, + "step": 960 + }, + { + "epoch": 0.96, + "grad_norm": 0.6605021406745498, + "learning_rate": 3.9833457757572636e-08, + "loss": 0.1312, + "step": 961 + }, + { + "epoch": 0.96, + "grad_norm": 1.7611722108034722, + "learning_rate": 3.781945144718912e-08, + "loss": 0.1127, + "step": 962 + }, + { + "epoch": 0.96, + "grad_norm": 0.4052079041989755, + "learning_rate": 3.585749606269562e-08, + "loss": 0.0762, + "step": 963 + }, + { + "epoch": 0.96, + "grad_norm": 0.2812946717625918, + "learning_rate": 3.394761218407705e-08, + "loss": 0.0453, + "step": 964 + }, + { + "epoch": 0.96, + "grad_norm": 1.6666698061897918, + "learning_rate": 3.2089819845111946e-08, + "loss": 0.0599, + "step": 965 + }, + { + "epoch": 0.97, + "grad_norm": 0.6616369308089431, + "learning_rate": 3.0284138533160924e-08, + "loss": 0.099, + "step": 966 + }, + { + "epoch": 0.97, + "grad_norm": 0.5595168583725473, + "learning_rate": 2.8530587188968508e-08, + "loss": 0.1096, + "step": 967 + }, + { + "epoch": 0.97, + "grad_norm": 0.3862726620479047, + "learning_rate": 2.6829184206457194e-08, + "loss": 0.064, + "step": 968 + }, + { + "epoch": 0.97, + "grad_norm": 0.7776881783133217, + "learning_rate": 2.5179947432540376e-08, + "loss": 0.1087, + "step": 969 + }, + { + "epoch": 0.97, + "grad_norm": 0.8070875336917069, + "learning_rate": 2.358289416693027e-08, + "loss": 0.1547, + "step": 970 + }, + { + "epoch": 0.97, + "grad_norm": 0.4034578997229353, + "learning_rate": 2.2038041161960288e-08, + "loss": 0.0961, + "step": 971 + }, + { + "epoch": 0.97, + "grad_norm": 0.5057471723242044, + "learning_rate": 2.0545404622407396e-08, + "loss": 0.1123, + "step": 972 + }, + { + "epoch": 0.97, + "grad_norm": 0.35190909630950856, + "learning_rate": 1.91050002053228e-08, + "loss": 0.0635, + "step": 973 + }, + { + "epoch": 0.97, + "grad_norm": 0.5187828907818753, + "learning_rate": 1.7716843019867646e-08, + "loss": 0.0856, + "step": 974 + }, + { + "epoch": 0.97, + "grad_norm": 0.394444506384258, + "learning_rate": 1.6380947627153143e-08, + "loss": 0.0623, + "step": 975 + }, + { + "epoch": 0.98, + "grad_norm": 0.5285270173606262, + "learning_rate": 1.509732804009012e-08, + "loss": 0.1073, + "step": 976 + }, + { + "epoch": 0.98, + "grad_norm": 0.37338286137353993, + "learning_rate": 1.386599772324082e-08, + "loss": 0.082, + "step": 977 + }, + { + "epoch": 0.98, + "grad_norm": 0.43264207890271816, + "learning_rate": 1.268696959267679e-08, + "loss": 0.1035, + "step": 978 + }, + { + "epoch": 0.98, + "grad_norm": 0.7496118580227054, + "learning_rate": 1.156025601584676e-08, + "loss": 0.1461, + "step": 979 + }, + { + "epoch": 0.98, + "grad_norm": 0.36412351519153907, + "learning_rate": 1.0485868811441757e-08, + "loss": 0.1018, + "step": 980 + }, + { + "epoch": 0.98, + "grad_norm": 0.5314038519295458, + "learning_rate": 9.463819249275751e-09, + "loss": 0.1252, + "step": 981 + }, + { + "epoch": 0.98, + "grad_norm": 0.48112137285779566, + "learning_rate": 8.494118050164646e-09, + "loss": 0.1104, + "step": 982 + }, + { + "epoch": 0.98, + "grad_norm": 0.7189714211724794, + "learning_rate": 7.576775385815249e-09, + "loss": 0.1468, + "step": 983 + }, + { + "epoch": 0.98, + "grad_norm": 0.40292170595375787, + "learning_rate": 6.711800878718144e-09, + "loss": 0.0745, + "step": 984 + }, + { + "epoch": 0.98, + "grad_norm": 0.44643651712628796, + "learning_rate": 5.899203602046655e-09, + "loss": 0.0931, + "step": 985 + }, + { + "epoch": 0.99, + "grad_norm": 0.7086989217011592, + "learning_rate": 5.138992079561367e-09, + "loss": 0.1206, + "step": 986 + }, + { + "epoch": 0.99, + "grad_norm": 0.6598387711819119, + "learning_rate": 4.431174285521866e-09, + "loss": 0.1148, + "step": 987 + }, + { + "epoch": 0.99, + "grad_norm": 0.6104258040113265, + "learning_rate": 3.775757644601808e-09, + "loss": 0.0875, + "step": 988 + }, + { + "epoch": 0.99, + "grad_norm": 0.48595597670970386, + "learning_rate": 3.1727490318111953e-09, + "loss": 0.1145, + "step": 989 + }, + { + "epoch": 0.99, + "grad_norm": 0.6209649139140617, + "learning_rate": 2.6221547724253337e-09, + "loss": 0.1113, + "step": 990 + }, + { + "epoch": 0.99, + "grad_norm": 0.410496344724791, + "learning_rate": 2.1239806419176556e-09, + "loss": 0.059, + "step": 991 + }, + { + "epoch": 0.99, + "grad_norm": 0.7022484208897792, + "learning_rate": 1.6782318658992159e-09, + "loss": 0.0865, + "step": 992 + }, + { + "epoch": 0.99, + "grad_norm": 0.4556693550827079, + "learning_rate": 1.2849131200631804e-09, + "loss": 0.0795, + "step": 993 + }, + { + "epoch": 0.99, + "grad_norm": 0.31377306177553776, + "learning_rate": 9.440285301370865e-10, + "loss": 0.0617, + "step": 994 + }, + { + "epoch": 0.99, + "grad_norm": 0.5835806283311138, + "learning_rate": 6.555816718389896e-10, + "loss": 0.1197, + "step": 995 + }, + { + "epoch": 1.0, + "grad_norm": 0.6526393593582267, + "learning_rate": 4.1957557084082447e-10, + "loss": 0.1179, + "step": 996 + }, + { + "epoch": 1.0, + "grad_norm": 0.6533521512496714, + "learning_rate": 2.360127027339898e-10, + "loss": 0.0879, + "step": 997 + }, + { + "epoch": 1.0, + "grad_norm": 1.6049139152852736, + "learning_rate": 1.0489499300603279e-10, + "loss": 0.0594, + "step": 998 + }, + { + "epoch": 1.0, + "grad_norm": 0.7328945103620341, + "learning_rate": 2.622381702066523e-11, + "loss": 0.1141, + "step": 999 + }, + { + "epoch": 1.0, + "grad_norm": 0.759905161698802, + "learning_rate": 0.0, + "loss": 0.1072, + "step": 1000 + }, + { + "epoch": 1.0, + "step": 1000, + "total_flos": 38113260066816.0, + "train_loss": 0.11589447382092476, + "train_runtime": 2391.3181, + "train_samples_per_second": 0.836, + "train_steps_per_second": 0.418 + } + ], + "logging_steps": 1.0, + "max_steps": 1000, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "total_flos": 38113260066816.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/README.md b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..84ec7fb74b98cfd0816d69a85e53a402a3fe9252 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/README.md @@ -0,0 +1,10 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + +- PEFT 0.4.0 + +- PEFT 0.4.0 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/adapter_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c4919754736b9e4bdfbc56e33ca2c7472a8a8c96 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "olora", + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 512, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 256, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "q_proj", + "k_proj", + "up_proj", + "o_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/adapter_model.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..72fbc4a2ed0e5ba7ba7bbfbf939cab44e6632f6e --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c000cb164df104a9753016cf678b8b51addb141609a1fbb90e3a9c42e17ffde7 +size 1384057050 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a67f02d0063c3de7740207b9ab2a3eb7be1cbe3 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/config.json @@ -0,0 +1,221 @@ +{ + "_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "add_faster_video": false, + "add_time_instruction": true, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": true, + "hidden_act": "silu", + "hidden_size": 3584, + "ignore_index": -100, + "image_aspect_ratio": "anyres_max_9", + "image_crop_resolution": null, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "grid", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": 2e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": null, + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": 2, + "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "projector_hidden_act": "gelu", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "text_config": { + "model_type": "llama" + }, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 32768, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_config": { + "hidden_size": 1024, + "image_size": 336, + "intermediate_size": 4096, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "vocab_size": 32000 + }, + "vision_feature_layer": -2, + "vision_feature_select_strategy": "default", + "vision_tower_pretrained": null +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/generation_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/non_lora_trainables.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..d0650a24e63145778fe77dc7c6625b87a067eab7 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dad0f447903e090b103776926817a34a460998c74753362a5680b0f8b59028a +size 33964208 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/trainer_state.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5848f7926361e1f5f77a5f31915c9b98745fc882 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split02_all_mm_tune_olora256_512_llm/trainer_state.json @@ -0,0 +1,83106 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 11868, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2.378236731435547, + "learning_rate": 2.8011204481792718e-08, + "loss": 0.6801, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.722609869464893, + "learning_rate": 5.6022408963585437e-08, + "loss": 0.7218, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1.847724876204175, + "learning_rate": 8.403361344537815e-08, + "loss": 0.4376, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 1.5297435835569315, + "learning_rate": 1.1204481792717087e-07, + "loss": 0.4518, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 2.4594532441008154, + "learning_rate": 1.400560224089636e-07, + "loss": 0.5368, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 2.342484129298085, + "learning_rate": 1.680672268907563e-07, + "loss": 0.7457, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 1.7258716398365765, + "learning_rate": 1.9607843137254904e-07, + "loss": 0.4883, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 2.374800919761394, + "learning_rate": 2.2408963585434175e-07, + "loss": 0.6331, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 1.931840058665886, + "learning_rate": 2.5210084033613445e-07, + "loss": 0.5522, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 1.5580990973108273, + "learning_rate": 2.801120448179272e-07, + "loss": 0.4187, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 2.7699991769572647, + "learning_rate": 3.081232492997199e-07, + "loss": 0.7044, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 1.8607488429310421, + "learning_rate": 3.361344537815126e-07, + "loss": 0.4766, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 2.314201389132091, + "learning_rate": 3.641456582633054e-07, + "loss": 0.5408, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 1.7867487593176063, + "learning_rate": 3.921568627450981e-07, + "loss": 0.4655, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 2.083342201941341, + "learning_rate": 4.201680672268908e-07, + "loss": 0.5379, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 2.98994471655251, + "learning_rate": 4.481792717086835e-07, + "loss": 0.762, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 2.3005144423857664, + "learning_rate": 4.7619047619047623e-07, + "loss": 0.6111, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.28068264709896, + "learning_rate": 5.042016806722689e-07, + "loss": 0.879, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 1.669347349205939, + "learning_rate": 5.322128851540616e-07, + "loss": 0.4334, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 2.274896716929672, + "learning_rate": 5.602240896358544e-07, + "loss": 0.6438, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 0.8954774848253007, + "learning_rate": 5.882352941176471e-07, + "loss": 0.2637, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 3.5306507550466755, + "learning_rate": 6.162464985994398e-07, + "loss": 0.9185, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 2.5144969672407966, + "learning_rate": 6.442577030812325e-07, + "loss": 0.7177, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 1.5397609499677072, + "learning_rate": 6.722689075630252e-07, + "loss": 0.4216, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 2.147361661302807, + "learning_rate": 7.002801120448179e-07, + "loss": 0.6331, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 3.0814448143364994, + "learning_rate": 7.282913165266108e-07, + "loss": 0.7564, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 2.02008979730368, + "learning_rate": 7.563025210084034e-07, + "loss": 0.4948, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 1.8232678476497801, + "learning_rate": 7.843137254901962e-07, + "loss": 0.5, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 2.3284249466759057, + "learning_rate": 8.123249299719889e-07, + "loss": 0.5935, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 2.223237815984025, + "learning_rate": 8.403361344537816e-07, + "loss": 0.4647, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.1156301417954375, + "learning_rate": 8.683473389355742e-07, + "loss": 0.9354, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 1.3338272351732452, + "learning_rate": 8.96358543417367e-07, + "loss": 0.386, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 3.214014387713159, + "learning_rate": 9.243697478991598e-07, + "loss": 0.8075, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.77956706524366, + "learning_rate": 9.523809523809525e-07, + "loss": 0.7149, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 1.9983819491132484, + "learning_rate": 9.80392156862745e-07, + "loss": 0.5854, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 1.6280142558712318, + "learning_rate": 1.0084033613445378e-06, + "loss": 0.4886, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 3.1777859145289016, + "learning_rate": 1.0364145658263308e-06, + "loss": 0.8269, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 1.9289542432724536, + "learning_rate": 1.0644257703081233e-06, + "loss": 0.641, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 1.555043912185022, + "learning_rate": 1.092436974789916e-06, + "loss": 0.4924, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 1.1517709724637335, + "learning_rate": 1.1204481792717088e-06, + "loss": 0.3612, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 1.3090280563058787, + "learning_rate": 1.1484593837535015e-06, + "loss": 0.3707, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 1.5348881241692107, + "learning_rate": 1.1764705882352942e-06, + "loss": 0.4788, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 1.1642073571650549, + "learning_rate": 1.204481792717087e-06, + "loss": 0.3499, + "step": 43 + }, + { + "epoch": 0.0, + "grad_norm": 2.561620555582239, + "learning_rate": 1.2324929971988797e-06, + "loss": 0.6534, + "step": 44 + }, + { + "epoch": 0.0, + "grad_norm": 1.5200680863783844, + "learning_rate": 1.2605042016806724e-06, + "loss": 0.4172, + "step": 45 + }, + { + "epoch": 0.0, + "grad_norm": 2.0672131028851704, + "learning_rate": 1.288515406162465e-06, + "loss": 0.5117, + "step": 46 + }, + { + "epoch": 0.0, + "grad_norm": 1.3267050525804174, + "learning_rate": 1.316526610644258e-06, + "loss": 0.392, + "step": 47 + }, + { + "epoch": 0.0, + "grad_norm": 1.6082344603552319, + "learning_rate": 1.3445378151260504e-06, + "loss": 0.386, + "step": 48 + }, + { + "epoch": 0.0, + "grad_norm": 1.5539462317860868, + "learning_rate": 1.3725490196078434e-06, + "loss": 0.3556, + "step": 49 + }, + { + "epoch": 0.0, + "grad_norm": 1.309493873051941, + "learning_rate": 1.4005602240896359e-06, + "loss": 0.2866, + "step": 50 + }, + { + "epoch": 0.0, + "grad_norm": 1.5679101753087994, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.3349, + "step": 51 + }, + { + "epoch": 0.0, + "grad_norm": 1.4037561690173512, + "learning_rate": 1.4565826330532216e-06, + "loss": 0.3631, + "step": 52 + }, + { + "epoch": 0.0, + "grad_norm": 1.149439501665514, + "learning_rate": 1.484593837535014e-06, + "loss": 0.3144, + "step": 53 + }, + { + "epoch": 0.0, + "grad_norm": 0.9156445398777989, + "learning_rate": 1.5126050420168068e-06, + "loss": 0.2594, + "step": 54 + }, + { + "epoch": 0.0, + "grad_norm": 0.8554957368940027, + "learning_rate": 1.5406162464985996e-06, + "loss": 0.2235, + "step": 55 + }, + { + "epoch": 0.0, + "grad_norm": 1.304353447332407, + "learning_rate": 1.5686274509803923e-06, + "loss": 0.3232, + "step": 56 + }, + { + "epoch": 0.0, + "grad_norm": 1.9474417020162798, + "learning_rate": 1.5966386554621848e-06, + "loss": 0.2953, + "step": 57 + }, + { + "epoch": 0.0, + "grad_norm": 0.9925423302626352, + "learning_rate": 1.6246498599439778e-06, + "loss": 0.2353, + "step": 58 + }, + { + "epoch": 0.0, + "grad_norm": 1.806722828112331, + "learning_rate": 1.6526610644257705e-06, + "loss": 0.4393, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 1.3421913952877822, + "learning_rate": 1.6806722689075632e-06, + "loss": 0.2885, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 1.629878207455927, + "learning_rate": 1.708683473389356e-06, + "loss": 0.3663, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 1.1623322808354322, + "learning_rate": 1.7366946778711485e-06, + "loss": 0.2524, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 1.2713641551250345, + "learning_rate": 1.7647058823529414e-06, + "loss": 0.3329, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 1.6220897652081994, + "learning_rate": 1.792717086834734e-06, + "loss": 0.2722, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 1.480713887381015, + "learning_rate": 1.8207282913165267e-06, + "loss": 0.2783, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.3330905139520488, + "learning_rate": 1.8487394957983196e-06, + "loss": 0.2439, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 0.8873443176784379, + "learning_rate": 1.8767507002801122e-06, + "loss": 0.157, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 1.165198341695851, + "learning_rate": 1.904761904761905e-06, + "loss": 0.3187, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 1.6964579458207625, + "learning_rate": 1.932773109243698e-06, + "loss": 0.3121, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 1.1187192308380378, + "learning_rate": 1.96078431372549e-06, + "loss": 0.2171, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 1.0276764807060643, + "learning_rate": 1.988795518207283e-06, + "loss": 0.2625, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 1.9027873230036005, + "learning_rate": 2.0168067226890756e-06, + "loss": 0.3217, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 1.5165163685998264, + "learning_rate": 2.0448179271708684e-06, + "loss": 0.2961, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 1.1473792353335586, + "learning_rate": 2.0728291316526615e-06, + "loss": 0.2281, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 0.9349009675846548, + "learning_rate": 2.100840336134454e-06, + "loss": 0.2056, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 1.0032690191243625, + "learning_rate": 2.1288515406162466e-06, + "loss": 0.1932, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 1.4498611743782313, + "learning_rate": 2.1568627450980393e-06, + "loss": 0.2399, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 1.5705516137473403, + "learning_rate": 2.184873949579832e-06, + "loss": 0.1891, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 1.3027285742443064, + "learning_rate": 2.2128851540616248e-06, + "loss": 0.2051, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 4.008999876619946, + "learning_rate": 2.2408963585434175e-06, + "loss": 0.4214, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 1.283035163456896, + "learning_rate": 2.2689075630252102e-06, + "loss": 0.1946, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 1.577756531888979, + "learning_rate": 2.296918767507003e-06, + "loss": 0.1838, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 1.7382106351764461, + "learning_rate": 2.3249299719887957e-06, + "loss": 0.217, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 1.2549335402945008, + "learning_rate": 2.3529411764705885e-06, + "loss": 0.1845, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.0818358619557094, + "learning_rate": 2.380952380952381e-06, + "loss": 0.2088, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.1092305795090334, + "learning_rate": 2.408963585434174e-06, + "loss": 0.223, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 1.0046575088900542, + "learning_rate": 2.4369747899159667e-06, + "loss": 0.1588, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.543139589180529, + "learning_rate": 2.4649859943977594e-06, + "loss": 0.25, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 1.0862284836177165, + "learning_rate": 2.492997198879552e-06, + "loss": 0.1834, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 1.0951286709007708, + "learning_rate": 2.521008403361345e-06, + "loss": 0.1843, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 1.0854365027544433, + "learning_rate": 2.549019607843137e-06, + "loss": 0.1839, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 0.8741260935022611, + "learning_rate": 2.57703081232493e-06, + "loss": 0.1462, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 1.900664319990146, + "learning_rate": 2.605042016806723e-06, + "loss": 0.2132, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 1.002053960580304, + "learning_rate": 2.633053221288516e-06, + "loss": 0.0995, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 1.070235803762512, + "learning_rate": 2.6610644257703085e-06, + "loss": 0.1864, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 1.994410676021949, + "learning_rate": 2.689075630252101e-06, + "loss": 0.2112, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 0.8435942764081367, + "learning_rate": 2.7170868347338936e-06, + "loss": 0.1225, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 1.5227951998275673, + "learning_rate": 2.7450980392156867e-06, + "loss": 0.1688, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 9.569301304363513, + "learning_rate": 2.7731092436974795e-06, + "loss": 0.2324, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 1.122581084598405, + "learning_rate": 2.8011204481792718e-06, + "loss": 0.1808, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 0.7171633676092366, + "learning_rate": 2.8291316526610645e-06, + "loss": 0.1331, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 1.9068398782740705, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.2535, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 1.0038063649347526, + "learning_rate": 2.88515406162465e-06, + "loss": 0.1292, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 1.49553597269004, + "learning_rate": 2.913165266106443e-06, + "loss": 0.1794, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 0.9147845385812657, + "learning_rate": 2.9411764705882355e-06, + "loss": 0.1347, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 1.0340199089798479, + "learning_rate": 2.969187675070028e-06, + "loss": 0.2023, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 0.9518522137890731, + "learning_rate": 2.997198879551821e-06, + "loss": 0.1584, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.3008290897927282, + "learning_rate": 3.0252100840336137e-06, + "loss": 0.1693, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 0.7663463604846564, + "learning_rate": 3.053221288515407e-06, + "loss": 0.1533, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.1969616595733, + "learning_rate": 3.081232492997199e-06, + "loss": 0.2056, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 3.3165158068159264, + "learning_rate": 3.109243697478992e-06, + "loss": 0.165, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 0.6738693608018019, + "learning_rate": 3.1372549019607846e-06, + "loss": 0.1323, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.3285311317453075, + "learning_rate": 3.1652661064425773e-06, + "loss": 0.2615, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 1.7602131341712919, + "learning_rate": 3.1932773109243696e-06, + "loss": 0.2569, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 0.5686472084206783, + "learning_rate": 3.221288515406163e-06, + "loss": 0.0984, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 1.0151729646603311, + "learning_rate": 3.2492997198879555e-06, + "loss": 0.1424, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 1.3321196500506631, + "learning_rate": 3.2773109243697483e-06, + "loss": 0.2019, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 0.7005549079921075, + "learning_rate": 3.305322128851541e-06, + "loss": 0.0449, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 0.9280575675822933, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.094, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 0.8187209837179708, + "learning_rate": 3.3613445378151265e-06, + "loss": 0.1091, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 0.7228045751843665, + "learning_rate": 3.3893557422969192e-06, + "loss": 0.1129, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.1373406248756526, + "learning_rate": 3.417366946778712e-06, + "loss": 0.1797, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 0.9688545946817783, + "learning_rate": 3.4453781512605043e-06, + "loss": 0.1431, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 0.9018344392656461, + "learning_rate": 3.473389355742297e-06, + "loss": 0.1278, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 0.692740577546455, + "learning_rate": 3.5014005602240897e-06, + "loss": 0.1488, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 1.1873256092562878, + "learning_rate": 3.529411764705883e-06, + "loss": 0.1357, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 0.627583002919607, + "learning_rate": 3.5574229691876756e-06, + "loss": 0.1115, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 0.6408287701248372, + "learning_rate": 3.585434173669468e-06, + "loss": 0.1252, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 0.7829349104719872, + "learning_rate": 3.6134453781512607e-06, + "loss": 0.1353, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 0.8150973685572598, + "learning_rate": 3.6414565826330534e-06, + "loss": 0.1319, + "step": 130 + }, + { + "epoch": 0.01, + "grad_norm": 0.909265927877615, + "learning_rate": 3.669467787114846e-06, + "loss": 0.0976, + "step": 131 + }, + { + "epoch": 0.01, + "grad_norm": 0.6399903356863155, + "learning_rate": 3.6974789915966393e-06, + "loss": 0.0924, + "step": 132 + }, + { + "epoch": 0.01, + "grad_norm": 0.9810165314757194, + "learning_rate": 3.7254901960784316e-06, + "loss": 0.1374, + "step": 133 + }, + { + "epoch": 0.01, + "grad_norm": 1.1169033864692144, + "learning_rate": 3.7535014005602243e-06, + "loss": 0.1201, + "step": 134 + }, + { + "epoch": 0.01, + "grad_norm": 1.0230195969558176, + "learning_rate": 3.781512605042017e-06, + "loss": 0.1623, + "step": 135 + }, + { + "epoch": 0.01, + "grad_norm": 0.8201776917789521, + "learning_rate": 3.80952380952381e-06, + "loss": 0.1038, + "step": 136 + }, + { + "epoch": 0.01, + "grad_norm": 0.7854349579918295, + "learning_rate": 3.8375350140056026e-06, + "loss": 0.1404, + "step": 137 + }, + { + "epoch": 0.01, + "grad_norm": 0.9473898549020787, + "learning_rate": 3.865546218487396e-06, + "loss": 0.1913, + "step": 138 + }, + { + "epoch": 0.01, + "grad_norm": 1.0816682569185128, + "learning_rate": 3.893557422969188e-06, + "loss": 0.145, + "step": 139 + }, + { + "epoch": 0.01, + "grad_norm": 0.8972569274975063, + "learning_rate": 3.92156862745098e-06, + "loss": 0.1494, + "step": 140 + }, + { + "epoch": 0.01, + "grad_norm": 0.7977795459044014, + "learning_rate": 3.9495798319327735e-06, + "loss": 0.1063, + "step": 141 + }, + { + "epoch": 0.01, + "grad_norm": 1.4936965473968211, + "learning_rate": 3.977591036414566e-06, + "loss": 0.1487, + "step": 142 + }, + { + "epoch": 0.01, + "grad_norm": 1.3648362102115557, + "learning_rate": 4.005602240896359e-06, + "loss": 0.1043, + "step": 143 + }, + { + "epoch": 0.01, + "grad_norm": 4.660370415184799, + "learning_rate": 4.033613445378151e-06, + "loss": 0.1203, + "step": 144 + }, + { + "epoch": 0.01, + "grad_norm": 0.9550080631057337, + "learning_rate": 4.0616246498599444e-06, + "loss": 0.1464, + "step": 145 + }, + { + "epoch": 0.01, + "grad_norm": 1.2467657727213688, + "learning_rate": 4.089635854341737e-06, + "loss": 0.1712, + "step": 146 + }, + { + "epoch": 0.01, + "grad_norm": 0.9738315271510968, + "learning_rate": 4.11764705882353e-06, + "loss": 0.1435, + "step": 147 + }, + { + "epoch": 0.01, + "grad_norm": 1.2557624756002042, + "learning_rate": 4.145658263305323e-06, + "loss": 0.1748, + "step": 148 + }, + { + "epoch": 0.01, + "grad_norm": 1.1322387036992898, + "learning_rate": 4.173669467787115e-06, + "loss": 0.1507, + "step": 149 + }, + { + "epoch": 0.01, + "grad_norm": 0.7925609196162391, + "learning_rate": 4.201680672268908e-06, + "loss": 0.1517, + "step": 150 + }, + { + "epoch": 0.01, + "grad_norm": 0.9363183396165131, + "learning_rate": 4.229691876750701e-06, + "loss": 0.1182, + "step": 151 + }, + { + "epoch": 0.01, + "grad_norm": 1.036075343362631, + "learning_rate": 4.257703081232493e-06, + "loss": 0.1257, + "step": 152 + }, + { + "epoch": 0.01, + "grad_norm": 2.292902569474106, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.1682, + "step": 153 + }, + { + "epoch": 0.01, + "grad_norm": 1.1189764303753709, + "learning_rate": 4.313725490196079e-06, + "loss": 0.1104, + "step": 154 + }, + { + "epoch": 0.01, + "grad_norm": 0.8997710102528754, + "learning_rate": 4.341736694677872e-06, + "loss": 0.1483, + "step": 155 + }, + { + "epoch": 0.01, + "grad_norm": 1.3888439514053874, + "learning_rate": 4.369747899159664e-06, + "loss": 0.1388, + "step": 156 + }, + { + "epoch": 0.01, + "grad_norm": 1.6241812478721267, + "learning_rate": 4.397759103641457e-06, + "loss": 0.1529, + "step": 157 + }, + { + "epoch": 0.01, + "grad_norm": 1.0750061034225609, + "learning_rate": 4.4257703081232496e-06, + "loss": 0.1183, + "step": 158 + }, + { + "epoch": 0.01, + "grad_norm": 1.1806226563310234, + "learning_rate": 4.453781512605043e-06, + "loss": 0.1158, + "step": 159 + }, + { + "epoch": 0.01, + "grad_norm": 1.012194765371845, + "learning_rate": 4.481792717086835e-06, + "loss": 0.1374, + "step": 160 + }, + { + "epoch": 0.01, + "grad_norm": 1.491833971454559, + "learning_rate": 4.509803921568628e-06, + "loss": 0.1667, + "step": 161 + }, + { + "epoch": 0.01, + "grad_norm": 0.6814837935597085, + "learning_rate": 4.5378151260504205e-06, + "loss": 0.1395, + "step": 162 + }, + { + "epoch": 0.01, + "grad_norm": 1.0534935397219367, + "learning_rate": 4.565826330532213e-06, + "loss": 0.1329, + "step": 163 + }, + { + "epoch": 0.01, + "grad_norm": 2.181140819390502, + "learning_rate": 4.593837535014006e-06, + "loss": 0.137, + "step": 164 + }, + { + "epoch": 0.01, + "grad_norm": 0.6721444437160121, + "learning_rate": 4.621848739495799e-06, + "loss": 0.1604, + "step": 165 + }, + { + "epoch": 0.01, + "grad_norm": 1.1232194654735719, + "learning_rate": 4.6498599439775914e-06, + "loss": 0.1325, + "step": 166 + }, + { + "epoch": 0.01, + "grad_norm": 0.9063000227821515, + "learning_rate": 4.677871148459384e-06, + "loss": 0.1329, + "step": 167 + }, + { + "epoch": 0.01, + "grad_norm": 0.7880981332151767, + "learning_rate": 4.705882352941177e-06, + "loss": 0.0799, + "step": 168 + }, + { + "epoch": 0.01, + "grad_norm": 0.75882075731128, + "learning_rate": 4.733893557422969e-06, + "loss": 0.1193, + "step": 169 + }, + { + "epoch": 0.01, + "grad_norm": 0.707775289332377, + "learning_rate": 4.761904761904762e-06, + "loss": 0.1268, + "step": 170 + }, + { + "epoch": 0.01, + "grad_norm": 0.8654630841941064, + "learning_rate": 4.7899159663865555e-06, + "loss": 0.1466, + "step": 171 + }, + { + "epoch": 0.01, + "grad_norm": 1.4331089362823357, + "learning_rate": 4.817927170868348e-06, + "loss": 0.1496, + "step": 172 + }, + { + "epoch": 0.01, + "grad_norm": 0.7788799339916578, + "learning_rate": 4.84593837535014e-06, + "loss": 0.1042, + "step": 173 + }, + { + "epoch": 0.01, + "grad_norm": 2.851418831279099, + "learning_rate": 4.873949579831933e-06, + "loss": 0.2319, + "step": 174 + }, + { + "epoch": 0.01, + "grad_norm": 0.7548200282053826, + "learning_rate": 4.901960784313726e-06, + "loss": 0.0723, + "step": 175 + }, + { + "epoch": 0.01, + "grad_norm": 0.8352632652003819, + "learning_rate": 4.929971988795519e-06, + "loss": 0.1174, + "step": 176 + }, + { + "epoch": 0.01, + "grad_norm": 1.0768429613158856, + "learning_rate": 4.957983193277311e-06, + "loss": 0.0984, + "step": 177 + }, + { + "epoch": 0.01, + "grad_norm": 0.6860551834344599, + "learning_rate": 4.985994397759104e-06, + "loss": 0.1275, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 1.5072187058489912, + "learning_rate": 5.0140056022408966e-06, + "loss": 0.1237, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 0.5693532801629468, + "learning_rate": 5.04201680672269e-06, + "loss": 0.1045, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 1.5909399961449435, + "learning_rate": 5.070028011204482e-06, + "loss": 0.1024, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 1.3184753528297817, + "learning_rate": 5.098039215686274e-06, + "loss": 0.1499, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 0.7387783165179622, + "learning_rate": 5.1260504201680675e-06, + "loss": 0.1134, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 1.1874169089723954, + "learning_rate": 5.15406162464986e-06, + "loss": 0.1405, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 1.108865943119381, + "learning_rate": 5.182072829131654e-06, + "loss": 0.1333, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 1.793094650111033, + "learning_rate": 5.210084033613446e-06, + "loss": 0.1374, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 0.9491133219463196, + "learning_rate": 5.2380952380952384e-06, + "loss": 0.1241, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 1.0412064473071794, + "learning_rate": 5.266106442577032e-06, + "loss": 0.0947, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 0.5987650433717931, + "learning_rate": 5.294117647058824e-06, + "loss": 0.087, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 0.7088693304879308, + "learning_rate": 5.322128851540617e-06, + "loss": 0.0629, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 1.0136297711338809, + "learning_rate": 5.350140056022409e-06, + "loss": 0.0825, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 0.6908771420245972, + "learning_rate": 5.378151260504202e-06, + "loss": 0.1026, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 0.6181606512498035, + "learning_rate": 5.406162464985995e-06, + "loss": 0.0725, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 1.0198534062141151, + "learning_rate": 5.434173669467787e-06, + "loss": 0.1137, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 0.7407742939780293, + "learning_rate": 5.4621848739495795e-06, + "loss": 0.1234, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 0.9864376614302909, + "learning_rate": 5.4901960784313735e-06, + "loss": 0.1083, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 1.3105056476116779, + "learning_rate": 5.518207282913166e-06, + "loss": 0.1462, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 1.8500959694566836, + "learning_rate": 5.546218487394959e-06, + "loss": 0.1293, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 0.8049639630293183, + "learning_rate": 5.574229691876751e-06, + "loss": 0.1427, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 0.8175086191610695, + "learning_rate": 5.6022408963585436e-06, + "loss": 0.1326, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.0738426113487498, + "learning_rate": 5.630252100840337e-06, + "loss": 0.1197, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 0.9445056867957289, + "learning_rate": 5.658263305322129e-06, + "loss": 0.1176, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 1.0965738365735191, + "learning_rate": 5.686274509803922e-06, + "loss": 0.1501, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 0.5956119796110726, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.0626, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 1.9839322325853606, + "learning_rate": 5.742296918767507e-06, + "loss": 0.1541, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 1.0196458103635884, + "learning_rate": 5.7703081232493e-06, + "loss": 0.1291, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 0.9156257111669147, + "learning_rate": 5.798319327731093e-06, + "loss": 0.0676, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 1.1691888058139204, + "learning_rate": 5.826330532212886e-06, + "loss": 0.1102, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 1.2281207163385925, + "learning_rate": 5.854341736694679e-06, + "loss": 0.1248, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 0.7956677084983999, + "learning_rate": 5.882352941176471e-06, + "loss": 0.1344, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 2.3420020663082877, + "learning_rate": 5.910364145658264e-06, + "loss": 0.173, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 0.9516047545964721, + "learning_rate": 5.938375350140056e-06, + "loss": 0.1542, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 0.5357263932212112, + "learning_rate": 5.9663865546218495e-06, + "loss": 0.1013, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 1.2253766818994591, + "learning_rate": 5.994397759103642e-06, + "loss": 0.1472, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 1.368394434203125, + "learning_rate": 6.022408963585434e-06, + "loss": 0.1352, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 2.564022696136383, + "learning_rate": 6.050420168067227e-06, + "loss": 0.2411, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 0.9832068902828558, + "learning_rate": 6.07843137254902e-06, + "loss": 0.1309, + "step": 217 + }, + { + "epoch": 0.02, + "grad_norm": 0.6125673936716415, + "learning_rate": 6.106442577030814e-06, + "loss": 0.1245, + "step": 218 + }, + { + "epoch": 0.02, + "grad_norm": 2.375934875362054, + "learning_rate": 6.134453781512606e-06, + "loss": 0.1915, + "step": 219 + }, + { + "epoch": 0.02, + "grad_norm": 1.4056305921320742, + "learning_rate": 6.162464985994398e-06, + "loss": 0.1264, + "step": 220 + }, + { + "epoch": 0.02, + "grad_norm": 0.8682730619311089, + "learning_rate": 6.1904761904761914e-06, + "loss": 0.101, + "step": 221 + }, + { + "epoch": 0.02, + "grad_norm": 0.7539398805306745, + "learning_rate": 6.218487394957984e-06, + "loss": 0.0843, + "step": 222 + }, + { + "epoch": 0.02, + "grad_norm": 1.004436289611301, + "learning_rate": 6.246498599439776e-06, + "loss": 0.1376, + "step": 223 + }, + { + "epoch": 0.02, + "grad_norm": 1.1607485522612968, + "learning_rate": 6.274509803921569e-06, + "loss": 0.1281, + "step": 224 + }, + { + "epoch": 0.02, + "grad_norm": 1.0786201456835138, + "learning_rate": 6.3025210084033615e-06, + "loss": 0.1534, + "step": 225 + }, + { + "epoch": 0.02, + "grad_norm": 0.8102842019977521, + "learning_rate": 6.330532212885155e-06, + "loss": 0.1062, + "step": 226 + }, + { + "epoch": 0.02, + "grad_norm": 2.494194292716641, + "learning_rate": 6.358543417366947e-06, + "loss": 0.1243, + "step": 227 + }, + { + "epoch": 0.02, + "grad_norm": 0.7577938538854214, + "learning_rate": 6.386554621848739e-06, + "loss": 0.0875, + "step": 228 + }, + { + "epoch": 0.02, + "grad_norm": 0.8883762694307261, + "learning_rate": 6.414565826330533e-06, + "loss": 0.124, + "step": 229 + }, + { + "epoch": 0.02, + "grad_norm": 1.2912673146126228, + "learning_rate": 6.442577030812326e-06, + "loss": 0.1347, + "step": 230 + }, + { + "epoch": 0.02, + "grad_norm": 1.2536637158520805, + "learning_rate": 6.470588235294119e-06, + "loss": 0.1159, + "step": 231 + }, + { + "epoch": 0.02, + "grad_norm": 0.887432612038577, + "learning_rate": 6.498599439775911e-06, + "loss": 0.0902, + "step": 232 + }, + { + "epoch": 0.02, + "grad_norm": 0.9958418038296715, + "learning_rate": 6.526610644257703e-06, + "loss": 0.1671, + "step": 233 + }, + { + "epoch": 0.02, + "grad_norm": 0.4256230440377394, + "learning_rate": 6.5546218487394966e-06, + "loss": 0.0776, + "step": 234 + }, + { + "epoch": 0.02, + "grad_norm": 1.792639600779407, + "learning_rate": 6.582633053221289e-06, + "loss": 0.1737, + "step": 235 + }, + { + "epoch": 0.02, + "grad_norm": 1.3531430409489649, + "learning_rate": 6.610644257703082e-06, + "loss": 0.1908, + "step": 236 + }, + { + "epoch": 0.02, + "grad_norm": 0.6160059180993964, + "learning_rate": 6.638655462184874e-06, + "loss": 0.1, + "step": 237 + }, + { + "epoch": 0.02, + "grad_norm": 0.44218621076890513, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0905, + "step": 238 + }, + { + "epoch": 0.02, + "grad_norm": 0.7934067501977317, + "learning_rate": 6.69467787114846e-06, + "loss": 0.1075, + "step": 239 + }, + { + "epoch": 0.02, + "grad_norm": 1.015273688117325, + "learning_rate": 6.722689075630253e-06, + "loss": 0.1017, + "step": 240 + }, + { + "epoch": 0.02, + "grad_norm": 0.8589450725959116, + "learning_rate": 6.750700280112046e-06, + "loss": 0.1343, + "step": 241 + }, + { + "epoch": 0.02, + "grad_norm": 1.1868668159709224, + "learning_rate": 6.7787114845938384e-06, + "loss": 0.0735, + "step": 242 + }, + { + "epoch": 0.02, + "grad_norm": 0.624749218438332, + "learning_rate": 6.806722689075631e-06, + "loss": 0.1153, + "step": 243 + }, + { + "epoch": 0.02, + "grad_norm": 0.9117904723909608, + "learning_rate": 6.834733893557424e-06, + "loss": 0.1397, + "step": 244 + }, + { + "epoch": 0.02, + "grad_norm": 1.348056610920841, + "learning_rate": 6.862745098039216e-06, + "loss": 0.1342, + "step": 245 + }, + { + "epoch": 0.02, + "grad_norm": 1.9788193021114568, + "learning_rate": 6.8907563025210085e-06, + "loss": 0.2195, + "step": 246 + }, + { + "epoch": 0.02, + "grad_norm": 0.8805606669229958, + "learning_rate": 6.918767507002802e-06, + "loss": 0.1245, + "step": 247 + }, + { + "epoch": 0.02, + "grad_norm": 1.1252650337746726, + "learning_rate": 6.946778711484594e-06, + "loss": 0.1508, + "step": 248 + }, + { + "epoch": 0.02, + "grad_norm": 1.1502642953663642, + "learning_rate": 6.974789915966387e-06, + "loss": 0.143, + "step": 249 + }, + { + "epoch": 0.02, + "grad_norm": 0.7265820033419227, + "learning_rate": 7.0028011204481795e-06, + "loss": 0.0952, + "step": 250 + }, + { + "epoch": 0.02, + "grad_norm": 1.0408077277033767, + "learning_rate": 7.030812324929972e-06, + "loss": 0.118, + "step": 251 + }, + { + "epoch": 0.02, + "grad_norm": 0.7152894868756683, + "learning_rate": 7.058823529411766e-06, + "loss": 0.1179, + "step": 252 + }, + { + "epoch": 0.02, + "grad_norm": 1.1771133756751315, + "learning_rate": 7.086834733893558e-06, + "loss": 0.1871, + "step": 253 + }, + { + "epoch": 0.02, + "grad_norm": 0.7868140613650079, + "learning_rate": 7.114845938375351e-06, + "loss": 0.102, + "step": 254 + }, + { + "epoch": 0.02, + "grad_norm": 1.0963641998512772, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.1601, + "step": 255 + }, + { + "epoch": 0.02, + "grad_norm": 2.8494884016013327, + "learning_rate": 7.170868347338936e-06, + "loss": 0.1118, + "step": 256 + }, + { + "epoch": 0.02, + "grad_norm": 0.7955504742627728, + "learning_rate": 7.198879551820729e-06, + "loss": 0.1205, + "step": 257 + }, + { + "epoch": 0.02, + "grad_norm": 0.5823455860412632, + "learning_rate": 7.226890756302521e-06, + "loss": 0.106, + "step": 258 + }, + { + "epoch": 0.02, + "grad_norm": 0.9676164660232698, + "learning_rate": 7.2549019607843145e-06, + "loss": 0.1386, + "step": 259 + }, + { + "epoch": 0.02, + "grad_norm": 0.654979177276987, + "learning_rate": 7.282913165266107e-06, + "loss": 0.1002, + "step": 260 + }, + { + "epoch": 0.02, + "grad_norm": 0.8320532160587607, + "learning_rate": 7.310924369747899e-06, + "loss": 0.0843, + "step": 261 + }, + { + "epoch": 0.02, + "grad_norm": 0.6435451195097509, + "learning_rate": 7.338935574229692e-06, + "loss": 0.0826, + "step": 262 + }, + { + "epoch": 0.02, + "grad_norm": 0.660236816111505, + "learning_rate": 7.3669467787114854e-06, + "loss": 0.1055, + "step": 263 + }, + { + "epoch": 0.02, + "grad_norm": 1.0586532395876818, + "learning_rate": 7.394957983193279e-06, + "loss": 0.1433, + "step": 264 + }, + { + "epoch": 0.02, + "grad_norm": 0.7180157872264912, + "learning_rate": 7.422969187675071e-06, + "loss": 0.133, + "step": 265 + }, + { + "epoch": 0.02, + "grad_norm": 0.6586616577809415, + "learning_rate": 7.450980392156863e-06, + "loss": 0.1014, + "step": 266 + }, + { + "epoch": 0.02, + "grad_norm": 0.909167436704611, + "learning_rate": 7.478991596638656e-06, + "loss": 0.1247, + "step": 267 + }, + { + "epoch": 0.02, + "grad_norm": 1.5155235546294923, + "learning_rate": 7.507002801120449e-06, + "loss": 0.1787, + "step": 268 + }, + { + "epoch": 0.02, + "grad_norm": 0.8862318907564707, + "learning_rate": 7.535014005602241e-06, + "loss": 0.1103, + "step": 269 + }, + { + "epoch": 0.02, + "grad_norm": 0.7961777666694813, + "learning_rate": 7.563025210084034e-06, + "loss": 0.1391, + "step": 270 + }, + { + "epoch": 0.02, + "grad_norm": 1.2268121697537753, + "learning_rate": 7.5910364145658265e-06, + "loss": 0.104, + "step": 271 + }, + { + "epoch": 0.02, + "grad_norm": 0.8424893528228262, + "learning_rate": 7.61904761904762e-06, + "loss": 0.1293, + "step": 272 + }, + { + "epoch": 0.02, + "grad_norm": 0.5945027275520673, + "learning_rate": 7.647058823529411e-06, + "loss": 0.106, + "step": 273 + }, + { + "epoch": 0.02, + "grad_norm": 0.9020478674324943, + "learning_rate": 7.675070028011205e-06, + "loss": 0.1501, + "step": 274 + }, + { + "epoch": 0.02, + "grad_norm": 0.6602743166751268, + "learning_rate": 7.703081232492997e-06, + "loss": 0.1164, + "step": 275 + }, + { + "epoch": 0.02, + "grad_norm": 0.7128662762896094, + "learning_rate": 7.731092436974791e-06, + "loss": 0.0765, + "step": 276 + }, + { + "epoch": 0.02, + "grad_norm": 0.6716073336746646, + "learning_rate": 7.759103641456584e-06, + "loss": 0.0912, + "step": 277 + }, + { + "epoch": 0.02, + "grad_norm": 0.3602254932010835, + "learning_rate": 7.787114845938376e-06, + "loss": 0.0646, + "step": 278 + }, + { + "epoch": 0.02, + "grad_norm": 0.7922401097154961, + "learning_rate": 7.815126050420168e-06, + "loss": 0.0816, + "step": 279 + }, + { + "epoch": 0.02, + "grad_norm": 0.8965502776188748, + "learning_rate": 7.84313725490196e-06, + "loss": 0.1042, + "step": 280 + }, + { + "epoch": 0.02, + "grad_norm": 1.006229806736457, + "learning_rate": 7.871148459383755e-06, + "loss": 0.1536, + "step": 281 + }, + { + "epoch": 0.02, + "grad_norm": 0.8036842584260245, + "learning_rate": 7.899159663865547e-06, + "loss": 0.1008, + "step": 282 + }, + { + "epoch": 0.02, + "grad_norm": 0.7641755373566564, + "learning_rate": 7.92717086834734e-06, + "loss": 0.1019, + "step": 283 + }, + { + "epoch": 0.02, + "grad_norm": 0.5435840865677047, + "learning_rate": 7.955182072829132e-06, + "loss": 0.0879, + "step": 284 + }, + { + "epoch": 0.02, + "grad_norm": 2.117229913816576, + "learning_rate": 7.983193277310926e-06, + "loss": 0.111, + "step": 285 + }, + { + "epoch": 0.02, + "grad_norm": 1.4255462180416747, + "learning_rate": 8.011204481792718e-06, + "loss": 0.1196, + "step": 286 + }, + { + "epoch": 0.02, + "grad_norm": 0.8560337611042066, + "learning_rate": 8.03921568627451e-06, + "loss": 0.1192, + "step": 287 + }, + { + "epoch": 0.02, + "grad_norm": 0.7453938132777229, + "learning_rate": 8.067226890756303e-06, + "loss": 0.0938, + "step": 288 + }, + { + "epoch": 0.02, + "grad_norm": 1.5344160036818146, + "learning_rate": 8.095238095238097e-06, + "loss": 0.1389, + "step": 289 + }, + { + "epoch": 0.02, + "grad_norm": 1.2445530419872426, + "learning_rate": 8.123249299719889e-06, + "loss": 0.1857, + "step": 290 + }, + { + "epoch": 0.02, + "grad_norm": 0.8201105729160768, + "learning_rate": 8.151260504201681e-06, + "loss": 0.1204, + "step": 291 + }, + { + "epoch": 0.02, + "grad_norm": 1.68602184828112, + "learning_rate": 8.179271708683473e-06, + "loss": 0.13, + "step": 292 + }, + { + "epoch": 0.02, + "grad_norm": 1.159022500857207, + "learning_rate": 8.207282913165266e-06, + "loss": 0.1394, + "step": 293 + }, + { + "epoch": 0.02, + "grad_norm": 1.482898982638148, + "learning_rate": 8.23529411764706e-06, + "loss": 0.0969, + "step": 294 + }, + { + "epoch": 0.02, + "grad_norm": 1.0976033158870477, + "learning_rate": 8.263305322128852e-06, + "loss": 0.1186, + "step": 295 + }, + { + "epoch": 0.02, + "grad_norm": 1.219906914646565, + "learning_rate": 8.291316526610646e-06, + "loss": 0.1286, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 0.8883998652416756, + "learning_rate": 8.319327731092438e-06, + "loss": 0.1233, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 0.683376475279875, + "learning_rate": 8.34733893557423e-06, + "loss": 0.0934, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 0.6815811997423906, + "learning_rate": 8.375350140056023e-06, + "loss": 0.0999, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 0.6507939989312181, + "learning_rate": 8.403361344537815e-06, + "loss": 0.0894, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 1.5029467799578613, + "learning_rate": 8.43137254901961e-06, + "loss": 0.1208, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 1.59289985631983, + "learning_rate": 8.459383753501402e-06, + "loss": 0.1556, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 0.7435023481452023, + "learning_rate": 8.487394957983194e-06, + "loss": 0.117, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 0.8797687851595912, + "learning_rate": 8.515406162464986e-06, + "loss": 0.1056, + "step": 304 + }, + { + "epoch": 0.03, + "grad_norm": 0.5876011082652602, + "learning_rate": 8.543417366946779e-06, + "loss": 0.0953, + "step": 305 + }, + { + "epoch": 0.03, + "grad_norm": 1.4171970763131865, + "learning_rate": 8.571428571428571e-06, + "loss": 0.198, + "step": 306 + }, + { + "epoch": 0.03, + "grad_norm": 0.9413837759523089, + "learning_rate": 8.599439775910365e-06, + "loss": 0.1018, + "step": 307 + }, + { + "epoch": 0.03, + "grad_norm": 1.2430758571541691, + "learning_rate": 8.627450980392157e-06, + "loss": 0.1277, + "step": 308 + }, + { + "epoch": 0.03, + "grad_norm": 3.3602081613156978, + "learning_rate": 8.655462184873951e-06, + "loss": 0.1276, + "step": 309 + }, + { + "epoch": 0.03, + "grad_norm": 0.8684694085341427, + "learning_rate": 8.683473389355744e-06, + "loss": 0.1278, + "step": 310 + }, + { + "epoch": 0.03, + "grad_norm": 1.341288434386308, + "learning_rate": 8.711484593837536e-06, + "loss": 0.1625, + "step": 311 + }, + { + "epoch": 0.03, + "grad_norm": 0.5657386807369579, + "learning_rate": 8.739495798319328e-06, + "loss": 0.0655, + "step": 312 + }, + { + "epoch": 0.03, + "grad_norm": 0.7675959063816352, + "learning_rate": 8.76750700280112e-06, + "loss": 0.0876, + "step": 313 + }, + { + "epoch": 0.03, + "grad_norm": 0.5677678167017767, + "learning_rate": 8.795518207282914e-06, + "loss": 0.0872, + "step": 314 + }, + { + "epoch": 0.03, + "grad_norm": 0.7331556013163614, + "learning_rate": 8.823529411764707e-06, + "loss": 0.1371, + "step": 315 + }, + { + "epoch": 0.03, + "grad_norm": 1.538261767586086, + "learning_rate": 8.851540616246499e-06, + "loss": 0.1135, + "step": 316 + }, + { + "epoch": 0.03, + "grad_norm": 0.6061951403038772, + "learning_rate": 8.879551820728291e-06, + "loss": 0.0837, + "step": 317 + }, + { + "epoch": 0.03, + "grad_norm": 0.853638420522851, + "learning_rate": 8.907563025210085e-06, + "loss": 0.1701, + "step": 318 + }, + { + "epoch": 0.03, + "grad_norm": 0.7491503645158919, + "learning_rate": 8.935574229691878e-06, + "loss": 0.142, + "step": 319 + }, + { + "epoch": 0.03, + "grad_norm": 0.5753420716760586, + "learning_rate": 8.96358543417367e-06, + "loss": 0.1067, + "step": 320 + }, + { + "epoch": 0.03, + "grad_norm": 1.0994723241815523, + "learning_rate": 8.991596638655462e-06, + "loss": 0.0896, + "step": 321 + }, + { + "epoch": 0.03, + "grad_norm": 0.7933813065611962, + "learning_rate": 9.019607843137256e-06, + "loss": 0.09, + "step": 322 + }, + { + "epoch": 0.03, + "grad_norm": 0.8507879001937253, + "learning_rate": 9.047619047619049e-06, + "loss": 0.078, + "step": 323 + }, + { + "epoch": 0.03, + "grad_norm": 1.4023159099396283, + "learning_rate": 9.075630252100841e-06, + "loss": 0.1311, + "step": 324 + }, + { + "epoch": 0.03, + "grad_norm": 0.6680112670310601, + "learning_rate": 9.103641456582633e-06, + "loss": 0.1076, + "step": 325 + }, + { + "epoch": 0.03, + "grad_norm": 0.9929825899425084, + "learning_rate": 9.131652661064426e-06, + "loss": 0.1197, + "step": 326 + }, + { + "epoch": 0.03, + "grad_norm": 0.5771122052184955, + "learning_rate": 9.15966386554622e-06, + "loss": 0.0558, + "step": 327 + }, + { + "epoch": 0.03, + "grad_norm": 0.74524960917958, + "learning_rate": 9.187675070028012e-06, + "loss": 0.118, + "step": 328 + }, + { + "epoch": 0.03, + "grad_norm": 1.2818665090537797, + "learning_rate": 9.215686274509804e-06, + "loss": 0.1356, + "step": 329 + }, + { + "epoch": 0.03, + "grad_norm": 0.5463510671824718, + "learning_rate": 9.243697478991598e-06, + "loss": 0.0669, + "step": 330 + }, + { + "epoch": 0.03, + "grad_norm": 0.71357449243926, + "learning_rate": 9.27170868347339e-06, + "loss": 0.1034, + "step": 331 + }, + { + "epoch": 0.03, + "grad_norm": 0.7278246158003135, + "learning_rate": 9.299719887955183e-06, + "loss": 0.0962, + "step": 332 + }, + { + "epoch": 0.03, + "grad_norm": 1.011311704762003, + "learning_rate": 9.327731092436975e-06, + "loss": 0.1192, + "step": 333 + }, + { + "epoch": 0.03, + "grad_norm": 0.8045394187677906, + "learning_rate": 9.355742296918767e-06, + "loss": 0.0728, + "step": 334 + }, + { + "epoch": 0.03, + "grad_norm": 0.45846489036131516, + "learning_rate": 9.383753501400561e-06, + "loss": 0.0756, + "step": 335 + }, + { + "epoch": 0.03, + "grad_norm": 0.7539729227805693, + "learning_rate": 9.411764705882354e-06, + "loss": 0.1236, + "step": 336 + }, + { + "epoch": 0.03, + "grad_norm": 1.0510971035035583, + "learning_rate": 9.439775910364146e-06, + "loss": 0.1329, + "step": 337 + }, + { + "epoch": 0.03, + "grad_norm": 0.8656009252285208, + "learning_rate": 9.467787114845938e-06, + "loss": 0.1086, + "step": 338 + }, + { + "epoch": 0.03, + "grad_norm": 0.5420911667172628, + "learning_rate": 9.49579831932773e-06, + "loss": 0.0789, + "step": 339 + }, + { + "epoch": 0.03, + "grad_norm": 0.4770627197230506, + "learning_rate": 9.523809523809525e-06, + "loss": 0.0716, + "step": 340 + }, + { + "epoch": 0.03, + "grad_norm": 0.8958014457497894, + "learning_rate": 9.551820728291317e-06, + "loss": 0.1056, + "step": 341 + }, + { + "epoch": 0.03, + "grad_norm": 0.643953139246371, + "learning_rate": 9.579831932773111e-06, + "loss": 0.1224, + "step": 342 + }, + { + "epoch": 0.03, + "grad_norm": 3.9714742292981904, + "learning_rate": 9.607843137254903e-06, + "loss": 0.1436, + "step": 343 + }, + { + "epoch": 0.03, + "grad_norm": 0.6220072129182658, + "learning_rate": 9.635854341736696e-06, + "loss": 0.0871, + "step": 344 + }, + { + "epoch": 0.03, + "grad_norm": 1.5303088272034027, + "learning_rate": 9.663865546218488e-06, + "loss": 0.1703, + "step": 345 + }, + { + "epoch": 0.03, + "grad_norm": 0.7226529697999378, + "learning_rate": 9.69187675070028e-06, + "loss": 0.1156, + "step": 346 + }, + { + "epoch": 0.03, + "grad_norm": 1.2260748489126005, + "learning_rate": 9.719887955182074e-06, + "loss": 0.1271, + "step": 347 + }, + { + "epoch": 0.03, + "grad_norm": 0.6748616643203599, + "learning_rate": 9.747899159663867e-06, + "loss": 0.1232, + "step": 348 + }, + { + "epoch": 0.03, + "grad_norm": 1.1013718233309193, + "learning_rate": 9.775910364145659e-06, + "loss": 0.111, + "step": 349 + }, + { + "epoch": 0.03, + "grad_norm": 1.052337693001522, + "learning_rate": 9.803921568627451e-06, + "loss": 0.1555, + "step": 350 + }, + { + "epoch": 0.03, + "grad_norm": 0.4536110593980989, + "learning_rate": 9.831932773109244e-06, + "loss": 0.0726, + "step": 351 + }, + { + "epoch": 0.03, + "grad_norm": 0.8457898329587078, + "learning_rate": 9.859943977591038e-06, + "loss": 0.1324, + "step": 352 + }, + { + "epoch": 0.03, + "grad_norm": 0.5120053623169777, + "learning_rate": 9.88795518207283e-06, + "loss": 0.0838, + "step": 353 + }, + { + "epoch": 0.03, + "grad_norm": 0.5504843035063868, + "learning_rate": 9.915966386554622e-06, + "loss": 0.0818, + "step": 354 + }, + { + "epoch": 0.03, + "grad_norm": 1.423564606144916, + "learning_rate": 9.943977591036416e-06, + "loss": 0.15, + "step": 355 + }, + { + "epoch": 0.03, + "grad_norm": 1.0587957908183863, + "learning_rate": 9.971988795518209e-06, + "loss": 0.123, + "step": 356 + }, + { + "epoch": 0.03, + "grad_norm": 0.8578592733638776, + "learning_rate": 1e-05, + "loss": 0.1127, + "step": 357 + }, + { + "epoch": 0.03, + "grad_norm": 0.8815493803001586, + "learning_rate": 9.999999813785437e-06, + "loss": 0.1771, + "step": 358 + }, + { + "epoch": 0.03, + "grad_norm": 1.113227727342441, + "learning_rate": 9.99999925514176e-06, + "loss": 0.1462, + "step": 359 + }, + { + "epoch": 0.03, + "grad_norm": 0.7915549138912488, + "learning_rate": 9.99999832406901e-06, + "loss": 0.0968, + "step": 360 + }, + { + "epoch": 0.03, + "grad_norm": 1.0723579158386007, + "learning_rate": 9.99999702056726e-06, + "loss": 0.102, + "step": 361 + }, + { + "epoch": 0.03, + "grad_norm": 1.1235711365058287, + "learning_rate": 9.999995344636605e-06, + "loss": 0.1583, + "step": 362 + }, + { + "epoch": 0.03, + "grad_norm": 1.2759531404786768, + "learning_rate": 9.999993296277168e-06, + "loss": 0.1653, + "step": 363 + }, + { + "epoch": 0.03, + "grad_norm": 1.294747411897277, + "learning_rate": 9.999990875489104e-06, + "loss": 0.1602, + "step": 364 + }, + { + "epoch": 0.03, + "grad_norm": 0.6076115319143451, + "learning_rate": 9.999988082272593e-06, + "loss": 0.1052, + "step": 365 + }, + { + "epoch": 0.03, + "grad_norm": 2.1569298403268538, + "learning_rate": 9.999984916627841e-06, + "loss": 0.0671, + "step": 366 + }, + { + "epoch": 0.03, + "grad_norm": 0.5323238762100674, + "learning_rate": 9.999981378555086e-06, + "loss": 0.0727, + "step": 367 + }, + { + "epoch": 0.03, + "grad_norm": 3.3576882479079333, + "learning_rate": 9.999977468054592e-06, + "loss": 0.0951, + "step": 368 + }, + { + "epoch": 0.03, + "grad_norm": 1.2830878862708373, + "learning_rate": 9.999973185126647e-06, + "loss": 0.1444, + "step": 369 + }, + { + "epoch": 0.03, + "grad_norm": 0.9986121372720049, + "learning_rate": 9.999968529771573e-06, + "loss": 0.1175, + "step": 370 + }, + { + "epoch": 0.03, + "grad_norm": 0.6712629723993542, + "learning_rate": 9.999963501989717e-06, + "loss": 0.0976, + "step": 371 + }, + { + "epoch": 0.03, + "grad_norm": 0.8805471005107806, + "learning_rate": 9.99995810178145e-06, + "loss": 0.1135, + "step": 372 + }, + { + "epoch": 0.03, + "grad_norm": 0.7892499685388796, + "learning_rate": 9.99995232914718e-06, + "loss": 0.1991, + "step": 373 + }, + { + "epoch": 0.03, + "grad_norm": 0.8026901664725075, + "learning_rate": 9.999946184087332e-06, + "loss": 0.1455, + "step": 374 + }, + { + "epoch": 0.03, + "grad_norm": 0.7530310671355417, + "learning_rate": 9.999939666602365e-06, + "loss": 0.1207, + "step": 375 + }, + { + "epoch": 0.03, + "grad_norm": 0.9191083699721888, + "learning_rate": 9.999932776692765e-06, + "loss": 0.1476, + "step": 376 + }, + { + "epoch": 0.03, + "grad_norm": 0.7009997924776623, + "learning_rate": 9.999925514359045e-06, + "loss": 0.0995, + "step": 377 + }, + { + "epoch": 0.03, + "grad_norm": 0.7074357470778695, + "learning_rate": 9.999917879601746e-06, + "loss": 0.1111, + "step": 378 + }, + { + "epoch": 0.03, + "grad_norm": 0.9624155972736683, + "learning_rate": 9.999909872421439e-06, + "loss": 0.1333, + "step": 379 + }, + { + "epoch": 0.03, + "grad_norm": 0.684838351137313, + "learning_rate": 9.999901492818714e-06, + "loss": 0.1212, + "step": 380 + }, + { + "epoch": 0.03, + "grad_norm": 0.6902930353429366, + "learning_rate": 9.999892740794201e-06, + "loss": 0.1143, + "step": 381 + }, + { + "epoch": 0.03, + "grad_norm": 1.3764351813451206, + "learning_rate": 9.999883616348551e-06, + "loss": 0.1448, + "step": 382 + }, + { + "epoch": 0.03, + "grad_norm": 1.0654760412384587, + "learning_rate": 9.999874119482442e-06, + "loss": 0.067, + "step": 383 + }, + { + "epoch": 0.03, + "grad_norm": 1.3977401109537015, + "learning_rate": 9.999864250196582e-06, + "loss": 0.1374, + "step": 384 + }, + { + "epoch": 0.03, + "grad_norm": 1.4598036942142179, + "learning_rate": 9.999854008491707e-06, + "loss": 0.1202, + "step": 385 + }, + { + "epoch": 0.03, + "grad_norm": 0.8363603571549727, + "learning_rate": 9.999843394368579e-06, + "loss": 0.103, + "step": 386 + }, + { + "epoch": 0.03, + "grad_norm": 2.642018463149415, + "learning_rate": 9.999832407827988e-06, + "loss": 0.1287, + "step": 387 + }, + { + "epoch": 0.03, + "grad_norm": 1.0812781152617352, + "learning_rate": 9.999821048870753e-06, + "loss": 0.1305, + "step": 388 + }, + { + "epoch": 0.03, + "grad_norm": 0.7260835054899538, + "learning_rate": 9.99980931749772e-06, + "loss": 0.043, + "step": 389 + }, + { + "epoch": 0.03, + "grad_norm": 1.0679017072726107, + "learning_rate": 9.999797213709764e-06, + "loss": 0.1643, + "step": 390 + }, + { + "epoch": 0.03, + "grad_norm": 0.8172198916508853, + "learning_rate": 9.999784737507786e-06, + "loss": 0.1368, + "step": 391 + }, + { + "epoch": 0.03, + "grad_norm": 1.4832709124642445, + "learning_rate": 9.999771888892713e-06, + "loss": 0.154, + "step": 392 + }, + { + "epoch": 0.03, + "grad_norm": 0.60807905681673, + "learning_rate": 9.999758667865506e-06, + "loss": 0.0706, + "step": 393 + }, + { + "epoch": 0.03, + "grad_norm": 0.60879834660372, + "learning_rate": 9.999745074427147e-06, + "loss": 0.1284, + "step": 394 + }, + { + "epoch": 0.03, + "grad_norm": 0.8675368859356152, + "learning_rate": 9.999731108578648e-06, + "loss": 0.1633, + "step": 395 + }, + { + "epoch": 0.03, + "grad_norm": 0.7573311793662983, + "learning_rate": 9.999716770321052e-06, + "loss": 0.0923, + "step": 396 + }, + { + "epoch": 0.03, + "grad_norm": 0.9524590206768031, + "learning_rate": 9.999702059655423e-06, + "loss": 0.1, + "step": 397 + }, + { + "epoch": 0.03, + "grad_norm": 0.4696946536083038, + "learning_rate": 9.999686976582862e-06, + "loss": 0.1012, + "step": 398 + }, + { + "epoch": 0.03, + "grad_norm": 0.8518966433696187, + "learning_rate": 9.999671521104489e-06, + "loss": 0.1396, + "step": 399 + }, + { + "epoch": 0.03, + "grad_norm": 0.6195946184783215, + "learning_rate": 9.999655693221454e-06, + "loss": 0.0824, + "step": 400 + }, + { + "epoch": 0.03, + "grad_norm": 0.591151422291874, + "learning_rate": 9.99963949293494e-06, + "loss": 0.1209, + "step": 401 + }, + { + "epoch": 0.03, + "grad_norm": 0.5062533740828177, + "learning_rate": 9.99962292024615e-06, + "loss": 0.0727, + "step": 402 + }, + { + "epoch": 0.03, + "grad_norm": 1.1864507035069163, + "learning_rate": 9.999605975156322e-06, + "loss": 0.169, + "step": 403 + }, + { + "epoch": 0.03, + "grad_norm": 0.5787858074675994, + "learning_rate": 9.999588657666715e-06, + "loss": 0.0771, + "step": 404 + }, + { + "epoch": 0.03, + "grad_norm": 0.7089851677108531, + "learning_rate": 9.999570967778618e-06, + "loss": 0.08, + "step": 405 + }, + { + "epoch": 0.03, + "grad_norm": 0.6368272158985858, + "learning_rate": 9.999552905493352e-06, + "loss": 0.1221, + "step": 406 + }, + { + "epoch": 0.03, + "grad_norm": 0.5856673895295825, + "learning_rate": 9.999534470812262e-06, + "loss": 0.0935, + "step": 407 + }, + { + "epoch": 0.03, + "grad_norm": 1.0956286449896735, + "learning_rate": 9.99951566373672e-06, + "loss": 0.1548, + "step": 408 + }, + { + "epoch": 0.03, + "grad_norm": 0.7936770844101599, + "learning_rate": 9.999496484268127e-06, + "loss": 0.1466, + "step": 409 + }, + { + "epoch": 0.03, + "grad_norm": 0.8895310467872868, + "learning_rate": 9.999476932407913e-06, + "loss": 0.136, + "step": 410 + }, + { + "epoch": 0.03, + "grad_norm": 3.7262630524416394, + "learning_rate": 9.999457008157529e-06, + "loss": 0.2041, + "step": 411 + }, + { + "epoch": 0.03, + "grad_norm": 0.7063052207104966, + "learning_rate": 9.999436711518467e-06, + "loss": 0.0903, + "step": 412 + }, + { + "epoch": 0.03, + "grad_norm": 0.7829596724890455, + "learning_rate": 9.999416042492233e-06, + "loss": 0.1272, + "step": 413 + }, + { + "epoch": 0.03, + "grad_norm": 0.5782971535803838, + "learning_rate": 9.999395001080368e-06, + "loss": 0.1055, + "step": 414 + }, + { + "epoch": 0.03, + "grad_norm": 0.4408465690017831, + "learning_rate": 9.99937358728444e-06, + "loss": 0.0793, + "step": 415 + }, + { + "epoch": 0.04, + "grad_norm": 0.5914949158707651, + "learning_rate": 9.999351801106044e-06, + "loss": 0.1087, + "step": 416 + }, + { + "epoch": 0.04, + "grad_norm": 0.8869791593426875, + "learning_rate": 9.999329642546802e-06, + "loss": 0.1237, + "step": 417 + }, + { + "epoch": 0.04, + "grad_norm": 0.6626500279143978, + "learning_rate": 9.999307111608366e-06, + "loss": 0.1417, + "step": 418 + }, + { + "epoch": 0.04, + "grad_norm": 0.4083491663954509, + "learning_rate": 9.999284208292412e-06, + "loss": 0.1189, + "step": 419 + }, + { + "epoch": 0.04, + "grad_norm": 0.520968320487048, + "learning_rate": 9.999260932600649e-06, + "loss": 0.083, + "step": 420 + }, + { + "epoch": 0.04, + "grad_norm": 0.8475956354539327, + "learning_rate": 9.999237284534808e-06, + "loss": 0.1649, + "step": 421 + }, + { + "epoch": 0.04, + "grad_norm": 1.0004759207779281, + "learning_rate": 9.999213264096651e-06, + "loss": 0.1081, + "step": 422 + }, + { + "epoch": 0.04, + "grad_norm": 0.4284016872736085, + "learning_rate": 9.999188871287967e-06, + "loss": 0.0607, + "step": 423 + }, + { + "epoch": 0.04, + "grad_norm": 0.942209140728215, + "learning_rate": 9.999164106110574e-06, + "loss": 0.1863, + "step": 424 + }, + { + "epoch": 0.04, + "grad_norm": 0.8303710619051764, + "learning_rate": 9.999138968566317e-06, + "loss": 0.1237, + "step": 425 + }, + { + "epoch": 0.04, + "grad_norm": 0.8797360781869421, + "learning_rate": 9.999113458657066e-06, + "loss": 0.1533, + "step": 426 + }, + { + "epoch": 0.04, + "grad_norm": 0.7520200202175943, + "learning_rate": 9.999087576384723e-06, + "loss": 0.1147, + "step": 427 + }, + { + "epoch": 0.04, + "grad_norm": 0.7763533929093924, + "learning_rate": 9.999061321751215e-06, + "loss": 0.1033, + "step": 428 + }, + { + "epoch": 0.04, + "grad_norm": 0.6182054681852013, + "learning_rate": 9.9990346947585e-06, + "loss": 0.0984, + "step": 429 + }, + { + "epoch": 0.04, + "grad_norm": 0.6269097783989711, + "learning_rate": 9.999007695408559e-06, + "loss": 0.1082, + "step": 430 + }, + { + "epoch": 0.04, + "grad_norm": 0.9902443934513061, + "learning_rate": 9.998980323703403e-06, + "loss": 0.1062, + "step": 431 + }, + { + "epoch": 0.04, + "grad_norm": 0.987425391169721, + "learning_rate": 9.998952579645068e-06, + "loss": 0.145, + "step": 432 + }, + { + "epoch": 0.04, + "grad_norm": 0.8298876706657803, + "learning_rate": 9.998924463235628e-06, + "loss": 0.123, + "step": 433 + }, + { + "epoch": 0.04, + "grad_norm": 0.5956116378358924, + "learning_rate": 9.998895974477172e-06, + "loss": 0.1033, + "step": 434 + }, + { + "epoch": 0.04, + "grad_norm": 0.44934521687809664, + "learning_rate": 9.998867113371824e-06, + "loss": 0.1016, + "step": 435 + }, + { + "epoch": 0.04, + "grad_norm": 0.5759296768779403, + "learning_rate": 9.998837879921732e-06, + "loss": 0.1086, + "step": 436 + }, + { + "epoch": 0.04, + "grad_norm": 0.3565615328440879, + "learning_rate": 9.998808274129073e-06, + "loss": 0.0673, + "step": 437 + }, + { + "epoch": 0.04, + "grad_norm": 0.5510718520623004, + "learning_rate": 9.998778295996054e-06, + "loss": 0.0964, + "step": 438 + }, + { + "epoch": 0.04, + "grad_norm": 0.611371042558083, + "learning_rate": 9.99874794552491e-06, + "loss": 0.1054, + "step": 439 + }, + { + "epoch": 0.04, + "grad_norm": 0.8257973609669831, + "learning_rate": 9.998717222717896e-06, + "loss": 0.1108, + "step": 440 + }, + { + "epoch": 0.04, + "grad_norm": 0.5596408984719556, + "learning_rate": 9.998686127577305e-06, + "loss": 0.1169, + "step": 441 + }, + { + "epoch": 0.04, + "grad_norm": 0.48547098410358214, + "learning_rate": 9.998654660105451e-06, + "loss": 0.1063, + "step": 442 + }, + { + "epoch": 0.04, + "grad_norm": 0.519789135168508, + "learning_rate": 9.99862282030468e-06, + "loss": 0.1251, + "step": 443 + }, + { + "epoch": 0.04, + "grad_norm": 2.3180794764397854, + "learning_rate": 9.998590608177361e-06, + "loss": 0.1794, + "step": 444 + }, + { + "epoch": 0.04, + "grad_norm": 0.739902398693344, + "learning_rate": 9.998558023725896e-06, + "loss": 0.1398, + "step": 445 + }, + { + "epoch": 0.04, + "grad_norm": 0.6045677753121406, + "learning_rate": 9.998525066952709e-06, + "loss": 0.0828, + "step": 446 + }, + { + "epoch": 0.04, + "grad_norm": 0.7988531564079917, + "learning_rate": 9.998491737860256e-06, + "loss": 0.1092, + "step": 447 + }, + { + "epoch": 0.04, + "grad_norm": 0.518060320605162, + "learning_rate": 9.998458036451022e-06, + "loss": 0.0864, + "step": 448 + }, + { + "epoch": 0.04, + "grad_norm": 1.2612545325155107, + "learning_rate": 9.998423962727514e-06, + "loss": 0.1481, + "step": 449 + }, + { + "epoch": 0.04, + "grad_norm": 0.6699689889857784, + "learning_rate": 9.998389516692272e-06, + "loss": 0.1069, + "step": 450 + }, + { + "epoch": 0.04, + "grad_norm": 0.579477288461418, + "learning_rate": 9.99835469834786e-06, + "loss": 0.1243, + "step": 451 + }, + { + "epoch": 0.04, + "grad_norm": 0.47259436832250945, + "learning_rate": 9.998319507696874e-06, + "loss": 0.0842, + "step": 452 + }, + { + "epoch": 0.04, + "grad_norm": 0.6461597455139227, + "learning_rate": 9.998283944741932e-06, + "loss": 0.1012, + "step": 453 + }, + { + "epoch": 0.04, + "grad_norm": 1.296265985877647, + "learning_rate": 9.998248009485685e-06, + "loss": 0.146, + "step": 454 + }, + { + "epoch": 0.04, + "grad_norm": 0.5403351160481193, + "learning_rate": 9.99821170193081e-06, + "loss": 0.0893, + "step": 455 + }, + { + "epoch": 0.04, + "grad_norm": 0.48404375981303693, + "learning_rate": 9.99817502208001e-06, + "loss": 0.1249, + "step": 456 + }, + { + "epoch": 0.04, + "grad_norm": 0.5452138186241835, + "learning_rate": 9.99813796993602e-06, + "loss": 0.0968, + "step": 457 + }, + { + "epoch": 0.04, + "grad_norm": 0.5408153715381283, + "learning_rate": 9.998100545501595e-06, + "loss": 0.0878, + "step": 458 + }, + { + "epoch": 0.04, + "grad_norm": 0.4637347253448656, + "learning_rate": 9.998062748779527e-06, + "loss": 0.0818, + "step": 459 + }, + { + "epoch": 0.04, + "grad_norm": 0.6002717791081874, + "learning_rate": 9.998024579772628e-06, + "loss": 0.0913, + "step": 460 + }, + { + "epoch": 0.04, + "grad_norm": 0.7778546802377903, + "learning_rate": 9.997986038483744e-06, + "loss": 0.1096, + "step": 461 + }, + { + "epoch": 0.04, + "grad_norm": 0.6823411599203874, + "learning_rate": 9.997947124915743e-06, + "loss": 0.1447, + "step": 462 + }, + { + "epoch": 0.04, + "grad_norm": 0.41059589679698616, + "learning_rate": 9.997907839071526e-06, + "loss": 0.0736, + "step": 463 + }, + { + "epoch": 0.04, + "grad_norm": 0.3746874131971565, + "learning_rate": 9.997868180954017e-06, + "loss": 0.0583, + "step": 464 + }, + { + "epoch": 0.04, + "grad_norm": 0.4084417863796041, + "learning_rate": 9.997828150566174e-06, + "loss": 0.0836, + "step": 465 + }, + { + "epoch": 0.04, + "grad_norm": 0.4031936106293697, + "learning_rate": 9.997787747910974e-06, + "loss": 0.0881, + "step": 466 + }, + { + "epoch": 0.04, + "grad_norm": 0.598120977661575, + "learning_rate": 9.997746972991427e-06, + "loss": 0.1031, + "step": 467 + }, + { + "epoch": 0.04, + "grad_norm": 0.4838991095112319, + "learning_rate": 9.997705825810574e-06, + "loss": 0.0466, + "step": 468 + }, + { + "epoch": 0.04, + "grad_norm": 0.49334636352735917, + "learning_rate": 9.997664306371476e-06, + "loss": 0.1106, + "step": 469 + }, + { + "epoch": 0.04, + "grad_norm": 0.3554698052882315, + "learning_rate": 9.997622414677227e-06, + "loss": 0.0826, + "step": 470 + }, + { + "epoch": 0.04, + "grad_norm": 0.6461637048522569, + "learning_rate": 9.997580150730947e-06, + "loss": 0.1079, + "step": 471 + }, + { + "epoch": 0.04, + "grad_norm": 0.3680252416183355, + "learning_rate": 9.997537514535785e-06, + "loss": 0.0813, + "step": 472 + }, + { + "epoch": 0.04, + "grad_norm": 0.801902627365545, + "learning_rate": 9.997494506094916e-06, + "loss": 0.1191, + "step": 473 + }, + { + "epoch": 0.04, + "grad_norm": 1.3337940715156686, + "learning_rate": 9.997451125411542e-06, + "loss": 0.159, + "step": 474 + }, + { + "epoch": 0.04, + "grad_norm": 0.49171510201932156, + "learning_rate": 9.997407372488898e-06, + "loss": 0.1082, + "step": 475 + }, + { + "epoch": 0.04, + "grad_norm": 0.7138644142831517, + "learning_rate": 9.99736324733024e-06, + "loss": 0.1343, + "step": 476 + }, + { + "epoch": 0.04, + "grad_norm": 0.7483748916796715, + "learning_rate": 9.997318749938856e-06, + "loss": 0.1601, + "step": 477 + }, + { + "epoch": 0.04, + "grad_norm": 0.5438634056067543, + "learning_rate": 9.997273880318058e-06, + "loss": 0.07, + "step": 478 + }, + { + "epoch": 0.04, + "grad_norm": 0.5391960919956674, + "learning_rate": 9.997228638471192e-06, + "loss": 0.086, + "step": 479 + }, + { + "epoch": 0.04, + "grad_norm": 0.5503328137892898, + "learning_rate": 9.997183024401627e-06, + "loss": 0.1121, + "step": 480 + }, + { + "epoch": 0.04, + "grad_norm": 0.4563977164051309, + "learning_rate": 9.997137038112758e-06, + "loss": 0.1021, + "step": 481 + }, + { + "epoch": 0.04, + "grad_norm": 0.5319076475615525, + "learning_rate": 9.99709067960801e-06, + "loss": 0.0828, + "step": 482 + }, + { + "epoch": 0.04, + "grad_norm": 0.5297629239747339, + "learning_rate": 9.997043948890841e-06, + "loss": 0.1022, + "step": 483 + }, + { + "epoch": 0.04, + "grad_norm": 0.6175352559175136, + "learning_rate": 9.996996845964729e-06, + "loss": 0.1081, + "step": 484 + }, + { + "epoch": 0.04, + "grad_norm": 0.6259564128021599, + "learning_rate": 9.99694937083318e-06, + "loss": 0.1111, + "step": 485 + }, + { + "epoch": 0.04, + "grad_norm": 0.34471655188850864, + "learning_rate": 9.996901523499733e-06, + "loss": 0.0704, + "step": 486 + }, + { + "epoch": 0.04, + "grad_norm": 0.5776546753090909, + "learning_rate": 9.996853303967953e-06, + "loss": 0.1054, + "step": 487 + }, + { + "epoch": 0.04, + "grad_norm": 0.4511868296874108, + "learning_rate": 9.99680471224143e-06, + "loss": 0.0983, + "step": 488 + }, + { + "epoch": 0.04, + "grad_norm": 0.515678683484427, + "learning_rate": 9.996755748323782e-06, + "loss": 0.1256, + "step": 489 + }, + { + "epoch": 0.04, + "grad_norm": 0.6800153674811197, + "learning_rate": 9.996706412218658e-06, + "loss": 0.118, + "step": 490 + }, + { + "epoch": 0.04, + "grad_norm": 0.3968468937166093, + "learning_rate": 9.996656703929733e-06, + "loss": 0.0773, + "step": 491 + }, + { + "epoch": 0.04, + "grad_norm": 0.464086865109732, + "learning_rate": 9.99660662346071e-06, + "loss": 0.0842, + "step": 492 + }, + { + "epoch": 0.04, + "grad_norm": 0.7641447353183326, + "learning_rate": 9.996556170815315e-06, + "loss": 0.1386, + "step": 493 + }, + { + "epoch": 0.04, + "grad_norm": 0.47166397522099524, + "learning_rate": 9.996505345997311e-06, + "loss": 0.0838, + "step": 494 + }, + { + "epoch": 0.04, + "grad_norm": 0.6278954677632359, + "learning_rate": 9.996454149010484e-06, + "loss": 0.0897, + "step": 495 + }, + { + "epoch": 0.04, + "grad_norm": 0.3604775455384319, + "learning_rate": 9.996402579858644e-06, + "loss": 0.068, + "step": 496 + }, + { + "epoch": 0.04, + "grad_norm": 0.5408802140052029, + "learning_rate": 9.996350638545633e-06, + "loss": 0.1074, + "step": 497 + }, + { + "epoch": 0.04, + "grad_norm": 0.2781717586890425, + "learning_rate": 9.996298325075322e-06, + "loss": 0.0835, + "step": 498 + }, + { + "epoch": 0.04, + "grad_norm": 0.4597361347141018, + "learning_rate": 9.996245639451603e-06, + "loss": 0.0422, + "step": 499 + }, + { + "epoch": 0.04, + "grad_norm": 1.13132198533703, + "learning_rate": 9.996192581678407e-06, + "loss": 0.1446, + "step": 500 + }, + { + "epoch": 0.04, + "grad_norm": 1.046582299432322, + "learning_rate": 9.99613915175968e-06, + "loss": 0.158, + "step": 501 + }, + { + "epoch": 0.04, + "grad_norm": 1.6711420785578908, + "learning_rate": 9.996085349699407e-06, + "loss": 0.1441, + "step": 502 + }, + { + "epoch": 0.04, + "grad_norm": 0.9910923739073177, + "learning_rate": 9.99603117550159e-06, + "loss": 0.1802, + "step": 503 + }, + { + "epoch": 0.04, + "grad_norm": 0.5913690128537314, + "learning_rate": 9.99597662917027e-06, + "loss": 0.0949, + "step": 504 + }, + { + "epoch": 0.04, + "grad_norm": 0.724501776402321, + "learning_rate": 9.995921710709504e-06, + "loss": 0.1272, + "step": 505 + }, + { + "epoch": 0.04, + "grad_norm": 0.5097582103514099, + "learning_rate": 9.995866420123387e-06, + "loss": 0.0997, + "step": 506 + }, + { + "epoch": 0.04, + "grad_norm": 0.621934622231527, + "learning_rate": 9.995810757416036e-06, + "loss": 0.1105, + "step": 507 + }, + { + "epoch": 0.04, + "grad_norm": 0.5202017740838007, + "learning_rate": 9.995754722591597e-06, + "loss": 0.0665, + "step": 508 + }, + { + "epoch": 0.04, + "grad_norm": 1.3834096933119617, + "learning_rate": 9.995698315654244e-06, + "loss": 0.1656, + "step": 509 + }, + { + "epoch": 0.04, + "grad_norm": 0.3892741297485457, + "learning_rate": 9.995641536608178e-06, + "loss": 0.0449, + "step": 510 + }, + { + "epoch": 0.04, + "grad_norm": 0.6551174259633928, + "learning_rate": 9.995584385457629e-06, + "loss": 0.1135, + "step": 511 + }, + { + "epoch": 0.04, + "grad_norm": 1.220152564097502, + "learning_rate": 9.995526862206852e-06, + "loss": 0.1165, + "step": 512 + }, + { + "epoch": 0.04, + "grad_norm": 1.0568416329525472, + "learning_rate": 9.995468966860135e-06, + "loss": 0.1487, + "step": 513 + }, + { + "epoch": 0.04, + "grad_norm": 0.39048557738709233, + "learning_rate": 9.995410699421788e-06, + "loss": 0.0748, + "step": 514 + }, + { + "epoch": 0.04, + "grad_norm": 0.832719304203821, + "learning_rate": 9.995352059896152e-06, + "loss": 0.1791, + "step": 515 + }, + { + "epoch": 0.04, + "grad_norm": 0.6313956435549932, + "learning_rate": 9.995293048287595e-06, + "loss": 0.1626, + "step": 516 + }, + { + "epoch": 0.04, + "grad_norm": 0.5337349838505776, + "learning_rate": 9.99523366460051e-06, + "loss": 0.1036, + "step": 517 + }, + { + "epoch": 0.04, + "grad_norm": 0.6544270540260293, + "learning_rate": 9.995173908839324e-06, + "loss": 0.1071, + "step": 518 + }, + { + "epoch": 0.04, + "grad_norm": 0.559257811059123, + "learning_rate": 9.995113781008485e-06, + "loss": 0.0871, + "step": 519 + }, + { + "epoch": 0.04, + "grad_norm": 0.5850860312546887, + "learning_rate": 9.995053281112475e-06, + "loss": 0.0689, + "step": 520 + }, + { + "epoch": 0.04, + "grad_norm": 1.256963241911466, + "learning_rate": 9.994992409155797e-06, + "loss": 0.1538, + "step": 521 + }, + { + "epoch": 0.04, + "grad_norm": 0.5090878933377821, + "learning_rate": 9.994931165142986e-06, + "loss": 0.0882, + "step": 522 + }, + { + "epoch": 0.04, + "grad_norm": 0.4294758309767924, + "learning_rate": 9.994869549078605e-06, + "loss": 0.0718, + "step": 523 + }, + { + "epoch": 0.04, + "grad_norm": 0.456826803356107, + "learning_rate": 9.994807560967241e-06, + "loss": 0.1033, + "step": 524 + }, + { + "epoch": 0.04, + "grad_norm": 0.830167603889245, + "learning_rate": 9.994745200813515e-06, + "loss": 0.105, + "step": 525 + }, + { + "epoch": 0.04, + "grad_norm": 0.5719666718827081, + "learning_rate": 9.99468246862207e-06, + "loss": 0.0735, + "step": 526 + }, + { + "epoch": 0.04, + "grad_norm": 0.677575709393069, + "learning_rate": 9.994619364397576e-06, + "loss": 0.1179, + "step": 527 + }, + { + "epoch": 0.04, + "grad_norm": 0.7125015681410454, + "learning_rate": 9.994555888144738e-06, + "loss": 0.1122, + "step": 528 + }, + { + "epoch": 0.04, + "grad_norm": 0.4108344536293684, + "learning_rate": 9.994492039868283e-06, + "loss": 0.0707, + "step": 529 + }, + { + "epoch": 0.04, + "grad_norm": 0.3965546390441736, + "learning_rate": 9.994427819572963e-06, + "loss": 0.0896, + "step": 530 + }, + { + "epoch": 0.04, + "grad_norm": 0.6251902964897077, + "learning_rate": 9.994363227263565e-06, + "loss": 0.0715, + "step": 531 + }, + { + "epoch": 0.04, + "grad_norm": 1.8056165087410867, + "learning_rate": 9.994298262944902e-06, + "loss": 0.1144, + "step": 532 + }, + { + "epoch": 0.04, + "grad_norm": 0.838992011088451, + "learning_rate": 9.994232926621807e-06, + "loss": 0.1068, + "step": 533 + }, + { + "epoch": 0.04, + "grad_norm": 0.7116641563129352, + "learning_rate": 9.994167218299152e-06, + "loss": 0.1399, + "step": 534 + }, + { + "epoch": 0.05, + "grad_norm": 0.7378263274066403, + "learning_rate": 9.994101137981829e-06, + "loss": 0.1186, + "step": 535 + }, + { + "epoch": 0.05, + "grad_norm": 0.7041449410903895, + "learning_rate": 9.99403468567476e-06, + "loss": 0.0993, + "step": 536 + }, + { + "epoch": 0.05, + "grad_norm": 0.6254325213413032, + "learning_rate": 9.993967861382896e-06, + "loss": 0.1032, + "step": 537 + }, + { + "epoch": 0.05, + "grad_norm": 1.0533206631189598, + "learning_rate": 9.993900665111213e-06, + "loss": 0.1755, + "step": 538 + }, + { + "epoch": 0.05, + "grad_norm": 0.7112542349578724, + "learning_rate": 9.993833096864719e-06, + "loss": 0.1283, + "step": 539 + }, + { + "epoch": 0.05, + "grad_norm": 1.274831365207608, + "learning_rate": 9.993765156648442e-06, + "loss": 0.1978, + "step": 540 + }, + { + "epoch": 0.05, + "grad_norm": 0.6463996387109753, + "learning_rate": 9.993696844467447e-06, + "loss": 0.1263, + "step": 541 + }, + { + "epoch": 0.05, + "grad_norm": 0.5078792777730835, + "learning_rate": 9.99362816032682e-06, + "loss": 0.0816, + "step": 542 + }, + { + "epoch": 0.05, + "grad_norm": 0.36945632584032145, + "learning_rate": 9.993559104231676e-06, + "loss": 0.0818, + "step": 543 + }, + { + "epoch": 0.05, + "grad_norm": 0.7584613739323615, + "learning_rate": 9.993489676187162e-06, + "loss": 0.1156, + "step": 544 + }, + { + "epoch": 0.05, + "grad_norm": 0.5550081327051679, + "learning_rate": 9.993419876198448e-06, + "loss": 0.0782, + "step": 545 + }, + { + "epoch": 0.05, + "grad_norm": 1.4500914806166052, + "learning_rate": 9.993349704270733e-06, + "loss": 0.1601, + "step": 546 + }, + { + "epoch": 0.05, + "grad_norm": 0.6270229749906052, + "learning_rate": 9.993279160409243e-06, + "loss": 0.1007, + "step": 547 + }, + { + "epoch": 0.05, + "grad_norm": 0.7585445841588694, + "learning_rate": 9.993208244619232e-06, + "loss": 0.0851, + "step": 548 + }, + { + "epoch": 0.05, + "grad_norm": 0.3796296026306707, + "learning_rate": 9.993136956905985e-06, + "loss": 0.0519, + "step": 549 + }, + { + "epoch": 0.05, + "grad_norm": 0.9476177420044567, + "learning_rate": 9.99306529727481e-06, + "loss": 0.1707, + "step": 550 + }, + { + "epoch": 0.05, + "grad_norm": 0.4961212889927388, + "learning_rate": 9.992993265731044e-06, + "loss": 0.1061, + "step": 551 + }, + { + "epoch": 0.05, + "grad_norm": 0.6792133972369617, + "learning_rate": 9.992920862280054e-06, + "loss": 0.103, + "step": 552 + }, + { + "epoch": 0.05, + "grad_norm": 0.5190310895252396, + "learning_rate": 9.992848086927232e-06, + "loss": 0.0986, + "step": 553 + }, + { + "epoch": 0.05, + "grad_norm": 0.6887901071033682, + "learning_rate": 9.992774939677998e-06, + "loss": 0.0914, + "step": 554 + }, + { + "epoch": 0.05, + "grad_norm": 0.602808068195729, + "learning_rate": 9.992701420537804e-06, + "loss": 0.1282, + "step": 555 + }, + { + "epoch": 0.05, + "grad_norm": 0.42715537528399167, + "learning_rate": 9.992627529512122e-06, + "loss": 0.0874, + "step": 556 + }, + { + "epoch": 0.05, + "grad_norm": 0.3866382323268875, + "learning_rate": 9.992553266606457e-06, + "loss": 0.0833, + "step": 557 + }, + { + "epoch": 0.05, + "grad_norm": 0.45723410507274154, + "learning_rate": 9.99247863182634e-06, + "loss": 0.0973, + "step": 558 + }, + { + "epoch": 0.05, + "grad_norm": 0.8460176996161588, + "learning_rate": 9.992403625177333e-06, + "loss": 0.0958, + "step": 559 + }, + { + "epoch": 0.05, + "grad_norm": 1.049788559783224, + "learning_rate": 9.992328246665021e-06, + "loss": 0.1721, + "step": 560 + }, + { + "epoch": 0.05, + "grad_norm": 2.6202660791611607, + "learning_rate": 9.992252496295018e-06, + "loss": 0.1782, + "step": 561 + }, + { + "epoch": 0.05, + "grad_norm": 0.43680908563113335, + "learning_rate": 9.992176374072968e-06, + "loss": 0.0685, + "step": 562 + }, + { + "epoch": 0.05, + "grad_norm": 0.8545196911296337, + "learning_rate": 9.99209988000454e-06, + "loss": 0.1737, + "step": 563 + }, + { + "epoch": 0.05, + "grad_norm": 0.8039961883982933, + "learning_rate": 9.992023014095431e-06, + "loss": 0.1486, + "step": 564 + }, + { + "epoch": 0.05, + "grad_norm": 0.5587173575844564, + "learning_rate": 9.991945776351368e-06, + "loss": 0.0994, + "step": 565 + }, + { + "epoch": 0.05, + "grad_norm": 0.5825429344800608, + "learning_rate": 9.991868166778103e-06, + "loss": 0.1219, + "step": 566 + }, + { + "epoch": 0.05, + "grad_norm": 0.5269676403472533, + "learning_rate": 9.991790185381416e-06, + "loss": 0.0909, + "step": 567 + }, + { + "epoch": 0.05, + "grad_norm": 0.7607855977635978, + "learning_rate": 9.991711832167119e-06, + "loss": 0.1251, + "step": 568 + }, + { + "epoch": 0.05, + "grad_norm": 0.36124544967025113, + "learning_rate": 9.991633107141043e-06, + "loss": 0.0754, + "step": 569 + }, + { + "epoch": 0.05, + "grad_norm": 0.6700259840727764, + "learning_rate": 9.991554010309057e-06, + "loss": 0.116, + "step": 570 + }, + { + "epoch": 0.05, + "grad_norm": 0.34117193375791594, + "learning_rate": 9.99147454167705e-06, + "loss": 0.0892, + "step": 571 + }, + { + "epoch": 0.05, + "grad_norm": 0.4645072200501844, + "learning_rate": 9.991394701250941e-06, + "loss": 0.1495, + "step": 572 + }, + { + "epoch": 0.05, + "grad_norm": 0.5039230574002541, + "learning_rate": 9.991314489036678e-06, + "loss": 0.1411, + "step": 573 + }, + { + "epoch": 0.05, + "grad_norm": 0.36907899959379753, + "learning_rate": 9.991233905040234e-06, + "loss": 0.0802, + "step": 574 + }, + { + "epoch": 0.05, + "grad_norm": 0.4813627763686513, + "learning_rate": 9.991152949267615e-06, + "loss": 0.0659, + "step": 575 + }, + { + "epoch": 0.05, + "grad_norm": 0.6080267806649663, + "learning_rate": 9.991071621724847e-06, + "loss": 0.1213, + "step": 576 + }, + { + "epoch": 0.05, + "grad_norm": 0.3296477756766429, + "learning_rate": 9.990989922417991e-06, + "loss": 0.0534, + "step": 577 + }, + { + "epoch": 0.05, + "grad_norm": 0.7949673619884388, + "learning_rate": 9.990907851353129e-06, + "loss": 0.1226, + "step": 578 + }, + { + "epoch": 0.05, + "grad_norm": 0.41403372438903074, + "learning_rate": 9.990825408536377e-06, + "loss": 0.0711, + "step": 579 + }, + { + "epoch": 0.05, + "grad_norm": 0.5409182312537043, + "learning_rate": 9.990742593973876e-06, + "loss": 0.0884, + "step": 580 + }, + { + "epoch": 0.05, + "grad_norm": 0.5879315809835352, + "learning_rate": 9.990659407671793e-06, + "loss": 0.1144, + "step": 581 + }, + { + "epoch": 0.05, + "grad_norm": 0.6959366535210851, + "learning_rate": 9.990575849636322e-06, + "loss": 0.1165, + "step": 582 + }, + { + "epoch": 0.05, + "grad_norm": 0.4382921231669729, + "learning_rate": 9.990491919873692e-06, + "loss": 0.0852, + "step": 583 + }, + { + "epoch": 0.05, + "grad_norm": 0.3746800474911327, + "learning_rate": 9.990407618390151e-06, + "loss": 0.0798, + "step": 584 + }, + { + "epoch": 0.05, + "grad_norm": 0.4233637593178525, + "learning_rate": 9.990322945191979e-06, + "loss": 0.0757, + "step": 585 + }, + { + "epoch": 0.05, + "grad_norm": 0.7095894367893713, + "learning_rate": 9.990237900285484e-06, + "loss": 0.1227, + "step": 586 + }, + { + "epoch": 0.05, + "grad_norm": 0.4073519461855776, + "learning_rate": 9.990152483677e-06, + "loss": 0.0933, + "step": 587 + }, + { + "epoch": 0.05, + "grad_norm": 0.6994301134078731, + "learning_rate": 9.99006669537289e-06, + "loss": 0.1362, + "step": 588 + }, + { + "epoch": 0.05, + "grad_norm": 0.3497621479101028, + "learning_rate": 9.989980535379542e-06, + "loss": 0.0966, + "step": 589 + }, + { + "epoch": 0.05, + "grad_norm": 0.39985364697513043, + "learning_rate": 9.989894003703375e-06, + "loss": 0.0855, + "step": 590 + }, + { + "epoch": 0.05, + "grad_norm": 0.4836760904604064, + "learning_rate": 9.989807100350834e-06, + "loss": 0.0865, + "step": 591 + }, + { + "epoch": 0.05, + "grad_norm": 0.29820842002707637, + "learning_rate": 9.989719825328394e-06, + "loss": 0.0601, + "step": 592 + }, + { + "epoch": 0.05, + "grad_norm": 0.8453628926786679, + "learning_rate": 9.989632178642553e-06, + "loss": 0.1502, + "step": 593 + }, + { + "epoch": 0.05, + "grad_norm": 0.5234738780012702, + "learning_rate": 9.98954416029984e-06, + "loss": 0.1107, + "step": 594 + }, + { + "epoch": 0.05, + "grad_norm": 0.4967032282665953, + "learning_rate": 9.989455770306812e-06, + "loss": 0.1074, + "step": 595 + }, + { + "epoch": 0.05, + "grad_norm": 0.34743376077546084, + "learning_rate": 9.989367008670054e-06, + "loss": 0.0871, + "step": 596 + }, + { + "epoch": 0.05, + "grad_norm": 0.4492626716605562, + "learning_rate": 9.989277875396176e-06, + "loss": 0.119, + "step": 597 + }, + { + "epoch": 0.05, + "grad_norm": 0.6897675475179305, + "learning_rate": 9.989188370491814e-06, + "loss": 0.0949, + "step": 598 + }, + { + "epoch": 0.05, + "grad_norm": 0.38203174895636355, + "learning_rate": 9.98909849396364e-06, + "loss": 0.082, + "step": 599 + }, + { + "epoch": 0.05, + "grad_norm": 0.3659272969348084, + "learning_rate": 9.989008245818349e-06, + "loss": 0.0852, + "step": 600 + }, + { + "epoch": 0.05, + "grad_norm": 0.5540785818848548, + "learning_rate": 9.988917626062658e-06, + "loss": 0.1162, + "step": 601 + }, + { + "epoch": 0.05, + "grad_norm": 0.7174871771534248, + "learning_rate": 9.988826634703318e-06, + "loss": 0.0908, + "step": 602 + }, + { + "epoch": 0.05, + "grad_norm": 0.46028336734572634, + "learning_rate": 9.988735271747111e-06, + "loss": 0.0806, + "step": 603 + }, + { + "epoch": 0.05, + "grad_norm": 0.5341200542113298, + "learning_rate": 9.988643537200839e-06, + "loss": 0.0707, + "step": 604 + }, + { + "epoch": 0.05, + "grad_norm": 0.5459506599509913, + "learning_rate": 9.988551431071334e-06, + "loss": 0.1133, + "step": 605 + }, + { + "epoch": 0.05, + "grad_norm": 0.8650308693532739, + "learning_rate": 9.988458953365458e-06, + "loss": 0.1335, + "step": 606 + }, + { + "epoch": 0.05, + "grad_norm": 0.5167271814754328, + "learning_rate": 9.988366104090101e-06, + "loss": 0.0895, + "step": 607 + }, + { + "epoch": 0.05, + "grad_norm": 0.4757146134343915, + "learning_rate": 9.988272883252175e-06, + "loss": 0.0956, + "step": 608 + }, + { + "epoch": 0.05, + "grad_norm": 0.5979190251152173, + "learning_rate": 9.988179290858628e-06, + "loss": 0.1015, + "step": 609 + }, + { + "epoch": 0.05, + "grad_norm": 0.5421151955831349, + "learning_rate": 9.988085326916427e-06, + "loss": 0.1125, + "step": 610 + }, + { + "epoch": 0.05, + "grad_norm": 0.4138660629175644, + "learning_rate": 9.987990991432575e-06, + "loss": 0.0716, + "step": 611 + }, + { + "epoch": 0.05, + "grad_norm": 0.4352447007847622, + "learning_rate": 9.987896284414096e-06, + "loss": 0.0423, + "step": 612 + }, + { + "epoch": 0.05, + "grad_norm": 0.8483519854557902, + "learning_rate": 9.987801205868046e-06, + "loss": 0.1476, + "step": 613 + }, + { + "epoch": 0.05, + "grad_norm": 0.8653367733360023, + "learning_rate": 9.987705755801505e-06, + "loss": 0.1446, + "step": 614 + }, + { + "epoch": 0.05, + "grad_norm": 0.7884598250327272, + "learning_rate": 9.987609934221584e-06, + "loss": 0.1331, + "step": 615 + }, + { + "epoch": 0.05, + "grad_norm": 0.5768077773682908, + "learning_rate": 9.987513741135419e-06, + "loss": 0.0597, + "step": 616 + }, + { + "epoch": 0.05, + "grad_norm": 1.4950915967755527, + "learning_rate": 9.987417176550177e-06, + "loss": 0.1771, + "step": 617 + }, + { + "epoch": 0.05, + "grad_norm": 0.7672729682728879, + "learning_rate": 9.98732024047305e-06, + "loss": 0.1332, + "step": 618 + }, + { + "epoch": 0.05, + "grad_norm": 0.5124256765111024, + "learning_rate": 9.98722293291126e-06, + "loss": 0.0828, + "step": 619 + }, + { + "epoch": 0.05, + "grad_norm": 0.46115873788036593, + "learning_rate": 9.98712525387205e-06, + "loss": 0.0986, + "step": 620 + }, + { + "epoch": 0.05, + "grad_norm": 0.43851838026958595, + "learning_rate": 9.987027203362702e-06, + "loss": 0.1341, + "step": 621 + }, + { + "epoch": 0.05, + "grad_norm": 0.4933692636942046, + "learning_rate": 9.986928781390515e-06, + "loss": 0.1154, + "step": 622 + }, + { + "epoch": 0.05, + "grad_norm": 0.7596838157927859, + "learning_rate": 9.986829987962821e-06, + "loss": 0.1853, + "step": 623 + }, + { + "epoch": 0.05, + "grad_norm": 0.4482223467953727, + "learning_rate": 9.98673082308698e-06, + "loss": 0.0391, + "step": 624 + }, + { + "epoch": 0.05, + "grad_norm": 0.546819435988106, + "learning_rate": 9.986631286770375e-06, + "loss": 0.1259, + "step": 625 + }, + { + "epoch": 0.05, + "grad_norm": 0.31016185358042764, + "learning_rate": 9.986531379020425e-06, + "loss": 0.0689, + "step": 626 + }, + { + "epoch": 0.05, + "grad_norm": 1.3366705821812166, + "learning_rate": 9.986431099844567e-06, + "loss": 0.1201, + "step": 627 + }, + { + "epoch": 0.05, + "grad_norm": 0.5972035151395412, + "learning_rate": 9.986330449250275e-06, + "loss": 0.124, + "step": 628 + }, + { + "epoch": 0.05, + "grad_norm": 0.3784447366098721, + "learning_rate": 9.986229427245042e-06, + "loss": 0.0613, + "step": 629 + }, + { + "epoch": 0.05, + "grad_norm": 0.4599583131512941, + "learning_rate": 9.986128033836395e-06, + "loss": 0.1014, + "step": 630 + }, + { + "epoch": 0.05, + "grad_norm": 0.5184754615425619, + "learning_rate": 9.986026269031886e-06, + "loss": 0.1079, + "step": 631 + }, + { + "epoch": 0.05, + "grad_norm": 0.3998366126981325, + "learning_rate": 9.985924132839094e-06, + "loss": 0.073, + "step": 632 + }, + { + "epoch": 0.05, + "grad_norm": 0.34278854993948016, + "learning_rate": 9.985821625265628e-06, + "loss": 0.0873, + "step": 633 + }, + { + "epoch": 0.05, + "grad_norm": 0.37036264861234747, + "learning_rate": 9.985718746319121e-06, + "loss": 0.0909, + "step": 634 + }, + { + "epoch": 0.05, + "grad_norm": 0.4368701474913475, + "learning_rate": 9.98561549600724e-06, + "loss": 0.0732, + "step": 635 + }, + { + "epoch": 0.05, + "grad_norm": 0.3949499507523411, + "learning_rate": 9.985511874337674e-06, + "loss": 0.0771, + "step": 636 + }, + { + "epoch": 0.05, + "grad_norm": 0.3786770072654766, + "learning_rate": 9.985407881318139e-06, + "loss": 0.1032, + "step": 637 + }, + { + "epoch": 0.05, + "grad_norm": 0.8247300873352073, + "learning_rate": 9.985303516956384e-06, + "loss": 0.1549, + "step": 638 + }, + { + "epoch": 0.05, + "grad_norm": 0.53503488372417, + "learning_rate": 9.98519878126018e-06, + "loss": 0.0852, + "step": 639 + }, + { + "epoch": 0.05, + "grad_norm": 1.1380528987805165, + "learning_rate": 9.985093674237331e-06, + "loss": 0.1739, + "step": 640 + }, + { + "epoch": 0.05, + "grad_norm": 0.5647741339837101, + "learning_rate": 9.984988195895665e-06, + "loss": 0.1006, + "step": 641 + }, + { + "epoch": 0.05, + "grad_norm": 0.7907468355414724, + "learning_rate": 9.98488234624304e-06, + "loss": 0.0976, + "step": 642 + }, + { + "epoch": 0.05, + "grad_norm": 0.44103242732956977, + "learning_rate": 9.984776125287334e-06, + "loss": 0.101, + "step": 643 + }, + { + "epoch": 0.05, + "grad_norm": 0.4763927534225128, + "learning_rate": 9.984669533036468e-06, + "loss": 0.0854, + "step": 644 + }, + { + "epoch": 0.05, + "grad_norm": 0.4364591360546529, + "learning_rate": 9.984562569498376e-06, + "loss": 0.0791, + "step": 645 + }, + { + "epoch": 0.05, + "grad_norm": 0.8354452825737708, + "learning_rate": 9.984455234681026e-06, + "loss": 0.1304, + "step": 646 + }, + { + "epoch": 0.05, + "grad_norm": 0.6315167822769249, + "learning_rate": 9.984347528592414e-06, + "loss": 0.1318, + "step": 647 + }, + { + "epoch": 0.05, + "grad_norm": 0.9837174420911087, + "learning_rate": 9.984239451240563e-06, + "loss": 0.126, + "step": 648 + }, + { + "epoch": 0.05, + "grad_norm": 0.7110140817816601, + "learning_rate": 9.984131002633522e-06, + "loss": 0.1276, + "step": 649 + }, + { + "epoch": 0.05, + "grad_norm": 0.7804572355713342, + "learning_rate": 9.984022182779368e-06, + "loss": 0.1353, + "step": 650 + }, + { + "epoch": 0.05, + "grad_norm": 0.37308411809973047, + "learning_rate": 9.983912991686209e-06, + "loss": 0.0953, + "step": 651 + }, + { + "epoch": 0.05, + "grad_norm": 1.7812706705060706, + "learning_rate": 9.983803429362176e-06, + "loss": 0.0928, + "step": 652 + }, + { + "epoch": 0.06, + "grad_norm": 0.7794512009178994, + "learning_rate": 9.983693495815431e-06, + "loss": 0.0883, + "step": 653 + }, + { + "epoch": 0.06, + "grad_norm": 0.39391040063477734, + "learning_rate": 9.983583191054162e-06, + "loss": 0.0759, + "step": 654 + }, + { + "epoch": 0.06, + "grad_norm": 0.7524525592446534, + "learning_rate": 9.983472515086587e-06, + "loss": 0.163, + "step": 655 + }, + { + "epoch": 0.06, + "grad_norm": 0.5645054399105317, + "learning_rate": 9.983361467920946e-06, + "loss": 0.0933, + "step": 656 + }, + { + "epoch": 0.06, + "grad_norm": 0.5776931264675441, + "learning_rate": 9.983250049565513e-06, + "loss": 0.096, + "step": 657 + }, + { + "epoch": 0.06, + "grad_norm": 0.6488340876420681, + "learning_rate": 9.983138260028588e-06, + "loss": 0.126, + "step": 658 + }, + { + "epoch": 0.06, + "grad_norm": 0.29527027694565605, + "learning_rate": 9.983026099318496e-06, + "loss": 0.0724, + "step": 659 + }, + { + "epoch": 0.06, + "grad_norm": 0.4209397093066584, + "learning_rate": 9.982913567443593e-06, + "loss": 0.0899, + "step": 660 + }, + { + "epoch": 0.06, + "grad_norm": 0.8849909341407453, + "learning_rate": 9.982800664412258e-06, + "loss": 0.1422, + "step": 661 + }, + { + "epoch": 0.06, + "grad_norm": 0.3016102935108095, + "learning_rate": 9.982687390232903e-06, + "loss": 0.0713, + "step": 662 + }, + { + "epoch": 0.06, + "grad_norm": 0.8462776193051589, + "learning_rate": 9.982573744913967e-06, + "loss": 0.1406, + "step": 663 + }, + { + "epoch": 0.06, + "grad_norm": 0.5974186029855596, + "learning_rate": 9.982459728463909e-06, + "loss": 0.1229, + "step": 664 + }, + { + "epoch": 0.06, + "grad_norm": 0.5287798021722847, + "learning_rate": 9.982345340891228e-06, + "loss": 0.1011, + "step": 665 + }, + { + "epoch": 0.06, + "grad_norm": 0.5024606088830081, + "learning_rate": 9.98223058220444e-06, + "loss": 0.09, + "step": 666 + }, + { + "epoch": 0.06, + "grad_norm": 0.4176777456679742, + "learning_rate": 9.982115452412096e-06, + "loss": 0.0889, + "step": 667 + }, + { + "epoch": 0.06, + "grad_norm": 0.4783718986253215, + "learning_rate": 9.98199995152277e-06, + "loss": 0.1249, + "step": 668 + }, + { + "epoch": 0.06, + "grad_norm": 0.5746502172418447, + "learning_rate": 9.981884079545065e-06, + "loss": 0.1019, + "step": 669 + }, + { + "epoch": 0.06, + "grad_norm": 0.7838284704959871, + "learning_rate": 9.981767836487612e-06, + "loss": 0.1312, + "step": 670 + }, + { + "epoch": 0.06, + "grad_norm": 0.46211289563329305, + "learning_rate": 9.98165122235907e-06, + "loss": 0.1046, + "step": 671 + }, + { + "epoch": 0.06, + "grad_norm": 0.5180169877701982, + "learning_rate": 9.981534237168125e-06, + "loss": 0.0977, + "step": 672 + }, + { + "epoch": 0.06, + "grad_norm": 0.648313147331964, + "learning_rate": 9.98141688092349e-06, + "loss": 0.102, + "step": 673 + }, + { + "epoch": 0.06, + "grad_norm": 0.4555003707249415, + "learning_rate": 9.981299153633907e-06, + "loss": 0.1086, + "step": 674 + }, + { + "epoch": 0.06, + "grad_norm": 0.6594244793104176, + "learning_rate": 9.981181055308144e-06, + "loss": 0.1495, + "step": 675 + }, + { + "epoch": 0.06, + "grad_norm": 0.3910892291954089, + "learning_rate": 9.981062585954998e-06, + "loss": 0.0981, + "step": 676 + }, + { + "epoch": 0.06, + "grad_norm": 0.500056338395118, + "learning_rate": 9.980943745583295e-06, + "loss": 0.0858, + "step": 677 + }, + { + "epoch": 0.06, + "grad_norm": 0.4409433771060219, + "learning_rate": 9.980824534201887e-06, + "loss": 0.0682, + "step": 678 + }, + { + "epoch": 0.06, + "grad_norm": 1.025807500082504, + "learning_rate": 9.98070495181965e-06, + "loss": 0.1602, + "step": 679 + }, + { + "epoch": 0.06, + "grad_norm": 1.060495146342171, + "learning_rate": 9.980584998445494e-06, + "loss": 0.1455, + "step": 680 + }, + { + "epoch": 0.06, + "grad_norm": 0.4472523554821826, + "learning_rate": 9.980464674088356e-06, + "loss": 0.0979, + "step": 681 + }, + { + "epoch": 0.06, + "grad_norm": 0.7558408616564564, + "learning_rate": 9.980343978757192e-06, + "loss": 0.1388, + "step": 682 + }, + { + "epoch": 0.06, + "grad_norm": 0.6134617548087573, + "learning_rate": 9.980222912460998e-06, + "loss": 0.1134, + "step": 683 + }, + { + "epoch": 0.06, + "grad_norm": 0.5487507413098076, + "learning_rate": 9.980101475208788e-06, + "loss": 0.0838, + "step": 684 + }, + { + "epoch": 0.06, + "grad_norm": 0.9217127138530026, + "learning_rate": 9.97997966700961e-06, + "loss": 0.1463, + "step": 685 + }, + { + "epoch": 0.06, + "grad_norm": 0.5189865907994974, + "learning_rate": 9.979857487872534e-06, + "loss": 0.0796, + "step": 686 + }, + { + "epoch": 0.06, + "grad_norm": 0.5158497852699863, + "learning_rate": 9.979734937806665e-06, + "loss": 0.0602, + "step": 687 + }, + { + "epoch": 0.06, + "grad_norm": 0.4948210244474041, + "learning_rate": 9.979612016821127e-06, + "loss": 0.0933, + "step": 688 + }, + { + "epoch": 0.06, + "grad_norm": 0.589607784339615, + "learning_rate": 9.979488724925078e-06, + "loss": 0.1498, + "step": 689 + }, + { + "epoch": 0.06, + "grad_norm": 0.47301440797198707, + "learning_rate": 9.979365062127702e-06, + "loss": 0.0957, + "step": 690 + }, + { + "epoch": 0.06, + "grad_norm": 0.5454882542083043, + "learning_rate": 9.979241028438208e-06, + "loss": 0.0699, + "step": 691 + }, + { + "epoch": 0.06, + "grad_norm": 0.8554320147287323, + "learning_rate": 9.979116623865837e-06, + "loss": 0.1319, + "step": 692 + }, + { + "epoch": 0.06, + "grad_norm": 0.3730520495259766, + "learning_rate": 9.978991848419853e-06, + "loss": 0.0534, + "step": 693 + }, + { + "epoch": 0.06, + "grad_norm": 0.40773601959232847, + "learning_rate": 9.978866702109553e-06, + "loss": 0.0944, + "step": 694 + }, + { + "epoch": 0.06, + "grad_norm": 0.4089814641648594, + "learning_rate": 9.978741184944256e-06, + "loss": 0.0902, + "step": 695 + }, + { + "epoch": 0.06, + "grad_norm": 0.38094078688763744, + "learning_rate": 9.978615296933311e-06, + "loss": 0.0655, + "step": 696 + }, + { + "epoch": 0.06, + "grad_norm": 0.6948070486021376, + "learning_rate": 9.978489038086099e-06, + "loss": 0.1081, + "step": 697 + }, + { + "epoch": 0.06, + "grad_norm": 0.5206503957453157, + "learning_rate": 9.97836240841202e-06, + "loss": 0.116, + "step": 698 + }, + { + "epoch": 0.06, + "grad_norm": 0.4573488794539956, + "learning_rate": 9.978235407920506e-06, + "loss": 0.0652, + "step": 699 + }, + { + "epoch": 0.06, + "grad_norm": 0.5829010355688334, + "learning_rate": 9.978108036621021e-06, + "loss": 0.104, + "step": 700 + }, + { + "epoch": 0.06, + "grad_norm": 0.444850397072115, + "learning_rate": 9.977980294523048e-06, + "loss": 0.0983, + "step": 701 + }, + { + "epoch": 0.06, + "grad_norm": 0.5762184524949143, + "learning_rate": 9.977852181636106e-06, + "loss": 0.1087, + "step": 702 + }, + { + "epoch": 0.06, + "grad_norm": 0.40349017557780686, + "learning_rate": 9.977723697969735e-06, + "loss": 0.0665, + "step": 703 + }, + { + "epoch": 0.06, + "grad_norm": 0.39232734971983557, + "learning_rate": 9.977594843533505e-06, + "loss": 0.1036, + "step": 704 + }, + { + "epoch": 0.06, + "grad_norm": 0.4854036632535074, + "learning_rate": 9.977465618337013e-06, + "loss": 0.1259, + "step": 705 + }, + { + "epoch": 0.06, + "grad_norm": 0.7149773205926221, + "learning_rate": 9.977336022389888e-06, + "loss": 0.1694, + "step": 706 + }, + { + "epoch": 0.06, + "grad_norm": 2.2564009534439236, + "learning_rate": 9.97720605570178e-06, + "loss": 0.0911, + "step": 707 + }, + { + "epoch": 0.06, + "grad_norm": 0.5250941327411177, + "learning_rate": 9.97707571828237e-06, + "loss": 0.1032, + "step": 708 + }, + { + "epoch": 0.06, + "grad_norm": 0.4310308484906928, + "learning_rate": 9.976945010141369e-06, + "loss": 0.0994, + "step": 709 + }, + { + "epoch": 0.06, + "grad_norm": 0.4801171159821674, + "learning_rate": 9.976813931288508e-06, + "loss": 0.0929, + "step": 710 + }, + { + "epoch": 0.06, + "grad_norm": 0.9408550585340312, + "learning_rate": 9.976682481733554e-06, + "loss": 0.1812, + "step": 711 + }, + { + "epoch": 0.06, + "grad_norm": 0.6670035667101979, + "learning_rate": 9.9765506614863e-06, + "loss": 0.1066, + "step": 712 + }, + { + "epoch": 0.06, + "grad_norm": 0.35806788360972625, + "learning_rate": 9.97641847055656e-06, + "loss": 0.0884, + "step": 713 + }, + { + "epoch": 0.06, + "grad_norm": 0.5234161323169547, + "learning_rate": 9.976285908954182e-06, + "loss": 0.1177, + "step": 714 + }, + { + "epoch": 0.06, + "grad_norm": 0.8088895483824468, + "learning_rate": 9.976152976689043e-06, + "loss": 0.1351, + "step": 715 + }, + { + "epoch": 0.06, + "grad_norm": 0.5605482943163758, + "learning_rate": 9.976019673771038e-06, + "loss": 0.0891, + "step": 716 + }, + { + "epoch": 0.06, + "grad_norm": 0.7380956337822607, + "learning_rate": 9.975886000210104e-06, + "loss": 0.1371, + "step": 717 + }, + { + "epoch": 0.06, + "grad_norm": 0.36673155123878615, + "learning_rate": 9.975751956016192e-06, + "loss": 0.0666, + "step": 718 + }, + { + "epoch": 0.06, + "grad_norm": 0.6197419593903347, + "learning_rate": 9.975617541199289e-06, + "loss": 0.1003, + "step": 719 + }, + { + "epoch": 0.06, + "grad_norm": 0.35300926269928473, + "learning_rate": 9.975482755769406e-06, + "loss": 0.0543, + "step": 720 + }, + { + "epoch": 0.06, + "grad_norm": 0.5669314921832739, + "learning_rate": 9.975347599736585e-06, + "loss": 0.1069, + "step": 721 + }, + { + "epoch": 0.06, + "grad_norm": 0.6374110848146446, + "learning_rate": 9.97521207311089e-06, + "loss": 0.1072, + "step": 722 + }, + { + "epoch": 0.06, + "grad_norm": 0.394069298750776, + "learning_rate": 9.975076175902415e-06, + "loss": 0.0487, + "step": 723 + }, + { + "epoch": 0.06, + "grad_norm": 0.5194233371124842, + "learning_rate": 9.974939908121285e-06, + "loss": 0.1205, + "step": 724 + }, + { + "epoch": 0.06, + "grad_norm": 1.122963800657039, + "learning_rate": 9.974803269777654e-06, + "loss": 0.1005, + "step": 725 + }, + { + "epoch": 0.06, + "grad_norm": 0.7066368936197616, + "learning_rate": 9.97466626088169e-06, + "loss": 0.1129, + "step": 726 + }, + { + "epoch": 0.06, + "grad_norm": 1.1486603337554333, + "learning_rate": 9.974528881443606e-06, + "loss": 0.1346, + "step": 727 + }, + { + "epoch": 0.06, + "grad_norm": 0.5422940997853235, + "learning_rate": 9.974391131473631e-06, + "loss": 0.1238, + "step": 728 + }, + { + "epoch": 0.06, + "grad_norm": 0.9430505268457784, + "learning_rate": 9.974253010982026e-06, + "loss": 0.1637, + "step": 729 + }, + { + "epoch": 0.06, + "grad_norm": 0.9332423259531186, + "learning_rate": 9.974114519979081e-06, + "loss": 0.1428, + "step": 730 + }, + { + "epoch": 0.06, + "grad_norm": 0.622667978830865, + "learning_rate": 9.973975658475109e-06, + "loss": 0.0909, + "step": 731 + }, + { + "epoch": 0.06, + "grad_norm": 0.3169498690315978, + "learning_rate": 9.973836426480455e-06, + "loss": 0.0616, + "step": 732 + }, + { + "epoch": 0.06, + "grad_norm": 0.45812757108662544, + "learning_rate": 9.97369682400549e-06, + "loss": 0.0831, + "step": 733 + }, + { + "epoch": 0.06, + "grad_norm": 0.6578225858005559, + "learning_rate": 9.973556851060611e-06, + "loss": 0.1465, + "step": 734 + }, + { + "epoch": 0.06, + "grad_norm": 0.46958628611936376, + "learning_rate": 9.973416507656245e-06, + "loss": 0.1151, + "step": 735 + }, + { + "epoch": 0.06, + "grad_norm": 0.6162633795441038, + "learning_rate": 9.973275793802844e-06, + "loss": 0.1466, + "step": 736 + }, + { + "epoch": 0.06, + "grad_norm": 0.5197208235937061, + "learning_rate": 9.97313470951089e-06, + "loss": 0.1214, + "step": 737 + }, + { + "epoch": 0.06, + "grad_norm": 0.47561863006047034, + "learning_rate": 9.972993254790894e-06, + "loss": 0.0924, + "step": 738 + }, + { + "epoch": 0.06, + "grad_norm": 0.6123899806423115, + "learning_rate": 9.97285142965339e-06, + "loss": 0.1492, + "step": 739 + }, + { + "epoch": 0.06, + "grad_norm": 0.45631134029699005, + "learning_rate": 9.97270923410894e-06, + "loss": 0.0544, + "step": 740 + }, + { + "epoch": 0.06, + "grad_norm": 0.464961830420518, + "learning_rate": 9.972566668168143e-06, + "loss": 0.0863, + "step": 741 + }, + { + "epoch": 0.06, + "grad_norm": 0.5072664529808102, + "learning_rate": 9.972423731841609e-06, + "loss": 0.1157, + "step": 742 + }, + { + "epoch": 0.06, + "grad_norm": 0.4255338961588323, + "learning_rate": 9.972280425139992e-06, + "loss": 0.1098, + "step": 743 + }, + { + "epoch": 0.06, + "grad_norm": 0.8164375088772227, + "learning_rate": 9.972136748073962e-06, + "loss": 0.201, + "step": 744 + }, + { + "epoch": 0.06, + "grad_norm": 0.4902943607398687, + "learning_rate": 9.971992700654221e-06, + "loss": 0.0876, + "step": 745 + }, + { + "epoch": 0.06, + "grad_norm": 0.6964868623743768, + "learning_rate": 9.971848282891501e-06, + "loss": 0.1097, + "step": 746 + }, + { + "epoch": 0.06, + "grad_norm": 0.3916933419423451, + "learning_rate": 9.971703494796556e-06, + "loss": 0.0764, + "step": 747 + }, + { + "epoch": 0.06, + "grad_norm": 0.31562422960077924, + "learning_rate": 9.971558336380173e-06, + "loss": 0.0857, + "step": 748 + }, + { + "epoch": 0.06, + "grad_norm": 0.5892484859725993, + "learning_rate": 9.971412807653166e-06, + "loss": 0.1265, + "step": 749 + }, + { + "epoch": 0.06, + "grad_norm": 0.5717862042182116, + "learning_rate": 9.971266908626372e-06, + "loss": 0.1293, + "step": 750 + }, + { + "epoch": 0.06, + "grad_norm": 1.0155816934403537, + "learning_rate": 9.971120639310656e-06, + "loss": 0.126, + "step": 751 + }, + { + "epoch": 0.06, + "grad_norm": 0.5987383696405469, + "learning_rate": 9.970973999716917e-06, + "loss": 0.0842, + "step": 752 + }, + { + "epoch": 0.06, + "grad_norm": 0.5039857009747198, + "learning_rate": 9.970826989856079e-06, + "loss": 0.0974, + "step": 753 + }, + { + "epoch": 0.06, + "grad_norm": 0.39264581261524684, + "learning_rate": 9.970679609739086e-06, + "loss": 0.0622, + "step": 754 + }, + { + "epoch": 0.06, + "grad_norm": 0.5287540907886646, + "learning_rate": 9.970531859376921e-06, + "loss": 0.0974, + "step": 755 + }, + { + "epoch": 0.06, + "grad_norm": 0.7477820966847073, + "learning_rate": 9.970383738780589e-06, + "loss": 0.0626, + "step": 756 + }, + { + "epoch": 0.06, + "grad_norm": 0.7232686230853854, + "learning_rate": 9.97023524796112e-06, + "loss": 0.1323, + "step": 757 + }, + { + "epoch": 0.06, + "grad_norm": 0.5850992577571005, + "learning_rate": 9.970086386929576e-06, + "loss": 0.1041, + "step": 758 + }, + { + "epoch": 0.06, + "grad_norm": 0.563005733848986, + "learning_rate": 9.969937155697045e-06, + "loss": 0.0999, + "step": 759 + }, + { + "epoch": 0.06, + "grad_norm": 0.4076292540871964, + "learning_rate": 9.969787554274642e-06, + "loss": 0.0828, + "step": 760 + }, + { + "epoch": 0.06, + "grad_norm": 0.5199660949872195, + "learning_rate": 9.969637582673512e-06, + "loss": 0.0997, + "step": 761 + }, + { + "epoch": 0.06, + "grad_norm": 0.9267861775973697, + "learning_rate": 9.969487240904823e-06, + "loss": 0.1426, + "step": 762 + }, + { + "epoch": 0.06, + "grad_norm": 0.4133904853641572, + "learning_rate": 9.969336528979777e-06, + "loss": 0.064, + "step": 763 + }, + { + "epoch": 0.06, + "grad_norm": 0.7482741881440251, + "learning_rate": 9.969185446909596e-06, + "loss": 0.1113, + "step": 764 + }, + { + "epoch": 0.06, + "grad_norm": 0.508486674947673, + "learning_rate": 9.969033994705536e-06, + "loss": 0.0924, + "step": 765 + }, + { + "epoch": 0.06, + "grad_norm": 0.989977905168848, + "learning_rate": 9.968882172378878e-06, + "loss": 0.1558, + "step": 766 + }, + { + "epoch": 0.06, + "grad_norm": 0.3554746217510501, + "learning_rate": 9.968729979940929e-06, + "loss": 0.0783, + "step": 767 + }, + { + "epoch": 0.06, + "grad_norm": 0.3976754946972572, + "learning_rate": 9.968577417403025e-06, + "loss": 0.1003, + "step": 768 + }, + { + "epoch": 0.06, + "grad_norm": 0.29254594131500583, + "learning_rate": 9.968424484776534e-06, + "loss": 0.066, + "step": 769 + }, + { + "epoch": 0.06, + "grad_norm": 0.48740356214464453, + "learning_rate": 9.968271182072842e-06, + "loss": 0.0991, + "step": 770 + }, + { + "epoch": 0.06, + "grad_norm": 0.4838852190172392, + "learning_rate": 9.96811750930337e-06, + "loss": 0.1309, + "step": 771 + }, + { + "epoch": 0.07, + "grad_norm": 0.5184392328414776, + "learning_rate": 9.967963466479564e-06, + "loss": 0.1424, + "step": 772 + }, + { + "epoch": 0.07, + "grad_norm": 0.35862278131174913, + "learning_rate": 9.9678090536129e-06, + "loss": 0.078, + "step": 773 + }, + { + "epoch": 0.07, + "grad_norm": 0.6999597335122942, + "learning_rate": 9.967654270714878e-06, + "loss": 0.1601, + "step": 774 + }, + { + "epoch": 0.07, + "grad_norm": 0.9557062808268679, + "learning_rate": 9.967499117797026e-06, + "loss": 0.1623, + "step": 775 + }, + { + "epoch": 0.07, + "grad_norm": 0.48354655525256973, + "learning_rate": 9.967343594870901e-06, + "loss": 0.116, + "step": 776 + }, + { + "epoch": 0.07, + "grad_norm": 0.4157235700044118, + "learning_rate": 9.967187701948092e-06, + "loss": 0.08, + "step": 777 + }, + { + "epoch": 0.07, + "grad_norm": 0.3874809401929965, + "learning_rate": 9.967031439040203e-06, + "loss": 0.0862, + "step": 778 + }, + { + "epoch": 0.07, + "grad_norm": 0.8514931067533913, + "learning_rate": 9.96687480615888e-06, + "loss": 0.1254, + "step": 779 + }, + { + "epoch": 0.07, + "grad_norm": 0.833859617600702, + "learning_rate": 9.966717803315787e-06, + "loss": 0.1099, + "step": 780 + }, + { + "epoch": 0.07, + "grad_norm": 0.3880532590790286, + "learning_rate": 9.966560430522616e-06, + "loss": 0.0668, + "step": 781 + }, + { + "epoch": 0.07, + "grad_norm": 0.25742737410472455, + "learning_rate": 9.966402687791094e-06, + "loss": 0.0409, + "step": 782 + }, + { + "epoch": 0.07, + "grad_norm": 0.6584616390619324, + "learning_rate": 9.966244575132969e-06, + "loss": 0.1036, + "step": 783 + }, + { + "epoch": 0.07, + "grad_norm": 0.5591847199464093, + "learning_rate": 9.966086092560016e-06, + "loss": 0.13, + "step": 784 + }, + { + "epoch": 0.07, + "grad_norm": 0.5588944667741114, + "learning_rate": 9.965927240084041e-06, + "loss": 0.1165, + "step": 785 + }, + { + "epoch": 0.07, + "grad_norm": 0.8185564454858302, + "learning_rate": 9.965768017716878e-06, + "loss": 0.1371, + "step": 786 + }, + { + "epoch": 0.07, + "grad_norm": 0.5201127036839857, + "learning_rate": 9.965608425470384e-06, + "loss": 0.1201, + "step": 787 + }, + { + "epoch": 0.07, + "grad_norm": 0.5525468657747358, + "learning_rate": 9.965448463356449e-06, + "loss": 0.1474, + "step": 788 + }, + { + "epoch": 0.07, + "grad_norm": 0.3800766597777952, + "learning_rate": 9.965288131386985e-06, + "loss": 0.0927, + "step": 789 + }, + { + "epoch": 0.07, + "grad_norm": 0.603956284448405, + "learning_rate": 9.965127429573938e-06, + "loss": 0.0998, + "step": 790 + }, + { + "epoch": 0.07, + "grad_norm": 0.5069560620465234, + "learning_rate": 9.964966357929275e-06, + "loss": 0.1127, + "step": 791 + }, + { + "epoch": 0.07, + "grad_norm": 0.5105246084781327, + "learning_rate": 9.964804916464994e-06, + "loss": 0.0795, + "step": 792 + }, + { + "epoch": 0.07, + "grad_norm": 0.31809322360310843, + "learning_rate": 9.964643105193122e-06, + "loss": 0.0584, + "step": 793 + }, + { + "epoch": 0.07, + "grad_norm": 0.4639321177618638, + "learning_rate": 9.964480924125708e-06, + "loss": 0.0613, + "step": 794 + }, + { + "epoch": 0.07, + "grad_norm": 0.47966608852474757, + "learning_rate": 9.964318373274838e-06, + "loss": 0.1025, + "step": 795 + }, + { + "epoch": 0.07, + "grad_norm": 0.6848677806647402, + "learning_rate": 9.964155452652613e-06, + "loss": 0.1018, + "step": 796 + }, + { + "epoch": 0.07, + "grad_norm": 0.7153338135643892, + "learning_rate": 9.963992162271172e-06, + "loss": 0.1306, + "step": 797 + }, + { + "epoch": 0.07, + "grad_norm": 0.4935492060472502, + "learning_rate": 9.963828502142679e-06, + "loss": 0.1168, + "step": 798 + }, + { + "epoch": 0.07, + "grad_norm": 0.46902027441710226, + "learning_rate": 9.963664472279323e-06, + "loss": 0.1033, + "step": 799 + }, + { + "epoch": 0.07, + "grad_norm": 0.2995033907721179, + "learning_rate": 9.96350007269332e-06, + "loss": 0.0528, + "step": 800 + }, + { + "epoch": 0.07, + "grad_norm": 0.5788015858082936, + "learning_rate": 9.963335303396918e-06, + "loss": 0.1304, + "step": 801 + }, + { + "epoch": 0.07, + "grad_norm": 0.5831250618080749, + "learning_rate": 9.963170164402388e-06, + "loss": 0.131, + "step": 802 + }, + { + "epoch": 0.07, + "grad_norm": 0.37525764909733145, + "learning_rate": 9.963004655722034e-06, + "loss": 0.0756, + "step": 803 + }, + { + "epoch": 0.07, + "grad_norm": 1.7108094217246574, + "learning_rate": 9.962838777368178e-06, + "loss": 0.1184, + "step": 804 + }, + { + "epoch": 0.07, + "grad_norm": 0.46155342771347846, + "learning_rate": 9.962672529353182e-06, + "loss": 0.073, + "step": 805 + }, + { + "epoch": 0.07, + "grad_norm": 0.7613256091599936, + "learning_rate": 9.962505911689424e-06, + "loss": 0.1227, + "step": 806 + }, + { + "epoch": 0.07, + "grad_norm": 1.0658788782319246, + "learning_rate": 9.962338924389319e-06, + "loss": 0.1931, + "step": 807 + }, + { + "epoch": 0.07, + "grad_norm": 0.4292166439445262, + "learning_rate": 9.962171567465301e-06, + "loss": 0.1004, + "step": 808 + }, + { + "epoch": 0.07, + "grad_norm": 0.6077807835596586, + "learning_rate": 9.962003840929841e-06, + "loss": 0.1096, + "step": 809 + }, + { + "epoch": 0.07, + "grad_norm": 0.674077113531982, + "learning_rate": 9.961835744795428e-06, + "loss": 0.1595, + "step": 810 + }, + { + "epoch": 0.07, + "grad_norm": 0.5617260265709009, + "learning_rate": 9.961667279074582e-06, + "loss": 0.1449, + "step": 811 + }, + { + "epoch": 0.07, + "grad_norm": 4.599243693948226, + "learning_rate": 9.961498443779855e-06, + "loss": 0.1588, + "step": 812 + }, + { + "epoch": 0.07, + "grad_norm": 0.7899581434039957, + "learning_rate": 9.96132923892382e-06, + "loss": 0.0812, + "step": 813 + }, + { + "epoch": 0.07, + "grad_norm": 0.7247643298857532, + "learning_rate": 9.961159664519082e-06, + "loss": 0.1047, + "step": 814 + }, + { + "epoch": 0.07, + "grad_norm": 0.4901747082118883, + "learning_rate": 9.960989720578272e-06, + "loss": 0.1228, + "step": 815 + }, + { + "epoch": 0.07, + "grad_norm": 0.5687435358143359, + "learning_rate": 9.960819407114046e-06, + "loss": 0.0892, + "step": 816 + }, + { + "epoch": 0.07, + "grad_norm": 0.6877339802997794, + "learning_rate": 9.960648724139092e-06, + "loss": 0.0954, + "step": 817 + }, + { + "epoch": 0.07, + "grad_norm": 1.0281489558541008, + "learning_rate": 9.960477671666125e-06, + "loss": 0.1, + "step": 818 + }, + { + "epoch": 0.07, + "grad_norm": 0.6030054399436926, + "learning_rate": 9.960306249707883e-06, + "loss": 0.1285, + "step": 819 + }, + { + "epoch": 0.07, + "grad_norm": 0.42562933145037557, + "learning_rate": 9.960134458277135e-06, + "loss": 0.0641, + "step": 820 + }, + { + "epoch": 0.07, + "grad_norm": 0.505524835148414, + "learning_rate": 9.959962297386678e-06, + "loss": 0.1171, + "step": 821 + }, + { + "epoch": 0.07, + "grad_norm": 0.525205893688321, + "learning_rate": 9.959789767049336e-06, + "loss": 0.1018, + "step": 822 + }, + { + "epoch": 0.07, + "grad_norm": 0.493912094701411, + "learning_rate": 9.959616867277959e-06, + "loss": 0.1097, + "step": 823 + }, + { + "epoch": 0.07, + "grad_norm": 1.6974269836147844, + "learning_rate": 9.959443598085425e-06, + "loss": 0.1525, + "step": 824 + }, + { + "epoch": 0.07, + "grad_norm": 0.42286614725709104, + "learning_rate": 9.959269959484641e-06, + "loss": 0.0796, + "step": 825 + }, + { + "epoch": 0.07, + "grad_norm": 0.30912254854988763, + "learning_rate": 9.95909595148854e-06, + "loss": 0.0585, + "step": 826 + }, + { + "epoch": 0.07, + "grad_norm": 0.4996509572761813, + "learning_rate": 9.958921574110085e-06, + "loss": 0.1148, + "step": 827 + }, + { + "epoch": 0.07, + "grad_norm": 0.5104671959358027, + "learning_rate": 9.958746827362263e-06, + "loss": 0.0834, + "step": 828 + }, + { + "epoch": 0.07, + "grad_norm": 0.4363796896506406, + "learning_rate": 9.95857171125809e-06, + "loss": 0.1233, + "step": 829 + }, + { + "epoch": 0.07, + "grad_norm": 0.3754810621808287, + "learning_rate": 9.95839622581061e-06, + "loss": 0.0866, + "step": 830 + }, + { + "epoch": 0.07, + "grad_norm": 0.4561248286320553, + "learning_rate": 9.958220371032895e-06, + "loss": 0.0763, + "step": 831 + }, + { + "epoch": 0.07, + "grad_norm": 0.39857532983299565, + "learning_rate": 9.958044146938042e-06, + "loss": 0.0893, + "step": 832 + }, + { + "epoch": 0.07, + "grad_norm": 0.3354060942880233, + "learning_rate": 9.95786755353918e-06, + "loss": 0.0972, + "step": 833 + }, + { + "epoch": 0.07, + "grad_norm": 0.946659022183752, + "learning_rate": 9.95769059084946e-06, + "loss": 0.0904, + "step": 834 + }, + { + "epoch": 0.07, + "grad_norm": 0.4998077954112448, + "learning_rate": 9.957513258882064e-06, + "loss": 0.0841, + "step": 835 + }, + { + "epoch": 0.07, + "grad_norm": 0.29125879380710573, + "learning_rate": 9.9573355576502e-06, + "loss": 0.0667, + "step": 836 + }, + { + "epoch": 0.07, + "grad_norm": 0.5981195999337393, + "learning_rate": 9.957157487167106e-06, + "loss": 0.1174, + "step": 837 + }, + { + "epoch": 0.07, + "grad_norm": 0.3126495227050807, + "learning_rate": 9.956979047446043e-06, + "loss": 0.0601, + "step": 838 + }, + { + "epoch": 0.07, + "grad_norm": 0.5803070385773836, + "learning_rate": 9.956800238500306e-06, + "loss": 0.1091, + "step": 839 + }, + { + "epoch": 0.07, + "grad_norm": 1.1782036550619492, + "learning_rate": 9.956621060343211e-06, + "loss": 0.1121, + "step": 840 + }, + { + "epoch": 0.07, + "grad_norm": 0.46202716532304794, + "learning_rate": 9.956441512988107e-06, + "loss": 0.11, + "step": 841 + }, + { + "epoch": 0.07, + "grad_norm": 0.7369296510727624, + "learning_rate": 9.956261596448362e-06, + "loss": 0.1169, + "step": 842 + }, + { + "epoch": 0.07, + "grad_norm": 0.40711809229195994, + "learning_rate": 9.956081310737383e-06, + "loss": 0.0856, + "step": 843 + }, + { + "epoch": 0.07, + "grad_norm": 0.426743422911565, + "learning_rate": 9.955900655868598e-06, + "loss": 0.1114, + "step": 844 + }, + { + "epoch": 0.07, + "grad_norm": 0.8743227901529752, + "learning_rate": 9.95571963185546e-06, + "loss": 0.0708, + "step": 845 + }, + { + "epoch": 0.07, + "grad_norm": 0.45198407629943704, + "learning_rate": 9.955538238711456e-06, + "loss": 0.1034, + "step": 846 + }, + { + "epoch": 0.07, + "grad_norm": 0.5720107964880582, + "learning_rate": 9.955356476450093e-06, + "loss": 0.0838, + "step": 847 + }, + { + "epoch": 0.07, + "grad_norm": 1.176480626776132, + "learning_rate": 9.955174345084915e-06, + "loss": 0.1089, + "step": 848 + }, + { + "epoch": 0.07, + "grad_norm": 0.40222387309825214, + "learning_rate": 9.954991844629485e-06, + "loss": 0.0708, + "step": 849 + }, + { + "epoch": 0.07, + "grad_norm": 0.5034536977136195, + "learning_rate": 9.954808975097397e-06, + "loss": 0.0872, + "step": 850 + }, + { + "epoch": 0.07, + "grad_norm": 0.8317191033338598, + "learning_rate": 9.954625736502274e-06, + "loss": 0.1161, + "step": 851 + }, + { + "epoch": 0.07, + "grad_norm": 0.5612592392382804, + "learning_rate": 9.954442128857763e-06, + "loss": 0.1164, + "step": 852 + }, + { + "epoch": 0.07, + "grad_norm": 0.6781380989260859, + "learning_rate": 9.95425815217754e-06, + "loss": 0.0819, + "step": 853 + }, + { + "epoch": 0.07, + "grad_norm": 0.4829270128483685, + "learning_rate": 9.954073806475308e-06, + "loss": 0.1162, + "step": 854 + }, + { + "epoch": 0.07, + "grad_norm": 0.7324520660795716, + "learning_rate": 9.953889091764801e-06, + "loss": 0.1379, + "step": 855 + }, + { + "epoch": 0.07, + "grad_norm": 0.5232788888718319, + "learning_rate": 9.953704008059776e-06, + "loss": 0.0898, + "step": 856 + }, + { + "epoch": 0.07, + "grad_norm": 0.44170488636008465, + "learning_rate": 9.953518555374018e-06, + "loss": 0.0984, + "step": 857 + }, + { + "epoch": 0.07, + "grad_norm": 0.37657221779276345, + "learning_rate": 9.953332733721341e-06, + "loss": 0.0761, + "step": 858 + }, + { + "epoch": 0.07, + "grad_norm": 0.43349959330673243, + "learning_rate": 9.953146543115588e-06, + "loss": 0.0698, + "step": 859 + }, + { + "epoch": 0.07, + "grad_norm": 0.3425556857627271, + "learning_rate": 9.952959983570627e-06, + "loss": 0.049, + "step": 860 + }, + { + "epoch": 0.07, + "grad_norm": 0.4695565298025845, + "learning_rate": 9.952773055100352e-06, + "loss": 0.0697, + "step": 861 + }, + { + "epoch": 0.07, + "grad_norm": 0.40545502464561173, + "learning_rate": 9.952585757718687e-06, + "loss": 0.0749, + "step": 862 + }, + { + "epoch": 0.07, + "grad_norm": 0.7098123897720013, + "learning_rate": 9.952398091439586e-06, + "loss": 0.0891, + "step": 863 + }, + { + "epoch": 0.07, + "grad_norm": 0.646234243570636, + "learning_rate": 9.952210056277025e-06, + "loss": 0.0957, + "step": 864 + }, + { + "epoch": 0.07, + "grad_norm": 0.7637499983151023, + "learning_rate": 9.952021652245008e-06, + "loss": 0.1016, + "step": 865 + }, + { + "epoch": 0.07, + "grad_norm": 0.6106644158140575, + "learning_rate": 9.951832879357572e-06, + "loss": 0.1057, + "step": 866 + }, + { + "epoch": 0.07, + "grad_norm": 0.40197049986377603, + "learning_rate": 9.951643737628778e-06, + "loss": 0.1083, + "step": 867 + }, + { + "epoch": 0.07, + "grad_norm": 0.7298684604904201, + "learning_rate": 9.951454227072711e-06, + "loss": 0.1361, + "step": 868 + }, + { + "epoch": 0.07, + "grad_norm": 0.5917815728060243, + "learning_rate": 9.95126434770349e-06, + "loss": 0.1043, + "step": 869 + }, + { + "epoch": 0.07, + "grad_norm": 1.066448645734056, + "learning_rate": 9.951074099535256e-06, + "loss": 0.1714, + "step": 870 + }, + { + "epoch": 0.07, + "grad_norm": 0.8085208911013361, + "learning_rate": 9.950883482582182e-06, + "loss": 0.1591, + "step": 871 + }, + { + "epoch": 0.07, + "grad_norm": 0.8884557855233239, + "learning_rate": 9.950692496858466e-06, + "loss": 0.0688, + "step": 872 + }, + { + "epoch": 0.07, + "grad_norm": 0.5421268698685046, + "learning_rate": 9.95050114237833e-06, + "loss": 0.0914, + "step": 873 + }, + { + "epoch": 0.07, + "grad_norm": 0.4836414387467889, + "learning_rate": 9.950309419156033e-06, + "loss": 0.1282, + "step": 874 + }, + { + "epoch": 0.07, + "grad_norm": 0.6255306531869191, + "learning_rate": 9.95011732720585e-06, + "loss": 0.1279, + "step": 875 + }, + { + "epoch": 0.07, + "grad_norm": 1.572004815392286, + "learning_rate": 9.949924866542095e-06, + "loss": 0.1153, + "step": 876 + }, + { + "epoch": 0.07, + "grad_norm": 0.9162023296829667, + "learning_rate": 9.949732037179098e-06, + "loss": 0.1246, + "step": 877 + }, + { + "epoch": 0.07, + "grad_norm": 0.3783815537144979, + "learning_rate": 9.949538839131225e-06, + "loss": 0.0698, + "step": 878 + }, + { + "epoch": 0.07, + "grad_norm": 0.7957663871932426, + "learning_rate": 9.949345272412866e-06, + "loss": 0.1136, + "step": 879 + }, + { + "epoch": 0.07, + "grad_norm": 0.9810703595829444, + "learning_rate": 9.94915133703844e-06, + "loss": 0.107, + "step": 880 + }, + { + "epoch": 0.07, + "grad_norm": 0.3183140183894449, + "learning_rate": 9.948957033022392e-06, + "loss": 0.0828, + "step": 881 + }, + { + "epoch": 0.07, + "grad_norm": 0.9714247586876741, + "learning_rate": 9.948762360379193e-06, + "loss": 0.138, + "step": 882 + }, + { + "epoch": 0.07, + "grad_norm": 0.5387107070689463, + "learning_rate": 9.948567319123345e-06, + "loss": 0.1302, + "step": 883 + }, + { + "epoch": 0.07, + "grad_norm": 0.5732754717833087, + "learning_rate": 9.948371909269376e-06, + "loss": 0.1251, + "step": 884 + }, + { + "epoch": 0.07, + "grad_norm": 0.4060247681359065, + "learning_rate": 9.948176130831842e-06, + "loss": 0.0641, + "step": 885 + }, + { + "epoch": 0.07, + "grad_norm": 0.5297539830451976, + "learning_rate": 9.947979983825322e-06, + "loss": 0.0792, + "step": 886 + }, + { + "epoch": 0.07, + "grad_norm": 0.5361088800134761, + "learning_rate": 9.94778346826443e-06, + "loss": 0.0808, + "step": 887 + }, + { + "epoch": 0.07, + "grad_norm": 0.5276001713735403, + "learning_rate": 9.947586584163802e-06, + "loss": 0.1271, + "step": 888 + }, + { + "epoch": 0.07, + "grad_norm": 0.5471188989182744, + "learning_rate": 9.947389331538105e-06, + "loss": 0.1177, + "step": 889 + }, + { + "epoch": 0.07, + "grad_norm": 0.5407686032517404, + "learning_rate": 9.947191710402027e-06, + "loss": 0.0862, + "step": 890 + }, + { + "epoch": 0.08, + "grad_norm": 0.4135873825953957, + "learning_rate": 9.946993720770293e-06, + "loss": 0.0895, + "step": 891 + }, + { + "epoch": 0.08, + "grad_norm": 0.5771589147281307, + "learning_rate": 9.946795362657647e-06, + "loss": 0.1021, + "step": 892 + }, + { + "epoch": 0.08, + "grad_norm": 0.6657756866399844, + "learning_rate": 9.946596636078866e-06, + "loss": 0.1135, + "step": 893 + }, + { + "epoch": 0.08, + "grad_norm": 0.4902035318995722, + "learning_rate": 9.946397541048751e-06, + "loss": 0.08, + "step": 894 + }, + { + "epoch": 0.08, + "grad_norm": 0.6956332496909995, + "learning_rate": 9.946198077582132e-06, + "loss": 0.1385, + "step": 895 + }, + { + "epoch": 0.08, + "grad_norm": 0.249718410001851, + "learning_rate": 9.945998245693867e-06, + "loss": 0.0543, + "step": 896 + }, + { + "epoch": 0.08, + "grad_norm": 0.45862268046293814, + "learning_rate": 9.94579804539884e-06, + "loss": 0.1106, + "step": 897 + }, + { + "epoch": 0.08, + "grad_norm": 0.39291985700628385, + "learning_rate": 9.945597476711964e-06, + "loss": 0.0495, + "step": 898 + }, + { + "epoch": 0.08, + "grad_norm": 0.27893399204500074, + "learning_rate": 9.945396539648176e-06, + "loss": 0.0594, + "step": 899 + }, + { + "epoch": 0.08, + "grad_norm": 0.7587112220764212, + "learning_rate": 9.945195234222446e-06, + "loss": 0.1263, + "step": 900 + }, + { + "epoch": 0.08, + "grad_norm": 0.5297318027887641, + "learning_rate": 9.944993560449768e-06, + "loss": 0.1083, + "step": 901 + }, + { + "epoch": 0.08, + "grad_norm": 0.4693822298788793, + "learning_rate": 9.94479151834516e-06, + "loss": 0.1078, + "step": 902 + }, + { + "epoch": 0.08, + "grad_norm": 0.508028839304705, + "learning_rate": 9.944589107923676e-06, + "loss": 0.1133, + "step": 903 + }, + { + "epoch": 0.08, + "grad_norm": 0.8253543588625101, + "learning_rate": 9.944386329200391e-06, + "loss": 0.1025, + "step": 904 + }, + { + "epoch": 0.08, + "grad_norm": 0.8330281963970702, + "learning_rate": 9.944183182190408e-06, + "loss": 0.1134, + "step": 905 + }, + { + "epoch": 0.08, + "grad_norm": 0.6005433302126857, + "learning_rate": 9.943979666908861e-06, + "loss": 0.0847, + "step": 906 + }, + { + "epoch": 0.08, + "grad_norm": 0.7003510994167398, + "learning_rate": 9.943775783370906e-06, + "loss": 0.1316, + "step": 907 + }, + { + "epoch": 0.08, + "grad_norm": 0.39904863185992173, + "learning_rate": 9.943571531591731e-06, + "loss": 0.0963, + "step": 908 + }, + { + "epoch": 0.08, + "grad_norm": 0.47613970749742557, + "learning_rate": 9.943366911586552e-06, + "loss": 0.1008, + "step": 909 + }, + { + "epoch": 0.08, + "grad_norm": 0.4679149234016634, + "learning_rate": 9.943161923370606e-06, + "loss": 0.0715, + "step": 910 + }, + { + "epoch": 0.08, + "grad_norm": 0.2728335271209151, + "learning_rate": 9.942956566959166e-06, + "loss": 0.0726, + "step": 911 + }, + { + "epoch": 0.08, + "grad_norm": 0.35873063294503904, + "learning_rate": 9.942750842367523e-06, + "loss": 0.0729, + "step": 912 + }, + { + "epoch": 0.08, + "grad_norm": 0.4369990270673701, + "learning_rate": 9.942544749611006e-06, + "loss": 0.0707, + "step": 913 + }, + { + "epoch": 0.08, + "grad_norm": 1.1984256340236767, + "learning_rate": 9.942338288704963e-06, + "loss": 0.0732, + "step": 914 + }, + { + "epoch": 0.08, + "grad_norm": 0.3918726128129519, + "learning_rate": 9.942131459664773e-06, + "loss": 0.0934, + "step": 915 + }, + { + "epoch": 0.08, + "grad_norm": 0.4557919381988022, + "learning_rate": 9.941924262505842e-06, + "loss": 0.0734, + "step": 916 + }, + { + "epoch": 0.08, + "grad_norm": 0.32419548515158414, + "learning_rate": 9.941716697243602e-06, + "loss": 0.0691, + "step": 917 + }, + { + "epoch": 0.08, + "grad_norm": 0.4902540754809835, + "learning_rate": 9.941508763893517e-06, + "loss": 0.0917, + "step": 918 + }, + { + "epoch": 0.08, + "grad_norm": 0.6426430143885548, + "learning_rate": 9.941300462471072e-06, + "loss": 0.1201, + "step": 919 + }, + { + "epoch": 0.08, + "grad_norm": 0.367322313248295, + "learning_rate": 9.941091792991784e-06, + "loss": 0.0639, + "step": 920 + }, + { + "epoch": 0.08, + "grad_norm": 0.5801069845898432, + "learning_rate": 9.940882755471194e-06, + "loss": 0.0692, + "step": 921 + }, + { + "epoch": 0.08, + "grad_norm": 0.6903967584542379, + "learning_rate": 9.940673349924871e-06, + "loss": 0.1211, + "step": 922 + }, + { + "epoch": 0.08, + "grad_norm": 0.4469218576415177, + "learning_rate": 9.94046357636842e-06, + "loss": 0.0754, + "step": 923 + }, + { + "epoch": 0.08, + "grad_norm": 0.6154049308241845, + "learning_rate": 9.94025343481746e-06, + "loss": 0.0497, + "step": 924 + }, + { + "epoch": 0.08, + "grad_norm": 0.37459787815271606, + "learning_rate": 9.940042925287644e-06, + "loss": 0.0636, + "step": 925 + }, + { + "epoch": 0.08, + "grad_norm": 0.5556325154089106, + "learning_rate": 9.939832047794654e-06, + "loss": 0.1139, + "step": 926 + }, + { + "epoch": 0.08, + "grad_norm": 0.5146092489930145, + "learning_rate": 9.939620802354196e-06, + "loss": 0.0919, + "step": 927 + }, + { + "epoch": 0.08, + "grad_norm": 0.4175804510343337, + "learning_rate": 9.939409188982005e-06, + "loss": 0.1157, + "step": 928 + }, + { + "epoch": 0.08, + "grad_norm": 0.35352566125519436, + "learning_rate": 9.939197207693846e-06, + "loss": 0.0844, + "step": 929 + }, + { + "epoch": 0.08, + "grad_norm": 0.442223706881349, + "learning_rate": 9.938984858505502e-06, + "loss": 0.1058, + "step": 930 + }, + { + "epoch": 0.08, + "grad_norm": 0.5304650289380314, + "learning_rate": 9.938772141432797e-06, + "loss": 0.0987, + "step": 931 + }, + { + "epoch": 0.08, + "grad_norm": 0.49839618608713415, + "learning_rate": 9.93855905649157e-06, + "loss": 0.1016, + "step": 932 + }, + { + "epoch": 0.08, + "grad_norm": 0.8508442700002354, + "learning_rate": 9.938345603697697e-06, + "loss": 0.17, + "step": 933 + }, + { + "epoch": 0.08, + "grad_norm": 0.5583779962048626, + "learning_rate": 9.938131783067076e-06, + "loss": 0.105, + "step": 934 + }, + { + "epoch": 0.08, + "grad_norm": 0.7386437313777376, + "learning_rate": 9.937917594615631e-06, + "loss": 0.1293, + "step": 935 + }, + { + "epoch": 0.08, + "grad_norm": 0.4899393831383726, + "learning_rate": 9.937703038359318e-06, + "loss": 0.1151, + "step": 936 + }, + { + "epoch": 0.08, + "grad_norm": 0.5959114212642134, + "learning_rate": 9.937488114314121e-06, + "loss": 0.0915, + "step": 937 + }, + { + "epoch": 0.08, + "grad_norm": 0.4070676867806631, + "learning_rate": 9.937272822496045e-06, + "loss": 0.0786, + "step": 938 + }, + { + "epoch": 0.08, + "grad_norm": 0.4454390145846921, + "learning_rate": 9.937057162921127e-06, + "loss": 0.089, + "step": 939 + }, + { + "epoch": 0.08, + "grad_norm": 0.5180370602475179, + "learning_rate": 9.936841135605431e-06, + "loss": 0.0997, + "step": 940 + }, + { + "epoch": 0.08, + "grad_norm": 0.5010252045878886, + "learning_rate": 9.936624740565049e-06, + "loss": 0.063, + "step": 941 + }, + { + "epoch": 0.08, + "grad_norm": 0.687365184479091, + "learning_rate": 9.936407977816097e-06, + "loss": 0.115, + "step": 942 + }, + { + "epoch": 0.08, + "grad_norm": 0.517935305443366, + "learning_rate": 9.936190847374722e-06, + "loss": 0.0744, + "step": 943 + }, + { + "epoch": 0.08, + "grad_norm": 0.5101032241181154, + "learning_rate": 9.935973349257099e-06, + "loss": 0.0982, + "step": 944 + }, + { + "epoch": 0.08, + "grad_norm": 0.4784360275682416, + "learning_rate": 9.935755483479426e-06, + "loss": 0.1002, + "step": 945 + }, + { + "epoch": 0.08, + "grad_norm": 0.39323509229697784, + "learning_rate": 9.935537250057932e-06, + "loss": 0.0797, + "step": 946 + }, + { + "epoch": 0.08, + "grad_norm": 0.3495108135403981, + "learning_rate": 9.93531864900887e-06, + "loss": 0.0848, + "step": 947 + }, + { + "epoch": 0.08, + "grad_norm": 0.46824795453218726, + "learning_rate": 9.935099680348527e-06, + "loss": 0.1003, + "step": 948 + }, + { + "epoch": 0.08, + "grad_norm": 0.32784264690700776, + "learning_rate": 9.93488034409321e-06, + "loss": 0.0489, + "step": 949 + }, + { + "epoch": 0.08, + "grad_norm": 0.5803597963753291, + "learning_rate": 9.934660640259258e-06, + "loss": 0.0734, + "step": 950 + }, + { + "epoch": 0.08, + "grad_norm": 0.25319968410962723, + "learning_rate": 9.934440568863033e-06, + "loss": 0.0633, + "step": 951 + }, + { + "epoch": 0.08, + "grad_norm": 0.42073432948788536, + "learning_rate": 9.93422012992093e-06, + "loss": 0.1026, + "step": 952 + }, + { + "epoch": 0.08, + "grad_norm": 0.34130394400624925, + "learning_rate": 9.933999323449367e-06, + "loss": 0.0817, + "step": 953 + }, + { + "epoch": 0.08, + "grad_norm": 0.3629272970857018, + "learning_rate": 9.933778149464794e-06, + "loss": 0.0758, + "step": 954 + }, + { + "epoch": 0.08, + "grad_norm": 0.27860397693080197, + "learning_rate": 9.93355660798368e-06, + "loss": 0.0902, + "step": 955 + }, + { + "epoch": 0.08, + "grad_norm": 0.3867955268411471, + "learning_rate": 9.933334699022532e-06, + "loss": 0.0765, + "step": 956 + }, + { + "epoch": 0.08, + "grad_norm": 0.6563781763193827, + "learning_rate": 9.933112422597878e-06, + "loss": 0.0878, + "step": 957 + }, + { + "epoch": 0.08, + "grad_norm": 0.46182489337509625, + "learning_rate": 9.932889778726272e-06, + "loss": 0.0869, + "step": 958 + }, + { + "epoch": 0.08, + "grad_norm": 0.6221781367469542, + "learning_rate": 9.932666767424297e-06, + "loss": 0.1242, + "step": 959 + }, + { + "epoch": 0.08, + "grad_norm": 0.2744635190303476, + "learning_rate": 9.932443388708569e-06, + "loss": 0.0784, + "step": 960 + }, + { + "epoch": 0.08, + "grad_norm": 0.4467377845641006, + "learning_rate": 9.932219642595721e-06, + "loss": 0.0783, + "step": 961 + }, + { + "epoch": 0.08, + "grad_norm": 0.5214324734498765, + "learning_rate": 9.931995529102424e-06, + "loss": 0.1351, + "step": 962 + }, + { + "epoch": 0.08, + "grad_norm": 0.35273439576889903, + "learning_rate": 9.931771048245368e-06, + "loss": 0.0466, + "step": 963 + }, + { + "epoch": 0.08, + "grad_norm": 0.37684358051357436, + "learning_rate": 9.931546200041275e-06, + "loss": 0.094, + "step": 964 + }, + { + "epoch": 0.08, + "grad_norm": 0.5229111731524864, + "learning_rate": 9.931320984506893e-06, + "loss": 0.1248, + "step": 965 + }, + { + "epoch": 0.08, + "grad_norm": 0.48753037028244073, + "learning_rate": 9.931095401658997e-06, + "loss": 0.0986, + "step": 966 + }, + { + "epoch": 0.08, + "grad_norm": 0.5456155379414487, + "learning_rate": 9.930869451514389e-06, + "loss": 0.1154, + "step": 967 + }, + { + "epoch": 0.08, + "grad_norm": 0.546772992013245, + "learning_rate": 9.930643134089902e-06, + "loss": 0.1335, + "step": 968 + }, + { + "epoch": 0.08, + "grad_norm": 0.43968362181394827, + "learning_rate": 9.930416449402388e-06, + "loss": 0.0891, + "step": 969 + }, + { + "epoch": 0.08, + "grad_norm": 0.3959336312149664, + "learning_rate": 9.930189397468737e-06, + "loss": 0.0943, + "step": 970 + }, + { + "epoch": 0.08, + "grad_norm": 0.5100112507332921, + "learning_rate": 9.929961978305857e-06, + "loss": 0.088, + "step": 971 + }, + { + "epoch": 0.08, + "grad_norm": 0.6399681127358918, + "learning_rate": 9.929734191930694e-06, + "loss": 0.1012, + "step": 972 + }, + { + "epoch": 0.08, + "grad_norm": 0.4281279157910534, + "learning_rate": 9.929506038360206e-06, + "loss": 0.1391, + "step": 973 + }, + { + "epoch": 0.08, + "grad_norm": 0.4418617836166927, + "learning_rate": 9.929277517611396e-06, + "loss": 0.0929, + "step": 974 + }, + { + "epoch": 0.08, + "grad_norm": 0.42468150051652975, + "learning_rate": 9.929048629701278e-06, + "loss": 0.0795, + "step": 975 + }, + { + "epoch": 0.08, + "grad_norm": 0.4038052014589253, + "learning_rate": 9.928819374646905e-06, + "loss": 0.0679, + "step": 976 + }, + { + "epoch": 0.08, + "grad_norm": 0.6334702580966, + "learning_rate": 9.928589752465352e-06, + "loss": 0.0968, + "step": 977 + }, + { + "epoch": 0.08, + "grad_norm": 0.3467438326724044, + "learning_rate": 9.928359763173725e-06, + "loss": 0.0984, + "step": 978 + }, + { + "epoch": 0.08, + "grad_norm": 0.7294607316698436, + "learning_rate": 9.92812940678915e-06, + "loss": 0.132, + "step": 979 + }, + { + "epoch": 0.08, + "grad_norm": 0.5790548331738603, + "learning_rate": 9.927898683328792e-06, + "loss": 0.0962, + "step": 980 + }, + { + "epoch": 0.08, + "grad_norm": 0.43412295110140336, + "learning_rate": 9.927667592809831e-06, + "loss": 0.1282, + "step": 981 + }, + { + "epoch": 0.08, + "grad_norm": 0.6415324164016171, + "learning_rate": 9.927436135249482e-06, + "loss": 0.0979, + "step": 982 + }, + { + "epoch": 0.08, + "grad_norm": 0.45416709276651535, + "learning_rate": 9.927204310664985e-06, + "loss": 0.0685, + "step": 983 + }, + { + "epoch": 0.08, + "grad_norm": 0.5067231909032829, + "learning_rate": 9.926972119073608e-06, + "loss": 0.0901, + "step": 984 + }, + { + "epoch": 0.08, + "grad_norm": 0.5980457141880093, + "learning_rate": 9.926739560492646e-06, + "loss": 0.088, + "step": 985 + }, + { + "epoch": 0.08, + "grad_norm": 0.5618607065814243, + "learning_rate": 9.926506634939421e-06, + "loss": 0.117, + "step": 986 + }, + { + "epoch": 0.08, + "grad_norm": 0.6693119391162285, + "learning_rate": 9.926273342431283e-06, + "loss": 0.1533, + "step": 987 + }, + { + "epoch": 0.08, + "grad_norm": 0.40272871118215053, + "learning_rate": 9.92603968298561e-06, + "loss": 0.1014, + "step": 988 + }, + { + "epoch": 0.08, + "grad_norm": 0.5045596281735716, + "learning_rate": 9.925805656619803e-06, + "loss": 0.0921, + "step": 989 + }, + { + "epoch": 0.08, + "grad_norm": 0.6801264648777448, + "learning_rate": 9.925571263351298e-06, + "loss": 0.1318, + "step": 990 + }, + { + "epoch": 0.08, + "grad_norm": 0.43553563582117355, + "learning_rate": 9.925336503197549e-06, + "loss": 0.0836, + "step": 991 + }, + { + "epoch": 0.08, + "grad_norm": 0.5421283320729043, + "learning_rate": 9.925101376176045e-06, + "loss": 0.1341, + "step": 992 + }, + { + "epoch": 0.08, + "grad_norm": 0.3604632761717536, + "learning_rate": 9.924865882304302e-06, + "loss": 0.095, + "step": 993 + }, + { + "epoch": 0.08, + "grad_norm": 0.3705815192143719, + "learning_rate": 9.924630021599857e-06, + "loss": 0.0545, + "step": 994 + }, + { + "epoch": 0.08, + "grad_norm": 0.542093934049776, + "learning_rate": 9.92439379408028e-06, + "loss": 0.1168, + "step": 995 + }, + { + "epoch": 0.08, + "grad_norm": 0.5724655742366311, + "learning_rate": 9.924157199763167e-06, + "loss": 0.1044, + "step": 996 + }, + { + "epoch": 0.08, + "grad_norm": 0.65819677817133, + "learning_rate": 9.92392023866614e-06, + "loss": 0.0873, + "step": 997 + }, + { + "epoch": 0.08, + "grad_norm": 0.6581220702580841, + "learning_rate": 9.923682910806851e-06, + "loss": 0.1279, + "step": 998 + }, + { + "epoch": 0.08, + "grad_norm": 0.476709092484444, + "learning_rate": 9.923445216202976e-06, + "loss": 0.082, + "step": 999 + }, + { + "epoch": 0.08, + "grad_norm": 0.5575590150644242, + "learning_rate": 9.923207154872218e-06, + "loss": 0.1138, + "step": 1000 + }, + { + "epoch": 0.08, + "grad_norm": 0.6128587692683094, + "learning_rate": 9.922968726832313e-06, + "loss": 0.1005, + "step": 1001 + }, + { + "epoch": 0.08, + "grad_norm": 0.5071336045917442, + "learning_rate": 9.92272993210102e-06, + "loss": 0.0656, + "step": 1002 + }, + { + "epoch": 0.08, + "grad_norm": 0.29330233235088676, + "learning_rate": 9.922490770696123e-06, + "loss": 0.0878, + "step": 1003 + }, + { + "epoch": 0.08, + "grad_norm": 0.2943264595247416, + "learning_rate": 9.922251242635436e-06, + "loss": 0.0674, + "step": 1004 + }, + { + "epoch": 0.08, + "grad_norm": 0.49734799853936823, + "learning_rate": 9.922011347936806e-06, + "loss": 0.1322, + "step": 1005 + }, + { + "epoch": 0.08, + "grad_norm": 0.9336663184860172, + "learning_rate": 9.921771086618095e-06, + "loss": 0.1563, + "step": 1006 + }, + { + "epoch": 0.08, + "grad_norm": 0.5539684019936046, + "learning_rate": 9.921530458697205e-06, + "loss": 0.1084, + "step": 1007 + }, + { + "epoch": 0.08, + "grad_norm": 0.3655790323392977, + "learning_rate": 9.921289464192054e-06, + "loss": 0.0686, + "step": 1008 + }, + { + "epoch": 0.09, + "grad_norm": 0.7679399812751885, + "learning_rate": 9.921048103120595e-06, + "loss": 0.1128, + "step": 1009 + }, + { + "epoch": 0.09, + "grad_norm": 0.22764904500185687, + "learning_rate": 9.920806375500806e-06, + "loss": 0.0476, + "step": 1010 + }, + { + "epoch": 0.09, + "grad_norm": 0.35019030363796383, + "learning_rate": 9.920564281350694e-06, + "loss": 0.0725, + "step": 1011 + }, + { + "epoch": 0.09, + "grad_norm": 0.5336785567156759, + "learning_rate": 9.92032182068829e-06, + "loss": 0.1403, + "step": 1012 + }, + { + "epoch": 0.09, + "grad_norm": 0.8570179840004625, + "learning_rate": 9.920078993531653e-06, + "loss": 0.1512, + "step": 1013 + }, + { + "epoch": 0.09, + "grad_norm": 0.6480474164209786, + "learning_rate": 9.919835799898869e-06, + "loss": 0.1009, + "step": 1014 + }, + { + "epoch": 0.09, + "grad_norm": 0.3648490110596108, + "learning_rate": 9.919592239808055e-06, + "loss": 0.1028, + "step": 1015 + }, + { + "epoch": 0.09, + "grad_norm": 0.49863438479900063, + "learning_rate": 9.919348313277354e-06, + "loss": 0.095, + "step": 1016 + }, + { + "epoch": 0.09, + "grad_norm": 0.3228715815147967, + "learning_rate": 9.919104020324934e-06, + "loss": 0.0666, + "step": 1017 + }, + { + "epoch": 0.09, + "grad_norm": 0.20053838825238074, + "learning_rate": 9.918859360968988e-06, + "loss": 0.0355, + "step": 1018 + }, + { + "epoch": 0.09, + "grad_norm": 1.0283913521891452, + "learning_rate": 9.918614335227745e-06, + "loss": 0.1414, + "step": 1019 + }, + { + "epoch": 0.09, + "grad_norm": 0.6893735105584714, + "learning_rate": 9.918368943119452e-06, + "loss": 0.1006, + "step": 1020 + }, + { + "epoch": 0.09, + "grad_norm": 0.4733481721002276, + "learning_rate": 9.918123184662388e-06, + "loss": 0.0794, + "step": 1021 + }, + { + "epoch": 0.09, + "grad_norm": 0.34936358745974627, + "learning_rate": 9.917877059874859e-06, + "loss": 0.0382, + "step": 1022 + }, + { + "epoch": 0.09, + "grad_norm": 0.6798964286310596, + "learning_rate": 9.917630568775199e-06, + "loss": 0.1216, + "step": 1023 + }, + { + "epoch": 0.09, + "grad_norm": 0.5058318622927875, + "learning_rate": 9.917383711381765e-06, + "loss": 0.0975, + "step": 1024 + }, + { + "epoch": 0.09, + "grad_norm": 0.3422759490296814, + "learning_rate": 9.917136487712949e-06, + "loss": 0.084, + "step": 1025 + }, + { + "epoch": 0.09, + "grad_norm": 0.5241334085758241, + "learning_rate": 9.916888897787162e-06, + "loss": 0.0918, + "step": 1026 + }, + { + "epoch": 0.09, + "grad_norm": 0.6320630403952675, + "learning_rate": 9.916640941622846e-06, + "loss": 0.1606, + "step": 1027 + }, + { + "epoch": 0.09, + "grad_norm": 0.3102672092593467, + "learning_rate": 9.916392619238471e-06, + "loss": 0.0809, + "step": 1028 + }, + { + "epoch": 0.09, + "grad_norm": 0.4547331000766038, + "learning_rate": 9.916143930652535e-06, + "loss": 0.0823, + "step": 1029 + }, + { + "epoch": 0.09, + "grad_norm": 0.3847028658721494, + "learning_rate": 9.915894875883558e-06, + "loss": 0.0777, + "step": 1030 + }, + { + "epoch": 0.09, + "grad_norm": 0.6285060513806519, + "learning_rate": 9.915645454950095e-06, + "loss": 0.1094, + "step": 1031 + }, + { + "epoch": 0.09, + "grad_norm": 0.3964929309546552, + "learning_rate": 9.915395667870725e-06, + "loss": 0.1103, + "step": 1032 + }, + { + "epoch": 0.09, + "grad_norm": 0.3337567372507969, + "learning_rate": 9.915145514664048e-06, + "loss": 0.0621, + "step": 1033 + }, + { + "epoch": 0.09, + "grad_norm": 0.2195503200503558, + "learning_rate": 9.9148949953487e-06, + "loss": 0.0566, + "step": 1034 + }, + { + "epoch": 0.09, + "grad_norm": 0.24031435739653853, + "learning_rate": 9.914644109943344e-06, + "loss": 0.0664, + "step": 1035 + }, + { + "epoch": 0.09, + "grad_norm": 0.48036937905615945, + "learning_rate": 9.914392858466663e-06, + "loss": 0.0886, + "step": 1036 + }, + { + "epoch": 0.09, + "grad_norm": 1.3410370303034023, + "learning_rate": 9.914141240937374e-06, + "loss": 0.1427, + "step": 1037 + }, + { + "epoch": 0.09, + "grad_norm": 0.3828778225793709, + "learning_rate": 9.913889257374219e-06, + "loss": 0.1187, + "step": 1038 + }, + { + "epoch": 0.09, + "grad_norm": 0.4221240304971868, + "learning_rate": 9.913636907795965e-06, + "loss": 0.0758, + "step": 1039 + }, + { + "epoch": 0.09, + "grad_norm": 0.4195364712905812, + "learning_rate": 9.913384192221412e-06, + "loss": 0.0602, + "step": 1040 + }, + { + "epoch": 0.09, + "grad_norm": 0.4151616454292022, + "learning_rate": 9.913131110669381e-06, + "loss": 0.0936, + "step": 1041 + }, + { + "epoch": 0.09, + "grad_norm": 0.34605635321497047, + "learning_rate": 9.912877663158722e-06, + "loss": 0.0887, + "step": 1042 + }, + { + "epoch": 0.09, + "grad_norm": 0.6411776233300411, + "learning_rate": 9.912623849708318e-06, + "loss": 0.0875, + "step": 1043 + }, + { + "epoch": 0.09, + "grad_norm": 0.4810940811302461, + "learning_rate": 9.91236967033707e-06, + "loss": 0.1379, + "step": 1044 + }, + { + "epoch": 0.09, + "grad_norm": 0.9707370391468721, + "learning_rate": 9.912115125063912e-06, + "loss": 0.1712, + "step": 1045 + }, + { + "epoch": 0.09, + "grad_norm": 0.7202689951462978, + "learning_rate": 9.911860213907804e-06, + "loss": 0.1253, + "step": 1046 + }, + { + "epoch": 0.09, + "grad_norm": 0.6792711543976911, + "learning_rate": 9.911604936887735e-06, + "loss": 0.0773, + "step": 1047 + }, + { + "epoch": 0.09, + "grad_norm": 0.5135716960783598, + "learning_rate": 9.911349294022715e-06, + "loss": 0.0846, + "step": 1048 + }, + { + "epoch": 0.09, + "grad_norm": 0.4515634359489102, + "learning_rate": 9.911093285331791e-06, + "loss": 0.1342, + "step": 1049 + }, + { + "epoch": 0.09, + "grad_norm": 0.43380882297670487, + "learning_rate": 9.910836910834032e-06, + "loss": 0.0915, + "step": 1050 + }, + { + "epoch": 0.09, + "grad_norm": 0.9496643062501857, + "learning_rate": 9.910580170548529e-06, + "loss": 0.1456, + "step": 1051 + }, + { + "epoch": 0.09, + "grad_norm": 0.6626901464785682, + "learning_rate": 9.910323064494412e-06, + "loss": 0.1378, + "step": 1052 + }, + { + "epoch": 0.09, + "grad_norm": 0.4920832816123536, + "learning_rate": 9.910065592690825e-06, + "loss": 0.0774, + "step": 1053 + }, + { + "epoch": 0.09, + "grad_norm": 0.3921658583848475, + "learning_rate": 9.90980775515695e-06, + "loss": 0.0847, + "step": 1054 + }, + { + "epoch": 0.09, + "grad_norm": 0.4366545173875459, + "learning_rate": 9.909549551911992e-06, + "loss": 0.1247, + "step": 1055 + }, + { + "epoch": 0.09, + "grad_norm": 0.5231216282483795, + "learning_rate": 9.909290982975184e-06, + "loss": 0.1228, + "step": 1056 + }, + { + "epoch": 0.09, + "grad_norm": 0.4802973975040235, + "learning_rate": 9.909032048365785e-06, + "loss": 0.1007, + "step": 1057 + }, + { + "epoch": 0.09, + "grad_norm": 0.5210139266584852, + "learning_rate": 9.908772748103081e-06, + "loss": 0.0907, + "step": 1058 + }, + { + "epoch": 0.09, + "grad_norm": 0.3845995009249425, + "learning_rate": 9.908513082206386e-06, + "loss": 0.106, + "step": 1059 + }, + { + "epoch": 0.09, + "grad_norm": 0.38921245445975855, + "learning_rate": 9.908253050695045e-06, + "loss": 0.0789, + "step": 1060 + }, + { + "epoch": 0.09, + "grad_norm": 0.3381470836574768, + "learning_rate": 9.907992653588422e-06, + "loss": 0.073, + "step": 1061 + }, + { + "epoch": 0.09, + "grad_norm": 0.3196970133080993, + "learning_rate": 9.907731890905916e-06, + "loss": 0.0717, + "step": 1062 + }, + { + "epoch": 0.09, + "grad_norm": 0.5663861820185041, + "learning_rate": 9.907470762666949e-06, + "loss": 0.1119, + "step": 1063 + }, + { + "epoch": 0.09, + "grad_norm": 0.4061645650360372, + "learning_rate": 9.90720926889097e-06, + "loss": 0.0668, + "step": 1064 + }, + { + "epoch": 0.09, + "grad_norm": 0.2823553293931554, + "learning_rate": 9.90694740959746e-06, + "loss": 0.0525, + "step": 1065 + }, + { + "epoch": 0.09, + "grad_norm": 0.31584063344088514, + "learning_rate": 9.906685184805921e-06, + "loss": 0.1093, + "step": 1066 + }, + { + "epoch": 0.09, + "grad_norm": 0.3740090404312078, + "learning_rate": 9.906422594535886e-06, + "loss": 0.0846, + "step": 1067 + }, + { + "epoch": 0.09, + "grad_norm": 0.4108001189983455, + "learning_rate": 9.906159638806914e-06, + "loss": 0.0846, + "step": 1068 + }, + { + "epoch": 0.09, + "grad_norm": 0.4246002189512503, + "learning_rate": 9.905896317638592e-06, + "loss": 0.0735, + "step": 1069 + }, + { + "epoch": 0.09, + "grad_norm": 0.37410289406831143, + "learning_rate": 9.905632631050533e-06, + "loss": 0.0645, + "step": 1070 + }, + { + "epoch": 0.09, + "grad_norm": 0.6430449137310531, + "learning_rate": 9.905368579062378e-06, + "loss": 0.141, + "step": 1071 + }, + { + "epoch": 0.09, + "grad_norm": 0.574593934666374, + "learning_rate": 9.905104161693794e-06, + "loss": 0.1142, + "step": 1072 + }, + { + "epoch": 0.09, + "grad_norm": 0.5436024484444233, + "learning_rate": 9.90483937896448e-06, + "loss": 0.1001, + "step": 1073 + }, + { + "epoch": 0.09, + "grad_norm": 0.7986589416527862, + "learning_rate": 9.904574230894157e-06, + "loss": 0.1362, + "step": 1074 + }, + { + "epoch": 0.09, + "grad_norm": 0.6530614296960888, + "learning_rate": 9.904308717502571e-06, + "loss": 0.1364, + "step": 1075 + }, + { + "epoch": 0.09, + "grad_norm": 0.4907342606682929, + "learning_rate": 9.904042838809504e-06, + "loss": 0.0905, + "step": 1076 + }, + { + "epoch": 0.09, + "grad_norm": 0.47899382328311724, + "learning_rate": 9.903776594834758e-06, + "loss": 0.077, + "step": 1077 + }, + { + "epoch": 0.09, + "grad_norm": 0.5307033651809876, + "learning_rate": 9.903509985598165e-06, + "loss": 0.0768, + "step": 1078 + }, + { + "epoch": 0.09, + "grad_norm": 0.554800074581126, + "learning_rate": 9.903243011119582e-06, + "loss": 0.1056, + "step": 1079 + }, + { + "epoch": 0.09, + "grad_norm": 0.6025689043752012, + "learning_rate": 9.902975671418897e-06, + "loss": 0.105, + "step": 1080 + }, + { + "epoch": 0.09, + "grad_norm": 0.5074997704067946, + "learning_rate": 9.902707966516021e-06, + "loss": 0.0778, + "step": 1081 + }, + { + "epoch": 0.09, + "grad_norm": 0.36432689618268954, + "learning_rate": 9.902439896430896e-06, + "loss": 0.0943, + "step": 1082 + }, + { + "epoch": 0.09, + "grad_norm": 0.40341345823234764, + "learning_rate": 9.902171461183489e-06, + "loss": 0.099, + "step": 1083 + }, + { + "epoch": 0.09, + "grad_norm": 0.19663950841937716, + "learning_rate": 9.901902660793793e-06, + "loss": 0.0432, + "step": 1084 + }, + { + "epoch": 0.09, + "grad_norm": 0.6729331397948428, + "learning_rate": 9.901633495281834e-06, + "loss": 0.1004, + "step": 1085 + }, + { + "epoch": 0.09, + "grad_norm": 0.39267698882367663, + "learning_rate": 9.901363964667653e-06, + "loss": 0.0966, + "step": 1086 + }, + { + "epoch": 0.09, + "grad_norm": 0.7728266937268466, + "learning_rate": 9.901094068971336e-06, + "loss": 0.1152, + "step": 1087 + }, + { + "epoch": 0.09, + "grad_norm": 0.6320378172217919, + "learning_rate": 9.90082380821298e-06, + "loss": 0.1183, + "step": 1088 + }, + { + "epoch": 0.09, + "grad_norm": 0.45407875871636105, + "learning_rate": 9.900553182412717e-06, + "loss": 0.0816, + "step": 1089 + }, + { + "epoch": 0.09, + "grad_norm": 0.8374015950204459, + "learning_rate": 9.900282191590707e-06, + "loss": 0.1751, + "step": 1090 + }, + { + "epoch": 0.09, + "grad_norm": 0.2432323114130031, + "learning_rate": 9.900010835767132e-06, + "loss": 0.0404, + "step": 1091 + }, + { + "epoch": 0.09, + "grad_norm": 0.5980545050136095, + "learning_rate": 9.899739114962206e-06, + "loss": 0.1222, + "step": 1092 + }, + { + "epoch": 0.09, + "grad_norm": 0.4742334049973117, + "learning_rate": 9.899467029196166e-06, + "loss": 0.0817, + "step": 1093 + }, + { + "epoch": 0.09, + "grad_norm": 0.7255131631360962, + "learning_rate": 9.899194578489281e-06, + "loss": 0.155, + "step": 1094 + }, + { + "epoch": 0.09, + "grad_norm": 0.6483315687049885, + "learning_rate": 9.898921762861843e-06, + "loss": 0.1297, + "step": 1095 + }, + { + "epoch": 0.09, + "grad_norm": 0.5426605287173824, + "learning_rate": 9.898648582334175e-06, + "loss": 0.116, + "step": 1096 + }, + { + "epoch": 0.09, + "grad_norm": 0.38890868008493357, + "learning_rate": 9.898375036926625e-06, + "loss": 0.0612, + "step": 1097 + }, + { + "epoch": 0.09, + "grad_norm": 0.47528887741967374, + "learning_rate": 9.898101126659564e-06, + "loss": 0.1551, + "step": 1098 + }, + { + "epoch": 0.09, + "grad_norm": 0.3676167719489773, + "learning_rate": 9.897826851553401e-06, + "loss": 0.1065, + "step": 1099 + }, + { + "epoch": 0.09, + "grad_norm": 0.29939583031476164, + "learning_rate": 9.897552211628558e-06, + "loss": 0.0667, + "step": 1100 + }, + { + "epoch": 0.09, + "grad_norm": 0.49107681883207727, + "learning_rate": 9.897277206905498e-06, + "loss": 0.101, + "step": 1101 + }, + { + "epoch": 0.09, + "grad_norm": 0.7821488056466414, + "learning_rate": 9.897001837404705e-06, + "loss": 0.0982, + "step": 1102 + }, + { + "epoch": 0.09, + "grad_norm": 0.33758831116814153, + "learning_rate": 9.896726103146686e-06, + "loss": 0.1019, + "step": 1103 + }, + { + "epoch": 0.09, + "grad_norm": 0.27090015565925035, + "learning_rate": 9.89645000415198e-06, + "loss": 0.0506, + "step": 1104 + }, + { + "epoch": 0.09, + "grad_norm": 0.5900120450662142, + "learning_rate": 9.896173540441155e-06, + "loss": 0.1049, + "step": 1105 + }, + { + "epoch": 0.09, + "grad_norm": 0.5562054274684947, + "learning_rate": 9.895896712034803e-06, + "loss": 0.0958, + "step": 1106 + }, + { + "epoch": 0.09, + "grad_norm": 0.4602306865983874, + "learning_rate": 9.895619518953544e-06, + "loss": 0.0881, + "step": 1107 + }, + { + "epoch": 0.09, + "grad_norm": 0.3625220082706862, + "learning_rate": 9.895341961218022e-06, + "loss": 0.0812, + "step": 1108 + }, + { + "epoch": 0.09, + "grad_norm": 0.37358857690179625, + "learning_rate": 9.895064038848913e-06, + "loss": 0.0912, + "step": 1109 + }, + { + "epoch": 0.09, + "grad_norm": 0.26740423534999935, + "learning_rate": 9.894785751866921e-06, + "loss": 0.0636, + "step": 1110 + }, + { + "epoch": 0.09, + "grad_norm": 0.4705011652756792, + "learning_rate": 9.89450710029277e-06, + "loss": 0.1264, + "step": 1111 + }, + { + "epoch": 0.09, + "grad_norm": 0.46104239660480945, + "learning_rate": 9.89422808414722e-06, + "loss": 0.0997, + "step": 1112 + }, + { + "epoch": 0.09, + "grad_norm": 0.6012631479694955, + "learning_rate": 9.893948703451049e-06, + "loss": 0.1593, + "step": 1113 + }, + { + "epoch": 0.09, + "grad_norm": 0.5669736618486356, + "learning_rate": 9.893668958225069e-06, + "loss": 0.08, + "step": 1114 + }, + { + "epoch": 0.09, + "grad_norm": 0.38975958090898793, + "learning_rate": 9.893388848490118e-06, + "loss": 0.0791, + "step": 1115 + }, + { + "epoch": 0.09, + "grad_norm": 0.5197548169998257, + "learning_rate": 9.89310837426706e-06, + "loss": 0.1175, + "step": 1116 + }, + { + "epoch": 0.09, + "grad_norm": 0.7452840133651398, + "learning_rate": 9.892827535576785e-06, + "loss": 0.1192, + "step": 1117 + }, + { + "epoch": 0.09, + "grad_norm": 0.7749975719884958, + "learning_rate": 9.892546332440211e-06, + "loss": 0.1133, + "step": 1118 + }, + { + "epoch": 0.09, + "grad_norm": 0.6099187626325029, + "learning_rate": 9.892264764878288e-06, + "loss": 0.1118, + "step": 1119 + }, + { + "epoch": 0.09, + "grad_norm": 0.6987266537787064, + "learning_rate": 9.891982832911983e-06, + "loss": 0.1286, + "step": 1120 + }, + { + "epoch": 0.09, + "grad_norm": 0.5503263203440083, + "learning_rate": 9.891700536562301e-06, + "loss": 0.1122, + "step": 1121 + }, + { + "epoch": 0.09, + "grad_norm": 0.3560344337887794, + "learning_rate": 9.891417875850265e-06, + "loss": 0.0453, + "step": 1122 + }, + { + "epoch": 0.09, + "grad_norm": 0.4069213099516218, + "learning_rate": 9.891134850796934e-06, + "loss": 0.0642, + "step": 1123 + }, + { + "epoch": 0.09, + "grad_norm": 0.5398333626397479, + "learning_rate": 9.890851461423383e-06, + "loss": 0.1073, + "step": 1124 + }, + { + "epoch": 0.09, + "grad_norm": 0.5647557233551089, + "learning_rate": 9.890567707750725e-06, + "loss": 0.1096, + "step": 1125 + }, + { + "epoch": 0.09, + "grad_norm": 0.4907833881619203, + "learning_rate": 9.890283589800094e-06, + "loss": 0.1133, + "step": 1126 + }, + { + "epoch": 0.09, + "grad_norm": 0.27304577465887153, + "learning_rate": 9.889999107592653e-06, + "loss": 0.0832, + "step": 1127 + }, + { + "epoch": 0.1, + "grad_norm": 0.3205087174695364, + "learning_rate": 9.889714261149591e-06, + "loss": 0.0693, + "step": 1128 + }, + { + "epoch": 0.1, + "grad_norm": 0.5843199898546776, + "learning_rate": 9.88942905049213e-06, + "loss": 0.0827, + "step": 1129 + }, + { + "epoch": 0.1, + "grad_norm": 0.6719320751535182, + "learning_rate": 9.889143475641506e-06, + "loss": 0.0862, + "step": 1130 + }, + { + "epoch": 0.1, + "grad_norm": 0.750216030671962, + "learning_rate": 9.888857536619e-06, + "loss": 0.1208, + "step": 1131 + }, + { + "epoch": 0.1, + "grad_norm": 0.5238715789547718, + "learning_rate": 9.8885712334459e-06, + "loss": 0.076, + "step": 1132 + }, + { + "epoch": 0.1, + "grad_norm": 0.5224572329628987, + "learning_rate": 9.888284566143538e-06, + "loss": 0.0693, + "step": 1133 + }, + { + "epoch": 0.1, + "grad_norm": 0.5031333413533658, + "learning_rate": 9.887997534733266e-06, + "loss": 0.1339, + "step": 1134 + }, + { + "epoch": 0.1, + "grad_norm": 0.6215835782837629, + "learning_rate": 9.887710139236461e-06, + "loss": 0.1229, + "step": 1135 + }, + { + "epoch": 0.1, + "grad_norm": 0.6207397443184434, + "learning_rate": 9.887422379674534e-06, + "loss": 0.1339, + "step": 1136 + }, + { + "epoch": 0.1, + "grad_norm": 0.33522001258654616, + "learning_rate": 9.887134256068916e-06, + "loss": 0.0658, + "step": 1137 + }, + { + "epoch": 0.1, + "grad_norm": 0.3609935475436488, + "learning_rate": 9.88684576844107e-06, + "loss": 0.0996, + "step": 1138 + }, + { + "epoch": 0.1, + "grad_norm": 1.328336900849856, + "learning_rate": 9.886556916812481e-06, + "loss": 0.1488, + "step": 1139 + }, + { + "epoch": 0.1, + "grad_norm": 0.582741176431571, + "learning_rate": 9.886267701204669e-06, + "loss": 0.0873, + "step": 1140 + }, + { + "epoch": 0.1, + "grad_norm": 0.5414026411886947, + "learning_rate": 9.88597812163917e-06, + "loss": 0.0868, + "step": 1141 + }, + { + "epoch": 0.1, + "grad_norm": 0.4116039991930575, + "learning_rate": 9.885688178137561e-06, + "loss": 0.1106, + "step": 1142 + }, + { + "epoch": 0.1, + "grad_norm": 0.39267213767335374, + "learning_rate": 9.885397870721435e-06, + "loss": 0.0905, + "step": 1143 + }, + { + "epoch": 0.1, + "grad_norm": 0.5699378615988014, + "learning_rate": 9.885107199412415e-06, + "loss": 0.1101, + "step": 1144 + }, + { + "epoch": 0.1, + "grad_norm": 0.5228437474031826, + "learning_rate": 9.884816164232154e-06, + "loss": 0.1025, + "step": 1145 + }, + { + "epoch": 0.1, + "grad_norm": 0.5192292638363653, + "learning_rate": 9.884524765202328e-06, + "loss": 0.0987, + "step": 1146 + }, + { + "epoch": 0.1, + "grad_norm": 0.6943531096398174, + "learning_rate": 9.884233002344643e-06, + "loss": 0.0948, + "step": 1147 + }, + { + "epoch": 0.1, + "grad_norm": 0.51343320059455, + "learning_rate": 9.883940875680831e-06, + "loss": 0.1288, + "step": 1148 + }, + { + "epoch": 0.1, + "grad_norm": 0.48947464946427705, + "learning_rate": 9.883648385232654e-06, + "loss": 0.0841, + "step": 1149 + }, + { + "epoch": 0.1, + "grad_norm": 1.104629679103235, + "learning_rate": 9.883355531021891e-06, + "loss": 0.1189, + "step": 1150 + }, + { + "epoch": 0.1, + "grad_norm": 0.48213675048635046, + "learning_rate": 9.883062313070365e-06, + "loss": 0.1169, + "step": 1151 + }, + { + "epoch": 0.1, + "grad_norm": 0.794538595708785, + "learning_rate": 9.88276873139991e-06, + "loss": 0.1583, + "step": 1152 + }, + { + "epoch": 0.1, + "grad_norm": 0.5923944037002727, + "learning_rate": 9.882474786032397e-06, + "loss": 0.1206, + "step": 1153 + }, + { + "epoch": 0.1, + "grad_norm": 0.4622035275391892, + "learning_rate": 9.882180476989718e-06, + "loss": 0.0973, + "step": 1154 + }, + { + "epoch": 0.1, + "grad_norm": 0.5846682430323663, + "learning_rate": 9.881885804293798e-06, + "loss": 0.0814, + "step": 1155 + }, + { + "epoch": 0.1, + "grad_norm": 0.3759972462195817, + "learning_rate": 9.881590767966582e-06, + "loss": 0.0505, + "step": 1156 + }, + { + "epoch": 0.1, + "grad_norm": 0.33017253680889497, + "learning_rate": 9.881295368030048e-06, + "loss": 0.0843, + "step": 1157 + }, + { + "epoch": 0.1, + "grad_norm": 0.308779573536669, + "learning_rate": 9.880999604506201e-06, + "loss": 0.0671, + "step": 1158 + }, + { + "epoch": 0.1, + "grad_norm": 0.3654092837792867, + "learning_rate": 9.88070347741707e-06, + "loss": 0.0819, + "step": 1159 + }, + { + "epoch": 0.1, + "grad_norm": 0.647926691460691, + "learning_rate": 9.880406986784709e-06, + "loss": 0.0793, + "step": 1160 + }, + { + "epoch": 0.1, + "grad_norm": 0.46083453391305856, + "learning_rate": 9.880110132631209e-06, + "loss": 0.1042, + "step": 1161 + }, + { + "epoch": 0.1, + "grad_norm": 0.5350901454148727, + "learning_rate": 9.879812914978675e-06, + "loss": 0.0953, + "step": 1162 + }, + { + "epoch": 0.1, + "grad_norm": 0.20464063363408663, + "learning_rate": 9.879515333849248e-06, + "loss": 0.0584, + "step": 1163 + }, + { + "epoch": 0.1, + "grad_norm": 0.7130866518658852, + "learning_rate": 9.879217389265094e-06, + "loss": 0.1708, + "step": 1164 + }, + { + "epoch": 0.1, + "grad_norm": 0.36667307433873825, + "learning_rate": 9.878919081248406e-06, + "loss": 0.0938, + "step": 1165 + }, + { + "epoch": 0.1, + "grad_norm": 0.45798058844203876, + "learning_rate": 9.878620409821403e-06, + "loss": 0.1248, + "step": 1166 + }, + { + "epoch": 0.1, + "grad_norm": 0.4414945823284431, + "learning_rate": 9.878321375006332e-06, + "loss": 0.1045, + "step": 1167 + }, + { + "epoch": 0.1, + "grad_norm": 0.30677925442786375, + "learning_rate": 9.878021976825464e-06, + "loss": 0.0783, + "step": 1168 + }, + { + "epoch": 0.1, + "grad_norm": 0.34492404236771507, + "learning_rate": 9.877722215301107e-06, + "loss": 0.0846, + "step": 1169 + }, + { + "epoch": 0.1, + "grad_norm": 0.6027907222313164, + "learning_rate": 9.877422090455582e-06, + "loss": 0.0856, + "step": 1170 + }, + { + "epoch": 0.1, + "grad_norm": 0.8015382227240772, + "learning_rate": 9.877121602311246e-06, + "loss": 0.1614, + "step": 1171 + }, + { + "epoch": 0.1, + "grad_norm": 0.39537338573458475, + "learning_rate": 9.876820750890484e-06, + "loss": 0.0747, + "step": 1172 + }, + { + "epoch": 0.1, + "grad_norm": 0.42933799499545644, + "learning_rate": 9.876519536215701e-06, + "loss": 0.0895, + "step": 1173 + }, + { + "epoch": 0.1, + "grad_norm": 0.3343277895465831, + "learning_rate": 9.876217958309336e-06, + "loss": 0.0866, + "step": 1174 + }, + { + "epoch": 0.1, + "grad_norm": 0.3688364508073806, + "learning_rate": 9.875916017193849e-06, + "loss": 0.1058, + "step": 1175 + }, + { + "epoch": 0.1, + "grad_norm": 0.5198635824863487, + "learning_rate": 9.875613712891736e-06, + "loss": 0.1131, + "step": 1176 + }, + { + "epoch": 0.1, + "grad_norm": 0.4361188917910848, + "learning_rate": 9.87531104542551e-06, + "loss": 0.0862, + "step": 1177 + }, + { + "epoch": 0.1, + "grad_norm": 0.2526501907374999, + "learning_rate": 9.875008014817716e-06, + "loss": 0.044, + "step": 1178 + }, + { + "epoch": 0.1, + "grad_norm": 0.4864521841889567, + "learning_rate": 9.874704621090927e-06, + "loss": 0.0714, + "step": 1179 + }, + { + "epoch": 0.1, + "grad_norm": 0.3346313632751537, + "learning_rate": 9.87440086426774e-06, + "loss": 0.0874, + "step": 1180 + }, + { + "epoch": 0.1, + "grad_norm": 0.472202182650732, + "learning_rate": 9.87409674437078e-06, + "loss": 0.1217, + "step": 1181 + }, + { + "epoch": 0.1, + "grad_norm": 0.3449391693934795, + "learning_rate": 9.873792261422702e-06, + "loss": 0.0856, + "step": 1182 + }, + { + "epoch": 0.1, + "grad_norm": 0.4827059193578475, + "learning_rate": 9.873487415446185e-06, + "loss": 0.1037, + "step": 1183 + }, + { + "epoch": 0.1, + "grad_norm": 0.35985001785208826, + "learning_rate": 9.873182206463933e-06, + "loss": 0.0735, + "step": 1184 + }, + { + "epoch": 0.1, + "grad_norm": 0.46723099621947195, + "learning_rate": 9.872876634498685e-06, + "loss": 0.1232, + "step": 1185 + }, + { + "epoch": 0.1, + "grad_norm": 0.4008016354186275, + "learning_rate": 9.872570699573196e-06, + "loss": 0.0725, + "step": 1186 + }, + { + "epoch": 0.1, + "grad_norm": 0.6539894709708747, + "learning_rate": 9.87226440171026e-06, + "loss": 0.0822, + "step": 1187 + }, + { + "epoch": 0.1, + "grad_norm": 0.47733502515948617, + "learning_rate": 9.871957740932684e-06, + "loss": 0.0893, + "step": 1188 + }, + { + "epoch": 0.1, + "grad_norm": 0.6038160961625617, + "learning_rate": 9.871650717263318e-06, + "loss": 0.1146, + "step": 1189 + }, + { + "epoch": 0.1, + "grad_norm": 0.4996875005266435, + "learning_rate": 9.871343330725023e-06, + "loss": 0.1062, + "step": 1190 + }, + { + "epoch": 0.1, + "grad_norm": 0.39134989921917007, + "learning_rate": 9.871035581340702e-06, + "loss": 0.0385, + "step": 1191 + }, + { + "epoch": 0.1, + "grad_norm": 0.26909527128357086, + "learning_rate": 9.870727469133275e-06, + "loss": 0.0698, + "step": 1192 + }, + { + "epoch": 0.1, + "grad_norm": 0.34614202734817223, + "learning_rate": 9.870418994125691e-06, + "loss": 0.0921, + "step": 1193 + }, + { + "epoch": 0.1, + "grad_norm": 0.34364987840075567, + "learning_rate": 9.870110156340928e-06, + "loss": 0.0586, + "step": 1194 + }, + { + "epoch": 0.1, + "grad_norm": 0.40773919101470035, + "learning_rate": 9.86980095580199e-06, + "loss": 0.0678, + "step": 1195 + }, + { + "epoch": 0.1, + "grad_norm": 0.3318687276539297, + "learning_rate": 9.869491392531908e-06, + "loss": 0.0723, + "step": 1196 + }, + { + "epoch": 0.1, + "grad_norm": 0.36136255398538897, + "learning_rate": 9.869181466553742e-06, + "loss": 0.0827, + "step": 1197 + }, + { + "epoch": 0.1, + "grad_norm": 0.4334417512648093, + "learning_rate": 9.868871177890573e-06, + "loss": 0.0932, + "step": 1198 + }, + { + "epoch": 0.1, + "grad_norm": 0.7050453338847832, + "learning_rate": 9.868560526565516e-06, + "loss": 0.1079, + "step": 1199 + }, + { + "epoch": 0.1, + "grad_norm": 0.5398061543496713, + "learning_rate": 9.86824951260171e-06, + "loss": 0.0787, + "step": 1200 + }, + { + "epoch": 0.1, + "grad_norm": 0.3049898176892678, + "learning_rate": 9.86793813602232e-06, + "loss": 0.0513, + "step": 1201 + }, + { + "epoch": 0.1, + "grad_norm": 0.31541742675691486, + "learning_rate": 9.867626396850541e-06, + "loss": 0.0456, + "step": 1202 + }, + { + "epoch": 0.1, + "grad_norm": 0.5775713344156369, + "learning_rate": 9.867314295109592e-06, + "loss": 0.1574, + "step": 1203 + }, + { + "epoch": 0.1, + "grad_norm": 0.40296162967650134, + "learning_rate": 9.867001830822717e-06, + "loss": 0.0919, + "step": 1204 + }, + { + "epoch": 0.1, + "grad_norm": 0.7315004777882081, + "learning_rate": 9.866689004013196e-06, + "loss": 0.1468, + "step": 1205 + }, + { + "epoch": 0.1, + "grad_norm": 0.3298272281379322, + "learning_rate": 9.866375814704328e-06, + "loss": 0.0731, + "step": 1206 + }, + { + "epoch": 0.1, + "grad_norm": 0.4528189488768842, + "learning_rate": 9.86606226291944e-06, + "loss": 0.1006, + "step": 1207 + }, + { + "epoch": 0.1, + "grad_norm": 0.5477172243983482, + "learning_rate": 9.865748348681888e-06, + "loss": 0.1113, + "step": 1208 + }, + { + "epoch": 0.1, + "grad_norm": 0.32433308395056626, + "learning_rate": 9.865434072015051e-06, + "loss": 0.0751, + "step": 1209 + }, + { + "epoch": 0.1, + "grad_norm": 0.48099240620608635, + "learning_rate": 9.865119432942344e-06, + "loss": 0.0941, + "step": 1210 + }, + { + "epoch": 0.1, + "grad_norm": 0.48147512410297716, + "learning_rate": 9.864804431487201e-06, + "loss": 0.0863, + "step": 1211 + }, + { + "epoch": 0.1, + "grad_norm": 0.4860049504634328, + "learning_rate": 9.864489067673082e-06, + "loss": 0.0948, + "step": 1212 + }, + { + "epoch": 0.1, + "grad_norm": 0.3669669481662148, + "learning_rate": 9.86417334152348e-06, + "loss": 0.0651, + "step": 1213 + }, + { + "epoch": 0.1, + "grad_norm": 0.511229237392076, + "learning_rate": 9.863857253061913e-06, + "loss": 0.1042, + "step": 1214 + }, + { + "epoch": 0.1, + "grad_norm": 0.38455495902357195, + "learning_rate": 9.863540802311923e-06, + "loss": 0.1037, + "step": 1215 + }, + { + "epoch": 0.1, + "grad_norm": 0.4074745569179462, + "learning_rate": 9.863223989297081e-06, + "loss": 0.0965, + "step": 1216 + }, + { + "epoch": 0.1, + "grad_norm": 0.7209706079615362, + "learning_rate": 9.862906814040987e-06, + "loss": 0.1246, + "step": 1217 + }, + { + "epoch": 0.1, + "grad_norm": 0.47932788755397343, + "learning_rate": 9.862589276567263e-06, + "loss": 0.0949, + "step": 1218 + }, + { + "epoch": 0.1, + "grad_norm": 0.4540557136197089, + "learning_rate": 9.862271376899564e-06, + "loss": 0.1039, + "step": 1219 + }, + { + "epoch": 0.1, + "grad_norm": 0.3220968944594814, + "learning_rate": 9.861953115061568e-06, + "loss": 0.0987, + "step": 1220 + }, + { + "epoch": 0.1, + "grad_norm": 0.33041159784337465, + "learning_rate": 9.861634491076984e-06, + "loss": 0.0688, + "step": 1221 + }, + { + "epoch": 0.1, + "grad_norm": 0.2704855424471933, + "learning_rate": 9.861315504969538e-06, + "loss": 0.0877, + "step": 1222 + }, + { + "epoch": 0.1, + "grad_norm": 0.3046283034984283, + "learning_rate": 9.860996156762996e-06, + "loss": 0.0671, + "step": 1223 + }, + { + "epoch": 0.1, + "grad_norm": 0.3617303088067, + "learning_rate": 9.860676446481142e-06, + "loss": 0.1307, + "step": 1224 + }, + { + "epoch": 0.1, + "grad_norm": 0.3985994219892503, + "learning_rate": 9.860356374147791e-06, + "loss": 0.0969, + "step": 1225 + }, + { + "epoch": 0.1, + "grad_norm": 0.5880517326984759, + "learning_rate": 9.860035939786783e-06, + "loss": 0.0872, + "step": 1226 + }, + { + "epoch": 0.1, + "grad_norm": 0.360573780651818, + "learning_rate": 9.859715143421988e-06, + "loss": 0.09, + "step": 1227 + }, + { + "epoch": 0.1, + "grad_norm": 0.495907523916959, + "learning_rate": 9.859393985077298e-06, + "loss": 0.1039, + "step": 1228 + }, + { + "epoch": 0.1, + "grad_norm": 0.41052371290696726, + "learning_rate": 9.859072464776636e-06, + "loss": 0.1004, + "step": 1229 + }, + { + "epoch": 0.1, + "grad_norm": 0.36047426973629865, + "learning_rate": 9.85875058254395e-06, + "loss": 0.0683, + "step": 1230 + }, + { + "epoch": 0.1, + "grad_norm": 0.45866046373925823, + "learning_rate": 9.858428338403217e-06, + "loss": 0.0935, + "step": 1231 + }, + { + "epoch": 0.1, + "grad_norm": 0.4449135382546556, + "learning_rate": 9.85810573237844e-06, + "loss": 0.0766, + "step": 1232 + }, + { + "epoch": 0.1, + "grad_norm": 0.569013932285049, + "learning_rate": 9.857782764493647e-06, + "loss": 0.1356, + "step": 1233 + }, + { + "epoch": 0.1, + "grad_norm": 0.1886984855398039, + "learning_rate": 9.857459434772894e-06, + "loss": 0.0288, + "step": 1234 + }, + { + "epoch": 0.1, + "grad_norm": 1.0051714802112395, + "learning_rate": 9.857135743240264e-06, + "loss": 0.1358, + "step": 1235 + }, + { + "epoch": 0.1, + "grad_norm": 0.6961789002776256, + "learning_rate": 9.856811689919872e-06, + "loss": 0.0788, + "step": 1236 + }, + { + "epoch": 0.1, + "grad_norm": 0.33539951876672575, + "learning_rate": 9.856487274835851e-06, + "loss": 0.105, + "step": 1237 + }, + { + "epoch": 0.1, + "grad_norm": 0.4288445489380213, + "learning_rate": 9.856162498012367e-06, + "loss": 0.0576, + "step": 1238 + }, + { + "epoch": 0.1, + "grad_norm": 0.3838948328955162, + "learning_rate": 9.855837359473611e-06, + "loss": 0.0936, + "step": 1239 + }, + { + "epoch": 0.1, + "grad_norm": 0.4027046159241728, + "learning_rate": 9.8555118592438e-06, + "loss": 0.0843, + "step": 1240 + }, + { + "epoch": 0.1, + "grad_norm": 0.5091845706480014, + "learning_rate": 9.855185997347183e-06, + "loss": 0.122, + "step": 1241 + }, + { + "epoch": 0.1, + "grad_norm": 0.39853686330473076, + "learning_rate": 9.854859773808027e-06, + "loss": 0.0832, + "step": 1242 + }, + { + "epoch": 0.1, + "grad_norm": 0.46219044288858957, + "learning_rate": 9.854533188650635e-06, + "loss": 0.1113, + "step": 1243 + }, + { + "epoch": 0.1, + "grad_norm": 0.37033310759865845, + "learning_rate": 9.85420624189933e-06, + "loss": 0.0945, + "step": 1244 + }, + { + "epoch": 0.1, + "grad_norm": 0.23326756366660525, + "learning_rate": 9.853878933578466e-06, + "loss": 0.0728, + "step": 1245 + }, + { + "epoch": 0.1, + "grad_norm": 0.47002898762042333, + "learning_rate": 9.853551263712423e-06, + "loss": 0.0911, + "step": 1246 + }, + { + "epoch": 0.11, + "grad_norm": 0.4351633588514563, + "learning_rate": 9.853223232325608e-06, + "loss": 0.0791, + "step": 1247 + }, + { + "epoch": 0.11, + "grad_norm": 0.7623362002711589, + "learning_rate": 9.852894839442455e-06, + "loss": 0.1256, + "step": 1248 + }, + { + "epoch": 0.11, + "grad_norm": 0.24963041638359088, + "learning_rate": 9.852566085087426e-06, + "loss": 0.0634, + "step": 1249 + }, + { + "epoch": 0.11, + "grad_norm": 0.45241593489806203, + "learning_rate": 9.852236969285005e-06, + "loss": 0.1335, + "step": 1250 + }, + { + "epoch": 0.11, + "grad_norm": 0.46437209591780204, + "learning_rate": 9.851907492059707e-06, + "loss": 0.0808, + "step": 1251 + }, + { + "epoch": 0.11, + "grad_norm": 0.41086595534838877, + "learning_rate": 9.851577653436075e-06, + "loss": 0.1129, + "step": 1252 + }, + { + "epoch": 0.11, + "grad_norm": 0.5962031113872454, + "learning_rate": 9.851247453438678e-06, + "loss": 0.1286, + "step": 1253 + }, + { + "epoch": 0.11, + "grad_norm": 0.5939079794126101, + "learning_rate": 9.85091689209211e-06, + "loss": 0.1219, + "step": 1254 + }, + { + "epoch": 0.11, + "grad_norm": 0.3017344258040954, + "learning_rate": 9.850585969420991e-06, + "loss": 0.0691, + "step": 1255 + }, + { + "epoch": 0.11, + "grad_norm": 0.5315405208041578, + "learning_rate": 9.850254685449976e-06, + "loss": 0.1097, + "step": 1256 + }, + { + "epoch": 0.11, + "grad_norm": 0.3511845162819431, + "learning_rate": 9.849923040203734e-06, + "loss": 0.0665, + "step": 1257 + }, + { + "epoch": 0.11, + "grad_norm": 0.5540155088885919, + "learning_rate": 9.849591033706971e-06, + "loss": 0.1069, + "step": 1258 + }, + { + "epoch": 0.11, + "grad_norm": 0.26963567578609043, + "learning_rate": 9.849258665984421e-06, + "loss": 0.0772, + "step": 1259 + }, + { + "epoch": 0.11, + "grad_norm": 0.46404570136747586, + "learning_rate": 9.848925937060832e-06, + "loss": 0.1032, + "step": 1260 + }, + { + "epoch": 0.11, + "grad_norm": 0.4293370949482611, + "learning_rate": 9.848592846960994e-06, + "loss": 0.0946, + "step": 1261 + }, + { + "epoch": 0.11, + "grad_norm": 0.45579782108471345, + "learning_rate": 9.848259395709715e-06, + "loss": 0.1343, + "step": 1262 + }, + { + "epoch": 0.11, + "grad_norm": 0.43670842211234284, + "learning_rate": 9.847925583331834e-06, + "loss": 0.1187, + "step": 1263 + }, + { + "epoch": 0.11, + "grad_norm": 0.4089442402537916, + "learning_rate": 9.847591409852213e-06, + "loss": 0.0836, + "step": 1264 + }, + { + "epoch": 0.11, + "grad_norm": 0.5382432399614083, + "learning_rate": 9.847256875295746e-06, + "loss": 0.1279, + "step": 1265 + }, + { + "epoch": 0.11, + "grad_norm": 0.3925785502754681, + "learning_rate": 9.846921979687348e-06, + "loss": 0.0543, + "step": 1266 + }, + { + "epoch": 0.11, + "grad_norm": 0.4414898908721815, + "learning_rate": 9.846586723051966e-06, + "loss": 0.0639, + "step": 1267 + }, + { + "epoch": 0.11, + "grad_norm": 0.41429972405256316, + "learning_rate": 9.846251105414572e-06, + "loss": 0.1118, + "step": 1268 + }, + { + "epoch": 0.11, + "grad_norm": 0.7072948621088979, + "learning_rate": 9.845915126800164e-06, + "loss": 0.1373, + "step": 1269 + }, + { + "epoch": 0.11, + "grad_norm": 0.34578062340533344, + "learning_rate": 9.845578787233767e-06, + "loss": 0.0881, + "step": 1270 + }, + { + "epoch": 0.11, + "grad_norm": 0.4362800768538075, + "learning_rate": 9.845242086740436e-06, + "loss": 0.1128, + "step": 1271 + }, + { + "epoch": 0.11, + "grad_norm": 0.5682282971247081, + "learning_rate": 9.844905025345247e-06, + "loss": 0.1243, + "step": 1272 + }, + { + "epoch": 0.11, + "grad_norm": 0.4859135283334354, + "learning_rate": 9.84456760307331e-06, + "loss": 0.1186, + "step": 1273 + }, + { + "epoch": 0.11, + "grad_norm": 0.47992981971449605, + "learning_rate": 9.844229819949754e-06, + "loss": 0.1246, + "step": 1274 + }, + { + "epoch": 0.11, + "grad_norm": 0.508534007981971, + "learning_rate": 9.843891675999742e-06, + "loss": 0.1171, + "step": 1275 + }, + { + "epoch": 0.11, + "grad_norm": 0.5665705368711783, + "learning_rate": 9.84355317124846e-06, + "loss": 0.1172, + "step": 1276 + }, + { + "epoch": 0.11, + "grad_norm": 1.1373088949594616, + "learning_rate": 9.843214305721124e-06, + "loss": 0.1557, + "step": 1277 + }, + { + "epoch": 0.11, + "grad_norm": 0.39902945956492036, + "learning_rate": 9.842875079442971e-06, + "loss": 0.1054, + "step": 1278 + }, + { + "epoch": 0.11, + "grad_norm": 0.5074053889789976, + "learning_rate": 9.84253549243927e-06, + "loss": 0.1109, + "step": 1279 + }, + { + "epoch": 0.11, + "grad_norm": 0.428133882775729, + "learning_rate": 9.842195544735316e-06, + "loss": 0.12, + "step": 1280 + }, + { + "epoch": 0.11, + "grad_norm": 0.3341129443101746, + "learning_rate": 9.84185523635643e-06, + "loss": 0.0516, + "step": 1281 + }, + { + "epoch": 0.11, + "grad_norm": 0.4310996618024957, + "learning_rate": 9.84151456732796e-06, + "loss": 0.0951, + "step": 1282 + }, + { + "epoch": 0.11, + "grad_norm": 0.3570990504991548, + "learning_rate": 9.841173537675281e-06, + "loss": 0.0802, + "step": 1283 + }, + { + "epoch": 0.11, + "grad_norm": 0.3346941823062726, + "learning_rate": 9.840832147423797e-06, + "loss": 0.0873, + "step": 1284 + }, + { + "epoch": 0.11, + "grad_norm": 0.5668108591973479, + "learning_rate": 9.840490396598933e-06, + "loss": 0.0868, + "step": 1285 + }, + { + "epoch": 0.11, + "grad_norm": 0.39383363487851014, + "learning_rate": 9.840148285226145e-06, + "loss": 0.1166, + "step": 1286 + }, + { + "epoch": 0.11, + "grad_norm": 0.39024185833878483, + "learning_rate": 9.83980581333092e-06, + "loss": 0.1306, + "step": 1287 + }, + { + "epoch": 0.11, + "grad_norm": 0.6024058693222207, + "learning_rate": 9.83946298093876e-06, + "loss": 0.0887, + "step": 1288 + }, + { + "epoch": 0.11, + "grad_norm": 0.4576632950069507, + "learning_rate": 9.83911978807521e-06, + "loss": 0.075, + "step": 1289 + }, + { + "epoch": 0.11, + "grad_norm": 0.3344338753325829, + "learning_rate": 9.838776234765824e-06, + "loss": 0.076, + "step": 1290 + }, + { + "epoch": 0.11, + "grad_norm": 0.46870497420785795, + "learning_rate": 9.838432321036198e-06, + "loss": 0.0929, + "step": 1291 + }, + { + "epoch": 0.11, + "grad_norm": 0.3780791653476731, + "learning_rate": 9.838088046911946e-06, + "loss": 0.1106, + "step": 1292 + }, + { + "epoch": 0.11, + "grad_norm": 0.46173724271856387, + "learning_rate": 9.837743412418714e-06, + "loss": 0.0595, + "step": 1293 + }, + { + "epoch": 0.11, + "grad_norm": 0.7624166420334992, + "learning_rate": 9.837398417582169e-06, + "loss": 0.0999, + "step": 1294 + }, + { + "epoch": 0.11, + "grad_norm": 0.45669878246743767, + "learning_rate": 9.83705306242801e-06, + "loss": 0.0537, + "step": 1295 + }, + { + "epoch": 0.11, + "grad_norm": 0.7503163272939328, + "learning_rate": 9.836707346981962e-06, + "loss": 0.1638, + "step": 1296 + }, + { + "epoch": 0.11, + "grad_norm": 0.5718920516006625, + "learning_rate": 9.836361271269773e-06, + "loss": 0.1184, + "step": 1297 + }, + { + "epoch": 0.11, + "grad_norm": 0.2638980797443833, + "learning_rate": 9.836014835317227e-06, + "loss": 0.0849, + "step": 1298 + }, + { + "epoch": 0.11, + "grad_norm": 0.5333538831225642, + "learning_rate": 9.835668039150119e-06, + "loss": 0.0726, + "step": 1299 + }, + { + "epoch": 0.11, + "grad_norm": 0.4678499855716581, + "learning_rate": 9.835320882794289e-06, + "loss": 0.0694, + "step": 1300 + }, + { + "epoch": 0.11, + "grad_norm": 0.5893940976654404, + "learning_rate": 9.83497336627559e-06, + "loss": 0.1115, + "step": 1301 + }, + { + "epoch": 0.11, + "grad_norm": 0.477520473673102, + "learning_rate": 9.83462548961991e-06, + "loss": 0.1114, + "step": 1302 + }, + { + "epoch": 0.11, + "grad_norm": 0.38339227477600996, + "learning_rate": 9.834277252853159e-06, + "loss": 0.0903, + "step": 1303 + }, + { + "epoch": 0.11, + "grad_norm": 0.40315468654635633, + "learning_rate": 9.833928656001277e-06, + "loss": 0.0734, + "step": 1304 + }, + { + "epoch": 0.11, + "grad_norm": 0.51326273500667, + "learning_rate": 9.833579699090228e-06, + "loss": 0.094, + "step": 1305 + }, + { + "epoch": 0.11, + "grad_norm": 0.4324557356298144, + "learning_rate": 9.833230382146007e-06, + "loss": 0.1127, + "step": 1306 + }, + { + "epoch": 0.11, + "grad_norm": 0.43447306946349384, + "learning_rate": 9.83288070519463e-06, + "loss": 0.0804, + "step": 1307 + }, + { + "epoch": 0.11, + "grad_norm": 0.37318183925231924, + "learning_rate": 9.832530668262146e-06, + "loss": 0.0755, + "step": 1308 + }, + { + "epoch": 0.11, + "grad_norm": 0.393211348658576, + "learning_rate": 9.832180271374625e-06, + "loss": 0.1092, + "step": 1309 + }, + { + "epoch": 0.11, + "grad_norm": 0.4063557255190413, + "learning_rate": 9.831829514558167e-06, + "loss": 0.1061, + "step": 1310 + }, + { + "epoch": 0.11, + "grad_norm": 0.3765834886207673, + "learning_rate": 9.8314783978389e-06, + "loss": 0.1059, + "step": 1311 + }, + { + "epoch": 0.11, + "grad_norm": 0.3213958970422227, + "learning_rate": 9.831126921242977e-06, + "loss": 0.0835, + "step": 1312 + }, + { + "epoch": 0.11, + "grad_norm": 0.3510965502673136, + "learning_rate": 9.830775084796578e-06, + "loss": 0.1019, + "step": 1313 + }, + { + "epoch": 0.11, + "grad_norm": 0.3664726620910326, + "learning_rate": 9.830422888525908e-06, + "loss": 0.0804, + "step": 1314 + }, + { + "epoch": 0.11, + "grad_norm": 0.5505245121910278, + "learning_rate": 9.830070332457203e-06, + "loss": 0.0466, + "step": 1315 + }, + { + "epoch": 0.11, + "grad_norm": 0.4574474443870217, + "learning_rate": 9.829717416616723e-06, + "loss": 0.1107, + "step": 1316 + }, + { + "epoch": 0.11, + "grad_norm": 0.28426321714867947, + "learning_rate": 9.829364141030753e-06, + "loss": 0.0651, + "step": 1317 + }, + { + "epoch": 0.11, + "grad_norm": 0.4561678875086119, + "learning_rate": 9.82901050572561e-06, + "loss": 0.1108, + "step": 1318 + }, + { + "epoch": 0.11, + "grad_norm": 0.3974288253269227, + "learning_rate": 9.828656510727632e-06, + "loss": 0.1201, + "step": 1319 + }, + { + "epoch": 0.11, + "grad_norm": 0.6102372108891713, + "learning_rate": 9.82830215606319e-06, + "loss": 0.1389, + "step": 1320 + }, + { + "epoch": 0.11, + "grad_norm": 0.4923301976604247, + "learning_rate": 9.827947441758675e-06, + "loss": 0.1188, + "step": 1321 + }, + { + "epoch": 0.11, + "grad_norm": 0.3768478512027931, + "learning_rate": 9.827592367840509e-06, + "loss": 0.0811, + "step": 1322 + }, + { + "epoch": 0.11, + "grad_norm": 0.4950674192325829, + "learning_rate": 9.827236934335142e-06, + "loss": 0.1114, + "step": 1323 + }, + { + "epoch": 0.11, + "grad_norm": 0.3985044521628323, + "learning_rate": 9.826881141269046e-06, + "loss": 0.0896, + "step": 1324 + }, + { + "epoch": 0.11, + "grad_norm": 0.4718674135071454, + "learning_rate": 9.826524988668727e-06, + "loss": 0.0702, + "step": 1325 + }, + { + "epoch": 0.11, + "grad_norm": 0.4688557173646293, + "learning_rate": 9.826168476560707e-06, + "loss": 0.0917, + "step": 1326 + }, + { + "epoch": 0.11, + "grad_norm": 0.4702698560538927, + "learning_rate": 9.825811604971546e-06, + "loss": 0.0953, + "step": 1327 + }, + { + "epoch": 0.11, + "grad_norm": 0.2966485367414335, + "learning_rate": 9.825454373927824e-06, + "loss": 0.0561, + "step": 1328 + }, + { + "epoch": 0.11, + "grad_norm": 0.4606986284646067, + "learning_rate": 9.82509678345615e-06, + "loss": 0.0979, + "step": 1329 + }, + { + "epoch": 0.11, + "grad_norm": 0.3882208342924487, + "learning_rate": 9.824738833583158e-06, + "loss": 0.0962, + "step": 1330 + }, + { + "epoch": 0.11, + "grad_norm": 0.3769168571737713, + "learning_rate": 9.824380524335513e-06, + "loss": 0.1003, + "step": 1331 + }, + { + "epoch": 0.11, + "grad_norm": 0.43139468527300157, + "learning_rate": 9.824021855739902e-06, + "loss": 0.1085, + "step": 1332 + }, + { + "epoch": 0.11, + "grad_norm": 0.5252124543080456, + "learning_rate": 9.823662827823041e-06, + "loss": 0.1134, + "step": 1333 + }, + { + "epoch": 0.11, + "grad_norm": 0.2838724211694334, + "learning_rate": 9.823303440611673e-06, + "loss": 0.071, + "step": 1334 + }, + { + "epoch": 0.11, + "grad_norm": 0.5720282344966142, + "learning_rate": 9.822943694132566e-06, + "loss": 0.1014, + "step": 1335 + }, + { + "epoch": 0.11, + "grad_norm": 0.5149843547110945, + "learning_rate": 9.82258358841252e-06, + "loss": 0.1188, + "step": 1336 + }, + { + "epoch": 0.11, + "grad_norm": 0.4758153241786766, + "learning_rate": 9.822223123478352e-06, + "loss": 0.1067, + "step": 1337 + }, + { + "epoch": 0.11, + "grad_norm": 0.497953222052105, + "learning_rate": 9.821862299356915e-06, + "loss": 0.0819, + "step": 1338 + }, + { + "epoch": 0.11, + "grad_norm": 0.35332746669853893, + "learning_rate": 9.821501116075083e-06, + "loss": 0.0825, + "step": 1339 + }, + { + "epoch": 0.11, + "grad_norm": 0.2903086115613855, + "learning_rate": 9.821139573659762e-06, + "loss": 0.0585, + "step": 1340 + }, + { + "epoch": 0.11, + "grad_norm": 0.6606346464925431, + "learning_rate": 9.82077767213788e-06, + "loss": 0.1531, + "step": 1341 + }, + { + "epoch": 0.11, + "grad_norm": 0.39219895899805657, + "learning_rate": 9.820415411536393e-06, + "loss": 0.0583, + "step": 1342 + }, + { + "epoch": 0.11, + "grad_norm": 0.7343511264949762, + "learning_rate": 9.820052791882286e-06, + "loss": 0.1182, + "step": 1343 + }, + { + "epoch": 0.11, + "grad_norm": 0.46109748738601714, + "learning_rate": 9.819689813202568e-06, + "loss": 0.0826, + "step": 1344 + }, + { + "epoch": 0.11, + "grad_norm": 0.3476703006514118, + "learning_rate": 9.819326475524278e-06, + "loss": 0.071, + "step": 1345 + }, + { + "epoch": 0.11, + "grad_norm": 0.4376253786638165, + "learning_rate": 9.818962778874474e-06, + "loss": 0.0956, + "step": 1346 + }, + { + "epoch": 0.11, + "grad_norm": 0.35602944680191645, + "learning_rate": 9.818598723280252e-06, + "loss": 0.109, + "step": 1347 + }, + { + "epoch": 0.11, + "grad_norm": 0.5374322508878858, + "learning_rate": 9.818234308768725e-06, + "loss": 0.1135, + "step": 1348 + }, + { + "epoch": 0.11, + "grad_norm": 0.38168161229482556, + "learning_rate": 9.81786953536704e-06, + "loss": 0.1008, + "step": 1349 + }, + { + "epoch": 0.11, + "grad_norm": 0.4381885207331545, + "learning_rate": 9.817504403102366e-06, + "loss": 0.0812, + "step": 1350 + }, + { + "epoch": 0.11, + "grad_norm": 0.5372639360914435, + "learning_rate": 9.8171389120019e-06, + "loss": 0.0988, + "step": 1351 + }, + { + "epoch": 0.11, + "grad_norm": 0.40129652875460275, + "learning_rate": 9.816773062092863e-06, + "loss": 0.1021, + "step": 1352 + }, + { + "epoch": 0.11, + "grad_norm": 0.3508178829775055, + "learning_rate": 9.81640685340251e-06, + "loss": 0.084, + "step": 1353 + }, + { + "epoch": 0.11, + "grad_norm": 0.362383745159872, + "learning_rate": 9.81604028595812e-06, + "loss": 0.067, + "step": 1354 + }, + { + "epoch": 0.11, + "grad_norm": 0.5963392936872175, + "learning_rate": 9.815673359786991e-06, + "loss": 0.1207, + "step": 1355 + }, + { + "epoch": 0.11, + "grad_norm": 0.3822166658436363, + "learning_rate": 9.815306074916458e-06, + "loss": 0.0442, + "step": 1356 + }, + { + "epoch": 0.11, + "grad_norm": 0.3639690691632694, + "learning_rate": 9.814938431373877e-06, + "loss": 0.0764, + "step": 1357 + }, + { + "epoch": 0.11, + "grad_norm": 0.40571712875780264, + "learning_rate": 9.814570429186631e-06, + "loss": 0.0743, + "step": 1358 + }, + { + "epoch": 0.11, + "grad_norm": 0.3526182468905231, + "learning_rate": 9.814202068382134e-06, + "loss": 0.0526, + "step": 1359 + }, + { + "epoch": 0.11, + "grad_norm": 0.6044733411657046, + "learning_rate": 9.813833348987823e-06, + "loss": 0.1172, + "step": 1360 + }, + { + "epoch": 0.11, + "grad_norm": 0.2741004641602592, + "learning_rate": 9.81346427103116e-06, + "loss": 0.063, + "step": 1361 + }, + { + "epoch": 0.11, + "grad_norm": 1.2146410708673758, + "learning_rate": 9.813094834539638e-06, + "loss": 0.138, + "step": 1362 + }, + { + "epoch": 0.11, + "grad_norm": 0.4414791627373951, + "learning_rate": 9.812725039540774e-06, + "loss": 0.1242, + "step": 1363 + }, + { + "epoch": 0.11, + "grad_norm": 0.5122967093166859, + "learning_rate": 9.812354886062115e-06, + "loss": 0.0983, + "step": 1364 + }, + { + "epoch": 0.12, + "grad_norm": 0.4213130077276111, + "learning_rate": 9.811984374131227e-06, + "loss": 0.0917, + "step": 1365 + }, + { + "epoch": 0.12, + "grad_norm": 0.3029624905098351, + "learning_rate": 9.811613503775712e-06, + "loss": 0.0603, + "step": 1366 + }, + { + "epoch": 0.12, + "grad_norm": 0.9774825380133516, + "learning_rate": 9.811242275023194e-06, + "loss": 0.1562, + "step": 1367 + }, + { + "epoch": 0.12, + "grad_norm": 0.7891023083916583, + "learning_rate": 9.810870687901324e-06, + "loss": 0.1114, + "step": 1368 + }, + { + "epoch": 0.12, + "grad_norm": 0.23904246159783715, + "learning_rate": 9.81049874243778e-06, + "loss": 0.0631, + "step": 1369 + }, + { + "epoch": 0.12, + "grad_norm": 0.3502714078561316, + "learning_rate": 9.810126438660265e-06, + "loss": 0.078, + "step": 1370 + }, + { + "epoch": 0.12, + "grad_norm": 0.4784032461045025, + "learning_rate": 9.809753776596512e-06, + "loss": 0.1252, + "step": 1371 + }, + { + "epoch": 0.12, + "grad_norm": 0.5541415928311386, + "learning_rate": 9.80938075627428e-06, + "loss": 0.1196, + "step": 1372 + }, + { + "epoch": 0.12, + "grad_norm": 0.3907293810137037, + "learning_rate": 9.80900737772135e-06, + "loss": 0.0822, + "step": 1373 + }, + { + "epoch": 0.12, + "grad_norm": 0.5785635564179468, + "learning_rate": 9.808633640965538e-06, + "loss": 0.1094, + "step": 1374 + }, + { + "epoch": 0.12, + "grad_norm": 0.4639965579304603, + "learning_rate": 9.808259546034682e-06, + "loss": 0.0891, + "step": 1375 + }, + { + "epoch": 0.12, + "grad_norm": 0.29863717705606946, + "learning_rate": 9.807885092956642e-06, + "loss": 0.0603, + "step": 1376 + }, + { + "epoch": 0.12, + "grad_norm": 0.44227780939442596, + "learning_rate": 9.807510281759314e-06, + "loss": 0.0852, + "step": 1377 + }, + { + "epoch": 0.12, + "grad_norm": 0.8270860320808165, + "learning_rate": 9.807135112470612e-06, + "loss": 0.1399, + "step": 1378 + }, + { + "epoch": 0.12, + "grad_norm": 0.5926787271392061, + "learning_rate": 9.806759585118486e-06, + "loss": 0.1182, + "step": 1379 + }, + { + "epoch": 0.12, + "grad_norm": 0.32695645529656936, + "learning_rate": 9.806383699730904e-06, + "loss": 0.0828, + "step": 1380 + }, + { + "epoch": 0.12, + "grad_norm": 0.3262736264844147, + "learning_rate": 9.806007456335863e-06, + "loss": 0.0674, + "step": 1381 + }, + { + "epoch": 0.12, + "grad_norm": 0.4038810033427228, + "learning_rate": 9.805630854961391e-06, + "loss": 0.0995, + "step": 1382 + }, + { + "epoch": 0.12, + "grad_norm": 0.26936718077686644, + "learning_rate": 9.805253895635537e-06, + "loss": 0.0759, + "step": 1383 + }, + { + "epoch": 0.12, + "grad_norm": 0.6008822625180089, + "learning_rate": 9.804876578386381e-06, + "loss": 0.1454, + "step": 1384 + }, + { + "epoch": 0.12, + "grad_norm": 0.3649423515980822, + "learning_rate": 9.804498903242028e-06, + "loss": 0.0993, + "step": 1385 + }, + { + "epoch": 0.12, + "grad_norm": 0.7992837295237342, + "learning_rate": 9.804120870230606e-06, + "loss": 0.1526, + "step": 1386 + }, + { + "epoch": 0.12, + "grad_norm": 0.48882359371714723, + "learning_rate": 9.803742479380277e-06, + "loss": 0.0973, + "step": 1387 + }, + { + "epoch": 0.12, + "grad_norm": 0.4174530827679763, + "learning_rate": 9.803363730719224e-06, + "loss": 0.1082, + "step": 1388 + }, + { + "epoch": 0.12, + "grad_norm": 0.49091856576080906, + "learning_rate": 9.80298462427566e-06, + "loss": 0.1306, + "step": 1389 + }, + { + "epoch": 0.12, + "grad_norm": 0.35527812814084153, + "learning_rate": 9.802605160077821e-06, + "loss": 0.1086, + "step": 1390 + }, + { + "epoch": 0.12, + "grad_norm": 0.44082350753822475, + "learning_rate": 9.80222533815397e-06, + "loss": 0.1009, + "step": 1391 + }, + { + "epoch": 0.12, + "grad_norm": 0.26992322616007103, + "learning_rate": 9.801845158532404e-06, + "loss": 0.0636, + "step": 1392 + }, + { + "epoch": 0.12, + "grad_norm": 0.4130224338156468, + "learning_rate": 9.801464621241437e-06, + "loss": 0.094, + "step": 1393 + }, + { + "epoch": 0.12, + "grad_norm": 0.3125403628838735, + "learning_rate": 9.801083726309412e-06, + "loss": 0.0779, + "step": 1394 + }, + { + "epoch": 0.12, + "grad_norm": 0.5812029215382618, + "learning_rate": 9.800702473764706e-06, + "loss": 0.083, + "step": 1395 + }, + { + "epoch": 0.12, + "grad_norm": 0.3707747195477804, + "learning_rate": 9.800320863635711e-06, + "loss": 0.0979, + "step": 1396 + }, + { + "epoch": 0.12, + "grad_norm": 0.4489089865020385, + "learning_rate": 9.799938895950856e-06, + "loss": 0.0889, + "step": 1397 + }, + { + "epoch": 0.12, + "grad_norm": 0.21992947453486855, + "learning_rate": 9.799556570738589e-06, + "loss": 0.0574, + "step": 1398 + }, + { + "epoch": 0.12, + "grad_norm": 0.6087719667274801, + "learning_rate": 9.79917388802739e-06, + "loss": 0.1298, + "step": 1399 + }, + { + "epoch": 0.12, + "grad_norm": 0.41023285218101163, + "learning_rate": 9.798790847845763e-06, + "loss": 0.1107, + "step": 1400 + }, + { + "epoch": 0.12, + "grad_norm": 0.38169174577252807, + "learning_rate": 9.798407450222238e-06, + "loss": 0.1154, + "step": 1401 + }, + { + "epoch": 0.12, + "grad_norm": 0.39207411104779605, + "learning_rate": 9.798023695185372e-06, + "loss": 0.0927, + "step": 1402 + }, + { + "epoch": 0.12, + "grad_norm": 0.3215208090495359, + "learning_rate": 9.797639582763752e-06, + "loss": 0.0862, + "step": 1403 + }, + { + "epoch": 0.12, + "grad_norm": 0.3958789135768561, + "learning_rate": 9.797255112985988e-06, + "loss": 0.1204, + "step": 1404 + }, + { + "epoch": 0.12, + "grad_norm": 0.44916089442550017, + "learning_rate": 9.796870285880717e-06, + "loss": 0.0949, + "step": 1405 + }, + { + "epoch": 0.12, + "grad_norm": 0.5431412185923403, + "learning_rate": 9.796485101476603e-06, + "loss": 0.09, + "step": 1406 + }, + { + "epoch": 0.12, + "grad_norm": 0.3045440325237794, + "learning_rate": 9.796099559802337e-06, + "loss": 0.0727, + "step": 1407 + }, + { + "epoch": 0.12, + "grad_norm": 0.8148253383767128, + "learning_rate": 9.795713660886636e-06, + "loss": 0.1166, + "step": 1408 + }, + { + "epoch": 0.12, + "grad_norm": 0.6464125069707106, + "learning_rate": 9.795327404758244e-06, + "loss": 0.1447, + "step": 1409 + }, + { + "epoch": 0.12, + "grad_norm": 0.42511849008305275, + "learning_rate": 9.794940791445933e-06, + "loss": 0.1147, + "step": 1410 + }, + { + "epoch": 0.12, + "grad_norm": 0.325528962918417, + "learning_rate": 9.7945538209785e-06, + "loss": 0.1047, + "step": 1411 + }, + { + "epoch": 0.12, + "grad_norm": 0.28709517419611796, + "learning_rate": 9.794166493384766e-06, + "loss": 0.0704, + "step": 1412 + }, + { + "epoch": 0.12, + "grad_norm": 0.6045332845168807, + "learning_rate": 9.793778808693586e-06, + "loss": 0.1178, + "step": 1413 + }, + { + "epoch": 0.12, + "grad_norm": 0.4247763840892641, + "learning_rate": 9.793390766933832e-06, + "loss": 0.0937, + "step": 1414 + }, + { + "epoch": 0.12, + "grad_norm": 0.2320937815419936, + "learning_rate": 9.793002368134412e-06, + "loss": 0.0432, + "step": 1415 + }, + { + "epoch": 0.12, + "grad_norm": 0.2501713040377906, + "learning_rate": 9.792613612324253e-06, + "loss": 0.0772, + "step": 1416 + }, + { + "epoch": 0.12, + "grad_norm": 0.282612051125166, + "learning_rate": 9.792224499532315e-06, + "loss": 0.0843, + "step": 1417 + }, + { + "epoch": 0.12, + "grad_norm": 0.3194330895928499, + "learning_rate": 9.791835029787578e-06, + "loss": 0.0907, + "step": 1418 + }, + { + "epoch": 0.12, + "grad_norm": 0.41584849893677234, + "learning_rate": 9.791445203119054e-06, + "loss": 0.0864, + "step": 1419 + }, + { + "epoch": 0.12, + "grad_norm": 0.6302388247600794, + "learning_rate": 9.791055019555777e-06, + "loss": 0.1078, + "step": 1420 + }, + { + "epoch": 0.12, + "grad_norm": 0.2695493146066337, + "learning_rate": 9.790664479126815e-06, + "loss": 0.0714, + "step": 1421 + }, + { + "epoch": 0.12, + "grad_norm": 0.34442803169797387, + "learning_rate": 9.790273581861255e-06, + "loss": 0.0962, + "step": 1422 + }, + { + "epoch": 0.12, + "grad_norm": 0.30369544184414865, + "learning_rate": 9.78988232778821e-06, + "loss": 0.0432, + "step": 1423 + }, + { + "epoch": 0.12, + "grad_norm": 0.27589792218212256, + "learning_rate": 9.78949071693683e-06, + "loss": 0.076, + "step": 1424 + }, + { + "epoch": 0.12, + "grad_norm": 0.5188388731361412, + "learning_rate": 9.789098749336276e-06, + "loss": 0.1149, + "step": 1425 + }, + { + "epoch": 0.12, + "grad_norm": 0.2800051036238471, + "learning_rate": 9.788706425015752e-06, + "loss": 0.0783, + "step": 1426 + }, + { + "epoch": 0.12, + "grad_norm": 0.4115534793843664, + "learning_rate": 9.788313744004476e-06, + "loss": 0.09, + "step": 1427 + }, + { + "epoch": 0.12, + "grad_norm": 0.2957672713403208, + "learning_rate": 9.7879207063317e-06, + "loss": 0.0852, + "step": 1428 + }, + { + "epoch": 0.12, + "grad_norm": 0.4437042497339269, + "learning_rate": 9.787527312026697e-06, + "loss": 0.0897, + "step": 1429 + }, + { + "epoch": 0.12, + "grad_norm": 0.570968232312105, + "learning_rate": 9.78713356111877e-06, + "loss": 0.1192, + "step": 1430 + }, + { + "epoch": 0.12, + "grad_norm": 0.3822117959482978, + "learning_rate": 9.786739453637249e-06, + "loss": 0.0655, + "step": 1431 + }, + { + "epoch": 0.12, + "grad_norm": 0.27197603581802404, + "learning_rate": 9.786344989611488e-06, + "loss": 0.0577, + "step": 1432 + }, + { + "epoch": 0.12, + "grad_norm": 0.604075883660989, + "learning_rate": 9.78595016907087e-06, + "loss": 0.1456, + "step": 1433 + }, + { + "epoch": 0.12, + "grad_norm": 0.4808961241870662, + "learning_rate": 9.785554992044804e-06, + "loss": 0.1304, + "step": 1434 + }, + { + "epoch": 0.12, + "grad_norm": 0.38574323916759173, + "learning_rate": 9.785159458562721e-06, + "loss": 0.0763, + "step": 1435 + }, + { + "epoch": 0.12, + "grad_norm": 0.3827481021051543, + "learning_rate": 9.78476356865409e-06, + "loss": 0.0606, + "step": 1436 + }, + { + "epoch": 0.12, + "grad_norm": 0.6247257813028966, + "learning_rate": 9.784367322348392e-06, + "loss": 0.0985, + "step": 1437 + }, + { + "epoch": 0.12, + "grad_norm": 0.4474587601874192, + "learning_rate": 9.783970719675146e-06, + "loss": 0.1282, + "step": 1438 + }, + { + "epoch": 0.12, + "grad_norm": 0.4393677720124891, + "learning_rate": 9.783573760663893e-06, + "loss": 0.1201, + "step": 1439 + }, + { + "epoch": 0.12, + "grad_norm": 0.30710021209559485, + "learning_rate": 9.783176445344198e-06, + "loss": 0.0731, + "step": 1440 + }, + { + "epoch": 0.12, + "grad_norm": 0.3748985205420724, + "learning_rate": 9.78277877374566e-06, + "loss": 0.0966, + "step": 1441 + }, + { + "epoch": 0.12, + "grad_norm": 1.3210672923903104, + "learning_rate": 9.782380745897894e-06, + "loss": 0.1068, + "step": 1442 + }, + { + "epoch": 0.12, + "grad_norm": 0.26672478574070213, + "learning_rate": 9.781982361830553e-06, + "loss": 0.089, + "step": 1443 + }, + { + "epoch": 0.12, + "grad_norm": 0.362470859534081, + "learning_rate": 9.781583621573309e-06, + "loss": 0.0669, + "step": 1444 + }, + { + "epoch": 0.12, + "grad_norm": 0.3917291628731751, + "learning_rate": 9.78118452515586e-06, + "loss": 0.0647, + "step": 1445 + }, + { + "epoch": 0.12, + "grad_norm": 0.3293239345465708, + "learning_rate": 9.780785072607937e-06, + "loss": 0.0691, + "step": 1446 + }, + { + "epoch": 0.12, + "grad_norm": 0.4526544953189884, + "learning_rate": 9.780385263959291e-06, + "loss": 0.0966, + "step": 1447 + }, + { + "epoch": 0.12, + "grad_norm": 0.5524286790280818, + "learning_rate": 9.779985099239702e-06, + "loss": 0.1709, + "step": 1448 + }, + { + "epoch": 0.12, + "grad_norm": 0.5913113163272414, + "learning_rate": 9.779584578478978e-06, + "loss": 0.126, + "step": 1449 + }, + { + "epoch": 0.12, + "grad_norm": 0.2589046747675721, + "learning_rate": 9.779183701706952e-06, + "loss": 0.0756, + "step": 1450 + }, + { + "epoch": 0.12, + "grad_norm": 0.5108901921863935, + "learning_rate": 9.778782468953482e-06, + "loss": 0.0973, + "step": 1451 + }, + { + "epoch": 0.12, + "grad_norm": 0.5747416209585868, + "learning_rate": 9.778380880248455e-06, + "loss": 0.0904, + "step": 1452 + }, + { + "epoch": 0.12, + "grad_norm": 0.29778006043222877, + "learning_rate": 9.777978935621785e-06, + "loss": 0.0901, + "step": 1453 + }, + { + "epoch": 0.12, + "grad_norm": 0.29081784685120615, + "learning_rate": 9.77757663510341e-06, + "loss": 0.0569, + "step": 1454 + }, + { + "epoch": 0.12, + "grad_norm": 0.3465458995387425, + "learning_rate": 9.777173978723294e-06, + "loss": 0.125, + "step": 1455 + }, + { + "epoch": 0.12, + "grad_norm": 0.2110996505630479, + "learning_rate": 9.776770966511433e-06, + "loss": 0.0442, + "step": 1456 + }, + { + "epoch": 0.12, + "grad_norm": 0.3458274942899966, + "learning_rate": 9.776367598497842e-06, + "loss": 0.0923, + "step": 1457 + }, + { + "epoch": 0.12, + "grad_norm": 0.4062644811860463, + "learning_rate": 9.775963874712569e-06, + "loss": 0.0922, + "step": 1458 + }, + { + "epoch": 0.12, + "grad_norm": 0.3847518792444242, + "learning_rate": 9.775559795185686e-06, + "loss": 0.098, + "step": 1459 + }, + { + "epoch": 0.12, + "grad_norm": 0.39431533621486803, + "learning_rate": 9.775155359947287e-06, + "loss": 0.0857, + "step": 1460 + }, + { + "epoch": 0.12, + "grad_norm": 0.2839981351990572, + "learning_rate": 9.774750569027501e-06, + "loss": 0.0687, + "step": 1461 + }, + { + "epoch": 0.12, + "grad_norm": 0.49376392735683666, + "learning_rate": 9.774345422456477e-06, + "loss": 0.1218, + "step": 1462 + }, + { + "epoch": 0.12, + "grad_norm": 0.41792793792763144, + "learning_rate": 9.773939920264395e-06, + "loss": 0.1042, + "step": 1463 + }, + { + "epoch": 0.12, + "grad_norm": 0.40335358598153637, + "learning_rate": 9.773534062481455e-06, + "loss": 0.0859, + "step": 1464 + }, + { + "epoch": 0.12, + "grad_norm": 0.36173800192963784, + "learning_rate": 9.77312784913789e-06, + "loss": 0.0829, + "step": 1465 + }, + { + "epoch": 0.12, + "grad_norm": 0.3627514492403434, + "learning_rate": 9.772721280263959e-06, + "loss": 0.1098, + "step": 1466 + }, + { + "epoch": 0.12, + "grad_norm": 0.4137791524888132, + "learning_rate": 9.772314355889945e-06, + "loss": 0.0991, + "step": 1467 + }, + { + "epoch": 0.12, + "grad_norm": 0.3965765294153142, + "learning_rate": 9.771907076046155e-06, + "loss": 0.0629, + "step": 1468 + }, + { + "epoch": 0.12, + "grad_norm": 0.35825354795333814, + "learning_rate": 9.771499440762929e-06, + "loss": 0.0927, + "step": 1469 + }, + { + "epoch": 0.12, + "grad_norm": 0.4480681115067083, + "learning_rate": 9.771091450070627e-06, + "loss": 0.0669, + "step": 1470 + }, + { + "epoch": 0.12, + "grad_norm": 0.29450196884852836, + "learning_rate": 9.77068310399964e-06, + "loss": 0.0605, + "step": 1471 + }, + { + "epoch": 0.12, + "grad_norm": 0.5615277442962547, + "learning_rate": 9.770274402580385e-06, + "loss": 0.1131, + "step": 1472 + }, + { + "epoch": 0.12, + "grad_norm": 0.2560883737250006, + "learning_rate": 9.769865345843304e-06, + "loss": 0.0698, + "step": 1473 + }, + { + "epoch": 0.12, + "grad_norm": 0.5999024434049399, + "learning_rate": 9.769455933818863e-06, + "loss": 0.1164, + "step": 1474 + }, + { + "epoch": 0.12, + "grad_norm": 0.5807593233572422, + "learning_rate": 9.769046166537563e-06, + "loss": 0.1082, + "step": 1475 + }, + { + "epoch": 0.12, + "grad_norm": 0.4211429316245326, + "learning_rate": 9.768636044029923e-06, + "loss": 0.0827, + "step": 1476 + }, + { + "epoch": 0.12, + "grad_norm": 0.2663639396427144, + "learning_rate": 9.768225566326489e-06, + "loss": 0.0377, + "step": 1477 + }, + { + "epoch": 0.12, + "grad_norm": 0.3946744858762572, + "learning_rate": 9.767814733457838e-06, + "loss": 0.1337, + "step": 1478 + }, + { + "epoch": 0.12, + "grad_norm": 0.8243941062998293, + "learning_rate": 9.767403545454572e-06, + "loss": 0.1504, + "step": 1479 + }, + { + "epoch": 0.12, + "grad_norm": 0.5917378094697343, + "learning_rate": 9.766992002347318e-06, + "loss": 0.053, + "step": 1480 + }, + { + "epoch": 0.12, + "grad_norm": 0.591172829632348, + "learning_rate": 9.766580104166727e-06, + "loss": 0.139, + "step": 1481 + }, + { + "epoch": 0.12, + "grad_norm": 0.33226585307682716, + "learning_rate": 9.766167850943486e-06, + "loss": 0.0572, + "step": 1482 + }, + { + "epoch": 0.12, + "grad_norm": 0.895786496231328, + "learning_rate": 9.765755242708296e-06, + "loss": 0.1375, + "step": 1483 + }, + { + "epoch": 0.13, + "grad_norm": 0.35871154484747436, + "learning_rate": 9.765342279491895e-06, + "loss": 0.0897, + "step": 1484 + }, + { + "epoch": 0.13, + "grad_norm": 0.3370585936636479, + "learning_rate": 9.76492896132504e-06, + "loss": 0.0818, + "step": 1485 + }, + { + "epoch": 0.13, + "grad_norm": 0.5676033380940825, + "learning_rate": 9.76451528823852e-06, + "loss": 0.1279, + "step": 1486 + }, + { + "epoch": 0.13, + "grad_norm": 0.3823880623261568, + "learning_rate": 9.764101260263143e-06, + "loss": 0.0655, + "step": 1487 + }, + { + "epoch": 0.13, + "grad_norm": 0.7229106454474415, + "learning_rate": 9.763686877429752e-06, + "loss": 0.0985, + "step": 1488 + }, + { + "epoch": 0.13, + "grad_norm": 0.6770218201080149, + "learning_rate": 9.763272139769212e-06, + "loss": 0.1686, + "step": 1489 + }, + { + "epoch": 0.13, + "grad_norm": 0.5656188493193873, + "learning_rate": 9.762857047312414e-06, + "loss": 0.1082, + "step": 1490 + }, + { + "epoch": 0.13, + "grad_norm": 0.9419062253241671, + "learning_rate": 9.76244160009028e-06, + "loss": 0.1541, + "step": 1491 + }, + { + "epoch": 0.13, + "grad_norm": 0.4856216970244555, + "learning_rate": 9.762025798133751e-06, + "loss": 0.081, + "step": 1492 + }, + { + "epoch": 0.13, + "grad_norm": 0.4626360808782635, + "learning_rate": 9.7616096414738e-06, + "loss": 0.1202, + "step": 1493 + }, + { + "epoch": 0.13, + "grad_norm": 0.6374693807164482, + "learning_rate": 9.761193130141425e-06, + "loss": 0.1243, + "step": 1494 + }, + { + "epoch": 0.13, + "grad_norm": 0.504121248979558, + "learning_rate": 9.760776264167647e-06, + "loss": 0.108, + "step": 1495 + }, + { + "epoch": 0.13, + "grad_norm": 0.7417105794810317, + "learning_rate": 9.760359043583521e-06, + "loss": 0.1274, + "step": 1496 + }, + { + "epoch": 0.13, + "grad_norm": 0.4185271924397774, + "learning_rate": 9.759941468420123e-06, + "loss": 0.1067, + "step": 1497 + }, + { + "epoch": 0.13, + "grad_norm": 0.3808547581628291, + "learning_rate": 9.759523538708555e-06, + "loss": 0.0775, + "step": 1498 + }, + { + "epoch": 0.13, + "grad_norm": 0.3599704757331957, + "learning_rate": 9.759105254479948e-06, + "loss": 0.0541, + "step": 1499 + }, + { + "epoch": 0.13, + "grad_norm": 0.619795090551407, + "learning_rate": 9.758686615765457e-06, + "loss": 0.1144, + "step": 1500 + }, + { + "epoch": 0.13, + "grad_norm": 0.5866436827248638, + "learning_rate": 9.758267622596266e-06, + "loss": 0.1448, + "step": 1501 + }, + { + "epoch": 0.13, + "grad_norm": 0.36986589084907723, + "learning_rate": 9.757848275003583e-06, + "loss": 0.0563, + "step": 1502 + }, + { + "epoch": 0.13, + "grad_norm": 0.4771788093118218, + "learning_rate": 9.757428573018645e-06, + "loss": 0.1292, + "step": 1503 + }, + { + "epoch": 0.13, + "grad_norm": 0.4522316304240206, + "learning_rate": 9.757008516672712e-06, + "loss": 0.0994, + "step": 1504 + }, + { + "epoch": 0.13, + "grad_norm": 0.32293143705640487, + "learning_rate": 9.756588105997074e-06, + "loss": 0.0685, + "step": 1505 + }, + { + "epoch": 0.13, + "grad_norm": 0.6007072822506229, + "learning_rate": 9.756167341023043e-06, + "loss": 0.145, + "step": 1506 + }, + { + "epoch": 0.13, + "grad_norm": 0.6066406062507865, + "learning_rate": 9.755746221781963e-06, + "loss": 0.1176, + "step": 1507 + }, + { + "epoch": 0.13, + "grad_norm": 0.24047478063062486, + "learning_rate": 9.7553247483052e-06, + "loss": 0.0723, + "step": 1508 + }, + { + "epoch": 0.13, + "grad_norm": 0.24376344534015634, + "learning_rate": 9.754902920624148e-06, + "loss": 0.0849, + "step": 1509 + }, + { + "epoch": 0.13, + "grad_norm": 0.4762194888847118, + "learning_rate": 9.754480738770225e-06, + "loss": 0.0785, + "step": 1510 + }, + { + "epoch": 0.13, + "grad_norm": 0.4410817876108083, + "learning_rate": 9.754058202774882e-06, + "loss": 0.1276, + "step": 1511 + }, + { + "epoch": 0.13, + "grad_norm": 0.3560200765797871, + "learning_rate": 9.75363531266959e-06, + "loss": 0.0654, + "step": 1512 + }, + { + "epoch": 0.13, + "grad_norm": 0.21710132432765883, + "learning_rate": 9.753212068485847e-06, + "loss": 0.0525, + "step": 1513 + }, + { + "epoch": 0.13, + "grad_norm": 0.6206189282015673, + "learning_rate": 9.752788470255179e-06, + "loss": 0.1095, + "step": 1514 + }, + { + "epoch": 0.13, + "grad_norm": 0.3617918284075503, + "learning_rate": 9.752364518009138e-06, + "loss": 0.0952, + "step": 1515 + }, + { + "epoch": 0.13, + "grad_norm": 0.9390791114575996, + "learning_rate": 9.751940211779304e-06, + "loss": 0.1227, + "step": 1516 + }, + { + "epoch": 0.13, + "grad_norm": 0.5058449185075965, + "learning_rate": 9.751515551597284e-06, + "loss": 0.0791, + "step": 1517 + }, + { + "epoch": 0.13, + "grad_norm": 0.5204834868719281, + "learning_rate": 9.751090537494702e-06, + "loss": 0.1038, + "step": 1518 + }, + { + "epoch": 0.13, + "grad_norm": 0.4013979355915156, + "learning_rate": 9.750665169503221e-06, + "loss": 0.109, + "step": 1519 + }, + { + "epoch": 0.13, + "grad_norm": 0.4063206696959503, + "learning_rate": 9.750239447654524e-06, + "loss": 0.0866, + "step": 1520 + }, + { + "epoch": 0.13, + "grad_norm": 0.5973066775160845, + "learning_rate": 9.749813371980321e-06, + "loss": 0.0708, + "step": 1521 + }, + { + "epoch": 0.13, + "grad_norm": 0.3726841628857404, + "learning_rate": 9.749386942512348e-06, + "loss": 0.0911, + "step": 1522 + }, + { + "epoch": 0.13, + "grad_norm": 0.48477416900487214, + "learning_rate": 9.748960159282369e-06, + "loss": 0.1189, + "step": 1523 + }, + { + "epoch": 0.13, + "grad_norm": 0.32388441148074815, + "learning_rate": 9.748533022322171e-06, + "loss": 0.052, + "step": 1524 + }, + { + "epoch": 0.13, + "grad_norm": 0.30040425248581754, + "learning_rate": 9.748105531663571e-06, + "loss": 0.0727, + "step": 1525 + }, + { + "epoch": 0.13, + "grad_norm": 0.3790364973174266, + "learning_rate": 9.747677687338412e-06, + "loss": 0.0786, + "step": 1526 + }, + { + "epoch": 0.13, + "grad_norm": 0.32418721650143434, + "learning_rate": 9.747249489378563e-06, + "loss": 0.0717, + "step": 1527 + }, + { + "epoch": 0.13, + "grad_norm": 0.5253222859510696, + "learning_rate": 9.746820937815915e-06, + "loss": 0.1177, + "step": 1528 + }, + { + "epoch": 0.13, + "grad_norm": 0.4208861165379929, + "learning_rate": 9.746392032682392e-06, + "loss": 0.0634, + "step": 1529 + }, + { + "epoch": 0.13, + "grad_norm": 0.33086106031564055, + "learning_rate": 9.745962774009941e-06, + "loss": 0.0874, + "step": 1530 + }, + { + "epoch": 0.13, + "grad_norm": 0.4422043077432377, + "learning_rate": 9.745533161830536e-06, + "loss": 0.0974, + "step": 1531 + }, + { + "epoch": 0.13, + "grad_norm": 0.3339923775304685, + "learning_rate": 9.745103196176175e-06, + "loss": 0.076, + "step": 1532 + }, + { + "epoch": 0.13, + "grad_norm": 0.35224397746724745, + "learning_rate": 9.744672877078887e-06, + "loss": 0.0817, + "step": 1533 + }, + { + "epoch": 0.13, + "grad_norm": 0.45663814164725286, + "learning_rate": 9.744242204570722e-06, + "loss": 0.1194, + "step": 1534 + }, + { + "epoch": 0.13, + "grad_norm": 0.3875681469634877, + "learning_rate": 9.74381117868376e-06, + "loss": 0.109, + "step": 1535 + }, + { + "epoch": 0.13, + "grad_norm": 0.35698716416209236, + "learning_rate": 9.743379799450107e-06, + "loss": 0.083, + "step": 1536 + }, + { + "epoch": 0.13, + "grad_norm": 0.6386712034474948, + "learning_rate": 9.742948066901895e-06, + "loss": 0.1668, + "step": 1537 + }, + { + "epoch": 0.13, + "grad_norm": 0.9720948229774626, + "learning_rate": 9.742515981071283e-06, + "loss": 0.117, + "step": 1538 + }, + { + "epoch": 0.13, + "grad_norm": 0.2693157611288496, + "learning_rate": 9.74208354199045e-06, + "loss": 0.0693, + "step": 1539 + }, + { + "epoch": 0.13, + "grad_norm": 0.45875415095510885, + "learning_rate": 9.741650749691613e-06, + "loss": 0.1143, + "step": 1540 + }, + { + "epoch": 0.13, + "grad_norm": 0.5354958449492344, + "learning_rate": 9.741217604207005e-06, + "loss": 0.1117, + "step": 1541 + }, + { + "epoch": 0.13, + "grad_norm": 0.5151087560697634, + "learning_rate": 9.740784105568889e-06, + "loss": 0.0964, + "step": 1542 + }, + { + "epoch": 0.13, + "grad_norm": 0.3328494758019325, + "learning_rate": 9.740350253809556e-06, + "loss": 0.0797, + "step": 1543 + }, + { + "epoch": 0.13, + "grad_norm": 0.515019355627681, + "learning_rate": 9.739916048961323e-06, + "loss": 0.1488, + "step": 1544 + }, + { + "epoch": 0.13, + "grad_norm": 0.38798563056935104, + "learning_rate": 9.73948149105653e-06, + "loss": 0.0819, + "step": 1545 + }, + { + "epoch": 0.13, + "grad_norm": 0.5629205624905728, + "learning_rate": 9.739046580127546e-06, + "loss": 0.0774, + "step": 1546 + }, + { + "epoch": 0.13, + "grad_norm": 0.3937594729089547, + "learning_rate": 9.738611316206766e-06, + "loss": 0.0808, + "step": 1547 + }, + { + "epoch": 0.13, + "grad_norm": 0.224608788639192, + "learning_rate": 9.73817569932661e-06, + "loss": 0.0619, + "step": 1548 + }, + { + "epoch": 0.13, + "grad_norm": 0.39501652421630695, + "learning_rate": 9.737739729519526e-06, + "loss": 0.1287, + "step": 1549 + }, + { + "epoch": 0.13, + "grad_norm": 0.44491014834985676, + "learning_rate": 9.737303406817989e-06, + "loss": 0.1076, + "step": 1550 + }, + { + "epoch": 0.13, + "grad_norm": 0.6025989047629982, + "learning_rate": 9.736866731254496e-06, + "loss": 0.123, + "step": 1551 + }, + { + "epoch": 0.13, + "grad_norm": 0.1704103586611847, + "learning_rate": 9.736429702861573e-06, + "loss": 0.0413, + "step": 1552 + }, + { + "epoch": 0.13, + "grad_norm": 0.4137527597554859, + "learning_rate": 9.735992321671776e-06, + "loss": 0.105, + "step": 1553 + }, + { + "epoch": 0.13, + "grad_norm": 0.2878628426595022, + "learning_rate": 9.735554587717683e-06, + "loss": 0.0995, + "step": 1554 + }, + { + "epoch": 0.13, + "grad_norm": 0.3141072292274884, + "learning_rate": 9.735116501031896e-06, + "loss": 0.0919, + "step": 1555 + }, + { + "epoch": 0.13, + "grad_norm": 0.3478366982179641, + "learning_rate": 9.734678061647048e-06, + "loss": 0.1003, + "step": 1556 + }, + { + "epoch": 0.13, + "grad_norm": 0.5283391463332364, + "learning_rate": 9.734239269595797e-06, + "loss": 0.1633, + "step": 1557 + }, + { + "epoch": 0.13, + "grad_norm": 0.33077142836322027, + "learning_rate": 9.733800124910825e-06, + "loss": 0.0629, + "step": 1558 + }, + { + "epoch": 0.13, + "grad_norm": 0.21220712946516898, + "learning_rate": 9.733360627624844e-06, + "loss": 0.061, + "step": 1559 + }, + { + "epoch": 0.13, + "grad_norm": 0.29246371399706167, + "learning_rate": 9.73292077777059e-06, + "loss": 0.0603, + "step": 1560 + }, + { + "epoch": 0.13, + "grad_norm": 0.38209044511873574, + "learning_rate": 9.732480575380825e-06, + "loss": 0.092, + "step": 1561 + }, + { + "epoch": 0.13, + "grad_norm": 0.5845083078941006, + "learning_rate": 9.732040020488338e-06, + "loss": 0.1317, + "step": 1562 + }, + { + "epoch": 0.13, + "grad_norm": 0.5188095227098074, + "learning_rate": 9.731599113125943e-06, + "loss": 0.1195, + "step": 1563 + }, + { + "epoch": 0.13, + "grad_norm": 0.33028395468363947, + "learning_rate": 9.731157853326483e-06, + "loss": 0.0828, + "step": 1564 + }, + { + "epoch": 0.13, + "grad_norm": 0.3254887091671216, + "learning_rate": 9.730716241122826e-06, + "loss": 0.0746, + "step": 1565 + }, + { + "epoch": 0.13, + "grad_norm": 0.6073445183841407, + "learning_rate": 9.730274276547864e-06, + "loss": 0.1195, + "step": 1566 + }, + { + "epoch": 0.13, + "grad_norm": 0.49404946944528094, + "learning_rate": 9.72983195963452e-06, + "loss": 0.0801, + "step": 1567 + }, + { + "epoch": 0.13, + "grad_norm": 0.3207447856836685, + "learning_rate": 9.729389290415734e-06, + "loss": 0.0936, + "step": 1568 + }, + { + "epoch": 0.13, + "grad_norm": 0.26726629706690463, + "learning_rate": 9.728946268924485e-06, + "loss": 0.0538, + "step": 1569 + }, + { + "epoch": 0.13, + "grad_norm": 0.8465957917259104, + "learning_rate": 9.728502895193772e-06, + "loss": 0.1659, + "step": 1570 + }, + { + "epoch": 0.13, + "grad_norm": 0.32026298132815856, + "learning_rate": 9.728059169256616e-06, + "loss": 0.0577, + "step": 1571 + }, + { + "epoch": 0.13, + "grad_norm": 0.6773602149451668, + "learning_rate": 9.72761509114607e-06, + "loss": 0.1323, + "step": 1572 + }, + { + "epoch": 0.13, + "grad_norm": 0.4173663515117522, + "learning_rate": 9.727170660895213e-06, + "loss": 0.1018, + "step": 1573 + }, + { + "epoch": 0.13, + "grad_norm": 0.36720059562511537, + "learning_rate": 9.726725878537145e-06, + "loss": 0.064, + "step": 1574 + }, + { + "epoch": 0.13, + "grad_norm": 0.506879016939399, + "learning_rate": 9.726280744105e-06, + "loss": 0.111, + "step": 1575 + }, + { + "epoch": 0.13, + "grad_norm": 0.4718081298190497, + "learning_rate": 9.725835257631932e-06, + "loss": 0.1242, + "step": 1576 + }, + { + "epoch": 0.13, + "grad_norm": 0.5437057884310103, + "learning_rate": 9.725389419151123e-06, + "loss": 0.0901, + "step": 1577 + }, + { + "epoch": 0.13, + "grad_norm": 0.3801182594710486, + "learning_rate": 9.724943228695784e-06, + "loss": 0.0509, + "step": 1578 + }, + { + "epoch": 0.13, + "grad_norm": 0.4162171144579076, + "learning_rate": 9.724496686299149e-06, + "loss": 0.1017, + "step": 1579 + }, + { + "epoch": 0.13, + "grad_norm": 0.42879282016265013, + "learning_rate": 9.724049791994477e-06, + "loss": 0.0792, + "step": 1580 + }, + { + "epoch": 0.13, + "grad_norm": 0.3443438077519364, + "learning_rate": 9.723602545815058e-06, + "loss": 0.0801, + "step": 1581 + }, + { + "epoch": 0.13, + "grad_norm": 0.2574610545867587, + "learning_rate": 9.723154947794204e-06, + "loss": 0.0584, + "step": 1582 + }, + { + "epoch": 0.13, + "grad_norm": 0.4380456248419518, + "learning_rate": 9.722706997965256e-06, + "loss": 0.1078, + "step": 1583 + }, + { + "epoch": 0.13, + "grad_norm": 0.5373266055449576, + "learning_rate": 9.722258696361577e-06, + "loss": 0.1192, + "step": 1584 + }, + { + "epoch": 0.13, + "grad_norm": 0.4940203295391308, + "learning_rate": 9.721810043016563e-06, + "loss": 0.111, + "step": 1585 + }, + { + "epoch": 0.13, + "grad_norm": 0.4604345272918427, + "learning_rate": 9.72136103796363e-06, + "loss": 0.118, + "step": 1586 + }, + { + "epoch": 0.13, + "grad_norm": 0.33524699694115173, + "learning_rate": 9.720911681236223e-06, + "loss": 0.0687, + "step": 1587 + }, + { + "epoch": 0.13, + "grad_norm": 0.20428752717733256, + "learning_rate": 9.720461972867812e-06, + "loss": 0.0481, + "step": 1588 + }, + { + "epoch": 0.13, + "grad_norm": 0.31385879426425145, + "learning_rate": 9.720011912891894e-06, + "loss": 0.0824, + "step": 1589 + }, + { + "epoch": 0.13, + "grad_norm": 0.4480870699341634, + "learning_rate": 9.719561501341994e-06, + "loss": 0.0762, + "step": 1590 + }, + { + "epoch": 0.13, + "grad_norm": 0.4369119093390214, + "learning_rate": 9.719110738251657e-06, + "loss": 0.1067, + "step": 1591 + }, + { + "epoch": 0.13, + "grad_norm": 0.3681110168145045, + "learning_rate": 9.718659623654465e-06, + "loss": 0.083, + "step": 1592 + }, + { + "epoch": 0.13, + "grad_norm": 0.6103455432805954, + "learning_rate": 9.718208157584015e-06, + "loss": 0.1201, + "step": 1593 + }, + { + "epoch": 0.13, + "grad_norm": 0.360903371312486, + "learning_rate": 9.717756340073935e-06, + "loss": 0.0769, + "step": 1594 + }, + { + "epoch": 0.13, + "grad_norm": 0.3555906285119128, + "learning_rate": 9.717304171157881e-06, + "loss": 0.0943, + "step": 1595 + }, + { + "epoch": 0.13, + "grad_norm": 0.34280619971552534, + "learning_rate": 9.716851650869532e-06, + "loss": 0.045, + "step": 1596 + }, + { + "epoch": 0.13, + "grad_norm": 0.34696396352755465, + "learning_rate": 9.716398779242593e-06, + "loss": 0.0951, + "step": 1597 + }, + { + "epoch": 0.13, + "grad_norm": 1.1773188411823499, + "learning_rate": 9.715945556310797e-06, + "loss": 0.1677, + "step": 1598 + }, + { + "epoch": 0.13, + "grad_norm": 0.46980452502374365, + "learning_rate": 9.715491982107905e-06, + "loss": 0.1069, + "step": 1599 + }, + { + "epoch": 0.13, + "grad_norm": 0.2802689734647612, + "learning_rate": 9.715038056667702e-06, + "loss": 0.0825, + "step": 1600 + }, + { + "epoch": 0.13, + "grad_norm": 0.4770422624692102, + "learning_rate": 9.714583780023995e-06, + "loss": 0.1046, + "step": 1601 + }, + { + "epoch": 0.13, + "grad_norm": 0.5785270908229381, + "learning_rate": 9.714129152210625e-06, + "loss": 0.0973, + "step": 1602 + }, + { + "epoch": 0.14, + "grad_norm": 0.8306559959486894, + "learning_rate": 9.713674173261454e-06, + "loss": 0.1116, + "step": 1603 + }, + { + "epoch": 0.14, + "grad_norm": 0.2927128651918903, + "learning_rate": 9.71321884321037e-06, + "loss": 0.0956, + "step": 1604 + }, + { + "epoch": 0.14, + "grad_norm": 0.34662603339291476, + "learning_rate": 9.71276316209129e-06, + "loss": 0.0608, + "step": 1605 + }, + { + "epoch": 0.14, + "grad_norm": 0.41762795518733203, + "learning_rate": 9.712307129938157e-06, + "loss": 0.0614, + "step": 1606 + }, + { + "epoch": 0.14, + "grad_norm": 0.4105634720377538, + "learning_rate": 9.711850746784937e-06, + "loss": 0.0776, + "step": 1607 + }, + { + "epoch": 0.14, + "grad_norm": 0.4564513774918546, + "learning_rate": 9.711394012665626e-06, + "loss": 0.0813, + "step": 1608 + }, + { + "epoch": 0.14, + "grad_norm": 0.5229489516213977, + "learning_rate": 9.710936927614241e-06, + "loss": 0.0904, + "step": 1609 + }, + { + "epoch": 0.14, + "grad_norm": 0.5909146669430879, + "learning_rate": 9.710479491664832e-06, + "loss": 0.1313, + "step": 1610 + }, + { + "epoch": 0.14, + "grad_norm": 0.5268795931281853, + "learning_rate": 9.71002170485147e-06, + "loss": 0.0984, + "step": 1611 + }, + { + "epoch": 0.14, + "grad_norm": 0.49806451081059294, + "learning_rate": 9.709563567208254e-06, + "loss": 0.1407, + "step": 1612 + }, + { + "epoch": 0.14, + "grad_norm": 1.3505897504550937, + "learning_rate": 9.709105078769306e-06, + "loss": 0.1429, + "step": 1613 + }, + { + "epoch": 0.14, + "grad_norm": 0.40472986301641434, + "learning_rate": 9.708646239568781e-06, + "loss": 0.0906, + "step": 1614 + }, + { + "epoch": 0.14, + "grad_norm": 0.3330593198849907, + "learning_rate": 9.708187049640853e-06, + "loss": 0.073, + "step": 1615 + }, + { + "epoch": 0.14, + "grad_norm": 0.355026167414063, + "learning_rate": 9.707727509019726e-06, + "loss": 0.0567, + "step": 1616 + }, + { + "epoch": 0.14, + "grad_norm": 0.31540416563774126, + "learning_rate": 9.707267617739632e-06, + "loss": 0.0839, + "step": 1617 + }, + { + "epoch": 0.14, + "grad_norm": 0.9833310243637143, + "learning_rate": 9.706807375834823e-06, + "loss": 0.0937, + "step": 1618 + }, + { + "epoch": 0.14, + "grad_norm": 0.41699340574494925, + "learning_rate": 9.70634678333958e-06, + "loss": 0.1076, + "step": 1619 + }, + { + "epoch": 0.14, + "grad_norm": 0.47901272549747814, + "learning_rate": 9.705885840288214e-06, + "loss": 0.0935, + "step": 1620 + }, + { + "epoch": 0.14, + "grad_norm": 0.8794044809776304, + "learning_rate": 9.705424546715055e-06, + "loss": 0.1308, + "step": 1621 + }, + { + "epoch": 0.14, + "grad_norm": 0.2991152516043269, + "learning_rate": 9.704962902654466e-06, + "loss": 0.0658, + "step": 1622 + }, + { + "epoch": 0.14, + "grad_norm": 0.4470362129046527, + "learning_rate": 9.70450090814083e-06, + "loss": 0.0628, + "step": 1623 + }, + { + "epoch": 0.14, + "grad_norm": 0.28368805335921876, + "learning_rate": 9.704038563208562e-06, + "loss": 0.0677, + "step": 1624 + }, + { + "epoch": 0.14, + "grad_norm": 0.4211828588435363, + "learning_rate": 9.703575867892099e-06, + "loss": 0.1121, + "step": 1625 + }, + { + "epoch": 0.14, + "grad_norm": 0.6006620620086295, + "learning_rate": 9.703112822225903e-06, + "loss": 0.1335, + "step": 1626 + }, + { + "epoch": 0.14, + "grad_norm": 0.34250629363217006, + "learning_rate": 9.702649426244469e-06, + "loss": 0.0726, + "step": 1627 + }, + { + "epoch": 0.14, + "grad_norm": 0.3446047913915509, + "learning_rate": 9.70218567998231e-06, + "loss": 0.0934, + "step": 1628 + }, + { + "epoch": 0.14, + "grad_norm": 0.4541490697934343, + "learning_rate": 9.701721583473968e-06, + "loss": 0.0815, + "step": 1629 + }, + { + "epoch": 0.14, + "grad_norm": 0.2800957868785917, + "learning_rate": 9.701257136754014e-06, + "loss": 0.0827, + "step": 1630 + }, + { + "epoch": 0.14, + "grad_norm": 0.5938545803444883, + "learning_rate": 9.700792339857042e-06, + "loss": 0.1428, + "step": 1631 + }, + { + "epoch": 0.14, + "grad_norm": 0.39735801907682106, + "learning_rate": 9.700327192817672e-06, + "loss": 0.0907, + "step": 1632 + }, + { + "epoch": 0.14, + "grad_norm": 0.33372457722583365, + "learning_rate": 9.69986169567055e-06, + "loss": 0.0947, + "step": 1633 + }, + { + "epoch": 0.14, + "grad_norm": 0.5399399754100918, + "learning_rate": 9.699395848450352e-06, + "loss": 0.085, + "step": 1634 + }, + { + "epoch": 0.14, + "grad_norm": 0.4788246806715867, + "learning_rate": 9.698929651191775e-06, + "loss": 0.0588, + "step": 1635 + }, + { + "epoch": 0.14, + "grad_norm": 0.3959951015948476, + "learning_rate": 9.698463103929542e-06, + "loss": 0.0847, + "step": 1636 + }, + { + "epoch": 0.14, + "grad_norm": 0.22307701210655395, + "learning_rate": 9.69799620669841e-06, + "loss": 0.0522, + "step": 1637 + }, + { + "epoch": 0.14, + "grad_norm": 0.5629817844772499, + "learning_rate": 9.69752895953315e-06, + "loss": 0.1118, + "step": 1638 + }, + { + "epoch": 0.14, + "grad_norm": 0.35079367337136536, + "learning_rate": 9.697061362468569e-06, + "loss": 0.0866, + "step": 1639 + }, + { + "epoch": 0.14, + "grad_norm": 0.4512904538653238, + "learning_rate": 9.696593415539494e-06, + "loss": 0.122, + "step": 1640 + }, + { + "epoch": 0.14, + "grad_norm": 0.39639752028183983, + "learning_rate": 9.696125118780784e-06, + "loss": 0.0946, + "step": 1641 + }, + { + "epoch": 0.14, + "grad_norm": 0.23958162538929298, + "learning_rate": 9.695656472227316e-06, + "loss": 0.0522, + "step": 1642 + }, + { + "epoch": 0.14, + "grad_norm": 0.30159306538982017, + "learning_rate": 9.695187475914e-06, + "loss": 0.0697, + "step": 1643 + }, + { + "epoch": 0.14, + "grad_norm": 0.7782686441904154, + "learning_rate": 9.694718129875772e-06, + "loss": 0.1254, + "step": 1644 + }, + { + "epoch": 0.14, + "grad_norm": 0.3518363797443654, + "learning_rate": 9.694248434147587e-06, + "loss": 0.109, + "step": 1645 + }, + { + "epoch": 0.14, + "grad_norm": 0.33831510922193114, + "learning_rate": 9.693778388764431e-06, + "loss": 0.0963, + "step": 1646 + }, + { + "epoch": 0.14, + "grad_norm": 0.24911569462044597, + "learning_rate": 9.69330799376132e-06, + "loss": 0.0735, + "step": 1647 + }, + { + "epoch": 0.14, + "grad_norm": 0.3983830652292169, + "learning_rate": 9.692837249173287e-06, + "loss": 0.0925, + "step": 1648 + }, + { + "epoch": 0.14, + "grad_norm": 0.2808850915461096, + "learning_rate": 9.692366155035399e-06, + "loss": 0.0649, + "step": 1649 + }, + { + "epoch": 0.14, + "grad_norm": 0.2760690129922068, + "learning_rate": 9.691894711382745e-06, + "loss": 0.0526, + "step": 1650 + }, + { + "epoch": 0.14, + "grad_norm": 0.34117336217318783, + "learning_rate": 9.691422918250441e-06, + "loss": 0.0951, + "step": 1651 + }, + { + "epoch": 0.14, + "grad_norm": 0.5470083354239057, + "learning_rate": 9.690950775673626e-06, + "loss": 0.0672, + "step": 1652 + }, + { + "epoch": 0.14, + "grad_norm": 0.32479946815877864, + "learning_rate": 9.690478283687473e-06, + "loss": 0.0578, + "step": 1653 + }, + { + "epoch": 0.14, + "grad_norm": 0.4500612467145754, + "learning_rate": 9.690005442327172e-06, + "loss": 0.1245, + "step": 1654 + }, + { + "epoch": 0.14, + "grad_norm": 0.7493651018061379, + "learning_rate": 9.689532251627946e-06, + "loss": 0.0945, + "step": 1655 + }, + { + "epoch": 0.14, + "grad_norm": 0.2083604300294076, + "learning_rate": 9.689058711625037e-06, + "loss": 0.0408, + "step": 1656 + }, + { + "epoch": 0.14, + "grad_norm": 0.37823970187962863, + "learning_rate": 9.688584822353721e-06, + "loss": 0.08, + "step": 1657 + }, + { + "epoch": 0.14, + "grad_norm": 0.3346852404506423, + "learning_rate": 9.688110583849296e-06, + "loss": 0.094, + "step": 1658 + }, + { + "epoch": 0.14, + "grad_norm": 0.3766079384996788, + "learning_rate": 9.687635996147081e-06, + "loss": 0.1067, + "step": 1659 + }, + { + "epoch": 0.14, + "grad_norm": 0.3973157988932218, + "learning_rate": 9.687161059282431e-06, + "loss": 0.0749, + "step": 1660 + }, + { + "epoch": 0.14, + "grad_norm": 0.31183440566978293, + "learning_rate": 9.68668577329072e-06, + "loss": 0.0912, + "step": 1661 + }, + { + "epoch": 0.14, + "grad_norm": 0.3539385255413199, + "learning_rate": 9.686210138207352e-06, + "loss": 0.0899, + "step": 1662 + }, + { + "epoch": 0.14, + "grad_norm": 0.18797687484048944, + "learning_rate": 9.685734154067752e-06, + "loss": 0.0404, + "step": 1663 + }, + { + "epoch": 0.14, + "grad_norm": 0.3722093333403882, + "learning_rate": 9.685257820907377e-06, + "loss": 0.0768, + "step": 1664 + }, + { + "epoch": 0.14, + "grad_norm": 0.3793665651940612, + "learning_rate": 9.684781138761705e-06, + "loss": 0.0753, + "step": 1665 + }, + { + "epoch": 0.14, + "grad_norm": 0.48333684582428976, + "learning_rate": 9.684304107666245e-06, + "loss": 0.0954, + "step": 1666 + }, + { + "epoch": 0.14, + "grad_norm": 0.3694805092446567, + "learning_rate": 9.683826727656525e-06, + "loss": 0.1027, + "step": 1667 + }, + { + "epoch": 0.14, + "grad_norm": 0.3859817412515084, + "learning_rate": 9.683348998768106e-06, + "loss": 0.0458, + "step": 1668 + }, + { + "epoch": 0.14, + "grad_norm": 0.4560717759024917, + "learning_rate": 9.682870921036569e-06, + "loss": 0.0914, + "step": 1669 + }, + { + "epoch": 0.14, + "grad_norm": 0.48765569775599016, + "learning_rate": 9.682392494497528e-06, + "loss": 0.1094, + "step": 1670 + }, + { + "epoch": 0.14, + "grad_norm": 0.29488885185697933, + "learning_rate": 9.681913719186616e-06, + "loss": 0.0725, + "step": 1671 + }, + { + "epoch": 0.14, + "grad_norm": 0.5631546267800813, + "learning_rate": 9.681434595139496e-06, + "loss": 0.1699, + "step": 1672 + }, + { + "epoch": 0.14, + "grad_norm": 0.9017945529494749, + "learning_rate": 9.680955122391858e-06, + "loss": 0.1346, + "step": 1673 + }, + { + "epoch": 0.14, + "grad_norm": 0.3227088044276552, + "learning_rate": 9.680475300979412e-06, + "loss": 0.0717, + "step": 1674 + }, + { + "epoch": 0.14, + "grad_norm": 0.5813438072495212, + "learning_rate": 9.6799951309379e-06, + "loss": 0.0853, + "step": 1675 + }, + { + "epoch": 0.14, + "grad_norm": 0.41135836425111183, + "learning_rate": 9.679514612303088e-06, + "loss": 0.0878, + "step": 1676 + }, + { + "epoch": 0.14, + "grad_norm": 0.36621261302916064, + "learning_rate": 9.679033745110767e-06, + "loss": 0.1101, + "step": 1677 + }, + { + "epoch": 0.14, + "grad_norm": 0.3549293253532294, + "learning_rate": 9.678552529396757e-06, + "loss": 0.0818, + "step": 1678 + }, + { + "epoch": 0.14, + "grad_norm": 0.4394260590726517, + "learning_rate": 9.678070965196897e-06, + "loss": 0.0864, + "step": 1679 + }, + { + "epoch": 0.14, + "grad_norm": 0.9490709657429738, + "learning_rate": 9.677589052547063e-06, + "loss": 0.1304, + "step": 1680 + }, + { + "epoch": 0.14, + "grad_norm": 0.45403516007725714, + "learning_rate": 9.677106791483147e-06, + "loss": 0.082, + "step": 1681 + }, + { + "epoch": 0.14, + "grad_norm": 0.3489199906452373, + "learning_rate": 9.676624182041067e-06, + "loss": 0.0674, + "step": 1682 + }, + { + "epoch": 0.14, + "grad_norm": 0.5438823422306638, + "learning_rate": 9.676141224256778e-06, + "loss": 0.1462, + "step": 1683 + }, + { + "epoch": 0.14, + "grad_norm": 0.850743670015194, + "learning_rate": 9.675657918166251e-06, + "loss": 0.0983, + "step": 1684 + }, + { + "epoch": 0.14, + "grad_norm": 0.25491265592182877, + "learning_rate": 9.675174263805483e-06, + "loss": 0.0496, + "step": 1685 + }, + { + "epoch": 0.14, + "grad_norm": 0.48138606967626524, + "learning_rate": 9.6746902612105e-06, + "loss": 0.0853, + "step": 1686 + }, + { + "epoch": 0.14, + "grad_norm": 0.4577760492863844, + "learning_rate": 9.674205910417356e-06, + "loss": 0.0967, + "step": 1687 + }, + { + "epoch": 0.14, + "grad_norm": 0.3258968424402221, + "learning_rate": 9.673721211462125e-06, + "loss": 0.1017, + "step": 1688 + }, + { + "epoch": 0.14, + "grad_norm": 0.35809153124559706, + "learning_rate": 9.673236164380912e-06, + "loss": 0.1029, + "step": 1689 + }, + { + "epoch": 0.14, + "grad_norm": 0.44933256820846557, + "learning_rate": 9.672750769209847e-06, + "loss": 0.1082, + "step": 1690 + }, + { + "epoch": 0.14, + "grad_norm": 0.3382113975920095, + "learning_rate": 9.672265025985083e-06, + "loss": 0.0745, + "step": 1691 + }, + { + "epoch": 0.14, + "grad_norm": 0.37860175862659473, + "learning_rate": 9.671778934742803e-06, + "loss": 0.0929, + "step": 1692 + }, + { + "epoch": 0.14, + "grad_norm": 0.943085121444794, + "learning_rate": 9.67129249551921e-06, + "loss": 0.1203, + "step": 1693 + }, + { + "epoch": 0.14, + "grad_norm": 0.5332242438493886, + "learning_rate": 9.670805708350544e-06, + "loss": 0.1086, + "step": 1694 + }, + { + "epoch": 0.14, + "grad_norm": 0.7995730641515192, + "learning_rate": 9.670318573273056e-06, + "loss": 0.1611, + "step": 1695 + }, + { + "epoch": 0.14, + "grad_norm": 0.36393982251294177, + "learning_rate": 9.669831090323036e-06, + "loss": 0.0755, + "step": 1696 + }, + { + "epoch": 0.14, + "grad_norm": 0.5420918455984371, + "learning_rate": 9.669343259536792e-06, + "loss": 0.1529, + "step": 1697 + }, + { + "epoch": 0.14, + "grad_norm": 0.3258567145449449, + "learning_rate": 9.668855080950661e-06, + "loss": 0.0909, + "step": 1698 + }, + { + "epoch": 0.14, + "grad_norm": 0.472418532031593, + "learning_rate": 9.668366554601005e-06, + "loss": 0.1407, + "step": 1699 + }, + { + "epoch": 0.14, + "grad_norm": 0.3067907705928447, + "learning_rate": 9.667877680524213e-06, + "loss": 0.0817, + "step": 1700 + }, + { + "epoch": 0.14, + "grad_norm": 0.4783910353119245, + "learning_rate": 9.667388458756699e-06, + "loss": 0.1296, + "step": 1701 + }, + { + "epoch": 0.14, + "grad_norm": 0.6832021852971457, + "learning_rate": 9.666898889334903e-06, + "loss": 0.1709, + "step": 1702 + }, + { + "epoch": 0.14, + "grad_norm": 0.2539010809356441, + "learning_rate": 9.666408972295291e-06, + "loss": 0.0473, + "step": 1703 + }, + { + "epoch": 0.14, + "grad_norm": 0.3915263050142795, + "learning_rate": 9.665918707674356e-06, + "loss": 0.0965, + "step": 1704 + }, + { + "epoch": 0.14, + "grad_norm": 0.443688343543515, + "learning_rate": 9.665428095508612e-06, + "loss": 0.0805, + "step": 1705 + }, + { + "epoch": 0.14, + "grad_norm": 0.2462069906348091, + "learning_rate": 9.664937135834607e-06, + "loss": 0.0528, + "step": 1706 + }, + { + "epoch": 0.14, + "grad_norm": 0.3059833800395932, + "learning_rate": 9.664445828688909e-06, + "loss": 0.0779, + "step": 1707 + }, + { + "epoch": 0.14, + "grad_norm": 0.34997345092857235, + "learning_rate": 9.663954174108114e-06, + "loss": 0.0466, + "step": 1708 + }, + { + "epoch": 0.14, + "grad_norm": 0.34688101090722157, + "learning_rate": 9.66346217212884e-06, + "loss": 0.0691, + "step": 1709 + }, + { + "epoch": 0.14, + "grad_norm": 0.4635148450913152, + "learning_rate": 9.66296982278774e-06, + "loss": 0.0909, + "step": 1710 + }, + { + "epoch": 0.14, + "grad_norm": 0.40611009037167684, + "learning_rate": 9.662477126121481e-06, + "loss": 0.058, + "step": 1711 + }, + { + "epoch": 0.14, + "grad_norm": 0.545865533814014, + "learning_rate": 9.661984082166766e-06, + "loss": 0.0715, + "step": 1712 + }, + { + "epoch": 0.14, + "grad_norm": 0.465371240062029, + "learning_rate": 9.661490690960318e-06, + "loss": 0.1325, + "step": 1713 + }, + { + "epoch": 0.14, + "grad_norm": 0.3366913122905652, + "learning_rate": 9.66099695253889e-06, + "loss": 0.0809, + "step": 1714 + }, + { + "epoch": 0.14, + "grad_norm": 0.39896688225891824, + "learning_rate": 9.660502866939254e-06, + "loss": 0.0627, + "step": 1715 + }, + { + "epoch": 0.14, + "grad_norm": 0.6788029674185972, + "learning_rate": 9.660008434198217e-06, + "loss": 0.1011, + "step": 1716 + }, + { + "epoch": 0.14, + "grad_norm": 0.31861354983673584, + "learning_rate": 9.659513654352603e-06, + "loss": 0.0941, + "step": 1717 + }, + { + "epoch": 0.14, + "grad_norm": 0.5258465189832534, + "learning_rate": 9.65901852743927e-06, + "loss": 0.1099, + "step": 1718 + }, + { + "epoch": 0.14, + "grad_norm": 0.2710032818313362, + "learning_rate": 9.658523053495096e-06, + "loss": 0.0883, + "step": 1719 + }, + { + "epoch": 0.14, + "grad_norm": 0.40871914245954305, + "learning_rate": 9.658027232556989e-06, + "loss": 0.1211, + "step": 1720 + }, + { + "epoch": 0.15, + "grad_norm": 0.2716850623023346, + "learning_rate": 9.657531064661875e-06, + "loss": 0.0515, + "step": 1721 + }, + { + "epoch": 0.15, + "grad_norm": 0.2772492049281881, + "learning_rate": 9.657034549846719e-06, + "loss": 0.0867, + "step": 1722 + }, + { + "epoch": 0.15, + "grad_norm": 0.4174072165238044, + "learning_rate": 9.656537688148499e-06, + "loss": 0.1076, + "step": 1723 + }, + { + "epoch": 0.15, + "grad_norm": 0.3469617477897984, + "learning_rate": 9.656040479604225e-06, + "loss": 0.0854, + "step": 1724 + }, + { + "epoch": 0.15, + "grad_norm": 0.7115411063230116, + "learning_rate": 9.655542924250933e-06, + "loss": 0.11, + "step": 1725 + }, + { + "epoch": 0.15, + "grad_norm": 0.4116103739510056, + "learning_rate": 9.655045022125683e-06, + "loss": 0.1242, + "step": 1726 + }, + { + "epoch": 0.15, + "grad_norm": 0.3540199019327835, + "learning_rate": 9.654546773265562e-06, + "loss": 0.1175, + "step": 1727 + }, + { + "epoch": 0.15, + "grad_norm": 0.2450499848005601, + "learning_rate": 9.654048177707684e-06, + "loss": 0.0498, + "step": 1728 + }, + { + "epoch": 0.15, + "grad_norm": 0.23230475104288978, + "learning_rate": 9.653549235489184e-06, + "loss": 0.0498, + "step": 1729 + }, + { + "epoch": 0.15, + "grad_norm": 0.33021047725469893, + "learning_rate": 9.65304994664723e-06, + "loss": 0.0554, + "step": 1730 + }, + { + "epoch": 0.15, + "grad_norm": 0.3298519310981313, + "learning_rate": 9.652550311219008e-06, + "loss": 0.0717, + "step": 1731 + }, + { + "epoch": 0.15, + "grad_norm": 0.6040181845310911, + "learning_rate": 9.652050329241737e-06, + "loss": 0.0909, + "step": 1732 + }, + { + "epoch": 0.15, + "grad_norm": 0.37257839442426416, + "learning_rate": 9.651550000752656e-06, + "loss": 0.1009, + "step": 1733 + }, + { + "epoch": 0.15, + "grad_norm": 0.4375964409520717, + "learning_rate": 9.651049325789035e-06, + "loss": 0.1221, + "step": 1734 + }, + { + "epoch": 0.15, + "grad_norm": 0.3517558742518417, + "learning_rate": 9.650548304388166e-06, + "loss": 0.0812, + "step": 1735 + }, + { + "epoch": 0.15, + "grad_norm": 0.48173316990309206, + "learning_rate": 9.650046936587367e-06, + "loss": 0.1067, + "step": 1736 + }, + { + "epoch": 0.15, + "grad_norm": 0.7544284724774956, + "learning_rate": 9.649545222423984e-06, + "loss": 0.1244, + "step": 1737 + }, + { + "epoch": 0.15, + "grad_norm": 0.2041707673219208, + "learning_rate": 9.649043161935387e-06, + "loss": 0.0609, + "step": 1738 + }, + { + "epoch": 0.15, + "grad_norm": 0.4313445397604857, + "learning_rate": 9.648540755158973e-06, + "loss": 0.0968, + "step": 1739 + }, + { + "epoch": 0.15, + "grad_norm": 0.3546779767254469, + "learning_rate": 9.648038002132164e-06, + "loss": 0.1146, + "step": 1740 + }, + { + "epoch": 0.15, + "grad_norm": 0.6625767198256146, + "learning_rate": 9.647534902892405e-06, + "loss": 0.107, + "step": 1741 + }, + { + "epoch": 0.15, + "grad_norm": 0.49935009692641946, + "learning_rate": 9.647031457477176e-06, + "loss": 0.1197, + "step": 1742 + }, + { + "epoch": 0.15, + "grad_norm": 0.5357819950506627, + "learning_rate": 9.646527665923973e-06, + "loss": 0.0958, + "step": 1743 + }, + { + "epoch": 0.15, + "grad_norm": 0.4040191738680696, + "learning_rate": 9.646023528270319e-06, + "loss": 0.1034, + "step": 1744 + }, + { + "epoch": 0.15, + "grad_norm": 0.4773618109906908, + "learning_rate": 9.64551904455377e-06, + "loss": 0.1, + "step": 1745 + }, + { + "epoch": 0.15, + "grad_norm": 0.3539996100809835, + "learning_rate": 9.645014214811899e-06, + "loss": 0.0793, + "step": 1746 + }, + { + "epoch": 0.15, + "grad_norm": 0.3542687730299365, + "learning_rate": 9.64450903908231e-06, + "loss": 0.1013, + "step": 1747 + }, + { + "epoch": 0.15, + "grad_norm": 0.35096286233135393, + "learning_rate": 9.644003517402631e-06, + "loss": 0.1027, + "step": 1748 + }, + { + "epoch": 0.15, + "grad_norm": 0.2733711946774802, + "learning_rate": 9.643497649810518e-06, + "loss": 0.0713, + "step": 1749 + }, + { + "epoch": 0.15, + "grad_norm": 0.3692470032729203, + "learning_rate": 9.64299143634365e-06, + "loss": 0.0749, + "step": 1750 + }, + { + "epoch": 0.15, + "grad_norm": 0.4059296918235959, + "learning_rate": 9.642484877039732e-06, + "loss": 0.1344, + "step": 1751 + }, + { + "epoch": 0.15, + "grad_norm": 0.4066474635625967, + "learning_rate": 9.641977971936496e-06, + "loss": 0.0738, + "step": 1752 + }, + { + "epoch": 0.15, + "grad_norm": 0.43514945540588296, + "learning_rate": 9.641470721071698e-06, + "loss": 0.0766, + "step": 1753 + }, + { + "epoch": 0.15, + "grad_norm": 0.47303169059026534, + "learning_rate": 9.640963124483123e-06, + "loss": 0.114, + "step": 1754 + }, + { + "epoch": 0.15, + "grad_norm": 0.4220763483611639, + "learning_rate": 9.640455182208578e-06, + "loss": 0.1252, + "step": 1755 + }, + { + "epoch": 0.15, + "grad_norm": 0.29092101217812594, + "learning_rate": 9.639946894285898e-06, + "loss": 0.0647, + "step": 1756 + }, + { + "epoch": 0.15, + "grad_norm": 0.4122689323009027, + "learning_rate": 9.639438260752945e-06, + "loss": 0.1138, + "step": 1757 + }, + { + "epoch": 0.15, + "grad_norm": 0.501291769446736, + "learning_rate": 9.638929281647604e-06, + "loss": 0.0911, + "step": 1758 + }, + { + "epoch": 0.15, + "grad_norm": 0.24670073911623808, + "learning_rate": 9.638419957007784e-06, + "loss": 0.0422, + "step": 1759 + }, + { + "epoch": 0.15, + "grad_norm": 0.5215980826502019, + "learning_rate": 9.637910286871428e-06, + "loss": 0.1397, + "step": 1760 + }, + { + "epoch": 0.15, + "grad_norm": 0.30612686064646755, + "learning_rate": 9.637400271276495e-06, + "loss": 0.0769, + "step": 1761 + }, + { + "epoch": 0.15, + "grad_norm": 0.3953106733774438, + "learning_rate": 9.636889910260974e-06, + "loss": 0.1056, + "step": 1762 + }, + { + "epoch": 0.15, + "grad_norm": 0.46752411724635234, + "learning_rate": 9.63637920386288e-06, + "loss": 0.0741, + "step": 1763 + }, + { + "epoch": 0.15, + "grad_norm": 0.5339227032316735, + "learning_rate": 9.635868152120256e-06, + "loss": 0.0925, + "step": 1764 + }, + { + "epoch": 0.15, + "grad_norm": 0.2254874092089242, + "learning_rate": 9.635356755071165e-06, + "loss": 0.0582, + "step": 1765 + }, + { + "epoch": 0.15, + "grad_norm": 0.7991655491318462, + "learning_rate": 9.6348450127537e-06, + "loss": 0.1336, + "step": 1766 + }, + { + "epoch": 0.15, + "grad_norm": 0.47281557900482096, + "learning_rate": 9.634332925205978e-06, + "loss": 0.108, + "step": 1767 + }, + { + "epoch": 0.15, + "grad_norm": 0.3818476949818449, + "learning_rate": 9.633820492466144e-06, + "loss": 0.0891, + "step": 1768 + }, + { + "epoch": 0.15, + "grad_norm": 0.3763393573070956, + "learning_rate": 9.633307714572363e-06, + "loss": 0.1092, + "step": 1769 + }, + { + "epoch": 0.15, + "grad_norm": 0.4202087038113615, + "learning_rate": 9.632794591562835e-06, + "loss": 0.0798, + "step": 1770 + }, + { + "epoch": 0.15, + "grad_norm": 0.3985458833864818, + "learning_rate": 9.632281123475778e-06, + "loss": 0.1118, + "step": 1771 + }, + { + "epoch": 0.15, + "grad_norm": 0.3821740888533187, + "learning_rate": 9.631767310349438e-06, + "loss": 0.1079, + "step": 1772 + }, + { + "epoch": 0.15, + "grad_norm": 0.3284128242194129, + "learning_rate": 9.631253152222085e-06, + "loss": 0.0373, + "step": 1773 + }, + { + "epoch": 0.15, + "grad_norm": 0.9914885394284557, + "learning_rate": 9.630738649132018e-06, + "loss": 0.1046, + "step": 1774 + }, + { + "epoch": 0.15, + "grad_norm": 0.5969830749512147, + "learning_rate": 9.630223801117562e-06, + "loss": 0.1283, + "step": 1775 + }, + { + "epoch": 0.15, + "grad_norm": 0.6101035610014983, + "learning_rate": 9.629708608217063e-06, + "loss": 0.1518, + "step": 1776 + }, + { + "epoch": 0.15, + "grad_norm": 0.38428681715152785, + "learning_rate": 9.629193070468898e-06, + "loss": 0.0988, + "step": 1777 + }, + { + "epoch": 0.15, + "grad_norm": 0.2636046602213241, + "learning_rate": 9.628677187911466e-06, + "loss": 0.0684, + "step": 1778 + }, + { + "epoch": 0.15, + "grad_norm": 0.463994127607072, + "learning_rate": 9.628160960583193e-06, + "loss": 0.1169, + "step": 1779 + }, + { + "epoch": 0.15, + "grad_norm": 0.6542644911398723, + "learning_rate": 9.627644388522528e-06, + "loss": 0.1483, + "step": 1780 + }, + { + "epoch": 0.15, + "grad_norm": 0.32888198276290986, + "learning_rate": 9.627127471767955e-06, + "loss": 0.0648, + "step": 1781 + }, + { + "epoch": 0.15, + "grad_norm": 0.27175643443963027, + "learning_rate": 9.626610210357969e-06, + "loss": 0.0559, + "step": 1782 + }, + { + "epoch": 0.15, + "grad_norm": 0.4597624521746198, + "learning_rate": 9.626092604331105e-06, + "loss": 0.138, + "step": 1783 + }, + { + "epoch": 0.15, + "grad_norm": 0.4700774146925981, + "learning_rate": 9.625574653725916e-06, + "loss": 0.1012, + "step": 1784 + }, + { + "epoch": 0.15, + "grad_norm": 0.5737118216114742, + "learning_rate": 9.625056358580978e-06, + "loss": 0.1498, + "step": 1785 + }, + { + "epoch": 0.15, + "grad_norm": 0.3181079907006066, + "learning_rate": 9.624537718934901e-06, + "loss": 0.0861, + "step": 1786 + }, + { + "epoch": 0.15, + "grad_norm": 0.33637802760195745, + "learning_rate": 9.624018734826314e-06, + "loss": 0.1093, + "step": 1787 + }, + { + "epoch": 0.15, + "grad_norm": 0.27508410210893214, + "learning_rate": 9.623499406293874e-06, + "loss": 0.0555, + "step": 1788 + }, + { + "epoch": 0.15, + "grad_norm": 0.4343609478317466, + "learning_rate": 9.622979733376266e-06, + "loss": 0.0982, + "step": 1789 + }, + { + "epoch": 0.15, + "grad_norm": 0.3448739760588774, + "learning_rate": 9.622459716112195e-06, + "loss": 0.0561, + "step": 1790 + }, + { + "epoch": 0.15, + "grad_norm": 0.23680149695828268, + "learning_rate": 9.621939354540399e-06, + "loss": 0.073, + "step": 1791 + }, + { + "epoch": 0.15, + "grad_norm": 0.255239398428975, + "learning_rate": 9.621418648699634e-06, + "loss": 0.0338, + "step": 1792 + }, + { + "epoch": 0.15, + "grad_norm": 0.42482429853238646, + "learning_rate": 9.620897598628685e-06, + "loss": 0.0618, + "step": 1793 + }, + { + "epoch": 0.15, + "grad_norm": 0.4804070310811205, + "learning_rate": 9.620376204366365e-06, + "loss": 0.0904, + "step": 1794 + }, + { + "epoch": 0.15, + "grad_norm": 0.2975749195132777, + "learning_rate": 9.61985446595151e-06, + "loss": 0.0826, + "step": 1795 + }, + { + "epoch": 0.15, + "grad_norm": 0.66197686965244, + "learning_rate": 9.619332383422981e-06, + "loss": 0.1226, + "step": 1796 + }, + { + "epoch": 0.15, + "grad_norm": 0.3546357737399377, + "learning_rate": 9.618809956819667e-06, + "loss": 0.0767, + "step": 1797 + }, + { + "epoch": 0.15, + "grad_norm": 0.575304126887479, + "learning_rate": 9.618287186180481e-06, + "loss": 0.1163, + "step": 1798 + }, + { + "epoch": 0.15, + "grad_norm": 0.45701609110518454, + "learning_rate": 9.617764071544361e-06, + "loss": 0.1201, + "step": 1799 + }, + { + "epoch": 0.15, + "grad_norm": 0.33695926938051457, + "learning_rate": 9.617240612950274e-06, + "loss": 0.0768, + "step": 1800 + }, + { + "epoch": 0.15, + "grad_norm": 0.5728228430650636, + "learning_rate": 9.616716810437208e-06, + "loss": 0.1186, + "step": 1801 + }, + { + "epoch": 0.15, + "grad_norm": 0.5054069417482399, + "learning_rate": 9.616192664044179e-06, + "loss": 0.0869, + "step": 1802 + }, + { + "epoch": 0.15, + "grad_norm": 0.607561965061396, + "learning_rate": 9.615668173810228e-06, + "loss": 0.1013, + "step": 1803 + }, + { + "epoch": 0.15, + "grad_norm": 0.489446526027828, + "learning_rate": 9.615143339774426e-06, + "loss": 0.1139, + "step": 1804 + }, + { + "epoch": 0.15, + "grad_norm": 0.4652410196145404, + "learning_rate": 9.614618161975861e-06, + "loss": 0.1208, + "step": 1805 + }, + { + "epoch": 0.15, + "grad_norm": 0.3307487510723781, + "learning_rate": 9.614092640453654e-06, + "loss": 0.0764, + "step": 1806 + }, + { + "epoch": 0.15, + "grad_norm": 0.4834579931552204, + "learning_rate": 9.613566775246948e-06, + "loss": 0.0963, + "step": 1807 + }, + { + "epoch": 0.15, + "grad_norm": 1.0709741162931112, + "learning_rate": 9.613040566394911e-06, + "loss": 0.1493, + "step": 1808 + }, + { + "epoch": 0.15, + "grad_norm": 0.7491351902539422, + "learning_rate": 9.612514013936741e-06, + "loss": 0.1409, + "step": 1809 + }, + { + "epoch": 0.15, + "grad_norm": 0.4938599196296888, + "learning_rate": 9.611987117911657e-06, + "loss": 0.0731, + "step": 1810 + }, + { + "epoch": 0.15, + "grad_norm": 1.153649357147798, + "learning_rate": 9.611459878358906e-06, + "loss": 0.1837, + "step": 1811 + }, + { + "epoch": 0.15, + "grad_norm": 0.43122340317346164, + "learning_rate": 9.610932295317762e-06, + "loss": 0.0889, + "step": 1812 + }, + { + "epoch": 0.15, + "grad_norm": 0.37993865832808865, + "learning_rate": 9.610404368827515e-06, + "loss": 0.0912, + "step": 1813 + }, + { + "epoch": 0.15, + "grad_norm": 0.523529343254003, + "learning_rate": 9.609876098927496e-06, + "loss": 0.1041, + "step": 1814 + }, + { + "epoch": 0.15, + "grad_norm": 0.4902970994945036, + "learning_rate": 9.60934748565705e-06, + "loss": 0.1151, + "step": 1815 + }, + { + "epoch": 0.15, + "grad_norm": 0.4062905869418258, + "learning_rate": 9.608818529055552e-06, + "loss": 0.0852, + "step": 1816 + }, + { + "epoch": 0.15, + "grad_norm": 0.46650697904878186, + "learning_rate": 9.608289229162401e-06, + "loss": 0.0766, + "step": 1817 + }, + { + "epoch": 0.15, + "grad_norm": 0.7142306543567551, + "learning_rate": 9.607759586017025e-06, + "loss": 0.0946, + "step": 1818 + }, + { + "epoch": 0.15, + "grad_norm": 0.37701850761138117, + "learning_rate": 9.60722959965887e-06, + "loss": 0.1182, + "step": 1819 + }, + { + "epoch": 0.15, + "grad_norm": 0.5784341156163959, + "learning_rate": 9.606699270127417e-06, + "loss": 0.1008, + "step": 1820 + }, + { + "epoch": 0.15, + "grad_norm": 0.37996851033851764, + "learning_rate": 9.606168597462167e-06, + "loss": 0.1154, + "step": 1821 + }, + { + "epoch": 0.15, + "grad_norm": 0.6904213355326778, + "learning_rate": 9.605637581702644e-06, + "loss": 0.1053, + "step": 1822 + }, + { + "epoch": 0.15, + "grad_norm": 0.4632611818469938, + "learning_rate": 9.605106222888408e-06, + "loss": 0.0924, + "step": 1823 + }, + { + "epoch": 0.15, + "grad_norm": 0.29790325861550254, + "learning_rate": 9.604574521059031e-06, + "loss": 0.0769, + "step": 1824 + }, + { + "epoch": 0.15, + "grad_norm": 0.2935469359880675, + "learning_rate": 9.60404247625412e-06, + "loss": 0.0848, + "step": 1825 + }, + { + "epoch": 0.15, + "grad_norm": 0.35847531601618876, + "learning_rate": 9.603510088513306e-06, + "loss": 0.1, + "step": 1826 + }, + { + "epoch": 0.15, + "grad_norm": 0.30672484585996274, + "learning_rate": 9.602977357876243e-06, + "loss": 0.0917, + "step": 1827 + }, + { + "epoch": 0.15, + "grad_norm": 0.8760810347049858, + "learning_rate": 9.602444284382613e-06, + "loss": 0.1615, + "step": 1828 + }, + { + "epoch": 0.15, + "grad_norm": 0.33654705848736377, + "learning_rate": 9.60191086807212e-06, + "loss": 0.0878, + "step": 1829 + }, + { + "epoch": 0.15, + "grad_norm": 0.4711618548231413, + "learning_rate": 9.601377108984498e-06, + "loss": 0.1196, + "step": 1830 + }, + { + "epoch": 0.15, + "grad_norm": 0.4243693448127547, + "learning_rate": 9.600843007159504e-06, + "loss": 0.1094, + "step": 1831 + }, + { + "epoch": 0.15, + "grad_norm": 0.33136047238713895, + "learning_rate": 9.60030856263692e-06, + "loss": 0.1052, + "step": 1832 + }, + { + "epoch": 0.15, + "grad_norm": 0.2866495821779736, + "learning_rate": 9.599773775456556e-06, + "loss": 0.0861, + "step": 1833 + }, + { + "epoch": 0.15, + "grad_norm": 0.4270524679787621, + "learning_rate": 9.599238645658247e-06, + "loss": 0.0793, + "step": 1834 + }, + { + "epoch": 0.15, + "grad_norm": 0.4709436703134125, + "learning_rate": 9.59870317328185e-06, + "loss": 0.1241, + "step": 1835 + }, + { + "epoch": 0.15, + "grad_norm": 0.6749150505946065, + "learning_rate": 9.59816735836725e-06, + "loss": 0.1411, + "step": 1836 + }, + { + "epoch": 0.15, + "grad_norm": 0.3885439690384418, + "learning_rate": 9.597631200954361e-06, + "loss": 0.0824, + "step": 1837 + }, + { + "epoch": 0.15, + "grad_norm": 0.29366077540168367, + "learning_rate": 9.597094701083114e-06, + "loss": 0.0765, + "step": 1838 + }, + { + "epoch": 0.15, + "grad_norm": 0.3109636267916011, + "learning_rate": 9.596557858793476e-06, + "loss": 0.0851, + "step": 1839 + }, + { + "epoch": 0.16, + "grad_norm": 0.5061885870624682, + "learning_rate": 9.59602067412543e-06, + "loss": 0.1226, + "step": 1840 + }, + { + "epoch": 0.16, + "grad_norm": 0.22946441238905588, + "learning_rate": 9.595483147118993e-06, + "loss": 0.0746, + "step": 1841 + }, + { + "epoch": 0.16, + "grad_norm": 0.33264233471642923, + "learning_rate": 9.5949452778142e-06, + "loss": 0.0781, + "step": 1842 + }, + { + "epoch": 0.16, + "grad_norm": 0.40897360664440763, + "learning_rate": 9.594407066251114e-06, + "loss": 0.1093, + "step": 1843 + }, + { + "epoch": 0.16, + "grad_norm": 0.3643547803657084, + "learning_rate": 9.593868512469824e-06, + "loss": 0.1223, + "step": 1844 + }, + { + "epoch": 0.16, + "grad_norm": 0.3726987785926976, + "learning_rate": 9.59332961651045e-06, + "loss": 0.1127, + "step": 1845 + }, + { + "epoch": 0.16, + "grad_norm": 0.4545366482154216, + "learning_rate": 9.592790378413125e-06, + "loss": 0.1041, + "step": 1846 + }, + { + "epoch": 0.16, + "grad_norm": 1.0369463398658216, + "learning_rate": 9.592250798218018e-06, + "loss": 0.1405, + "step": 1847 + }, + { + "epoch": 0.16, + "grad_norm": 0.2849198775900309, + "learning_rate": 9.59171087596532e-06, + "loss": 0.0846, + "step": 1848 + }, + { + "epoch": 0.16, + "grad_norm": 0.2600027101766995, + "learning_rate": 9.591170611695246e-06, + "loss": 0.0424, + "step": 1849 + }, + { + "epoch": 0.16, + "grad_norm": 0.5776973722820054, + "learning_rate": 9.590630005448041e-06, + "loss": 0.1157, + "step": 1850 + }, + { + "epoch": 0.16, + "grad_norm": 0.37225970150218674, + "learning_rate": 9.590089057263972e-06, + "loss": 0.0796, + "step": 1851 + }, + { + "epoch": 0.16, + "grad_norm": 0.49731182522790984, + "learning_rate": 9.589547767183328e-06, + "loss": 0.093, + "step": 1852 + }, + { + "epoch": 0.16, + "grad_norm": 0.2215884233876604, + "learning_rate": 9.58900613524643e-06, + "loss": 0.0551, + "step": 1853 + }, + { + "epoch": 0.16, + "grad_norm": 0.30276416906795556, + "learning_rate": 9.588464161493624e-06, + "loss": 0.0831, + "step": 1854 + }, + { + "epoch": 0.16, + "grad_norm": 0.29190265813288707, + "learning_rate": 9.587921845965278e-06, + "loss": 0.057, + "step": 1855 + }, + { + "epoch": 0.16, + "grad_norm": 0.23759949715209405, + "learning_rate": 9.587379188701785e-06, + "loss": 0.0487, + "step": 1856 + }, + { + "epoch": 0.16, + "grad_norm": 0.34838472796213904, + "learning_rate": 9.586836189743567e-06, + "loss": 0.1108, + "step": 1857 + }, + { + "epoch": 0.16, + "grad_norm": 0.26541310087255937, + "learning_rate": 9.58629284913107e-06, + "loss": 0.0677, + "step": 1858 + }, + { + "epoch": 0.16, + "grad_norm": 0.3078904514761348, + "learning_rate": 9.585749166904762e-06, + "loss": 0.0803, + "step": 1859 + }, + { + "epoch": 0.16, + "grad_norm": 0.36310516759145084, + "learning_rate": 9.585205143105144e-06, + "loss": 0.1302, + "step": 1860 + }, + { + "epoch": 0.16, + "grad_norm": 0.2847219970624417, + "learning_rate": 9.584660777772736e-06, + "loss": 0.0868, + "step": 1861 + }, + { + "epoch": 0.16, + "grad_norm": 0.16860687460144833, + "learning_rate": 9.584116070948083e-06, + "loss": 0.0571, + "step": 1862 + }, + { + "epoch": 0.16, + "grad_norm": 0.331191898363734, + "learning_rate": 9.583571022671765e-06, + "loss": 0.09, + "step": 1863 + }, + { + "epoch": 0.16, + "grad_norm": 0.20552531767640034, + "learning_rate": 9.583025632984374e-06, + "loss": 0.0605, + "step": 1864 + }, + { + "epoch": 0.16, + "grad_norm": 0.4004598143455329, + "learning_rate": 9.582479901926536e-06, + "loss": 0.0986, + "step": 1865 + }, + { + "epoch": 0.16, + "grad_norm": 0.19408586707790806, + "learning_rate": 9.581933829538899e-06, + "loss": 0.0211, + "step": 1866 + }, + { + "epoch": 0.16, + "grad_norm": 0.33038882294102445, + "learning_rate": 9.581387415862139e-06, + "loss": 0.0799, + "step": 1867 + }, + { + "epoch": 0.16, + "grad_norm": 0.4257771852146185, + "learning_rate": 9.580840660936957e-06, + "loss": 0.1007, + "step": 1868 + }, + { + "epoch": 0.16, + "grad_norm": 0.2914026436021699, + "learning_rate": 9.580293564804074e-06, + "loss": 0.0798, + "step": 1869 + }, + { + "epoch": 0.16, + "grad_norm": 0.3996582725614731, + "learning_rate": 9.579746127504247e-06, + "loss": 0.1092, + "step": 1870 + }, + { + "epoch": 0.16, + "grad_norm": 0.4070319055361208, + "learning_rate": 9.579198349078248e-06, + "loss": 0.1136, + "step": 1871 + }, + { + "epoch": 0.16, + "grad_norm": 0.3973404828997548, + "learning_rate": 9.578650229566883e-06, + "loss": 0.1048, + "step": 1872 + }, + { + "epoch": 0.16, + "grad_norm": 0.2563788951519117, + "learning_rate": 9.578101769010974e-06, + "loss": 0.0806, + "step": 1873 + }, + { + "epoch": 0.16, + "grad_norm": 0.47528229881893647, + "learning_rate": 9.577552967451376e-06, + "loss": 0.0897, + "step": 1874 + }, + { + "epoch": 0.16, + "grad_norm": 1.2667795499584422, + "learning_rate": 9.577003824928967e-06, + "loss": 0.1193, + "step": 1875 + }, + { + "epoch": 0.16, + "grad_norm": 0.5736140290053159, + "learning_rate": 9.57645434148465e-06, + "loss": 0.1038, + "step": 1876 + }, + { + "epoch": 0.16, + "grad_norm": 0.2772419181084563, + "learning_rate": 9.575904517159354e-06, + "loss": 0.1191, + "step": 1877 + }, + { + "epoch": 0.16, + "grad_norm": 0.2882163210317352, + "learning_rate": 9.575354351994032e-06, + "loss": 0.0774, + "step": 1878 + }, + { + "epoch": 0.16, + "grad_norm": 0.3667216601081788, + "learning_rate": 9.574803846029666e-06, + "loss": 0.0974, + "step": 1879 + }, + { + "epoch": 0.16, + "grad_norm": 0.30618162963866624, + "learning_rate": 9.574252999307258e-06, + "loss": 0.0635, + "step": 1880 + }, + { + "epoch": 0.16, + "grad_norm": 0.37307176651415713, + "learning_rate": 9.57370181186784e-06, + "loss": 0.101, + "step": 1881 + }, + { + "epoch": 0.16, + "grad_norm": 0.2719634094345342, + "learning_rate": 9.573150283752466e-06, + "loss": 0.0704, + "step": 1882 + }, + { + "epoch": 0.16, + "grad_norm": 0.359281631744184, + "learning_rate": 9.57259841500222e-06, + "loss": 0.1096, + "step": 1883 + }, + { + "epoch": 0.16, + "grad_norm": 0.2814151280573117, + "learning_rate": 9.572046205658207e-06, + "loss": 0.0738, + "step": 1884 + }, + { + "epoch": 0.16, + "grad_norm": 0.5749776765851902, + "learning_rate": 9.571493655761558e-06, + "loss": 0.106, + "step": 1885 + }, + { + "epoch": 0.16, + "grad_norm": 0.44029755218977173, + "learning_rate": 9.570940765353431e-06, + "loss": 0.0966, + "step": 1886 + }, + { + "epoch": 0.16, + "grad_norm": 0.3976276617886006, + "learning_rate": 9.570387534475008e-06, + "loss": 0.1008, + "step": 1887 + }, + { + "epoch": 0.16, + "grad_norm": 0.4744751750152143, + "learning_rate": 9.569833963167495e-06, + "loss": 0.1221, + "step": 1888 + }, + { + "epoch": 0.16, + "grad_norm": 0.22816683516551559, + "learning_rate": 9.569280051472129e-06, + "loss": 0.0689, + "step": 1889 + }, + { + "epoch": 0.16, + "grad_norm": 0.48673119316387864, + "learning_rate": 9.568725799430165e-06, + "loss": 0.0819, + "step": 1890 + }, + { + "epoch": 0.16, + "grad_norm": 0.3699284551314705, + "learning_rate": 9.568171207082891e-06, + "loss": 0.0885, + "step": 1891 + }, + { + "epoch": 0.16, + "grad_norm": 0.22307421615007708, + "learning_rate": 9.567616274471614e-06, + "loss": 0.0524, + "step": 1892 + }, + { + "epoch": 0.16, + "grad_norm": 0.5485866441109828, + "learning_rate": 9.567061001637666e-06, + "loss": 0.1043, + "step": 1893 + }, + { + "epoch": 0.16, + "grad_norm": 0.3356564728823607, + "learning_rate": 9.566505388622412e-06, + "loss": 0.069, + "step": 1894 + }, + { + "epoch": 0.16, + "grad_norm": 0.64209474126418, + "learning_rate": 9.565949435467233e-06, + "loss": 0.1152, + "step": 1895 + }, + { + "epoch": 0.16, + "grad_norm": 0.46191438754969344, + "learning_rate": 9.565393142213543e-06, + "loss": 0.1188, + "step": 1896 + }, + { + "epoch": 0.16, + "grad_norm": 0.46832712514110697, + "learning_rate": 9.564836508902776e-06, + "loss": 0.0937, + "step": 1897 + }, + { + "epoch": 0.16, + "grad_norm": 0.8101153380753563, + "learning_rate": 9.564279535576393e-06, + "loss": 0.1896, + "step": 1898 + }, + { + "epoch": 0.16, + "grad_norm": 0.28854704042852003, + "learning_rate": 9.563722222275882e-06, + "loss": 0.0913, + "step": 1899 + }, + { + "epoch": 0.16, + "grad_norm": 0.23398198139209972, + "learning_rate": 9.563164569042753e-06, + "loss": 0.0511, + "step": 1900 + }, + { + "epoch": 0.16, + "grad_norm": 0.48133097737011343, + "learning_rate": 9.562606575918545e-06, + "loss": 0.0994, + "step": 1901 + }, + { + "epoch": 0.16, + "grad_norm": 0.3519487205930869, + "learning_rate": 9.562048242944822e-06, + "loss": 0.0399, + "step": 1902 + }, + { + "epoch": 0.16, + "grad_norm": 0.31917410698880977, + "learning_rate": 9.561489570163167e-06, + "loss": 0.0576, + "step": 1903 + }, + { + "epoch": 0.16, + "grad_norm": 0.21060618246657206, + "learning_rate": 9.560930557615198e-06, + "loss": 0.0641, + "step": 1904 + }, + { + "epoch": 0.16, + "grad_norm": 0.4187131422650143, + "learning_rate": 9.560371205342552e-06, + "loss": 0.1135, + "step": 1905 + }, + { + "epoch": 0.16, + "grad_norm": 0.39822503095232115, + "learning_rate": 9.559811513386893e-06, + "loss": 0.0961, + "step": 1906 + }, + { + "epoch": 0.16, + "grad_norm": 0.5277395336641175, + "learning_rate": 9.559251481789905e-06, + "loss": 0.0432, + "step": 1907 + }, + { + "epoch": 0.16, + "grad_norm": 0.32860558455906774, + "learning_rate": 9.558691110593312e-06, + "loss": 0.0545, + "step": 1908 + }, + { + "epoch": 0.16, + "grad_norm": 0.3986479527892182, + "learning_rate": 9.558130399838848e-06, + "loss": 0.1151, + "step": 1909 + }, + { + "epoch": 0.16, + "grad_norm": 0.36150840517670046, + "learning_rate": 9.557569349568276e-06, + "loss": 0.0927, + "step": 1910 + }, + { + "epoch": 0.16, + "grad_norm": 0.2997963789081962, + "learning_rate": 9.557007959823393e-06, + "loss": 0.0864, + "step": 1911 + }, + { + "epoch": 0.16, + "grad_norm": 0.37580775723556203, + "learning_rate": 9.556446230646007e-06, + "loss": 0.1115, + "step": 1912 + }, + { + "epoch": 0.16, + "grad_norm": 0.21565114468203345, + "learning_rate": 9.555884162077965e-06, + "loss": 0.0557, + "step": 1913 + }, + { + "epoch": 0.16, + "grad_norm": 0.5952209261857783, + "learning_rate": 9.555321754161128e-06, + "loss": 0.1081, + "step": 1914 + }, + { + "epoch": 0.16, + "grad_norm": 0.4633814326283105, + "learning_rate": 9.554759006937393e-06, + "loss": 0.1039, + "step": 1915 + }, + { + "epoch": 0.16, + "grad_norm": 0.30483419406760776, + "learning_rate": 9.554195920448673e-06, + "loss": 0.0893, + "step": 1916 + }, + { + "epoch": 0.16, + "grad_norm": 0.4260064498029597, + "learning_rate": 9.553632494736908e-06, + "loss": 0.0993, + "step": 1917 + }, + { + "epoch": 0.16, + "grad_norm": 0.3865784602285552, + "learning_rate": 9.553068729844071e-06, + "loss": 0.0992, + "step": 1918 + }, + { + "epoch": 0.16, + "grad_norm": 0.30712239111443296, + "learning_rate": 9.552504625812151e-06, + "loss": 0.0806, + "step": 1919 + }, + { + "epoch": 0.16, + "grad_norm": 0.3061093067852982, + "learning_rate": 9.551940182683165e-06, + "loss": 0.0976, + "step": 1920 + }, + { + "epoch": 0.16, + "grad_norm": 0.4536066329343321, + "learning_rate": 9.55137540049916e-06, + "loss": 0.0881, + "step": 1921 + }, + { + "epoch": 0.16, + "grad_norm": 0.6888589280724848, + "learning_rate": 9.550810279302197e-06, + "loss": 0.0632, + "step": 1922 + }, + { + "epoch": 0.16, + "grad_norm": 0.4881213863047809, + "learning_rate": 9.550244819134376e-06, + "loss": 0.1428, + "step": 1923 + }, + { + "epoch": 0.16, + "grad_norm": 0.4585627732004255, + "learning_rate": 9.549679020037815e-06, + "loss": 0.1209, + "step": 1924 + }, + { + "epoch": 0.16, + "grad_norm": 0.3796390823258864, + "learning_rate": 9.549112882054655e-06, + "loss": 0.0978, + "step": 1925 + }, + { + "epoch": 0.16, + "grad_norm": 0.34617906720967245, + "learning_rate": 9.548546405227068e-06, + "loss": 0.1079, + "step": 1926 + }, + { + "epoch": 0.16, + "grad_norm": 0.3027132669964892, + "learning_rate": 9.547979589597247e-06, + "loss": 0.0877, + "step": 1927 + }, + { + "epoch": 0.16, + "grad_norm": 0.7228397512569966, + "learning_rate": 9.547412435207413e-06, + "loss": 0.0966, + "step": 1928 + }, + { + "epoch": 0.16, + "grad_norm": 0.28552182170189494, + "learning_rate": 9.546844942099809e-06, + "loss": 0.0588, + "step": 1929 + }, + { + "epoch": 0.16, + "grad_norm": 0.40501822776414936, + "learning_rate": 9.546277110316706e-06, + "loss": 0.0948, + "step": 1930 + }, + { + "epoch": 0.16, + "grad_norm": 0.31485542662887833, + "learning_rate": 9.545708939900402e-06, + "loss": 0.0624, + "step": 1931 + }, + { + "epoch": 0.16, + "grad_norm": 0.37669808553268835, + "learning_rate": 9.545140430893215e-06, + "loss": 0.1147, + "step": 1932 + }, + { + "epoch": 0.16, + "grad_norm": 0.4969298973553203, + "learning_rate": 9.54457158333749e-06, + "loss": 0.0838, + "step": 1933 + }, + { + "epoch": 0.16, + "grad_norm": 0.4469266156739036, + "learning_rate": 9.5440023972756e-06, + "loss": 0.1097, + "step": 1934 + }, + { + "epoch": 0.16, + "grad_norm": 0.24151121098947906, + "learning_rate": 9.54343287274994e-06, + "loss": 0.0635, + "step": 1935 + }, + { + "epoch": 0.16, + "grad_norm": 0.40705944128208127, + "learning_rate": 9.542863009802933e-06, + "loss": 0.1104, + "step": 1936 + }, + { + "epoch": 0.16, + "grad_norm": 0.43116492972596615, + "learning_rate": 9.542292808477026e-06, + "loss": 0.0788, + "step": 1937 + }, + { + "epoch": 0.16, + "grad_norm": 0.45539462383988244, + "learning_rate": 9.541722268814687e-06, + "loss": 0.1346, + "step": 1938 + }, + { + "epoch": 0.16, + "grad_norm": 0.3101595091055292, + "learning_rate": 9.541151390858417e-06, + "loss": 0.0832, + "step": 1939 + }, + { + "epoch": 0.16, + "grad_norm": 0.6876254577704088, + "learning_rate": 9.540580174650738e-06, + "loss": 0.1355, + "step": 1940 + }, + { + "epoch": 0.16, + "grad_norm": 0.3665868492527732, + "learning_rate": 9.540008620234197e-06, + "loss": 0.0872, + "step": 1941 + }, + { + "epoch": 0.16, + "grad_norm": 0.29438406757155905, + "learning_rate": 9.539436727651365e-06, + "loss": 0.0869, + "step": 1942 + }, + { + "epoch": 0.16, + "grad_norm": 0.4029844747919658, + "learning_rate": 9.538864496944843e-06, + "loss": 0.0992, + "step": 1943 + }, + { + "epoch": 0.16, + "grad_norm": 0.35002215559217426, + "learning_rate": 9.538291928157252e-06, + "loss": 0.0578, + "step": 1944 + }, + { + "epoch": 0.16, + "grad_norm": 0.37963414284910696, + "learning_rate": 9.53771902133124e-06, + "loss": 0.0761, + "step": 1945 + }, + { + "epoch": 0.16, + "grad_norm": 0.5488268923899522, + "learning_rate": 9.537145776509482e-06, + "loss": 0.1256, + "step": 1946 + }, + { + "epoch": 0.16, + "grad_norm": 0.4027731444881979, + "learning_rate": 9.536572193734675e-06, + "loss": 0.1041, + "step": 1947 + }, + { + "epoch": 0.16, + "grad_norm": 0.5526595994132719, + "learning_rate": 9.535998273049542e-06, + "loss": 0.1167, + "step": 1948 + }, + { + "epoch": 0.16, + "grad_norm": 0.2959930102857174, + "learning_rate": 9.535424014496838e-06, + "loss": 0.0584, + "step": 1949 + }, + { + "epoch": 0.16, + "grad_norm": 0.49781507105261646, + "learning_rate": 9.534849418119328e-06, + "loss": 0.1113, + "step": 1950 + }, + { + "epoch": 0.16, + "grad_norm": 0.2980499875388227, + "learning_rate": 9.534274483959818e-06, + "loss": 0.0812, + "step": 1951 + }, + { + "epoch": 0.16, + "grad_norm": 0.45092725178630183, + "learning_rate": 9.533699212061131e-06, + "loss": 0.1071, + "step": 1952 + }, + { + "epoch": 0.16, + "grad_norm": 0.291848327225128, + "learning_rate": 9.533123602466114e-06, + "loss": 0.0659, + "step": 1953 + }, + { + "epoch": 0.16, + "grad_norm": 0.4958560544525665, + "learning_rate": 9.532547655217645e-06, + "loss": 0.067, + "step": 1954 + }, + { + "epoch": 0.16, + "grad_norm": 0.2990804492458282, + "learning_rate": 9.53197137035862e-06, + "loss": 0.0769, + "step": 1955 + }, + { + "epoch": 0.16, + "grad_norm": 0.43936633789455953, + "learning_rate": 9.531394747931971e-06, + "loss": 0.0711, + "step": 1956 + }, + { + "epoch": 0.16, + "grad_norm": 0.38520052624273077, + "learning_rate": 9.530817787980641e-06, + "loss": 0.1048, + "step": 1957 + }, + { + "epoch": 0.16, + "grad_norm": 0.5124849042811812, + "learning_rate": 9.53024049054761e-06, + "loss": 0.1564, + "step": 1958 + }, + { + "epoch": 0.17, + "grad_norm": 0.2429548537851146, + "learning_rate": 9.529662855675876e-06, + "loss": 0.0577, + "step": 1959 + }, + { + "epoch": 0.17, + "grad_norm": 0.30580016502362695, + "learning_rate": 9.529084883408463e-06, + "loss": 0.0921, + "step": 1960 + }, + { + "epoch": 0.17, + "grad_norm": 0.3281601659052775, + "learning_rate": 9.528506573788428e-06, + "loss": 0.0743, + "step": 1961 + }, + { + "epoch": 0.17, + "grad_norm": 0.37229018071737446, + "learning_rate": 9.52792792685884e-06, + "loss": 0.1052, + "step": 1962 + }, + { + "epoch": 0.17, + "grad_norm": 0.4109827953441356, + "learning_rate": 9.527348942662803e-06, + "loss": 0.0668, + "step": 1963 + }, + { + "epoch": 0.17, + "grad_norm": 0.3011620513730135, + "learning_rate": 9.526769621243444e-06, + "loss": 0.0719, + "step": 1964 + }, + { + "epoch": 0.17, + "grad_norm": 0.4098383268555689, + "learning_rate": 9.526189962643912e-06, + "loss": 0.1168, + "step": 1965 + }, + { + "epoch": 0.17, + "grad_norm": 0.2872030837061024, + "learning_rate": 9.525609966907386e-06, + "loss": 0.0579, + "step": 1966 + }, + { + "epoch": 0.17, + "grad_norm": 0.28330110651646995, + "learning_rate": 9.525029634077067e-06, + "loss": 0.0593, + "step": 1967 + }, + { + "epoch": 0.17, + "grad_norm": 0.341177430945532, + "learning_rate": 9.524448964196179e-06, + "loss": 0.0959, + "step": 1968 + }, + { + "epoch": 0.17, + "grad_norm": 0.4452198477193765, + "learning_rate": 9.523867957307975e-06, + "loss": 0.1281, + "step": 1969 + }, + { + "epoch": 0.17, + "grad_norm": 0.20941745653167596, + "learning_rate": 9.523286613455732e-06, + "loss": 0.0541, + "step": 1970 + }, + { + "epoch": 0.17, + "grad_norm": 0.3149012028116164, + "learning_rate": 9.522704932682753e-06, + "loss": 0.0942, + "step": 1971 + }, + { + "epoch": 0.17, + "grad_norm": 0.415166406816186, + "learning_rate": 9.522122915032364e-06, + "loss": 0.0888, + "step": 1972 + }, + { + "epoch": 0.17, + "grad_norm": 0.4143181662701593, + "learning_rate": 9.521540560547915e-06, + "loss": 0.1015, + "step": 1973 + }, + { + "epoch": 0.17, + "grad_norm": 0.6426815928836875, + "learning_rate": 9.520957869272786e-06, + "loss": 0.1438, + "step": 1974 + }, + { + "epoch": 0.17, + "grad_norm": 0.4888716888608612, + "learning_rate": 9.520374841250379e-06, + "loss": 0.1236, + "step": 1975 + }, + { + "epoch": 0.17, + "grad_norm": 0.3896014096527798, + "learning_rate": 9.519791476524121e-06, + "loss": 0.091, + "step": 1976 + }, + { + "epoch": 0.17, + "grad_norm": 0.6752570099183843, + "learning_rate": 9.519207775137463e-06, + "loss": 0.1422, + "step": 1977 + }, + { + "epoch": 0.17, + "grad_norm": 0.4218827812769126, + "learning_rate": 9.518623737133885e-06, + "loss": 0.0929, + "step": 1978 + }, + { + "epoch": 0.17, + "grad_norm": 0.2577582911623791, + "learning_rate": 9.518039362556887e-06, + "loss": 0.0536, + "step": 1979 + }, + { + "epoch": 0.17, + "grad_norm": 0.40823862900848484, + "learning_rate": 9.517454651449998e-06, + "loss": 0.0575, + "step": 1980 + }, + { + "epoch": 0.17, + "grad_norm": 0.32141817254171867, + "learning_rate": 9.516869603856772e-06, + "loss": 0.106, + "step": 1981 + }, + { + "epoch": 0.17, + "grad_norm": 0.23420996589350715, + "learning_rate": 9.516284219820781e-06, + "loss": 0.0682, + "step": 1982 + }, + { + "epoch": 0.17, + "grad_norm": 0.46985457661293323, + "learning_rate": 9.515698499385635e-06, + "loss": 0.1229, + "step": 1983 + }, + { + "epoch": 0.17, + "grad_norm": 0.6751457525658184, + "learning_rate": 9.51511244259496e-06, + "loss": 0.0951, + "step": 1984 + }, + { + "epoch": 0.17, + "grad_norm": 0.23594047422484615, + "learning_rate": 9.514526049492406e-06, + "loss": 0.0588, + "step": 1985 + }, + { + "epoch": 0.17, + "grad_norm": 0.25525652640834906, + "learning_rate": 9.513939320121653e-06, + "loss": 0.0499, + "step": 1986 + }, + { + "epoch": 0.17, + "grad_norm": 0.3048005503828047, + "learning_rate": 9.513352254526404e-06, + "loss": 0.0729, + "step": 1987 + }, + { + "epoch": 0.17, + "grad_norm": 0.41088215119855725, + "learning_rate": 9.512764852750387e-06, + "loss": 0.074, + "step": 1988 + }, + { + "epoch": 0.17, + "grad_norm": 0.41182032987326145, + "learning_rate": 9.512177114837355e-06, + "loss": 0.0938, + "step": 1989 + }, + { + "epoch": 0.17, + "grad_norm": 0.4364992103322481, + "learning_rate": 9.511589040831085e-06, + "loss": 0.1263, + "step": 1990 + }, + { + "epoch": 0.17, + "grad_norm": 0.6001211133422241, + "learning_rate": 9.511000630775382e-06, + "loss": 0.1272, + "step": 1991 + }, + { + "epoch": 0.17, + "grad_norm": 0.38902850008709944, + "learning_rate": 9.510411884714073e-06, + "loss": 0.0937, + "step": 1992 + }, + { + "epoch": 0.17, + "grad_norm": 0.5857018855959341, + "learning_rate": 9.509822802691013e-06, + "loss": 0.118, + "step": 1993 + }, + { + "epoch": 0.17, + "grad_norm": 0.45607906993765923, + "learning_rate": 9.509233384750079e-06, + "loss": 0.0999, + "step": 1994 + }, + { + "epoch": 0.17, + "grad_norm": 0.5838549497176535, + "learning_rate": 9.508643630935173e-06, + "loss": 0.1019, + "step": 1995 + }, + { + "epoch": 0.17, + "grad_norm": 0.43132563951415875, + "learning_rate": 9.508053541290226e-06, + "loss": 0.1023, + "step": 1996 + }, + { + "epoch": 0.17, + "grad_norm": 0.25768875678564085, + "learning_rate": 9.507463115859188e-06, + "loss": 0.0578, + "step": 1997 + }, + { + "epoch": 0.17, + "grad_norm": 0.4652198591159032, + "learning_rate": 9.506872354686041e-06, + "loss": 0.134, + "step": 1998 + }, + { + "epoch": 0.17, + "grad_norm": 0.3258001476598917, + "learning_rate": 9.506281257814786e-06, + "loss": 0.1066, + "step": 1999 + }, + { + "epoch": 0.17, + "grad_norm": 0.6777966835575625, + "learning_rate": 9.505689825289452e-06, + "loss": 0.1234, + "step": 2000 + }, + { + "epoch": 0.17, + "grad_norm": 0.5419415946372411, + "learning_rate": 9.505098057154091e-06, + "loss": 0.0992, + "step": 2001 + }, + { + "epoch": 0.17, + "grad_norm": 0.4182726820048844, + "learning_rate": 9.504505953452783e-06, + "loss": 0.1349, + "step": 2002 + }, + { + "epoch": 0.17, + "grad_norm": 0.2761378143920018, + "learning_rate": 9.503913514229633e-06, + "loss": 0.0878, + "step": 2003 + }, + { + "epoch": 0.17, + "grad_norm": 0.35197591167277326, + "learning_rate": 9.503320739528765e-06, + "loss": 0.082, + "step": 2004 + }, + { + "epoch": 0.17, + "grad_norm": 0.9825059425492416, + "learning_rate": 9.502727629394335e-06, + "loss": 0.0923, + "step": 2005 + }, + { + "epoch": 0.17, + "grad_norm": 0.34063089561454185, + "learning_rate": 9.502134183870522e-06, + "loss": 0.0879, + "step": 2006 + }, + { + "epoch": 0.17, + "grad_norm": 0.3159521140823055, + "learning_rate": 9.501540403001527e-06, + "loss": 0.1019, + "step": 2007 + }, + { + "epoch": 0.17, + "grad_norm": 0.44742738624693507, + "learning_rate": 9.50094628683158e-06, + "loss": 0.1381, + "step": 2008 + }, + { + "epoch": 0.17, + "grad_norm": 0.258281507717571, + "learning_rate": 9.500351835404932e-06, + "loss": 0.0893, + "step": 2009 + }, + { + "epoch": 0.17, + "grad_norm": 0.35100134046858567, + "learning_rate": 9.499757048765864e-06, + "loss": 0.1014, + "step": 2010 + }, + { + "epoch": 0.17, + "grad_norm": 0.5145820463458471, + "learning_rate": 9.499161926958678e-06, + "loss": 0.0937, + "step": 2011 + }, + { + "epoch": 0.17, + "grad_norm": 0.38285465738833085, + "learning_rate": 9.498566470027703e-06, + "loss": 0.0538, + "step": 2012 + }, + { + "epoch": 0.17, + "grad_norm": 0.16053243785459514, + "learning_rate": 9.49797067801729e-06, + "loss": 0.0232, + "step": 2013 + }, + { + "epoch": 0.17, + "grad_norm": 0.27437296355152263, + "learning_rate": 9.497374550971819e-06, + "loss": 0.0924, + "step": 2014 + }, + { + "epoch": 0.17, + "grad_norm": 0.2615857204563584, + "learning_rate": 9.496778088935691e-06, + "loss": 0.0762, + "step": 2015 + }, + { + "epoch": 0.17, + "grad_norm": 0.6125348829553893, + "learning_rate": 9.496181291953336e-06, + "loss": 0.1113, + "step": 2016 + }, + { + "epoch": 0.17, + "grad_norm": 0.6633824834058893, + "learning_rate": 9.495584160069207e-06, + "loss": 0.0922, + "step": 2017 + }, + { + "epoch": 0.17, + "grad_norm": 0.3447150786286894, + "learning_rate": 9.49498669332778e-06, + "loss": 0.0738, + "step": 2018 + }, + { + "epoch": 0.17, + "grad_norm": 0.28013945144973423, + "learning_rate": 9.49438889177356e-06, + "loss": 0.0846, + "step": 2019 + }, + { + "epoch": 0.17, + "grad_norm": 0.2814589067238763, + "learning_rate": 9.493790755451072e-06, + "loss": 0.1006, + "step": 2020 + }, + { + "epoch": 0.17, + "grad_norm": 0.3037464454284003, + "learning_rate": 9.49319228440487e-06, + "loss": 0.1064, + "step": 2021 + }, + { + "epoch": 0.17, + "grad_norm": 0.35575872226096344, + "learning_rate": 9.492593478679535e-06, + "loss": 0.1044, + "step": 2022 + }, + { + "epoch": 0.17, + "grad_norm": 0.7363800305196193, + "learning_rate": 9.491994338319664e-06, + "loss": 0.1356, + "step": 2023 + }, + { + "epoch": 0.17, + "grad_norm": 0.27989258563291874, + "learning_rate": 9.491394863369887e-06, + "loss": 0.093, + "step": 2024 + }, + { + "epoch": 0.17, + "grad_norm": 0.35861683058009075, + "learning_rate": 9.490795053874857e-06, + "loss": 0.089, + "step": 2025 + }, + { + "epoch": 0.17, + "grad_norm": 0.30018812424838803, + "learning_rate": 9.49019490987925e-06, + "loss": 0.102, + "step": 2026 + }, + { + "epoch": 0.17, + "grad_norm": 0.4750617248573952, + "learning_rate": 9.489594431427768e-06, + "loss": 0.096, + "step": 2027 + }, + { + "epoch": 0.17, + "grad_norm": 0.4770863256516558, + "learning_rate": 9.48899361856514e-06, + "loss": 0.0759, + "step": 2028 + }, + { + "epoch": 0.17, + "grad_norm": 0.45807891432183784, + "learning_rate": 9.488392471336117e-06, + "loss": 0.0962, + "step": 2029 + }, + { + "epoch": 0.17, + "grad_norm": 0.35706347253363035, + "learning_rate": 9.487790989785475e-06, + "loss": 0.1141, + "step": 2030 + }, + { + "epoch": 0.17, + "grad_norm": 0.44544491838767303, + "learning_rate": 9.487189173958019e-06, + "loss": 0.103, + "step": 2031 + }, + { + "epoch": 0.17, + "grad_norm": 0.30005473159971874, + "learning_rate": 9.486587023898571e-06, + "loss": 0.0504, + "step": 2032 + }, + { + "epoch": 0.17, + "grad_norm": 0.5108988354208385, + "learning_rate": 9.485984539651987e-06, + "loss": 0.1225, + "step": 2033 + }, + { + "epoch": 0.17, + "grad_norm": 0.6548942786896536, + "learning_rate": 9.485381721263141e-06, + "loss": 0.118, + "step": 2034 + }, + { + "epoch": 0.17, + "grad_norm": 0.5959468979905713, + "learning_rate": 9.484778568776933e-06, + "loss": 0.1223, + "step": 2035 + }, + { + "epoch": 0.17, + "grad_norm": 0.4989012562735155, + "learning_rate": 9.484175082238293e-06, + "loss": 0.104, + "step": 2036 + }, + { + "epoch": 0.17, + "grad_norm": 0.2912968393800039, + "learning_rate": 9.483571261692173e-06, + "loss": 0.087, + "step": 2037 + }, + { + "epoch": 0.17, + "grad_norm": 0.45542945615330743, + "learning_rate": 9.482967107183544e-06, + "loss": 0.1079, + "step": 2038 + }, + { + "epoch": 0.17, + "grad_norm": 0.5814941483700978, + "learning_rate": 9.48236261875741e-06, + "loss": 0.1483, + "step": 2039 + }, + { + "epoch": 0.17, + "grad_norm": 0.4678829560793343, + "learning_rate": 9.481757796458797e-06, + "loss": 0.1311, + "step": 2040 + }, + { + "epoch": 0.17, + "grad_norm": 0.2943329225561303, + "learning_rate": 9.481152640332756e-06, + "loss": 0.1014, + "step": 2041 + }, + { + "epoch": 0.17, + "grad_norm": 0.41385069370464994, + "learning_rate": 9.48054715042436e-06, + "loss": 0.1079, + "step": 2042 + }, + { + "epoch": 0.17, + "grad_norm": 0.332222148353211, + "learning_rate": 9.479941326778712e-06, + "loss": 0.1055, + "step": 2043 + }, + { + "epoch": 0.17, + "grad_norm": 0.22114445308384245, + "learning_rate": 9.479335169440935e-06, + "loss": 0.0596, + "step": 2044 + }, + { + "epoch": 0.17, + "grad_norm": 0.30042555756992606, + "learning_rate": 9.478728678456182e-06, + "loss": 0.0421, + "step": 2045 + }, + { + "epoch": 0.17, + "grad_norm": 0.30598510548256924, + "learning_rate": 9.478121853869627e-06, + "loss": 0.0869, + "step": 2046 + }, + { + "epoch": 0.17, + "grad_norm": 0.5299024892029863, + "learning_rate": 9.477514695726468e-06, + "loss": 0.1297, + "step": 2047 + }, + { + "epoch": 0.17, + "grad_norm": 0.4489318886945348, + "learning_rate": 9.476907204071933e-06, + "loss": 0.0723, + "step": 2048 + }, + { + "epoch": 0.17, + "grad_norm": 0.4075155467965644, + "learning_rate": 9.476299378951267e-06, + "loss": 0.0857, + "step": 2049 + }, + { + "epoch": 0.17, + "grad_norm": 0.4863623130534948, + "learning_rate": 9.475691220409748e-06, + "loss": 0.1105, + "step": 2050 + }, + { + "epoch": 0.17, + "grad_norm": 0.5097727038303006, + "learning_rate": 9.475082728492674e-06, + "loss": 0.1235, + "step": 2051 + }, + { + "epoch": 0.17, + "grad_norm": 0.25405081088878334, + "learning_rate": 9.474473903245369e-06, + "loss": 0.0473, + "step": 2052 + }, + { + "epoch": 0.17, + "grad_norm": 0.5517507443911186, + "learning_rate": 9.473864744713182e-06, + "loss": 0.1069, + "step": 2053 + }, + { + "epoch": 0.17, + "grad_norm": 0.31697606388279304, + "learning_rate": 9.473255252941487e-06, + "loss": 0.0984, + "step": 2054 + }, + { + "epoch": 0.17, + "grad_norm": 0.4626150704732813, + "learning_rate": 9.472645427975681e-06, + "loss": 0.101, + "step": 2055 + }, + { + "epoch": 0.17, + "grad_norm": 0.3303079832175486, + "learning_rate": 9.472035269861191e-06, + "loss": 0.0878, + "step": 2056 + }, + { + "epoch": 0.17, + "grad_norm": 0.21718309932001578, + "learning_rate": 9.471424778643459e-06, + "loss": 0.0563, + "step": 2057 + }, + { + "epoch": 0.17, + "grad_norm": 0.34375817554153326, + "learning_rate": 9.470813954367964e-06, + "loss": 0.0607, + "step": 2058 + }, + { + "epoch": 0.17, + "grad_norm": 0.3203867115955599, + "learning_rate": 9.4702027970802e-06, + "loss": 0.0869, + "step": 2059 + }, + { + "epoch": 0.17, + "grad_norm": 0.39817440859084197, + "learning_rate": 9.469591306825691e-06, + "loss": 0.1103, + "step": 2060 + }, + { + "epoch": 0.17, + "grad_norm": 0.5859653342600838, + "learning_rate": 9.468979483649987e-06, + "loss": 0.1542, + "step": 2061 + }, + { + "epoch": 0.17, + "grad_norm": 0.4474831365686752, + "learning_rate": 9.468367327598653e-06, + "loss": 0.1067, + "step": 2062 + }, + { + "epoch": 0.17, + "grad_norm": 0.5588590704569549, + "learning_rate": 9.467754838717293e-06, + "loss": 0.0693, + "step": 2063 + }, + { + "epoch": 0.17, + "grad_norm": 0.29277718031774563, + "learning_rate": 9.467142017051525e-06, + "loss": 0.063, + "step": 2064 + }, + { + "epoch": 0.17, + "grad_norm": 0.4096378324409489, + "learning_rate": 9.466528862646998e-06, + "loss": 0.1117, + "step": 2065 + }, + { + "epoch": 0.17, + "grad_norm": 0.3556425694245333, + "learning_rate": 9.465915375549379e-06, + "loss": 0.0849, + "step": 2066 + }, + { + "epoch": 0.17, + "grad_norm": 0.3943073867480224, + "learning_rate": 9.46530155580437e-06, + "loss": 0.0856, + "step": 2067 + }, + { + "epoch": 0.17, + "grad_norm": 0.32787111926269585, + "learning_rate": 9.464687403457687e-06, + "loss": 0.0907, + "step": 2068 + }, + { + "epoch": 0.17, + "grad_norm": 0.3656075721271882, + "learning_rate": 9.464072918555078e-06, + "loss": 0.1105, + "step": 2069 + }, + { + "epoch": 0.17, + "grad_norm": 0.40992379207978313, + "learning_rate": 9.463458101142312e-06, + "loss": 0.0675, + "step": 2070 + }, + { + "epoch": 0.17, + "grad_norm": 0.32100171900447483, + "learning_rate": 9.462842951265185e-06, + "loss": 0.0951, + "step": 2071 + }, + { + "epoch": 0.17, + "grad_norm": 0.3439146031528608, + "learning_rate": 9.462227468969518e-06, + "loss": 0.0632, + "step": 2072 + }, + { + "epoch": 0.17, + "grad_norm": 0.4536344153049165, + "learning_rate": 9.461611654301155e-06, + "loss": 0.1111, + "step": 2073 + }, + { + "epoch": 0.17, + "grad_norm": 0.3579289727457569, + "learning_rate": 9.460995507305965e-06, + "loss": 0.1062, + "step": 2074 + }, + { + "epoch": 0.17, + "grad_norm": 0.30603795230627223, + "learning_rate": 9.460379028029842e-06, + "loss": 0.1014, + "step": 2075 + }, + { + "epoch": 0.17, + "grad_norm": 0.24227467318400694, + "learning_rate": 9.459762216518706e-06, + "loss": 0.0597, + "step": 2076 + }, + { + "epoch": 0.18, + "grad_norm": 0.302783728025485, + "learning_rate": 9.459145072818498e-06, + "loss": 0.1231, + "step": 2077 + }, + { + "epoch": 0.18, + "grad_norm": 0.37431265541089237, + "learning_rate": 9.45852759697519e-06, + "loss": 0.0848, + "step": 2078 + }, + { + "epoch": 0.18, + "grad_norm": 0.3184615399757154, + "learning_rate": 9.457909789034774e-06, + "loss": 0.0909, + "step": 2079 + }, + { + "epoch": 0.18, + "grad_norm": 0.4409040223910068, + "learning_rate": 9.457291649043267e-06, + "loss": 0.1108, + "step": 2080 + }, + { + "epoch": 0.18, + "grad_norm": 0.4339535678649829, + "learning_rate": 9.456673177046713e-06, + "loss": 0.1097, + "step": 2081 + }, + { + "epoch": 0.18, + "grad_norm": 0.3612414208551163, + "learning_rate": 9.45605437309118e-06, + "loss": 0.0632, + "step": 2082 + }, + { + "epoch": 0.18, + "grad_norm": 0.3460709312038517, + "learning_rate": 9.455435237222756e-06, + "loss": 0.0821, + "step": 2083 + }, + { + "epoch": 0.18, + "grad_norm": 0.2643084175150873, + "learning_rate": 9.454815769487563e-06, + "loss": 0.0527, + "step": 2084 + }, + { + "epoch": 0.18, + "grad_norm": 0.20437615328576653, + "learning_rate": 9.454195969931739e-06, + "loss": 0.0323, + "step": 2085 + }, + { + "epoch": 0.18, + "grad_norm": 0.5603073807863361, + "learning_rate": 9.453575838601451e-06, + "loss": 0.1217, + "step": 2086 + }, + { + "epoch": 0.18, + "grad_norm": 0.32887009037263376, + "learning_rate": 9.452955375542893e-06, + "loss": 0.0879, + "step": 2087 + }, + { + "epoch": 0.18, + "grad_norm": 0.23916719362578137, + "learning_rate": 9.452334580802276e-06, + "loss": 0.0864, + "step": 2088 + }, + { + "epoch": 0.18, + "grad_norm": 1.104995178205813, + "learning_rate": 9.451713454425845e-06, + "loss": 0.0859, + "step": 2089 + }, + { + "epoch": 0.18, + "grad_norm": 0.4746752900559491, + "learning_rate": 9.45109199645986e-06, + "loss": 0.1314, + "step": 2090 + }, + { + "epoch": 0.18, + "grad_norm": 0.5210535115740903, + "learning_rate": 9.450470206950615e-06, + "loss": 0.0816, + "step": 2091 + }, + { + "epoch": 0.18, + "grad_norm": 0.30579030805112756, + "learning_rate": 9.449848085944422e-06, + "loss": 0.0983, + "step": 2092 + }, + { + "epoch": 0.18, + "grad_norm": 0.3743530382937883, + "learning_rate": 9.449225633487623e-06, + "loss": 0.1004, + "step": 2093 + }, + { + "epoch": 0.18, + "grad_norm": 0.2927241158333802, + "learning_rate": 9.44860284962658e-06, + "loss": 0.0851, + "step": 2094 + }, + { + "epoch": 0.18, + "grad_norm": 0.3093823101893831, + "learning_rate": 9.44797973440768e-06, + "loss": 0.049, + "step": 2095 + }, + { + "epoch": 0.18, + "grad_norm": 0.22866188925550068, + "learning_rate": 9.447356287877339e-06, + "loss": 0.0446, + "step": 2096 + }, + { + "epoch": 0.18, + "grad_norm": 0.35844488092630356, + "learning_rate": 9.446732510081992e-06, + "loss": 0.1044, + "step": 2097 + }, + { + "epoch": 0.18, + "grad_norm": 0.5698069975196619, + "learning_rate": 9.446108401068108e-06, + "loss": 0.1238, + "step": 2098 + }, + { + "epoch": 0.18, + "grad_norm": 0.3582024936492228, + "learning_rate": 9.445483960882168e-06, + "loss": 0.0888, + "step": 2099 + }, + { + "epoch": 0.18, + "grad_norm": 0.6982166612907237, + "learning_rate": 9.444859189570683e-06, + "loss": 0.1523, + "step": 2100 + }, + { + "epoch": 0.18, + "grad_norm": 0.2509513781254003, + "learning_rate": 9.444234087180195e-06, + "loss": 0.0559, + "step": 2101 + }, + { + "epoch": 0.18, + "grad_norm": 0.5946769788155676, + "learning_rate": 9.443608653757261e-06, + "loss": 0.099, + "step": 2102 + }, + { + "epoch": 0.18, + "grad_norm": 0.3495230953465962, + "learning_rate": 9.442982889348469e-06, + "loss": 0.1061, + "step": 2103 + }, + { + "epoch": 0.18, + "grad_norm": 0.5810342216470176, + "learning_rate": 9.44235679400043e-06, + "loss": 0.1336, + "step": 2104 + }, + { + "epoch": 0.18, + "grad_norm": 0.3876650337333317, + "learning_rate": 9.441730367759778e-06, + "loss": 0.0846, + "step": 2105 + }, + { + "epoch": 0.18, + "grad_norm": 0.3343822791123102, + "learning_rate": 9.441103610673172e-06, + "loss": 0.0916, + "step": 2106 + }, + { + "epoch": 0.18, + "grad_norm": 0.36740235234976004, + "learning_rate": 9.4404765227873e-06, + "loss": 0.0378, + "step": 2107 + }, + { + "epoch": 0.18, + "grad_norm": 0.2880713716427908, + "learning_rate": 9.439849104148869e-06, + "loss": 0.0688, + "step": 2108 + }, + { + "epoch": 0.18, + "grad_norm": 0.2930059157292598, + "learning_rate": 9.439221354804609e-06, + "loss": 0.062, + "step": 2109 + }, + { + "epoch": 0.18, + "grad_norm": 0.4194614822083716, + "learning_rate": 9.438593274801285e-06, + "loss": 0.078, + "step": 2110 + }, + { + "epoch": 0.18, + "grad_norm": 0.28625071666043683, + "learning_rate": 9.437964864185676e-06, + "loss": 0.0249, + "step": 2111 + }, + { + "epoch": 0.18, + "grad_norm": 0.27928142027751585, + "learning_rate": 9.437336123004592e-06, + "loss": 0.0808, + "step": 2112 + }, + { + "epoch": 0.18, + "grad_norm": 0.4901406575441948, + "learning_rate": 9.436707051304865e-06, + "loss": 0.1345, + "step": 2113 + }, + { + "epoch": 0.18, + "grad_norm": 0.28790817624002923, + "learning_rate": 9.436077649133348e-06, + "loss": 0.0712, + "step": 2114 + }, + { + "epoch": 0.18, + "grad_norm": 0.5233839900416991, + "learning_rate": 9.435447916536928e-06, + "loss": 0.1125, + "step": 2115 + }, + { + "epoch": 0.18, + "grad_norm": 1.580184026875001, + "learning_rate": 9.434817853562507e-06, + "loss": 0.1174, + "step": 2116 + }, + { + "epoch": 0.18, + "grad_norm": 0.28586132004515585, + "learning_rate": 9.43418746025702e-06, + "loss": 0.0971, + "step": 2117 + }, + { + "epoch": 0.18, + "grad_norm": 0.4285955082745664, + "learning_rate": 9.433556736667419e-06, + "loss": 0.1141, + "step": 2118 + }, + { + "epoch": 0.18, + "grad_norm": 0.32944843354612124, + "learning_rate": 9.432925682840685e-06, + "loss": 0.0683, + "step": 2119 + }, + { + "epoch": 0.18, + "grad_norm": 0.5606661565661613, + "learning_rate": 9.432294298823821e-06, + "loss": 0.1292, + "step": 2120 + }, + { + "epoch": 0.18, + "grad_norm": 0.4655470066377842, + "learning_rate": 9.43166258466386e-06, + "loss": 0.0761, + "step": 2121 + }, + { + "epoch": 0.18, + "grad_norm": 0.40724900851727097, + "learning_rate": 9.43103054040785e-06, + "loss": 0.0681, + "step": 2122 + }, + { + "epoch": 0.18, + "grad_norm": 0.5439863568660173, + "learning_rate": 9.430398166102875e-06, + "loss": 0.1128, + "step": 2123 + }, + { + "epoch": 0.18, + "grad_norm": 0.40971313338262033, + "learning_rate": 9.429765461796037e-06, + "loss": 0.0997, + "step": 2124 + }, + { + "epoch": 0.18, + "grad_norm": 0.46087660431247257, + "learning_rate": 9.42913242753446e-06, + "loss": 0.1073, + "step": 2125 + }, + { + "epoch": 0.18, + "grad_norm": 0.2738214807812013, + "learning_rate": 9.428499063365297e-06, + "loss": 0.0545, + "step": 2126 + }, + { + "epoch": 0.18, + "grad_norm": 0.427560980158112, + "learning_rate": 9.427865369335727e-06, + "loss": 0.1062, + "step": 2127 + }, + { + "epoch": 0.18, + "grad_norm": 0.5297915894354662, + "learning_rate": 9.42723134549295e-06, + "loss": 0.1403, + "step": 2128 + }, + { + "epoch": 0.18, + "grad_norm": 0.3145747221826406, + "learning_rate": 9.426596991884193e-06, + "loss": 0.0806, + "step": 2129 + }, + { + "epoch": 0.18, + "grad_norm": 0.4041049964939, + "learning_rate": 9.425962308556705e-06, + "loss": 0.128, + "step": 2130 + }, + { + "epoch": 0.18, + "grad_norm": 0.4590460480746908, + "learning_rate": 9.42532729555776e-06, + "loss": 0.1096, + "step": 2131 + }, + { + "epoch": 0.18, + "grad_norm": 0.3698802182995215, + "learning_rate": 9.42469195293466e-06, + "loss": 0.0737, + "step": 2132 + }, + { + "epoch": 0.18, + "grad_norm": 0.31609869509130495, + "learning_rate": 9.424056280734726e-06, + "loss": 0.1015, + "step": 2133 + }, + { + "epoch": 0.18, + "grad_norm": 0.31281062904517365, + "learning_rate": 9.423420279005309e-06, + "loss": 0.0427, + "step": 2134 + }, + { + "epoch": 0.18, + "grad_norm": 0.3358012374865554, + "learning_rate": 9.422783947793782e-06, + "loss": 0.0535, + "step": 2135 + }, + { + "epoch": 0.18, + "grad_norm": 0.3448611260191238, + "learning_rate": 9.422147287147541e-06, + "loss": 0.1163, + "step": 2136 + }, + { + "epoch": 0.18, + "grad_norm": 0.2690747638099483, + "learning_rate": 9.42151029711401e-06, + "loss": 0.0481, + "step": 2137 + }, + { + "epoch": 0.18, + "grad_norm": 0.29342023990224186, + "learning_rate": 9.420872977740634e-06, + "loss": 0.0729, + "step": 2138 + }, + { + "epoch": 0.18, + "grad_norm": 0.41884751050214913, + "learning_rate": 9.420235329074884e-06, + "loss": 0.097, + "step": 2139 + }, + { + "epoch": 0.18, + "grad_norm": 0.6974565456357624, + "learning_rate": 9.41959735116426e-06, + "loss": 0.0754, + "step": 2140 + }, + { + "epoch": 0.18, + "grad_norm": 0.4781188006590574, + "learning_rate": 9.418959044056278e-06, + "loss": 0.0984, + "step": 2141 + }, + { + "epoch": 0.18, + "grad_norm": 0.3830671343176985, + "learning_rate": 9.418320407798482e-06, + "loss": 0.1028, + "step": 2142 + }, + { + "epoch": 0.18, + "grad_norm": 0.5419452405182396, + "learning_rate": 9.417681442438445e-06, + "loss": 0.1215, + "step": 2143 + }, + { + "epoch": 0.18, + "grad_norm": 0.37285566721286717, + "learning_rate": 9.41704214802376e-06, + "loss": 0.115, + "step": 2144 + }, + { + "epoch": 0.18, + "grad_norm": 0.29797407974493945, + "learning_rate": 9.416402524602044e-06, + "loss": 0.0718, + "step": 2145 + }, + { + "epoch": 0.18, + "grad_norm": 0.7029191866597169, + "learning_rate": 9.41576257222094e-06, + "loss": 0.1338, + "step": 2146 + }, + { + "epoch": 0.18, + "grad_norm": 0.45343169034441955, + "learning_rate": 9.415122290928115e-06, + "loss": 0.1219, + "step": 2147 + }, + { + "epoch": 0.18, + "grad_norm": 0.2935701162139371, + "learning_rate": 9.414481680771265e-06, + "loss": 0.0743, + "step": 2148 + }, + { + "epoch": 0.18, + "grad_norm": 0.5127436577194381, + "learning_rate": 9.413840741798101e-06, + "loss": 0.0998, + "step": 2149 + }, + { + "epoch": 0.18, + "grad_norm": 0.35915615424616426, + "learning_rate": 9.413199474056365e-06, + "loss": 0.0828, + "step": 2150 + }, + { + "epoch": 0.18, + "grad_norm": 0.30251352534766296, + "learning_rate": 9.412557877593826e-06, + "loss": 0.0759, + "step": 2151 + }, + { + "epoch": 0.18, + "grad_norm": 0.33125740737477705, + "learning_rate": 9.411915952458271e-06, + "loss": 0.084, + "step": 2152 + }, + { + "epoch": 0.18, + "grad_norm": 0.22244992737697733, + "learning_rate": 9.411273698697512e-06, + "loss": 0.0474, + "step": 2153 + }, + { + "epoch": 0.18, + "grad_norm": 0.26695825821277774, + "learning_rate": 9.410631116359392e-06, + "loss": 0.0688, + "step": 2154 + }, + { + "epoch": 0.18, + "grad_norm": 0.427976731321004, + "learning_rate": 9.409988205491772e-06, + "loss": 0.1381, + "step": 2155 + }, + { + "epoch": 0.18, + "grad_norm": 0.5136649457115914, + "learning_rate": 9.409344966142541e-06, + "loss": 0.1049, + "step": 2156 + }, + { + "epoch": 0.18, + "grad_norm": 0.40478344377720044, + "learning_rate": 9.408701398359612e-06, + "loss": 0.0718, + "step": 2157 + }, + { + "epoch": 0.18, + "grad_norm": 0.34759535441435085, + "learning_rate": 9.408057502190918e-06, + "loss": 0.0793, + "step": 2158 + }, + { + "epoch": 0.18, + "grad_norm": 0.5597623665737352, + "learning_rate": 9.407413277684423e-06, + "loss": 0.1169, + "step": 2159 + }, + { + "epoch": 0.18, + "grad_norm": 0.5577336513181574, + "learning_rate": 9.406768724888111e-06, + "loss": 0.0878, + "step": 2160 + }, + { + "epoch": 0.18, + "grad_norm": 0.3094706847580244, + "learning_rate": 9.406123843849995e-06, + "loss": 0.0789, + "step": 2161 + }, + { + "epoch": 0.18, + "grad_norm": 0.365899021360758, + "learning_rate": 9.405478634618106e-06, + "loss": 0.0844, + "step": 2162 + }, + { + "epoch": 0.18, + "grad_norm": 0.318880065901717, + "learning_rate": 9.404833097240506e-06, + "loss": 0.0936, + "step": 2163 + }, + { + "epoch": 0.18, + "grad_norm": 0.22888144323069895, + "learning_rate": 9.404187231765275e-06, + "loss": 0.0501, + "step": 2164 + }, + { + "epoch": 0.18, + "grad_norm": 0.35293310111611703, + "learning_rate": 9.403541038240525e-06, + "loss": 0.1079, + "step": 2165 + }, + { + "epoch": 0.18, + "grad_norm": 0.532189552654461, + "learning_rate": 9.402894516714384e-06, + "loss": 0.1398, + "step": 2166 + }, + { + "epoch": 0.18, + "grad_norm": 0.3490138081760511, + "learning_rate": 9.402247667235012e-06, + "loss": 0.0966, + "step": 2167 + }, + { + "epoch": 0.18, + "grad_norm": 0.4630106728842395, + "learning_rate": 9.401600489850587e-06, + "loss": 0.1324, + "step": 2168 + }, + { + "epoch": 0.18, + "grad_norm": 0.28697340323020426, + "learning_rate": 9.400952984609317e-06, + "loss": 0.0561, + "step": 2169 + }, + { + "epoch": 0.18, + "grad_norm": 0.24536692584360947, + "learning_rate": 9.400305151559432e-06, + "loss": 0.076, + "step": 2170 + }, + { + "epoch": 0.18, + "grad_norm": 0.27777638254115217, + "learning_rate": 9.399656990749185e-06, + "loss": 0.037, + "step": 2171 + }, + { + "epoch": 0.18, + "grad_norm": 0.405952845587326, + "learning_rate": 9.399008502226856e-06, + "loss": 0.0888, + "step": 2172 + }, + { + "epoch": 0.18, + "grad_norm": 0.6072264127481234, + "learning_rate": 9.398359686040748e-06, + "loss": 0.1065, + "step": 2173 + }, + { + "epoch": 0.18, + "grad_norm": 0.39351576737123956, + "learning_rate": 9.397710542239187e-06, + "loss": 0.0923, + "step": 2174 + }, + { + "epoch": 0.18, + "grad_norm": 0.3863214545580942, + "learning_rate": 9.397061070870525e-06, + "loss": 0.0926, + "step": 2175 + }, + { + "epoch": 0.18, + "grad_norm": 0.5129355088658487, + "learning_rate": 9.396411271983144e-06, + "loss": 0.1199, + "step": 2176 + }, + { + "epoch": 0.18, + "grad_norm": 0.31038248872915813, + "learning_rate": 9.395761145625437e-06, + "loss": 0.0971, + "step": 2177 + }, + { + "epoch": 0.18, + "grad_norm": 0.38879340596097023, + "learning_rate": 9.395110691845834e-06, + "loss": 0.0687, + "step": 2178 + }, + { + "epoch": 0.18, + "grad_norm": 0.27890422321529085, + "learning_rate": 9.394459910692783e-06, + "loss": 0.0619, + "step": 2179 + }, + { + "epoch": 0.18, + "grad_norm": 0.5342788844276241, + "learning_rate": 9.393808802214757e-06, + "loss": 0.0944, + "step": 2180 + }, + { + "epoch": 0.18, + "grad_norm": 0.34417901606388945, + "learning_rate": 9.393157366460257e-06, + "loss": 0.0533, + "step": 2181 + }, + { + "epoch": 0.18, + "grad_norm": 0.31082763396869256, + "learning_rate": 9.392505603477804e-06, + "loss": 0.0614, + "step": 2182 + }, + { + "epoch": 0.18, + "grad_norm": 0.8692247839565632, + "learning_rate": 9.391853513315944e-06, + "loss": 0.0755, + "step": 2183 + }, + { + "epoch": 0.18, + "grad_norm": 0.4043925623698882, + "learning_rate": 9.391201096023253e-06, + "loss": 0.1359, + "step": 2184 + }, + { + "epoch": 0.18, + "grad_norm": 0.40650029533689236, + "learning_rate": 9.390548351648322e-06, + "loss": 0.0636, + "step": 2185 + }, + { + "epoch": 0.18, + "grad_norm": 0.3076522315619764, + "learning_rate": 9.389895280239772e-06, + "loss": 0.0515, + "step": 2186 + }, + { + "epoch": 0.18, + "grad_norm": 0.2831444067566987, + "learning_rate": 9.38924188184625e-06, + "loss": 0.0553, + "step": 2187 + }, + { + "epoch": 0.18, + "grad_norm": 0.4827459696495055, + "learning_rate": 9.388588156516422e-06, + "loss": 0.0715, + "step": 2188 + }, + { + "epoch": 0.18, + "grad_norm": 0.4365680729078727, + "learning_rate": 9.387934104298985e-06, + "loss": 0.1082, + "step": 2189 + }, + { + "epoch": 0.18, + "grad_norm": 0.3319271240524271, + "learning_rate": 9.387279725242654e-06, + "loss": 0.0771, + "step": 2190 + }, + { + "epoch": 0.18, + "grad_norm": 0.5746874041097297, + "learning_rate": 9.38662501939617e-06, + "loss": 0.1285, + "step": 2191 + }, + { + "epoch": 0.18, + "grad_norm": 0.37256302255387674, + "learning_rate": 9.385969986808298e-06, + "loss": 0.0721, + "step": 2192 + }, + { + "epoch": 0.18, + "grad_norm": 0.4360181958704788, + "learning_rate": 9.385314627527835e-06, + "loss": 0.1054, + "step": 2193 + }, + { + "epoch": 0.18, + "grad_norm": 0.33894115552721427, + "learning_rate": 9.38465894160359e-06, + "loss": 0.1179, + "step": 2194 + }, + { + "epoch": 0.18, + "grad_norm": 0.333691616307029, + "learning_rate": 9.384002929084406e-06, + "loss": 0.0748, + "step": 2195 + }, + { + "epoch": 0.19, + "grad_norm": 0.35609956030710144, + "learning_rate": 9.383346590019146e-06, + "loss": 0.0968, + "step": 2196 + }, + { + "epoch": 0.19, + "grad_norm": 0.31760540487057437, + "learning_rate": 9.382689924456696e-06, + "loss": 0.1006, + "step": 2197 + }, + { + "epoch": 0.19, + "grad_norm": 0.3173277282721949, + "learning_rate": 9.38203293244597e-06, + "loss": 0.0655, + "step": 2198 + }, + { + "epoch": 0.19, + "grad_norm": 0.298370438507746, + "learning_rate": 9.381375614035901e-06, + "loss": 0.0687, + "step": 2199 + }, + { + "epoch": 0.19, + "grad_norm": 0.8038513184861436, + "learning_rate": 9.380717969275456e-06, + "loss": 0.1364, + "step": 2200 + }, + { + "epoch": 0.19, + "grad_norm": 0.4106793509241525, + "learning_rate": 9.380059998213617e-06, + "loss": 0.1307, + "step": 2201 + }, + { + "epoch": 0.19, + "grad_norm": 0.3827998128832313, + "learning_rate": 9.379401700899392e-06, + "loss": 0.0971, + "step": 2202 + }, + { + "epoch": 0.19, + "grad_norm": 0.33137795336936254, + "learning_rate": 9.378743077381818e-06, + "loss": 0.1062, + "step": 2203 + }, + { + "epoch": 0.19, + "grad_norm": 0.49717090352775806, + "learning_rate": 9.37808412770995e-06, + "loss": 0.0937, + "step": 2204 + }, + { + "epoch": 0.19, + "grad_norm": 0.4191702221108936, + "learning_rate": 9.377424851932872e-06, + "loss": 0.1076, + "step": 2205 + }, + { + "epoch": 0.19, + "grad_norm": 0.2500759059903071, + "learning_rate": 9.376765250099692e-06, + "loss": 0.0596, + "step": 2206 + }, + { + "epoch": 0.19, + "grad_norm": 0.2270574154040705, + "learning_rate": 9.376105322259538e-06, + "loss": 0.0536, + "step": 2207 + }, + { + "epoch": 0.19, + "grad_norm": 0.4702194750647342, + "learning_rate": 9.375445068461568e-06, + "loss": 0.1214, + "step": 2208 + }, + { + "epoch": 0.19, + "grad_norm": 0.40844161283483205, + "learning_rate": 9.37478448875496e-06, + "loss": 0.1211, + "step": 2209 + }, + { + "epoch": 0.19, + "grad_norm": 0.5057815197012512, + "learning_rate": 9.374123583188918e-06, + "loss": 0.1064, + "step": 2210 + }, + { + "epoch": 0.19, + "grad_norm": 0.38530726005947286, + "learning_rate": 9.373462351812672e-06, + "loss": 0.0948, + "step": 2211 + }, + { + "epoch": 0.19, + "grad_norm": 0.45419898384468865, + "learning_rate": 9.372800794675472e-06, + "loss": 0.1122, + "step": 2212 + }, + { + "epoch": 0.19, + "grad_norm": 0.29468408769535404, + "learning_rate": 9.372138911826596e-06, + "loss": 0.0747, + "step": 2213 + }, + { + "epoch": 0.19, + "grad_norm": 0.4549855143651968, + "learning_rate": 9.371476703315342e-06, + "loss": 0.1069, + "step": 2214 + }, + { + "epoch": 0.19, + "grad_norm": 0.3598398049931699, + "learning_rate": 9.370814169191038e-06, + "loss": 0.1013, + "step": 2215 + }, + { + "epoch": 0.19, + "grad_norm": 0.2637270653793351, + "learning_rate": 9.370151309503033e-06, + "loss": 0.0759, + "step": 2216 + }, + { + "epoch": 0.19, + "grad_norm": 0.4325472240842392, + "learning_rate": 9.369488124300702e-06, + "loss": 0.1037, + "step": 2217 + }, + { + "epoch": 0.19, + "grad_norm": 0.49140396441427825, + "learning_rate": 9.36882461363344e-06, + "loss": 0.1386, + "step": 2218 + }, + { + "epoch": 0.19, + "grad_norm": 0.34060658528375604, + "learning_rate": 9.36816077755067e-06, + "loss": 0.0905, + "step": 2219 + }, + { + "epoch": 0.19, + "grad_norm": 0.28820060315922585, + "learning_rate": 9.36749661610184e-06, + "loss": 0.082, + "step": 2220 + }, + { + "epoch": 0.19, + "grad_norm": 0.21240068006372972, + "learning_rate": 9.366832129336421e-06, + "loss": 0.056, + "step": 2221 + }, + { + "epoch": 0.19, + "grad_norm": 0.41145148918730623, + "learning_rate": 9.366167317303902e-06, + "loss": 0.1087, + "step": 2222 + }, + { + "epoch": 0.19, + "grad_norm": 0.520918604172463, + "learning_rate": 9.36550218005381e-06, + "loss": 0.1533, + "step": 2223 + }, + { + "epoch": 0.19, + "grad_norm": 0.3312222826760618, + "learning_rate": 9.364836717635684e-06, + "loss": 0.0633, + "step": 2224 + }, + { + "epoch": 0.19, + "grad_norm": 0.32281748387403, + "learning_rate": 9.364170930099092e-06, + "loss": 0.1257, + "step": 2225 + }, + { + "epoch": 0.19, + "grad_norm": 0.3238950495644765, + "learning_rate": 9.363504817493626e-06, + "loss": 0.0642, + "step": 2226 + }, + { + "epoch": 0.19, + "grad_norm": 0.32683535644377365, + "learning_rate": 9.3628383798689e-06, + "loss": 0.1053, + "step": 2227 + }, + { + "epoch": 0.19, + "grad_norm": 0.543738753720184, + "learning_rate": 9.362171617274558e-06, + "loss": 0.0928, + "step": 2228 + }, + { + "epoch": 0.19, + "grad_norm": 0.33394596719448605, + "learning_rate": 9.361504529760261e-06, + "loss": 0.1135, + "step": 2229 + }, + { + "epoch": 0.19, + "grad_norm": 0.38731728770733503, + "learning_rate": 9.3608371173757e-06, + "loss": 0.1317, + "step": 2230 + }, + { + "epoch": 0.19, + "grad_norm": 0.493118648199292, + "learning_rate": 9.360169380170587e-06, + "loss": 0.1046, + "step": 2231 + }, + { + "epoch": 0.19, + "grad_norm": 0.1734211831737521, + "learning_rate": 9.359501318194659e-06, + "loss": 0.0362, + "step": 2232 + }, + { + "epoch": 0.19, + "grad_norm": 0.20783991923757028, + "learning_rate": 9.358832931497677e-06, + "loss": 0.0515, + "step": 2233 + }, + { + "epoch": 0.19, + "grad_norm": 0.36125296433964915, + "learning_rate": 9.358164220129426e-06, + "loss": 0.1244, + "step": 2234 + }, + { + "epoch": 0.19, + "grad_norm": 0.46045809876037175, + "learning_rate": 9.357495184139716e-06, + "loss": 0.1326, + "step": 2235 + }, + { + "epoch": 0.19, + "grad_norm": 0.47959500646781406, + "learning_rate": 9.356825823578378e-06, + "loss": 0.1288, + "step": 2236 + }, + { + "epoch": 0.19, + "grad_norm": 0.32732815716506336, + "learning_rate": 9.356156138495274e-06, + "loss": 0.077, + "step": 2237 + }, + { + "epoch": 0.19, + "grad_norm": 0.3337669782361158, + "learning_rate": 9.355486128940285e-06, + "loss": 0.0981, + "step": 2238 + }, + { + "epoch": 0.19, + "grad_norm": 0.24849933527209633, + "learning_rate": 9.354815794963316e-06, + "loss": 0.0679, + "step": 2239 + }, + { + "epoch": 0.19, + "grad_norm": 0.23056386406039184, + "learning_rate": 9.354145136614297e-06, + "loss": 0.0786, + "step": 2240 + }, + { + "epoch": 0.19, + "grad_norm": 0.4393384582260254, + "learning_rate": 9.353474153943183e-06, + "loss": 0.0706, + "step": 2241 + }, + { + "epoch": 0.19, + "grad_norm": 0.65118502468742, + "learning_rate": 9.352802846999955e-06, + "loss": 0.0723, + "step": 2242 + }, + { + "epoch": 0.19, + "grad_norm": 0.3607947846547395, + "learning_rate": 9.352131215834613e-06, + "loss": 0.0468, + "step": 2243 + }, + { + "epoch": 0.19, + "grad_norm": 0.587192983956507, + "learning_rate": 9.351459260497187e-06, + "loss": 0.1347, + "step": 2244 + }, + { + "epoch": 0.19, + "grad_norm": 0.3793657815751059, + "learning_rate": 9.350786981037722e-06, + "loss": 0.0855, + "step": 2245 + }, + { + "epoch": 0.19, + "grad_norm": 0.3195697837908121, + "learning_rate": 9.3501143775063e-06, + "loss": 0.0722, + "step": 2246 + }, + { + "epoch": 0.19, + "grad_norm": 0.308546837720801, + "learning_rate": 9.349441449953018e-06, + "loss": 0.0956, + "step": 2247 + }, + { + "epoch": 0.19, + "grad_norm": 0.45377656421589263, + "learning_rate": 9.348768198428e-06, + "loss": 0.0953, + "step": 2248 + }, + { + "epoch": 0.19, + "grad_norm": 0.3974460014355134, + "learning_rate": 9.348094622981392e-06, + "loss": 0.0889, + "step": 2249 + }, + { + "epoch": 0.19, + "grad_norm": 0.4431360828459046, + "learning_rate": 9.347420723663367e-06, + "loss": 0.0853, + "step": 2250 + }, + { + "epoch": 0.19, + "grad_norm": 0.5721494867356366, + "learning_rate": 9.346746500524122e-06, + "loss": 0.158, + "step": 2251 + }, + { + "epoch": 0.19, + "grad_norm": 0.326486895017683, + "learning_rate": 9.346071953613876e-06, + "loss": 0.0853, + "step": 2252 + }, + { + "epoch": 0.19, + "grad_norm": 0.32827540075127837, + "learning_rate": 9.345397082982873e-06, + "loss": 0.0774, + "step": 2253 + }, + { + "epoch": 0.19, + "grad_norm": 0.6338444646629003, + "learning_rate": 9.344721888681379e-06, + "loss": 0.0916, + "step": 2254 + }, + { + "epoch": 0.19, + "grad_norm": 0.49580658565767105, + "learning_rate": 9.34404637075969e-06, + "loss": 0.0638, + "step": 2255 + }, + { + "epoch": 0.19, + "grad_norm": 0.2593705392274507, + "learning_rate": 9.343370529268123e-06, + "loss": 0.0651, + "step": 2256 + }, + { + "epoch": 0.19, + "grad_norm": 0.31254673882322603, + "learning_rate": 9.342694364257015e-06, + "loss": 0.0625, + "step": 2257 + }, + { + "epoch": 0.19, + "grad_norm": 0.6255686129843026, + "learning_rate": 9.342017875776734e-06, + "loss": 0.1349, + "step": 2258 + }, + { + "epoch": 0.19, + "grad_norm": 0.39507784829686776, + "learning_rate": 9.341341063877667e-06, + "loss": 0.115, + "step": 2259 + }, + { + "epoch": 0.19, + "grad_norm": 0.21122245510312226, + "learning_rate": 9.340663928610227e-06, + "loss": 0.0624, + "step": 2260 + }, + { + "epoch": 0.19, + "grad_norm": 0.8570032934631387, + "learning_rate": 9.339986470024853e-06, + "loss": 0.1522, + "step": 2261 + }, + { + "epoch": 0.19, + "grad_norm": 0.3935293152076621, + "learning_rate": 9.339308688172003e-06, + "loss": 0.1046, + "step": 2262 + }, + { + "epoch": 0.19, + "grad_norm": 0.41578165502469455, + "learning_rate": 9.338630583102164e-06, + "loss": 0.0802, + "step": 2263 + }, + { + "epoch": 0.19, + "grad_norm": 0.42771859563311004, + "learning_rate": 9.337952154865844e-06, + "loss": 0.1275, + "step": 2264 + }, + { + "epoch": 0.19, + "grad_norm": 0.33509295748410917, + "learning_rate": 9.337273403513578e-06, + "loss": 0.0916, + "step": 2265 + }, + { + "epoch": 0.19, + "grad_norm": 0.5729590793993866, + "learning_rate": 9.336594329095922e-06, + "loss": 0.0876, + "step": 2266 + }, + { + "epoch": 0.19, + "grad_norm": 0.33791363124289925, + "learning_rate": 9.33591493166346e-06, + "loss": 0.1008, + "step": 2267 + }, + { + "epoch": 0.19, + "grad_norm": 0.41375369908154275, + "learning_rate": 9.33523521126679e-06, + "loss": 0.0997, + "step": 2268 + }, + { + "epoch": 0.19, + "grad_norm": 0.3701664024187532, + "learning_rate": 9.334555167956551e-06, + "loss": 0.0898, + "step": 2269 + }, + { + "epoch": 0.19, + "grad_norm": 0.5660125964065214, + "learning_rate": 9.333874801783393e-06, + "loss": 0.0867, + "step": 2270 + }, + { + "epoch": 0.19, + "grad_norm": 0.3784728417953419, + "learning_rate": 9.333194112797991e-06, + "loss": 0.0732, + "step": 2271 + }, + { + "epoch": 0.19, + "grad_norm": 0.47613957231436194, + "learning_rate": 9.332513101051049e-06, + "loss": 0.1201, + "step": 2272 + }, + { + "epoch": 0.19, + "grad_norm": 0.2936014193054595, + "learning_rate": 9.331831766593294e-06, + "loss": 0.0746, + "step": 2273 + }, + { + "epoch": 0.19, + "grad_norm": 0.48955980433375457, + "learning_rate": 9.331150109475473e-06, + "loss": 0.1196, + "step": 2274 + }, + { + "epoch": 0.19, + "grad_norm": 0.8826617882758704, + "learning_rate": 9.33046812974836e-06, + "loss": 0.1255, + "step": 2275 + }, + { + "epoch": 0.19, + "grad_norm": 0.30043859612719337, + "learning_rate": 9.329785827462757e-06, + "loss": 0.0644, + "step": 2276 + }, + { + "epoch": 0.19, + "grad_norm": 0.41170057237837054, + "learning_rate": 9.32910320266948e-06, + "loss": 0.1034, + "step": 2277 + }, + { + "epoch": 0.19, + "grad_norm": 0.3056464923032891, + "learning_rate": 9.328420255419377e-06, + "loss": 0.0758, + "step": 2278 + }, + { + "epoch": 0.19, + "grad_norm": 0.32642016227222376, + "learning_rate": 9.327736985763321e-06, + "loss": 0.0912, + "step": 2279 + }, + { + "epoch": 0.19, + "grad_norm": 0.498843936151261, + "learning_rate": 9.3270533937522e-06, + "loss": 0.1431, + "step": 2280 + }, + { + "epoch": 0.19, + "grad_norm": 0.29905728999205355, + "learning_rate": 9.326369479436938e-06, + "loss": 0.0705, + "step": 2281 + }, + { + "epoch": 0.19, + "grad_norm": 0.8265670552442287, + "learning_rate": 9.325685242868475e-06, + "loss": 0.1591, + "step": 2282 + }, + { + "epoch": 0.19, + "grad_norm": 0.44808485307793977, + "learning_rate": 9.325000684097774e-06, + "loss": 0.1192, + "step": 2283 + }, + { + "epoch": 0.19, + "grad_norm": 0.38484088495247626, + "learning_rate": 9.324315803175827e-06, + "loss": 0.0683, + "step": 2284 + }, + { + "epoch": 0.19, + "grad_norm": 0.3287297689946327, + "learning_rate": 9.32363060015365e-06, + "loss": 0.1304, + "step": 2285 + }, + { + "epoch": 0.19, + "grad_norm": 0.4806725005323175, + "learning_rate": 9.322945075082278e-06, + "loss": 0.0825, + "step": 2286 + }, + { + "epoch": 0.19, + "grad_norm": 0.5125632133098119, + "learning_rate": 9.322259228012774e-06, + "loss": 0.0768, + "step": 2287 + }, + { + "epoch": 0.19, + "grad_norm": 0.6962424315501092, + "learning_rate": 9.321573058996223e-06, + "loss": 0.1203, + "step": 2288 + }, + { + "epoch": 0.19, + "grad_norm": 0.7708997007821469, + "learning_rate": 9.320886568083736e-06, + "loss": 0.1216, + "step": 2289 + }, + { + "epoch": 0.19, + "grad_norm": 0.2948621485012947, + "learning_rate": 9.320199755326445e-06, + "loss": 0.0871, + "step": 2290 + }, + { + "epoch": 0.19, + "grad_norm": 0.49689409162597564, + "learning_rate": 9.319512620775511e-06, + "loss": 0.1198, + "step": 2291 + }, + { + "epoch": 0.19, + "grad_norm": 0.24977304550679208, + "learning_rate": 9.318825164482112e-06, + "loss": 0.055, + "step": 2292 + }, + { + "epoch": 0.19, + "grad_norm": 0.30161144218119884, + "learning_rate": 9.318137386497457e-06, + "loss": 0.0359, + "step": 2293 + }, + { + "epoch": 0.19, + "grad_norm": 0.42738038580433657, + "learning_rate": 9.317449286872775e-06, + "loss": 0.0986, + "step": 2294 + }, + { + "epoch": 0.19, + "grad_norm": 0.5937263243641031, + "learning_rate": 9.316760865659318e-06, + "loss": 0.1398, + "step": 2295 + }, + { + "epoch": 0.19, + "grad_norm": 0.34344885761550314, + "learning_rate": 9.316072122908366e-06, + "loss": 0.0655, + "step": 2296 + }, + { + "epoch": 0.19, + "grad_norm": 0.4584300110460388, + "learning_rate": 9.315383058671219e-06, + "loss": 0.1029, + "step": 2297 + }, + { + "epoch": 0.19, + "grad_norm": 0.3314484366634366, + "learning_rate": 9.314693672999201e-06, + "loss": 0.0739, + "step": 2298 + }, + { + "epoch": 0.19, + "grad_norm": 0.365909450893409, + "learning_rate": 9.314003965943665e-06, + "loss": 0.076, + "step": 2299 + }, + { + "epoch": 0.19, + "grad_norm": 0.2681066536098896, + "learning_rate": 9.313313937555982e-06, + "loss": 0.0517, + "step": 2300 + }, + { + "epoch": 0.19, + "grad_norm": 0.37471136864222326, + "learning_rate": 9.31262358788755e-06, + "loss": 0.1204, + "step": 2301 + }, + { + "epoch": 0.19, + "grad_norm": 0.40826293451460227, + "learning_rate": 9.311932916989792e-06, + "loss": 0.1175, + "step": 2302 + }, + { + "epoch": 0.19, + "grad_norm": 0.3441914664708332, + "learning_rate": 9.31124192491415e-06, + "loss": 0.0812, + "step": 2303 + }, + { + "epoch": 0.19, + "grad_norm": 0.31759544099272835, + "learning_rate": 9.310550611712095e-06, + "loss": 0.1103, + "step": 2304 + }, + { + "epoch": 0.19, + "grad_norm": 0.2692933162350318, + "learning_rate": 9.309858977435118e-06, + "loss": 0.0604, + "step": 2305 + }, + { + "epoch": 0.19, + "grad_norm": 1.0808510236239905, + "learning_rate": 9.30916702213474e-06, + "loss": 0.0572, + "step": 2306 + }, + { + "epoch": 0.19, + "grad_norm": 0.31330759661430957, + "learning_rate": 9.3084747458625e-06, + "loss": 0.0587, + "step": 2307 + }, + { + "epoch": 0.19, + "grad_norm": 0.2602518858215506, + "learning_rate": 9.307782148669959e-06, + "loss": 0.0736, + "step": 2308 + }, + { + "epoch": 0.19, + "grad_norm": 0.41637649848546915, + "learning_rate": 9.30708923060871e-06, + "loss": 0.1, + "step": 2309 + }, + { + "epoch": 0.19, + "grad_norm": 0.4175821778769072, + "learning_rate": 9.306395991730365e-06, + "loss": 0.0896, + "step": 2310 + }, + { + "epoch": 0.19, + "grad_norm": 0.46330499700208705, + "learning_rate": 9.30570243208656e-06, + "loss": 0.1124, + "step": 2311 + }, + { + "epoch": 0.19, + "grad_norm": 0.2974488155755075, + "learning_rate": 9.305008551728956e-06, + "loss": 0.0576, + "step": 2312 + }, + { + "epoch": 0.19, + "grad_norm": 0.23501207289869444, + "learning_rate": 9.304314350709235e-06, + "loss": 0.0652, + "step": 2313 + }, + { + "epoch": 0.19, + "grad_norm": 0.14831808420974502, + "learning_rate": 9.303619829079107e-06, + "loss": 0.0336, + "step": 2314 + }, + { + "epoch": 0.2, + "grad_norm": 0.25836482441719333, + "learning_rate": 9.302924986890304e-06, + "loss": 0.0878, + "step": 2315 + }, + { + "epoch": 0.2, + "grad_norm": 0.6367297322592059, + "learning_rate": 9.302229824194582e-06, + "loss": 0.0912, + "step": 2316 + }, + { + "epoch": 0.2, + "grad_norm": 0.2534834102230206, + "learning_rate": 9.30153434104372e-06, + "loss": 0.0913, + "step": 2317 + }, + { + "epoch": 0.2, + "grad_norm": 0.30063392575062886, + "learning_rate": 9.300838537489522e-06, + "loss": 0.0779, + "step": 2318 + }, + { + "epoch": 0.2, + "grad_norm": 0.6081510267527095, + "learning_rate": 9.300142413583815e-06, + "loss": 0.1063, + "step": 2319 + }, + { + "epoch": 0.2, + "grad_norm": 0.27676304477307284, + "learning_rate": 9.299445969378451e-06, + "loss": 0.0657, + "step": 2320 + }, + { + "epoch": 0.2, + "grad_norm": 0.38594690112935753, + "learning_rate": 9.298749204925305e-06, + "loss": 0.0847, + "step": 2321 + }, + { + "epoch": 0.2, + "grad_norm": 0.3695285992143518, + "learning_rate": 9.298052120276277e-06, + "loss": 0.1073, + "step": 2322 + }, + { + "epoch": 0.2, + "grad_norm": 0.45642639679703134, + "learning_rate": 9.297354715483288e-06, + "loss": 0.1376, + "step": 2323 + }, + { + "epoch": 0.2, + "grad_norm": 0.20126860442180566, + "learning_rate": 9.296656990598288e-06, + "loss": 0.0356, + "step": 2324 + }, + { + "epoch": 0.2, + "grad_norm": 0.46927238596496695, + "learning_rate": 9.295958945673243e-06, + "loss": 0.1077, + "step": 2325 + }, + { + "epoch": 0.2, + "grad_norm": 0.29367986907341875, + "learning_rate": 9.29526058076015e-06, + "loss": 0.1075, + "step": 2326 + }, + { + "epoch": 0.2, + "grad_norm": 0.4288452773278697, + "learning_rate": 9.29456189591103e-06, + "loss": 0.0947, + "step": 2327 + }, + { + "epoch": 0.2, + "grad_norm": 0.34003507946587996, + "learning_rate": 9.29386289117792e-06, + "loss": 0.1019, + "step": 2328 + }, + { + "epoch": 0.2, + "grad_norm": 0.2659820539803868, + "learning_rate": 9.293163566612888e-06, + "loss": 0.0898, + "step": 2329 + }, + { + "epoch": 0.2, + "grad_norm": 0.22683154753930837, + "learning_rate": 9.292463922268025e-06, + "loss": 0.059, + "step": 2330 + }, + { + "epoch": 0.2, + "grad_norm": 0.24026047394087163, + "learning_rate": 9.291763958195444e-06, + "loss": 0.0666, + "step": 2331 + }, + { + "epoch": 0.2, + "grad_norm": 0.25051432073764, + "learning_rate": 9.29106367444728e-06, + "loss": 0.1071, + "step": 2332 + }, + { + "epoch": 0.2, + "grad_norm": 0.41609271981571083, + "learning_rate": 9.290363071075699e-06, + "loss": 0.0782, + "step": 2333 + }, + { + "epoch": 0.2, + "grad_norm": 0.318479064236394, + "learning_rate": 9.28966214813288e-06, + "loss": 0.0829, + "step": 2334 + }, + { + "epoch": 0.2, + "grad_norm": 0.2887733483241954, + "learning_rate": 9.288960905671038e-06, + "loss": 0.097, + "step": 2335 + }, + { + "epoch": 0.2, + "grad_norm": 0.5320088914392971, + "learning_rate": 9.2882593437424e-06, + "loss": 0.1005, + "step": 2336 + }, + { + "epoch": 0.2, + "grad_norm": 0.3139733696046921, + "learning_rate": 9.287557462399228e-06, + "loss": 0.069, + "step": 2337 + }, + { + "epoch": 0.2, + "grad_norm": 0.41702533047427764, + "learning_rate": 9.286855261693798e-06, + "loss": 0.0985, + "step": 2338 + }, + { + "epoch": 0.2, + "grad_norm": 0.32207282016958627, + "learning_rate": 9.286152741678416e-06, + "loss": 0.0986, + "step": 2339 + }, + { + "epoch": 0.2, + "grad_norm": 0.41825571094765834, + "learning_rate": 9.285449902405409e-06, + "loss": 0.077, + "step": 2340 + }, + { + "epoch": 0.2, + "grad_norm": 0.38126365470644813, + "learning_rate": 9.284746743927127e-06, + "loss": 0.0761, + "step": 2341 + }, + { + "epoch": 0.2, + "grad_norm": 0.5435371652562497, + "learning_rate": 9.284043266295948e-06, + "loss": 0.1306, + "step": 2342 + }, + { + "epoch": 0.2, + "grad_norm": 0.3030223415500926, + "learning_rate": 9.28333946956427e-06, + "loss": 0.0723, + "step": 2343 + }, + { + "epoch": 0.2, + "grad_norm": 0.36309697990515394, + "learning_rate": 9.282635353784517e-06, + "loss": 0.0861, + "step": 2344 + }, + { + "epoch": 0.2, + "grad_norm": 0.5488755809777321, + "learning_rate": 9.281930919009134e-06, + "loss": 0.1391, + "step": 2345 + }, + { + "epoch": 0.2, + "grad_norm": 0.3449667837760911, + "learning_rate": 9.281226165290592e-06, + "loss": 0.0976, + "step": 2346 + }, + { + "epoch": 0.2, + "grad_norm": 0.5983205483149432, + "learning_rate": 9.280521092681386e-06, + "loss": 0.1305, + "step": 2347 + }, + { + "epoch": 0.2, + "grad_norm": 0.314905034568646, + "learning_rate": 9.279815701234032e-06, + "loss": 0.072, + "step": 2348 + }, + { + "epoch": 0.2, + "grad_norm": 0.44028354166455647, + "learning_rate": 9.279109991001073e-06, + "loss": 0.1186, + "step": 2349 + }, + { + "epoch": 0.2, + "grad_norm": 0.4038271835741766, + "learning_rate": 9.278403962035074e-06, + "loss": 0.0854, + "step": 2350 + }, + { + "epoch": 0.2, + "grad_norm": 0.3687027084033775, + "learning_rate": 9.277697614388624e-06, + "loss": 0.0964, + "step": 2351 + }, + { + "epoch": 0.2, + "grad_norm": 0.3238511718452813, + "learning_rate": 9.276990948114338e-06, + "loss": 0.0841, + "step": 2352 + }, + { + "epoch": 0.2, + "grad_norm": 0.2705285581510575, + "learning_rate": 9.27628396326485e-06, + "loss": 0.0527, + "step": 2353 + }, + { + "epoch": 0.2, + "grad_norm": 0.33347708661952113, + "learning_rate": 9.27557665989282e-06, + "loss": 0.0795, + "step": 2354 + }, + { + "epoch": 0.2, + "grad_norm": 0.5950398540763525, + "learning_rate": 9.274869038050936e-06, + "loss": 0.1286, + "step": 2355 + }, + { + "epoch": 0.2, + "grad_norm": 0.22495757752564471, + "learning_rate": 9.2741610977919e-06, + "loss": 0.0693, + "step": 2356 + }, + { + "epoch": 0.2, + "grad_norm": 0.5407923898800172, + "learning_rate": 9.273452839168449e-06, + "loss": 0.1086, + "step": 2357 + }, + { + "epoch": 0.2, + "grad_norm": 0.43297286812618463, + "learning_rate": 9.272744262233334e-06, + "loss": 0.0435, + "step": 2358 + }, + { + "epoch": 0.2, + "grad_norm": 0.41466706197337605, + "learning_rate": 9.272035367039337e-06, + "loss": 0.097, + "step": 2359 + }, + { + "epoch": 0.2, + "grad_norm": 0.7195172691390104, + "learning_rate": 9.271326153639259e-06, + "loss": 0.0906, + "step": 2360 + }, + { + "epoch": 0.2, + "grad_norm": 0.5396883792642421, + "learning_rate": 9.270616622085926e-06, + "loss": 0.1367, + "step": 2361 + }, + { + "epoch": 0.2, + "grad_norm": 0.4296776492948369, + "learning_rate": 9.269906772432191e-06, + "loss": 0.0852, + "step": 2362 + }, + { + "epoch": 0.2, + "grad_norm": 0.6028479135930933, + "learning_rate": 9.269196604730923e-06, + "loss": 0.1211, + "step": 2363 + }, + { + "epoch": 0.2, + "grad_norm": 0.5700877451794317, + "learning_rate": 9.268486119035024e-06, + "loss": 0.1161, + "step": 2364 + }, + { + "epoch": 0.2, + "grad_norm": 0.45844944809909693, + "learning_rate": 9.267775315397413e-06, + "loss": 0.1054, + "step": 2365 + }, + { + "epoch": 0.2, + "grad_norm": 0.2358504304417916, + "learning_rate": 9.267064193871033e-06, + "loss": 0.0767, + "step": 2366 + }, + { + "epoch": 0.2, + "grad_norm": 0.4303326377577767, + "learning_rate": 9.266352754508855e-06, + "loss": 0.0775, + "step": 2367 + }, + { + "epoch": 0.2, + "grad_norm": 0.48018137671701383, + "learning_rate": 9.265640997363871e-06, + "loss": 0.1213, + "step": 2368 + }, + { + "epoch": 0.2, + "grad_norm": 0.4550399629095118, + "learning_rate": 9.264928922489097e-06, + "loss": 0.0711, + "step": 2369 + }, + { + "epoch": 0.2, + "grad_norm": 0.6185696510012659, + "learning_rate": 9.26421652993757e-06, + "loss": 0.097, + "step": 2370 + }, + { + "epoch": 0.2, + "grad_norm": 0.36518670265197134, + "learning_rate": 9.263503819762357e-06, + "loss": 0.0928, + "step": 2371 + }, + { + "epoch": 0.2, + "grad_norm": 0.5643621734805451, + "learning_rate": 9.262790792016543e-06, + "loss": 0.1034, + "step": 2372 + }, + { + "epoch": 0.2, + "grad_norm": 0.3684383768262814, + "learning_rate": 9.262077446753236e-06, + "loss": 0.0897, + "step": 2373 + }, + { + "epoch": 0.2, + "grad_norm": 0.35139945998552913, + "learning_rate": 9.261363784025574e-06, + "loss": 0.0697, + "step": 2374 + }, + { + "epoch": 0.2, + "grad_norm": 0.36686908286411946, + "learning_rate": 9.260649803886712e-06, + "loss": 0.1003, + "step": 2375 + }, + { + "epoch": 0.2, + "grad_norm": 0.38824144474565186, + "learning_rate": 9.259935506389833e-06, + "loss": 0.1078, + "step": 2376 + }, + { + "epoch": 0.2, + "grad_norm": 0.24022284061470212, + "learning_rate": 9.259220891588141e-06, + "loss": 0.0583, + "step": 2377 + }, + { + "epoch": 0.2, + "grad_norm": 0.31705229857491707, + "learning_rate": 9.258505959534867e-06, + "loss": 0.0675, + "step": 2378 + }, + { + "epoch": 0.2, + "grad_norm": 0.3285945161504574, + "learning_rate": 9.257790710283258e-06, + "loss": 0.0715, + "step": 2379 + }, + { + "epoch": 0.2, + "grad_norm": 0.5356269063347308, + "learning_rate": 9.257075143886598e-06, + "loss": 0.1543, + "step": 2380 + }, + { + "epoch": 0.2, + "grad_norm": 0.3535088220286594, + "learning_rate": 9.256359260398178e-06, + "loss": 0.0745, + "step": 2381 + }, + { + "epoch": 0.2, + "grad_norm": 0.4514672660471702, + "learning_rate": 9.255643059871327e-06, + "loss": 0.1317, + "step": 2382 + }, + { + "epoch": 0.2, + "grad_norm": 0.36110557820870454, + "learning_rate": 9.25492654235939e-06, + "loss": 0.0718, + "step": 2383 + }, + { + "epoch": 0.2, + "grad_norm": 0.2564055114431298, + "learning_rate": 9.254209707915737e-06, + "loss": 0.0409, + "step": 2384 + }, + { + "epoch": 0.2, + "grad_norm": 0.4880596413762622, + "learning_rate": 9.253492556593763e-06, + "loss": 0.1038, + "step": 2385 + }, + { + "epoch": 0.2, + "grad_norm": 0.26524893426896134, + "learning_rate": 9.252775088446884e-06, + "loss": 0.0576, + "step": 2386 + }, + { + "epoch": 0.2, + "grad_norm": 0.4003603107127384, + "learning_rate": 9.252057303528544e-06, + "loss": 0.0787, + "step": 2387 + }, + { + "epoch": 0.2, + "grad_norm": 0.3406245558755874, + "learning_rate": 9.251339201892203e-06, + "loss": 0.0813, + "step": 2388 + }, + { + "epoch": 0.2, + "grad_norm": 0.3261210254957526, + "learning_rate": 9.250620783591354e-06, + "loss": 0.0612, + "step": 2389 + }, + { + "epoch": 0.2, + "grad_norm": 0.21396491269875514, + "learning_rate": 9.249902048679507e-06, + "loss": 0.0538, + "step": 2390 + }, + { + "epoch": 0.2, + "grad_norm": 0.2796425878880943, + "learning_rate": 9.249182997210198e-06, + "loss": 0.0733, + "step": 2391 + }, + { + "epoch": 0.2, + "grad_norm": 0.5987007476421978, + "learning_rate": 9.248463629236987e-06, + "loss": 0.0689, + "step": 2392 + }, + { + "epoch": 0.2, + "grad_norm": 0.26341944254403304, + "learning_rate": 9.247743944813454e-06, + "loss": 0.0443, + "step": 2393 + }, + { + "epoch": 0.2, + "grad_norm": 0.29488704285765055, + "learning_rate": 9.247023943993208e-06, + "loss": 0.0729, + "step": 2394 + }, + { + "epoch": 0.2, + "grad_norm": 0.649137868596947, + "learning_rate": 9.246303626829878e-06, + "loss": 0.145, + "step": 2395 + }, + { + "epoch": 0.2, + "grad_norm": 0.36101111275762043, + "learning_rate": 9.245582993377117e-06, + "loss": 0.0702, + "step": 2396 + }, + { + "epoch": 0.2, + "grad_norm": 0.4622055912053914, + "learning_rate": 9.244862043688602e-06, + "loss": 0.0905, + "step": 2397 + }, + { + "epoch": 0.2, + "grad_norm": 0.476919771871768, + "learning_rate": 9.244140777818036e-06, + "loss": 0.1108, + "step": 2398 + }, + { + "epoch": 0.2, + "grad_norm": 0.3624400139473859, + "learning_rate": 9.243419195819139e-06, + "loss": 0.0803, + "step": 2399 + }, + { + "epoch": 0.2, + "grad_norm": 0.2591384784949351, + "learning_rate": 9.24269729774566e-06, + "loss": 0.0556, + "step": 2400 + }, + { + "epoch": 0.2, + "grad_norm": 0.6100973014729856, + "learning_rate": 9.241975083651372e-06, + "loss": 0.1099, + "step": 2401 + }, + { + "epoch": 0.2, + "grad_norm": 0.33070414540174936, + "learning_rate": 9.241252553590068e-06, + "loss": 0.0579, + "step": 2402 + }, + { + "epoch": 0.2, + "grad_norm": 0.45784867313565886, + "learning_rate": 9.240529707615566e-06, + "loss": 0.1064, + "step": 2403 + }, + { + "epoch": 0.2, + "grad_norm": 0.47645895537709654, + "learning_rate": 9.23980654578171e-06, + "loss": 0.096, + "step": 2404 + }, + { + "epoch": 0.2, + "grad_norm": 0.709165308299525, + "learning_rate": 9.239083068142362e-06, + "loss": 0.0844, + "step": 2405 + }, + { + "epoch": 0.2, + "grad_norm": 0.4169742623222363, + "learning_rate": 9.238359274751414e-06, + "loss": 0.1046, + "step": 2406 + }, + { + "epoch": 0.2, + "grad_norm": 0.3353419363137753, + "learning_rate": 9.237635165662777e-06, + "loss": 0.0742, + "step": 2407 + }, + { + "epoch": 0.2, + "grad_norm": 0.30777185027966525, + "learning_rate": 9.236910740930385e-06, + "loss": 0.0772, + "step": 2408 + }, + { + "epoch": 0.2, + "grad_norm": 0.4025918348115341, + "learning_rate": 9.236186000608202e-06, + "loss": 0.0726, + "step": 2409 + }, + { + "epoch": 0.2, + "grad_norm": 0.31819153065209155, + "learning_rate": 9.235460944750206e-06, + "loss": 0.0784, + "step": 2410 + }, + { + "epoch": 0.2, + "grad_norm": 0.36466499319636253, + "learning_rate": 9.234735573410406e-06, + "loss": 0.1129, + "step": 2411 + }, + { + "epoch": 0.2, + "grad_norm": 0.419476596826727, + "learning_rate": 9.234009886642832e-06, + "loss": 0.1039, + "step": 2412 + }, + { + "epoch": 0.2, + "grad_norm": 0.43847129055073936, + "learning_rate": 9.233283884501535e-06, + "loss": 0.1252, + "step": 2413 + }, + { + "epoch": 0.2, + "grad_norm": 0.24861610736164041, + "learning_rate": 9.232557567040593e-06, + "loss": 0.04, + "step": 2414 + }, + { + "epoch": 0.2, + "grad_norm": 0.3398709188441637, + "learning_rate": 9.23183093431411e-06, + "loss": 0.054, + "step": 2415 + }, + { + "epoch": 0.2, + "grad_norm": 0.4491830416661052, + "learning_rate": 9.231103986376207e-06, + "loss": 0.0919, + "step": 2416 + }, + { + "epoch": 0.2, + "grad_norm": 0.29298627885131784, + "learning_rate": 9.230376723281027e-06, + "loss": 0.1054, + "step": 2417 + }, + { + "epoch": 0.2, + "grad_norm": 0.41701764202936675, + "learning_rate": 9.229649145082749e-06, + "loss": 0.061, + "step": 2418 + }, + { + "epoch": 0.2, + "grad_norm": 0.4385706379841609, + "learning_rate": 9.228921251835562e-06, + "loss": 0.106, + "step": 2419 + }, + { + "epoch": 0.2, + "grad_norm": 0.41182881312011, + "learning_rate": 9.228193043593682e-06, + "loss": 0.1181, + "step": 2420 + }, + { + "epoch": 0.2, + "grad_norm": 0.44125008037721003, + "learning_rate": 9.227464520411355e-06, + "loss": 0.1028, + "step": 2421 + }, + { + "epoch": 0.2, + "grad_norm": 0.34476323497136757, + "learning_rate": 9.226735682342846e-06, + "loss": 0.0996, + "step": 2422 + }, + { + "epoch": 0.2, + "grad_norm": 0.32370349483702815, + "learning_rate": 9.226006529442439e-06, + "loss": 0.0564, + "step": 2423 + }, + { + "epoch": 0.2, + "grad_norm": 0.639512593249112, + "learning_rate": 9.225277061764447e-06, + "loss": 0.0979, + "step": 2424 + }, + { + "epoch": 0.2, + "grad_norm": 0.2591802918330556, + "learning_rate": 9.224547279363206e-06, + "loss": 0.072, + "step": 2425 + }, + { + "epoch": 0.2, + "grad_norm": 0.25234330520223974, + "learning_rate": 9.223817182293074e-06, + "loss": 0.0726, + "step": 2426 + }, + { + "epoch": 0.2, + "grad_norm": 0.39836143275817065, + "learning_rate": 9.223086770608432e-06, + "loss": 0.039, + "step": 2427 + }, + { + "epoch": 0.2, + "grad_norm": 0.25288854154711077, + "learning_rate": 9.222356044363686e-06, + "loss": 0.0717, + "step": 2428 + }, + { + "epoch": 0.2, + "grad_norm": 0.45690902308341924, + "learning_rate": 9.221625003613263e-06, + "loss": 0.1274, + "step": 2429 + }, + { + "epoch": 0.2, + "grad_norm": 0.19947169844895804, + "learning_rate": 9.22089364841162e-06, + "loss": 0.057, + "step": 2430 + }, + { + "epoch": 0.2, + "grad_norm": 0.3574262981023576, + "learning_rate": 9.220161978813229e-06, + "loss": 0.0988, + "step": 2431 + }, + { + "epoch": 0.2, + "grad_norm": 0.4760489126589686, + "learning_rate": 9.219429994872588e-06, + "loss": 0.1129, + "step": 2432 + }, + { + "epoch": 0.21, + "grad_norm": 0.35641492586824974, + "learning_rate": 9.218697696644223e-06, + "loss": 0.0794, + "step": 2433 + }, + { + "epoch": 0.21, + "grad_norm": 0.6287296604998953, + "learning_rate": 9.217965084182676e-06, + "loss": 0.1588, + "step": 2434 + }, + { + "epoch": 0.21, + "grad_norm": 0.4760913700629727, + "learning_rate": 9.217232157542519e-06, + "loss": 0.0825, + "step": 2435 + }, + { + "epoch": 0.21, + "grad_norm": 0.42036001748068064, + "learning_rate": 9.216498916778345e-06, + "loss": 0.1032, + "step": 2436 + }, + { + "epoch": 0.21, + "grad_norm": 0.40016852278390136, + "learning_rate": 9.215765361944766e-06, + "loss": 0.09, + "step": 2437 + }, + { + "epoch": 0.21, + "grad_norm": 0.3548804949368742, + "learning_rate": 9.215031493096428e-06, + "loss": 0.0833, + "step": 2438 + }, + { + "epoch": 0.21, + "grad_norm": 0.7681244631680875, + "learning_rate": 9.214297310287985e-06, + "loss": 0.1298, + "step": 2439 + }, + { + "epoch": 0.21, + "grad_norm": 0.36731406638271036, + "learning_rate": 9.213562813574131e-06, + "loss": 0.0926, + "step": 2440 + }, + { + "epoch": 0.21, + "grad_norm": 0.39382662804399066, + "learning_rate": 9.212828003009573e-06, + "loss": 0.0633, + "step": 2441 + }, + { + "epoch": 0.21, + "grad_norm": 0.3790270042765703, + "learning_rate": 9.212092878649044e-06, + "loss": 0.0772, + "step": 2442 + }, + { + "epoch": 0.21, + "grad_norm": 0.6067136613967411, + "learning_rate": 9.2113574405473e-06, + "loss": 0.1227, + "step": 2443 + }, + { + "epoch": 0.21, + "grad_norm": 0.4456505377332532, + "learning_rate": 9.21062168875912e-06, + "loss": 0.0958, + "step": 2444 + }, + { + "epoch": 0.21, + "grad_norm": 0.5324426399153132, + "learning_rate": 9.209885623339308e-06, + "loss": 0.1202, + "step": 2445 + }, + { + "epoch": 0.21, + "grad_norm": 0.3047861347538619, + "learning_rate": 9.209149244342692e-06, + "loss": 0.0797, + "step": 2446 + }, + { + "epoch": 0.21, + "grad_norm": 0.32809350280302174, + "learning_rate": 9.208412551824117e-06, + "loss": 0.0891, + "step": 2447 + }, + { + "epoch": 0.21, + "grad_norm": 0.25374701300360175, + "learning_rate": 9.207675545838463e-06, + "loss": 0.0654, + "step": 2448 + }, + { + "epoch": 0.21, + "grad_norm": 0.5349845644741081, + "learning_rate": 9.20693822644062e-06, + "loss": 0.0736, + "step": 2449 + }, + { + "epoch": 0.21, + "grad_norm": 0.31489577087665815, + "learning_rate": 9.206200593685513e-06, + "loss": 0.0612, + "step": 2450 + }, + { + "epoch": 0.21, + "grad_norm": 0.42791277062451033, + "learning_rate": 9.20546264762808e-06, + "loss": 0.113, + "step": 2451 + }, + { + "epoch": 0.21, + "grad_norm": 0.38029799496525946, + "learning_rate": 9.204724388323292e-06, + "loss": 0.0876, + "step": 2452 + }, + { + "epoch": 0.21, + "grad_norm": 0.3190607566560223, + "learning_rate": 9.203985815826137e-06, + "loss": 0.1128, + "step": 2453 + }, + { + "epoch": 0.21, + "grad_norm": 0.37339328848382636, + "learning_rate": 9.20324693019163e-06, + "loss": 0.1212, + "step": 2454 + }, + { + "epoch": 0.21, + "grad_norm": 0.6552134421042066, + "learning_rate": 9.202507731474803e-06, + "loss": 0.0428, + "step": 2455 + }, + { + "epoch": 0.21, + "grad_norm": 0.48053717660764556, + "learning_rate": 9.201768219730722e-06, + "loss": 0.1225, + "step": 2456 + }, + { + "epoch": 0.21, + "grad_norm": 0.2934395309830181, + "learning_rate": 9.201028395014464e-06, + "loss": 0.0913, + "step": 2457 + }, + { + "epoch": 0.21, + "grad_norm": 0.3017323536887283, + "learning_rate": 9.20028825738114e-06, + "loss": 0.083, + "step": 2458 + }, + { + "epoch": 0.21, + "grad_norm": 0.3035041835105427, + "learning_rate": 9.199547806885878e-06, + "loss": 0.0809, + "step": 2459 + }, + { + "epoch": 0.21, + "grad_norm": 0.31855927492886943, + "learning_rate": 9.198807043583831e-06, + "loss": 0.0858, + "step": 2460 + }, + { + "epoch": 0.21, + "grad_norm": 0.34542283170579696, + "learning_rate": 9.198065967530176e-06, + "loss": 0.0928, + "step": 2461 + }, + { + "epoch": 0.21, + "grad_norm": 0.5063082976442057, + "learning_rate": 9.197324578780111e-06, + "loss": 0.0975, + "step": 2462 + }, + { + "epoch": 0.21, + "grad_norm": 0.3680342856190144, + "learning_rate": 9.19658287738886e-06, + "loss": 0.0939, + "step": 2463 + }, + { + "epoch": 0.21, + "grad_norm": 0.47664354574423795, + "learning_rate": 9.19584086341167e-06, + "loss": 0.1089, + "step": 2464 + }, + { + "epoch": 0.21, + "grad_norm": 0.40856246493509385, + "learning_rate": 9.19509853690381e-06, + "loss": 0.0952, + "step": 2465 + }, + { + "epoch": 0.21, + "grad_norm": 0.2896309161783139, + "learning_rate": 9.194355897920572e-06, + "loss": 0.0654, + "step": 2466 + }, + { + "epoch": 0.21, + "grad_norm": 0.3672838319742439, + "learning_rate": 9.193612946517274e-06, + "loss": 0.1358, + "step": 2467 + }, + { + "epoch": 0.21, + "grad_norm": 0.380287434199588, + "learning_rate": 9.192869682749255e-06, + "loss": 0.1189, + "step": 2468 + }, + { + "epoch": 0.21, + "grad_norm": 0.40946803912484275, + "learning_rate": 9.192126106671874e-06, + "loss": 0.1073, + "step": 2469 + }, + { + "epoch": 0.21, + "grad_norm": 0.26418849420460605, + "learning_rate": 9.19138221834052e-06, + "loss": 0.0745, + "step": 2470 + }, + { + "epoch": 0.21, + "grad_norm": 0.39109404299844663, + "learning_rate": 9.190638017810604e-06, + "loss": 0.1041, + "step": 2471 + }, + { + "epoch": 0.21, + "grad_norm": 0.4574658893406799, + "learning_rate": 9.189893505137555e-06, + "loss": 0.0931, + "step": 2472 + }, + { + "epoch": 0.21, + "grad_norm": 0.24512730290637746, + "learning_rate": 9.189148680376829e-06, + "loss": 0.076, + "step": 2473 + }, + { + "epoch": 0.21, + "grad_norm": 0.5070097203437534, + "learning_rate": 9.188403543583909e-06, + "loss": 0.0952, + "step": 2474 + }, + { + "epoch": 0.21, + "grad_norm": 0.25139712395579317, + "learning_rate": 9.187658094814288e-06, + "loss": 0.0603, + "step": 2475 + }, + { + "epoch": 0.21, + "grad_norm": 0.48847416883191197, + "learning_rate": 9.186912334123503e-06, + "loss": 0.0929, + "step": 2476 + }, + { + "epoch": 0.21, + "grad_norm": 0.4937015884513572, + "learning_rate": 9.186166261567093e-06, + "loss": 0.0801, + "step": 2477 + }, + { + "epoch": 0.21, + "grad_norm": 0.3857002776517484, + "learning_rate": 9.185419877200636e-06, + "loss": 0.0735, + "step": 2478 + }, + { + "epoch": 0.21, + "grad_norm": 0.268295717158448, + "learning_rate": 9.184673181079723e-06, + "loss": 0.0657, + "step": 2479 + }, + { + "epoch": 0.21, + "grad_norm": 0.611197781324108, + "learning_rate": 9.183926173259974e-06, + "loss": 0.0999, + "step": 2480 + }, + { + "epoch": 0.21, + "grad_norm": 0.35630875181460675, + "learning_rate": 9.18317885379703e-06, + "loss": 0.1128, + "step": 2481 + }, + { + "epoch": 0.21, + "grad_norm": 0.30926034826405013, + "learning_rate": 9.182431222746557e-06, + "loss": 0.0943, + "step": 2482 + }, + { + "epoch": 0.21, + "grad_norm": 0.7206285124600662, + "learning_rate": 9.181683280164242e-06, + "loss": 0.1058, + "step": 2483 + }, + { + "epoch": 0.21, + "grad_norm": 0.2983424124580018, + "learning_rate": 9.180935026105797e-06, + "loss": 0.0763, + "step": 2484 + }, + { + "epoch": 0.21, + "grad_norm": 0.4113553776440482, + "learning_rate": 9.180186460626955e-06, + "loss": 0.075, + "step": 2485 + }, + { + "epoch": 0.21, + "grad_norm": 0.4121105870105272, + "learning_rate": 9.179437583783474e-06, + "loss": 0.089, + "step": 2486 + }, + { + "epoch": 0.21, + "grad_norm": 0.23062254175103497, + "learning_rate": 9.178688395631134e-06, + "loss": 0.0766, + "step": 2487 + }, + { + "epoch": 0.21, + "grad_norm": 0.4145102068514547, + "learning_rate": 9.17793889622574e-06, + "loss": 0.0978, + "step": 2488 + }, + { + "epoch": 0.21, + "grad_norm": 0.3219304579323048, + "learning_rate": 9.177189085623119e-06, + "loss": 0.1079, + "step": 2489 + }, + { + "epoch": 0.21, + "grad_norm": 0.328843946556971, + "learning_rate": 9.176438963879121e-06, + "loss": 0.0803, + "step": 2490 + }, + { + "epoch": 0.21, + "grad_norm": 0.24996337275980357, + "learning_rate": 9.17568853104962e-06, + "loss": 0.06, + "step": 2491 + }, + { + "epoch": 0.21, + "grad_norm": 0.4758088012089373, + "learning_rate": 9.174937787190512e-06, + "loss": 0.1465, + "step": 2492 + }, + { + "epoch": 0.21, + "grad_norm": 0.3223460355606129, + "learning_rate": 9.174186732357715e-06, + "loss": 0.0722, + "step": 2493 + }, + { + "epoch": 0.21, + "grad_norm": 0.30139502075431784, + "learning_rate": 9.173435366607176e-06, + "loss": 0.0745, + "step": 2494 + }, + { + "epoch": 0.21, + "grad_norm": 0.4966418311455309, + "learning_rate": 9.172683689994856e-06, + "loss": 0.1067, + "step": 2495 + }, + { + "epoch": 0.21, + "grad_norm": 0.6473820198850618, + "learning_rate": 9.171931702576751e-06, + "loss": 0.1579, + "step": 2496 + }, + { + "epoch": 0.21, + "grad_norm": 0.4254062331229803, + "learning_rate": 9.171179404408866e-06, + "loss": 0.1003, + "step": 2497 + }, + { + "epoch": 0.21, + "grad_norm": 0.2761746671294394, + "learning_rate": 9.170426795547241e-06, + "loss": 0.071, + "step": 2498 + }, + { + "epoch": 0.21, + "grad_norm": 0.4807357512511184, + "learning_rate": 9.169673876047935e-06, + "loss": 0.1233, + "step": 2499 + }, + { + "epoch": 0.21, + "grad_norm": 0.37588415670573455, + "learning_rate": 9.168920645967027e-06, + "loss": 0.1085, + "step": 2500 + }, + { + "epoch": 0.21, + "grad_norm": 0.5007401461969425, + "learning_rate": 9.168167105360625e-06, + "loss": 0.0819, + "step": 2501 + }, + { + "epoch": 0.21, + "grad_norm": 0.4948651541518069, + "learning_rate": 9.167413254284854e-06, + "loss": 0.1089, + "step": 2502 + }, + { + "epoch": 0.21, + "grad_norm": 0.48322159862417385, + "learning_rate": 9.166659092795869e-06, + "loss": 0.1079, + "step": 2503 + }, + { + "epoch": 0.21, + "grad_norm": 0.9899629578354386, + "learning_rate": 9.165904620949839e-06, + "loss": 0.1506, + "step": 2504 + }, + { + "epoch": 0.21, + "grad_norm": 0.33693433977736276, + "learning_rate": 9.165149838802967e-06, + "loss": 0.062, + "step": 2505 + }, + { + "epoch": 0.21, + "grad_norm": 0.36860823688203725, + "learning_rate": 9.164394746411471e-06, + "loss": 0.0961, + "step": 2506 + }, + { + "epoch": 0.21, + "grad_norm": 0.24010900256428624, + "learning_rate": 9.163639343831595e-06, + "loss": 0.0742, + "step": 2507 + }, + { + "epoch": 0.21, + "grad_norm": 0.3240588744915587, + "learning_rate": 9.162883631119606e-06, + "loss": 0.0726, + "step": 2508 + }, + { + "epoch": 0.21, + "grad_norm": 0.40624507683653166, + "learning_rate": 9.162127608331792e-06, + "loss": 0.0508, + "step": 2509 + }, + { + "epoch": 0.21, + "grad_norm": 0.322478586487207, + "learning_rate": 9.161371275524469e-06, + "loss": 0.1069, + "step": 2510 + }, + { + "epoch": 0.21, + "grad_norm": 0.45093396883388076, + "learning_rate": 9.160614632753971e-06, + "loss": 0.0892, + "step": 2511 + }, + { + "epoch": 0.21, + "grad_norm": 0.3815478254418809, + "learning_rate": 9.159857680076657e-06, + "loss": 0.0876, + "step": 2512 + }, + { + "epoch": 0.21, + "grad_norm": 0.6426436642489988, + "learning_rate": 9.159100417548911e-06, + "loss": 0.1224, + "step": 2513 + }, + { + "epoch": 0.21, + "grad_norm": 0.3331844016861563, + "learning_rate": 9.158342845227137e-06, + "loss": 0.0966, + "step": 2514 + }, + { + "epoch": 0.21, + "grad_norm": 0.33857962471035274, + "learning_rate": 9.157584963167764e-06, + "loss": 0.108, + "step": 2515 + }, + { + "epoch": 0.21, + "grad_norm": 0.29793596217942864, + "learning_rate": 9.156826771427243e-06, + "loss": 0.0993, + "step": 2516 + }, + { + "epoch": 0.21, + "grad_norm": 0.37629183947643907, + "learning_rate": 9.156068270062048e-06, + "loss": 0.0911, + "step": 2517 + }, + { + "epoch": 0.21, + "grad_norm": 0.445958521631865, + "learning_rate": 9.155309459128678e-06, + "loss": 0.107, + "step": 2518 + }, + { + "epoch": 0.21, + "grad_norm": 0.3023250317961179, + "learning_rate": 9.154550338683654e-06, + "loss": 0.0481, + "step": 2519 + }, + { + "epoch": 0.21, + "grad_norm": 0.38060769590598115, + "learning_rate": 9.153790908783517e-06, + "loss": 0.0923, + "step": 2520 + }, + { + "epoch": 0.21, + "grad_norm": 0.5010774061190695, + "learning_rate": 9.153031169484838e-06, + "loss": 0.0957, + "step": 2521 + }, + { + "epoch": 0.21, + "grad_norm": 0.45831047408042114, + "learning_rate": 9.152271120844203e-06, + "loss": 0.1121, + "step": 2522 + }, + { + "epoch": 0.21, + "grad_norm": 0.23030855756275562, + "learning_rate": 9.151510762918225e-06, + "loss": 0.0397, + "step": 2523 + }, + { + "epoch": 0.21, + "grad_norm": 0.24327685002587543, + "learning_rate": 9.150750095763543e-06, + "loss": 0.0632, + "step": 2524 + }, + { + "epoch": 0.21, + "grad_norm": 0.5162771619333781, + "learning_rate": 9.149989119436813e-06, + "loss": 0.16, + "step": 2525 + }, + { + "epoch": 0.21, + "grad_norm": 0.7383018806445086, + "learning_rate": 9.149227833994717e-06, + "loss": 0.0799, + "step": 2526 + }, + { + "epoch": 0.21, + "grad_norm": 0.4839474532854488, + "learning_rate": 9.148466239493962e-06, + "loss": 0.0994, + "step": 2527 + }, + { + "epoch": 0.21, + "grad_norm": 0.22568671027585252, + "learning_rate": 9.147704335991275e-06, + "loss": 0.0487, + "step": 2528 + }, + { + "epoch": 0.21, + "grad_norm": 0.35481442154136544, + "learning_rate": 9.146942123543406e-06, + "loss": 0.0866, + "step": 2529 + }, + { + "epoch": 0.21, + "grad_norm": 0.5514397097718858, + "learning_rate": 9.14617960220713e-06, + "loss": 0.1226, + "step": 2530 + }, + { + "epoch": 0.21, + "grad_norm": 0.5400627398502025, + "learning_rate": 9.145416772039245e-06, + "loss": 0.0827, + "step": 2531 + }, + { + "epoch": 0.21, + "grad_norm": 0.6267976470932751, + "learning_rate": 9.144653633096569e-06, + "loss": 0.1358, + "step": 2532 + }, + { + "epoch": 0.21, + "grad_norm": 0.23745799704731144, + "learning_rate": 9.143890185435946e-06, + "loss": 0.0666, + "step": 2533 + }, + { + "epoch": 0.21, + "grad_norm": 0.26523721134900063, + "learning_rate": 9.143126429114243e-06, + "loss": 0.0867, + "step": 2534 + }, + { + "epoch": 0.21, + "grad_norm": 0.4122795548462173, + "learning_rate": 9.142362364188348e-06, + "loss": 0.0945, + "step": 2535 + }, + { + "epoch": 0.21, + "grad_norm": 0.48871316196481074, + "learning_rate": 9.141597990715172e-06, + "loss": 0.1086, + "step": 2536 + }, + { + "epoch": 0.21, + "grad_norm": 0.6270852697051894, + "learning_rate": 9.14083330875165e-06, + "loss": 0.1198, + "step": 2537 + }, + { + "epoch": 0.21, + "grad_norm": 0.526201127494724, + "learning_rate": 9.140068318354745e-06, + "loss": 0.0831, + "step": 2538 + }, + { + "epoch": 0.21, + "grad_norm": 0.1888757342131632, + "learning_rate": 9.139303019581432e-06, + "loss": 0.0625, + "step": 2539 + }, + { + "epoch": 0.21, + "grad_norm": 0.3612695813707224, + "learning_rate": 9.138537412488715e-06, + "loss": 0.0819, + "step": 2540 + }, + { + "epoch": 0.21, + "grad_norm": 0.499989457781871, + "learning_rate": 9.137771497133625e-06, + "loss": 0.0911, + "step": 2541 + }, + { + "epoch": 0.21, + "grad_norm": 0.5989894805656822, + "learning_rate": 9.137005273573208e-06, + "loss": 0.1142, + "step": 2542 + }, + { + "epoch": 0.21, + "grad_norm": 0.47768961548322536, + "learning_rate": 9.13623874186454e-06, + "loss": 0.1029, + "step": 2543 + }, + { + "epoch": 0.21, + "grad_norm": 0.5300784521691798, + "learning_rate": 9.135471902064715e-06, + "loss": 0.1319, + "step": 2544 + }, + { + "epoch": 0.21, + "grad_norm": 0.2633714226731253, + "learning_rate": 9.13470475423085e-06, + "loss": 0.0633, + "step": 2545 + }, + { + "epoch": 0.21, + "grad_norm": 0.24341585967279902, + "learning_rate": 9.13393729842009e-06, + "loss": 0.0708, + "step": 2546 + }, + { + "epoch": 0.21, + "grad_norm": 0.3549411553736421, + "learning_rate": 9.133169534689598e-06, + "loss": 0.0843, + "step": 2547 + }, + { + "epoch": 0.21, + "grad_norm": 0.38410619968820586, + "learning_rate": 9.13240146309656e-06, + "loss": 0.0795, + "step": 2548 + }, + { + "epoch": 0.21, + "grad_norm": 0.5076940807071623, + "learning_rate": 9.13163308369819e-06, + "loss": 0.1438, + "step": 2549 + }, + { + "epoch": 0.21, + "grad_norm": 0.40084210883167004, + "learning_rate": 9.13086439655172e-06, + "loss": 0.0806, + "step": 2550 + }, + { + "epoch": 0.21, + "grad_norm": 0.526776219491348, + "learning_rate": 9.130095401714403e-06, + "loss": 0.1312, + "step": 2551 + }, + { + "epoch": 0.22, + "grad_norm": 0.23052135332538107, + "learning_rate": 9.129326099243522e-06, + "loss": 0.0663, + "step": 2552 + }, + { + "epoch": 0.22, + "grad_norm": 0.19425515962736847, + "learning_rate": 9.128556489196378e-06, + "loss": 0.0479, + "step": 2553 + }, + { + "epoch": 0.22, + "grad_norm": 0.2823366831343273, + "learning_rate": 9.127786571630297e-06, + "loss": 0.0835, + "step": 2554 + }, + { + "epoch": 0.22, + "grad_norm": 0.2608065392373284, + "learning_rate": 9.127016346602624e-06, + "loss": 0.0532, + "step": 2555 + }, + { + "epoch": 0.22, + "grad_norm": 0.39571993060155386, + "learning_rate": 9.126245814170733e-06, + "loss": 0.1149, + "step": 2556 + }, + { + "epoch": 0.22, + "grad_norm": 0.9318515612608446, + "learning_rate": 9.125474974392018e-06, + "loss": 0.1649, + "step": 2557 + }, + { + "epoch": 0.22, + "grad_norm": 0.3081282224855193, + "learning_rate": 9.124703827323892e-06, + "loss": 0.0881, + "step": 2558 + }, + { + "epoch": 0.22, + "grad_norm": 0.5163842898771509, + "learning_rate": 9.123932373023798e-06, + "loss": 0.1211, + "step": 2559 + }, + { + "epoch": 0.22, + "grad_norm": 0.8761097371665139, + "learning_rate": 9.123160611549197e-06, + "loss": 0.1641, + "step": 2560 + }, + { + "epoch": 0.22, + "grad_norm": 0.47144342685189805, + "learning_rate": 9.122388542957574e-06, + "loss": 0.0751, + "step": 2561 + }, + { + "epoch": 0.22, + "grad_norm": 0.3529385002467527, + "learning_rate": 9.12161616730644e-06, + "loss": 0.0974, + "step": 2562 + }, + { + "epoch": 0.22, + "grad_norm": 0.20732876157411942, + "learning_rate": 9.120843484653321e-06, + "loss": 0.0543, + "step": 2563 + }, + { + "epoch": 0.22, + "grad_norm": 0.41306360372722983, + "learning_rate": 9.120070495055774e-06, + "loss": 0.101, + "step": 2564 + }, + { + "epoch": 0.22, + "grad_norm": 0.7810116886043941, + "learning_rate": 9.119297198571375e-06, + "loss": 0.1893, + "step": 2565 + }, + { + "epoch": 0.22, + "grad_norm": 0.257236972422567, + "learning_rate": 9.118523595257725e-06, + "loss": 0.084, + "step": 2566 + }, + { + "epoch": 0.22, + "grad_norm": 0.437756239393404, + "learning_rate": 9.117749685172445e-06, + "loss": 0.1072, + "step": 2567 + }, + { + "epoch": 0.22, + "grad_norm": 0.27137049235555033, + "learning_rate": 9.116975468373181e-06, + "loss": 0.0533, + "step": 2568 + }, + { + "epoch": 0.22, + "grad_norm": 0.8444980930223501, + "learning_rate": 9.116200944917602e-06, + "loss": 0.1487, + "step": 2569 + }, + { + "epoch": 0.22, + "grad_norm": 0.328499656576064, + "learning_rate": 9.115426114863396e-06, + "loss": 0.1194, + "step": 2570 + }, + { + "epoch": 0.22, + "grad_norm": 0.23009404115584572, + "learning_rate": 9.11465097826828e-06, + "loss": 0.0485, + "step": 2571 + }, + { + "epoch": 0.22, + "grad_norm": 0.33661991461281293, + "learning_rate": 9.11387553518999e-06, + "loss": 0.0977, + "step": 2572 + }, + { + "epoch": 0.22, + "grad_norm": 0.34755574168555936, + "learning_rate": 9.113099785686283e-06, + "loss": 0.0686, + "step": 2573 + }, + { + "epoch": 0.22, + "grad_norm": 0.5980102053817973, + "learning_rate": 9.112323729814945e-06, + "loss": 0.1274, + "step": 2574 + }, + { + "epoch": 0.22, + "grad_norm": 0.2282413704088384, + "learning_rate": 9.11154736763378e-06, + "loss": 0.0529, + "step": 2575 + }, + { + "epoch": 0.22, + "grad_norm": 0.4160755343938698, + "learning_rate": 9.110770699200613e-06, + "loss": 0.1115, + "step": 2576 + }, + { + "epoch": 0.22, + "grad_norm": 0.19579505862800656, + "learning_rate": 9.1099937245733e-06, + "loss": 0.0523, + "step": 2577 + }, + { + "epoch": 0.22, + "grad_norm": 0.26222410254507084, + "learning_rate": 9.109216443809709e-06, + "loss": 0.065, + "step": 2578 + }, + { + "epoch": 0.22, + "grad_norm": 0.37327179354221546, + "learning_rate": 9.108438856967742e-06, + "loss": 0.08, + "step": 2579 + }, + { + "epoch": 0.22, + "grad_norm": 0.6161828289850994, + "learning_rate": 9.107660964105314e-06, + "loss": 0.0905, + "step": 2580 + }, + { + "epoch": 0.22, + "grad_norm": 0.6065118804954795, + "learning_rate": 9.10688276528037e-06, + "loss": 0.1224, + "step": 2581 + }, + { + "epoch": 0.22, + "grad_norm": 0.23746208395883547, + "learning_rate": 9.106104260550872e-06, + "loss": 0.0638, + "step": 2582 + }, + { + "epoch": 0.22, + "grad_norm": 0.292860342825077, + "learning_rate": 9.105325449974809e-06, + "loss": 0.0532, + "step": 2583 + }, + { + "epoch": 0.22, + "grad_norm": 0.3676602802948076, + "learning_rate": 9.10454633361019e-06, + "loss": 0.0979, + "step": 2584 + }, + { + "epoch": 0.22, + "grad_norm": 0.22188551180911154, + "learning_rate": 9.103766911515053e-06, + "loss": 0.0507, + "step": 2585 + }, + { + "epoch": 0.22, + "grad_norm": 0.4443835706268129, + "learning_rate": 9.102987183747449e-06, + "loss": 0.0876, + "step": 2586 + }, + { + "epoch": 0.22, + "grad_norm": 0.3464884058006399, + "learning_rate": 9.102207150365456e-06, + "loss": 0.0937, + "step": 2587 + }, + { + "epoch": 0.22, + "grad_norm": 0.31448629595846794, + "learning_rate": 9.101426811427179e-06, + "loss": 0.067, + "step": 2588 + }, + { + "epoch": 0.22, + "grad_norm": 0.43584081844549677, + "learning_rate": 9.10064616699074e-06, + "loss": 0.0942, + "step": 2589 + }, + { + "epoch": 0.22, + "grad_norm": 0.3930212409413606, + "learning_rate": 9.099865217114285e-06, + "loss": 0.0643, + "step": 2590 + }, + { + "epoch": 0.22, + "grad_norm": 0.30530088815780404, + "learning_rate": 9.099083961855986e-06, + "loss": 0.1021, + "step": 2591 + }, + { + "epoch": 0.22, + "grad_norm": 0.5398788198959811, + "learning_rate": 9.098302401274035e-06, + "loss": 0.1436, + "step": 2592 + }, + { + "epoch": 0.22, + "grad_norm": 0.20936585834393176, + "learning_rate": 9.097520535426646e-06, + "loss": 0.045, + "step": 2593 + }, + { + "epoch": 0.22, + "grad_norm": 0.37911592986465925, + "learning_rate": 9.09673836437206e-06, + "loss": 0.0788, + "step": 2594 + }, + { + "epoch": 0.22, + "grad_norm": 0.32963021373525403, + "learning_rate": 9.095955888168532e-06, + "loss": 0.0913, + "step": 2595 + }, + { + "epoch": 0.22, + "grad_norm": 0.5148699432817315, + "learning_rate": 9.095173106874352e-06, + "loss": 0.1048, + "step": 2596 + }, + { + "epoch": 0.22, + "grad_norm": 0.3907899794753764, + "learning_rate": 9.094390020547821e-06, + "loss": 0.0864, + "step": 2597 + }, + { + "epoch": 0.22, + "grad_norm": 0.27910647194051497, + "learning_rate": 9.09360662924727e-06, + "loss": 0.0828, + "step": 2598 + }, + { + "epoch": 0.22, + "grad_norm": 0.37978603053437976, + "learning_rate": 9.092822933031051e-06, + "loss": 0.0946, + "step": 2599 + }, + { + "epoch": 0.22, + "grad_norm": 0.3330646085488305, + "learning_rate": 9.092038931957537e-06, + "loss": 0.0999, + "step": 2600 + }, + { + "epoch": 0.22, + "grad_norm": 0.25342448791173694, + "learning_rate": 9.091254626085124e-06, + "loss": 0.0645, + "step": 2601 + }, + { + "epoch": 0.22, + "grad_norm": 0.36034819367439264, + "learning_rate": 9.090470015472234e-06, + "loss": 0.0904, + "step": 2602 + }, + { + "epoch": 0.22, + "grad_norm": 0.35055986277243534, + "learning_rate": 9.08968510017731e-06, + "loss": 0.0925, + "step": 2603 + }, + { + "epoch": 0.22, + "grad_norm": 0.4599181961626947, + "learning_rate": 9.088899880258815e-06, + "loss": 0.0979, + "step": 2604 + }, + { + "epoch": 0.22, + "grad_norm": 0.21793590238699348, + "learning_rate": 9.088114355775236e-06, + "loss": 0.0261, + "step": 2605 + }, + { + "epoch": 0.22, + "grad_norm": 0.37647391968449523, + "learning_rate": 9.087328526785086e-06, + "loss": 0.1, + "step": 2606 + }, + { + "epoch": 0.22, + "grad_norm": 0.34069634639924606, + "learning_rate": 9.086542393346895e-06, + "loss": 0.0861, + "step": 2607 + }, + { + "epoch": 0.22, + "grad_norm": 0.37754994620913385, + "learning_rate": 9.085755955519222e-06, + "loss": 0.098, + "step": 2608 + }, + { + "epoch": 0.22, + "grad_norm": 0.4059475133425131, + "learning_rate": 9.084969213360644e-06, + "loss": 0.0658, + "step": 2609 + }, + { + "epoch": 0.22, + "grad_norm": 0.30477118962372385, + "learning_rate": 9.084182166929764e-06, + "loss": 0.072, + "step": 2610 + }, + { + "epoch": 0.22, + "grad_norm": 0.5410236588885615, + "learning_rate": 9.083394816285203e-06, + "loss": 0.1093, + "step": 2611 + }, + { + "epoch": 0.22, + "grad_norm": 0.2418257658008138, + "learning_rate": 9.082607161485607e-06, + "loss": 0.0736, + "step": 2612 + }, + { + "epoch": 0.22, + "grad_norm": 0.6436519610748231, + "learning_rate": 9.081819202589648e-06, + "loss": 0.122, + "step": 2613 + }, + { + "epoch": 0.22, + "grad_norm": 0.25173956089787625, + "learning_rate": 9.081030939656017e-06, + "loss": 0.0556, + "step": 2614 + }, + { + "epoch": 0.22, + "grad_norm": 0.3106958839034273, + "learning_rate": 9.080242372743426e-06, + "loss": 0.0614, + "step": 2615 + }, + { + "epoch": 0.22, + "grad_norm": 0.2914830713588264, + "learning_rate": 9.079453501910616e-06, + "loss": 0.0866, + "step": 2616 + }, + { + "epoch": 0.22, + "grad_norm": 0.5978257986526445, + "learning_rate": 9.078664327216344e-06, + "loss": 0.1264, + "step": 2617 + }, + { + "epoch": 0.22, + "grad_norm": 0.3629966725438927, + "learning_rate": 9.077874848719392e-06, + "loss": 0.1039, + "step": 2618 + }, + { + "epoch": 0.22, + "grad_norm": 0.3990503699568111, + "learning_rate": 9.077085066478567e-06, + "loss": 0.0985, + "step": 2619 + }, + { + "epoch": 0.22, + "grad_norm": 0.32198765762253506, + "learning_rate": 9.076294980552694e-06, + "loss": 0.1053, + "step": 2620 + }, + { + "epoch": 0.22, + "grad_norm": 0.43261813709377944, + "learning_rate": 9.075504591000626e-06, + "loss": 0.1194, + "step": 2621 + }, + { + "epoch": 0.22, + "grad_norm": 0.5412840291700423, + "learning_rate": 9.074713897881233e-06, + "loss": 0.1167, + "step": 2622 + }, + { + "epoch": 0.22, + "grad_norm": 0.3993745932464002, + "learning_rate": 9.073922901253413e-06, + "loss": 0.1104, + "step": 2623 + }, + { + "epoch": 0.22, + "grad_norm": 0.2583907238984509, + "learning_rate": 9.073131601176084e-06, + "loss": 0.0766, + "step": 2624 + }, + { + "epoch": 0.22, + "grad_norm": 0.4320585829726278, + "learning_rate": 9.072339997708184e-06, + "loss": 0.1148, + "step": 2625 + }, + { + "epoch": 0.22, + "grad_norm": 0.3813778431770088, + "learning_rate": 9.07154809090868e-06, + "loss": 0.1141, + "step": 2626 + }, + { + "epoch": 0.22, + "grad_norm": 0.3473458305769543, + "learning_rate": 9.070755880836553e-06, + "loss": 0.0632, + "step": 2627 + }, + { + "epoch": 0.22, + "grad_norm": 0.5451070100529144, + "learning_rate": 9.069963367550815e-06, + "loss": 0.1182, + "step": 2628 + }, + { + "epoch": 0.22, + "grad_norm": 0.5404611314372313, + "learning_rate": 9.069170551110495e-06, + "loss": 0.1007, + "step": 2629 + }, + { + "epoch": 0.22, + "grad_norm": 0.32558571550519905, + "learning_rate": 9.06837743157465e-06, + "loss": 0.0778, + "step": 2630 + }, + { + "epoch": 0.22, + "grad_norm": 0.419590868998277, + "learning_rate": 9.067584009002353e-06, + "loss": 0.1082, + "step": 2631 + }, + { + "epoch": 0.22, + "grad_norm": 0.23376487494108497, + "learning_rate": 9.066790283452702e-06, + "loss": 0.0621, + "step": 2632 + }, + { + "epoch": 0.22, + "grad_norm": 0.39831459864013735, + "learning_rate": 9.065996254984822e-06, + "loss": 0.1086, + "step": 2633 + }, + { + "epoch": 0.22, + "grad_norm": 0.4948853966244402, + "learning_rate": 9.065201923657854e-06, + "loss": 0.1291, + "step": 2634 + }, + { + "epoch": 0.22, + "grad_norm": 0.24317796996064392, + "learning_rate": 9.064407289530966e-06, + "loss": 0.0584, + "step": 2635 + }, + { + "epoch": 0.22, + "grad_norm": 0.44600380841795656, + "learning_rate": 9.063612352663345e-06, + "loss": 0.0985, + "step": 2636 + }, + { + "epoch": 0.22, + "grad_norm": 0.2316615180585202, + "learning_rate": 9.062817113114203e-06, + "loss": 0.0587, + "step": 2637 + }, + { + "epoch": 0.22, + "grad_norm": 0.4914966639986054, + "learning_rate": 9.062021570942776e-06, + "loss": 0.1163, + "step": 2638 + }, + { + "epoch": 0.22, + "grad_norm": 0.2554579823720119, + "learning_rate": 9.06122572620832e-06, + "loss": 0.061, + "step": 2639 + }, + { + "epoch": 0.22, + "grad_norm": 0.29633411946898025, + "learning_rate": 9.060429578970111e-06, + "loss": 0.0818, + "step": 2640 + }, + { + "epoch": 0.22, + "grad_norm": 0.3048364763696953, + "learning_rate": 9.059633129287456e-06, + "loss": 0.0904, + "step": 2641 + }, + { + "epoch": 0.22, + "grad_norm": 0.280773975083365, + "learning_rate": 9.058836377219676e-06, + "loss": 0.0766, + "step": 2642 + }, + { + "epoch": 0.22, + "grad_norm": 0.30711115012666435, + "learning_rate": 9.058039322826115e-06, + "loss": 0.0536, + "step": 2643 + }, + { + "epoch": 0.22, + "grad_norm": 0.2757579844679646, + "learning_rate": 9.057241966166148e-06, + "loss": 0.0664, + "step": 2644 + }, + { + "epoch": 0.22, + "grad_norm": 0.29503626421763296, + "learning_rate": 9.056444307299166e-06, + "loss": 0.0747, + "step": 2645 + }, + { + "epoch": 0.22, + "grad_norm": 0.4320680644659813, + "learning_rate": 9.05564634628458e-06, + "loss": 0.0937, + "step": 2646 + }, + { + "epoch": 0.22, + "grad_norm": 0.252929903968619, + "learning_rate": 9.054848083181827e-06, + "loss": 0.0785, + "step": 2647 + }, + { + "epoch": 0.22, + "grad_norm": 0.645218675775852, + "learning_rate": 9.054049518050367e-06, + "loss": 0.1047, + "step": 2648 + }, + { + "epoch": 0.22, + "grad_norm": 0.3629642421428972, + "learning_rate": 9.053250650949684e-06, + "loss": 0.0951, + "step": 2649 + }, + { + "epoch": 0.22, + "grad_norm": 0.3034759792994803, + "learning_rate": 9.05245148193928e-06, + "loss": 0.0848, + "step": 2650 + }, + { + "epoch": 0.22, + "grad_norm": 0.33671694859370266, + "learning_rate": 9.051652011078681e-06, + "loss": 0.0816, + "step": 2651 + }, + { + "epoch": 0.22, + "grad_norm": 0.23614811979000472, + "learning_rate": 9.05085223842744e-06, + "loss": 0.0517, + "step": 2652 + }, + { + "epoch": 0.22, + "grad_norm": 0.37870939908090134, + "learning_rate": 9.050052164045125e-06, + "loss": 0.0741, + "step": 2653 + }, + { + "epoch": 0.22, + "grad_norm": 0.7553547903467922, + "learning_rate": 9.049251787991333e-06, + "loss": 0.0717, + "step": 2654 + }, + { + "epoch": 0.22, + "grad_norm": 0.35198760650404876, + "learning_rate": 9.048451110325678e-06, + "loss": 0.0724, + "step": 2655 + }, + { + "epoch": 0.22, + "grad_norm": 0.4936877660182769, + "learning_rate": 9.0476501311078e-06, + "loss": 0.1379, + "step": 2656 + }, + { + "epoch": 0.22, + "grad_norm": 0.44334452320078355, + "learning_rate": 9.046848850397361e-06, + "loss": 0.0914, + "step": 2657 + }, + { + "epoch": 0.22, + "grad_norm": 0.4532973311527628, + "learning_rate": 9.046047268254045e-06, + "loss": 0.074, + "step": 2658 + }, + { + "epoch": 0.22, + "grad_norm": 0.42104868375019777, + "learning_rate": 9.04524538473756e-06, + "loss": 0.0851, + "step": 2659 + }, + { + "epoch": 0.22, + "grad_norm": 0.21071206220743333, + "learning_rate": 9.044443199907633e-06, + "loss": 0.0621, + "step": 2660 + }, + { + "epoch": 0.22, + "grad_norm": 0.28122218849144776, + "learning_rate": 9.043640713824013e-06, + "loss": 0.0655, + "step": 2661 + }, + { + "epoch": 0.22, + "grad_norm": 0.39990627773414417, + "learning_rate": 9.04283792654648e-06, + "loss": 0.1149, + "step": 2662 + }, + { + "epoch": 0.22, + "grad_norm": 0.4726453379847842, + "learning_rate": 9.042034838134826e-06, + "loss": 0.1078, + "step": 2663 + }, + { + "epoch": 0.22, + "grad_norm": 0.509778545882967, + "learning_rate": 9.04123144864887e-06, + "loss": 0.1044, + "step": 2664 + }, + { + "epoch": 0.22, + "grad_norm": 0.4869428672743166, + "learning_rate": 9.040427758148455e-06, + "loss": 0.0908, + "step": 2665 + }, + { + "epoch": 0.22, + "grad_norm": 0.2764333055082348, + "learning_rate": 9.039623766693441e-06, + "loss": 0.0649, + "step": 2666 + }, + { + "epoch": 0.22, + "grad_norm": 0.3811368039556193, + "learning_rate": 9.038819474343718e-06, + "loss": 0.124, + "step": 2667 + }, + { + "epoch": 0.22, + "grad_norm": 0.5249246735671909, + "learning_rate": 9.038014881159194e-06, + "loss": 0.1068, + "step": 2668 + }, + { + "epoch": 0.22, + "grad_norm": 0.3905463546652125, + "learning_rate": 9.037209987199797e-06, + "loss": 0.0873, + "step": 2669 + }, + { + "epoch": 0.22, + "grad_norm": 1.1040854942502358, + "learning_rate": 9.036404792525482e-06, + "loss": 0.1815, + "step": 2670 + }, + { + "epoch": 0.23, + "grad_norm": 0.32323871665779386, + "learning_rate": 9.035599297196222e-06, + "loss": 0.1055, + "step": 2671 + }, + { + "epoch": 0.23, + "grad_norm": 0.262962789347326, + "learning_rate": 9.03479350127202e-06, + "loss": 0.0772, + "step": 2672 + }, + { + "epoch": 0.23, + "grad_norm": 0.4488840405858109, + "learning_rate": 9.033987404812893e-06, + "loss": 0.12, + "step": 2673 + }, + { + "epoch": 0.23, + "grad_norm": 0.45378582246556004, + "learning_rate": 9.033181007878884e-06, + "loss": 0.0798, + "step": 2674 + }, + { + "epoch": 0.23, + "grad_norm": 0.5040432210327149, + "learning_rate": 9.032374310530058e-06, + "loss": 0.1244, + "step": 2675 + }, + { + "epoch": 0.23, + "grad_norm": 0.2776149365654268, + "learning_rate": 9.031567312826504e-06, + "loss": 0.064, + "step": 2676 + }, + { + "epoch": 0.23, + "grad_norm": 0.4519826003045925, + "learning_rate": 9.030760014828332e-06, + "loss": 0.1114, + "step": 2677 + }, + { + "epoch": 0.23, + "grad_norm": 0.5554392327687911, + "learning_rate": 9.029952416595671e-06, + "loss": 0.1312, + "step": 2678 + }, + { + "epoch": 0.23, + "grad_norm": 0.5337728741387994, + "learning_rate": 9.029144518188679e-06, + "loss": 0.1304, + "step": 2679 + }, + { + "epoch": 0.23, + "grad_norm": 0.30854457818498365, + "learning_rate": 9.028336319667532e-06, + "loss": 0.093, + "step": 2680 + }, + { + "epoch": 0.23, + "grad_norm": 0.4355621896484041, + "learning_rate": 9.027527821092428e-06, + "loss": 0.0791, + "step": 2681 + }, + { + "epoch": 0.23, + "grad_norm": 0.3658283551907798, + "learning_rate": 9.026719022523592e-06, + "loss": 0.0874, + "step": 2682 + }, + { + "epoch": 0.23, + "grad_norm": 0.30816237347574826, + "learning_rate": 9.025909924021263e-06, + "loss": 0.051, + "step": 2683 + }, + { + "epoch": 0.23, + "grad_norm": 0.2644213688472279, + "learning_rate": 9.025100525645712e-06, + "loss": 0.0661, + "step": 2684 + }, + { + "epoch": 0.23, + "grad_norm": 0.33253195116671613, + "learning_rate": 9.024290827457225e-06, + "loss": 0.072, + "step": 2685 + }, + { + "epoch": 0.23, + "grad_norm": 0.35030347374023424, + "learning_rate": 9.023480829516114e-06, + "loss": 0.1091, + "step": 2686 + }, + { + "epoch": 0.23, + "grad_norm": 0.3215639519320042, + "learning_rate": 9.022670531882712e-06, + "loss": 0.0816, + "step": 2687 + }, + { + "epoch": 0.23, + "grad_norm": 0.597005491978897, + "learning_rate": 9.021859934617376e-06, + "loss": 0.0999, + "step": 2688 + }, + { + "epoch": 0.23, + "grad_norm": 0.412522460686854, + "learning_rate": 9.021049037780483e-06, + "loss": 0.0651, + "step": 2689 + }, + { + "epoch": 0.23, + "grad_norm": 0.2967607464327263, + "learning_rate": 9.020237841432431e-06, + "loss": 0.0745, + "step": 2690 + }, + { + "epoch": 0.23, + "grad_norm": 0.3353034253137983, + "learning_rate": 9.019426345633649e-06, + "loss": 0.0797, + "step": 2691 + }, + { + "epoch": 0.23, + "grad_norm": 0.45998628652792406, + "learning_rate": 9.018614550444575e-06, + "loss": 0.1235, + "step": 2692 + }, + { + "epoch": 0.23, + "grad_norm": 0.30943624871926556, + "learning_rate": 9.01780245592568e-06, + "loss": 0.0709, + "step": 2693 + }, + { + "epoch": 0.23, + "grad_norm": 0.22998853597353697, + "learning_rate": 9.016990062137452e-06, + "loss": 0.0429, + "step": 2694 + }, + { + "epoch": 0.23, + "grad_norm": 0.5391389714112024, + "learning_rate": 9.016177369140405e-06, + "loss": 0.1276, + "step": 2695 + }, + { + "epoch": 0.23, + "grad_norm": 1.0545453687355895, + "learning_rate": 9.015364376995071e-06, + "loss": 0.1189, + "step": 2696 + }, + { + "epoch": 0.23, + "grad_norm": 0.5624534579982434, + "learning_rate": 9.014551085762006e-06, + "loss": 0.1066, + "step": 2697 + }, + { + "epoch": 0.23, + "grad_norm": 0.43274477449888105, + "learning_rate": 9.013737495501788e-06, + "loss": 0.1229, + "step": 2698 + }, + { + "epoch": 0.23, + "grad_norm": 0.3110831120959545, + "learning_rate": 9.012923606275024e-06, + "loss": 0.1031, + "step": 2699 + }, + { + "epoch": 0.23, + "grad_norm": 0.37218239130568853, + "learning_rate": 9.01210941814233e-06, + "loss": 0.0663, + "step": 2700 + }, + { + "epoch": 0.23, + "grad_norm": 0.4026313358909027, + "learning_rate": 9.011294931164354e-06, + "loss": 0.1389, + "step": 2701 + }, + { + "epoch": 0.23, + "grad_norm": 0.4559221185031558, + "learning_rate": 9.010480145401765e-06, + "loss": 0.1092, + "step": 2702 + }, + { + "epoch": 0.23, + "grad_norm": 0.5446599409991894, + "learning_rate": 9.009665060915252e-06, + "loss": 0.1224, + "step": 2703 + }, + { + "epoch": 0.23, + "grad_norm": 0.4158705364810137, + "learning_rate": 9.008849677765527e-06, + "loss": 0.076, + "step": 2704 + }, + { + "epoch": 0.23, + "grad_norm": 0.4381717523803752, + "learning_rate": 9.008033996013326e-06, + "loss": 0.0788, + "step": 2705 + }, + { + "epoch": 0.23, + "grad_norm": 0.31993527647400266, + "learning_rate": 9.007218015719402e-06, + "loss": 0.0653, + "step": 2706 + }, + { + "epoch": 0.23, + "grad_norm": 0.3174546296094006, + "learning_rate": 9.006401736944538e-06, + "loss": 0.082, + "step": 2707 + }, + { + "epoch": 0.23, + "grad_norm": 0.28381480585491237, + "learning_rate": 9.005585159749533e-06, + "loss": 0.0984, + "step": 2708 + }, + { + "epoch": 0.23, + "grad_norm": 0.5372730783968656, + "learning_rate": 9.004768284195212e-06, + "loss": 0.1011, + "step": 2709 + }, + { + "epoch": 0.23, + "grad_norm": 0.3294326952667977, + "learning_rate": 9.003951110342418e-06, + "loss": 0.0906, + "step": 2710 + }, + { + "epoch": 0.23, + "grad_norm": 0.21896691538254417, + "learning_rate": 9.003133638252022e-06, + "loss": 0.059, + "step": 2711 + }, + { + "epoch": 0.23, + "grad_norm": 0.3326999807001258, + "learning_rate": 9.002315867984912e-06, + "loss": 0.0787, + "step": 2712 + }, + { + "epoch": 0.23, + "grad_norm": 0.40439843280643584, + "learning_rate": 9.001497799602001e-06, + "loss": 0.0574, + "step": 2713 + }, + { + "epoch": 0.23, + "grad_norm": 0.4033431963451583, + "learning_rate": 9.000679433164224e-06, + "loss": 0.0951, + "step": 2714 + }, + { + "epoch": 0.23, + "grad_norm": 0.3559040485871925, + "learning_rate": 8.999860768732536e-06, + "loss": 0.0709, + "step": 2715 + }, + { + "epoch": 0.23, + "grad_norm": 0.3061154405179683, + "learning_rate": 8.999041806367919e-06, + "loss": 0.0756, + "step": 2716 + }, + { + "epoch": 0.23, + "grad_norm": 0.4525134213262327, + "learning_rate": 8.99822254613137e-06, + "loss": 0.104, + "step": 2717 + }, + { + "epoch": 0.23, + "grad_norm": 0.30507556704077016, + "learning_rate": 8.997402988083916e-06, + "loss": 0.0988, + "step": 2718 + }, + { + "epoch": 0.23, + "grad_norm": 0.345364014966951, + "learning_rate": 8.9965831322866e-06, + "loss": 0.0944, + "step": 2719 + }, + { + "epoch": 0.23, + "grad_norm": 0.5671088035707671, + "learning_rate": 8.995762978800492e-06, + "loss": 0.1199, + "step": 2720 + }, + { + "epoch": 0.23, + "grad_norm": 0.27428010282051657, + "learning_rate": 8.994942527686677e-06, + "loss": 0.0731, + "step": 2721 + }, + { + "epoch": 0.23, + "grad_norm": 0.3704596620246462, + "learning_rate": 8.994121779006273e-06, + "loss": 0.0785, + "step": 2722 + }, + { + "epoch": 0.23, + "grad_norm": 0.33800915646533625, + "learning_rate": 8.99330073282041e-06, + "loss": 0.0955, + "step": 2723 + }, + { + "epoch": 0.23, + "grad_norm": 0.4920755607101958, + "learning_rate": 8.992479389190247e-06, + "loss": 0.095, + "step": 2724 + }, + { + "epoch": 0.23, + "grad_norm": 0.25387318046778673, + "learning_rate": 8.99165774817696e-06, + "loss": 0.0895, + "step": 2725 + }, + { + "epoch": 0.23, + "grad_norm": 0.3595736401784846, + "learning_rate": 8.990835809841751e-06, + "loss": 0.1049, + "step": 2726 + }, + { + "epoch": 0.23, + "grad_norm": 0.7542700445214279, + "learning_rate": 8.990013574245843e-06, + "loss": 0.1247, + "step": 2727 + }, + { + "epoch": 0.23, + "grad_norm": 0.3395562129942145, + "learning_rate": 8.98919104145048e-06, + "loss": 0.0727, + "step": 2728 + }, + { + "epoch": 0.23, + "grad_norm": 0.3963983887916889, + "learning_rate": 8.98836821151693e-06, + "loss": 0.1082, + "step": 2729 + }, + { + "epoch": 0.23, + "grad_norm": 0.36094206402572404, + "learning_rate": 8.987545084506481e-06, + "loss": 0.0759, + "step": 2730 + }, + { + "epoch": 0.23, + "grad_norm": 0.2660822618789366, + "learning_rate": 8.986721660480446e-06, + "loss": 0.0999, + "step": 2731 + }, + { + "epoch": 0.23, + "grad_norm": 0.23723519517975003, + "learning_rate": 8.985897939500156e-06, + "loss": 0.0576, + "step": 2732 + }, + { + "epoch": 0.23, + "grad_norm": 0.3518410306265884, + "learning_rate": 8.98507392162697e-06, + "loss": 0.0905, + "step": 2733 + }, + { + "epoch": 0.23, + "grad_norm": 0.5090933764933581, + "learning_rate": 8.984249606922261e-06, + "loss": 0.1297, + "step": 2734 + }, + { + "epoch": 0.23, + "grad_norm": 0.47058548146874457, + "learning_rate": 8.983424995447433e-06, + "loss": 0.0966, + "step": 2735 + }, + { + "epoch": 0.23, + "grad_norm": 0.37973954061021137, + "learning_rate": 8.982600087263907e-06, + "loss": 0.1368, + "step": 2736 + }, + { + "epoch": 0.23, + "grad_norm": 0.2521395228296642, + "learning_rate": 8.981774882433124e-06, + "loss": 0.0788, + "step": 2737 + }, + { + "epoch": 0.23, + "grad_norm": 0.3105013537155436, + "learning_rate": 8.980949381016553e-06, + "loss": 0.1077, + "step": 2738 + }, + { + "epoch": 0.23, + "grad_norm": 0.33258028085388047, + "learning_rate": 8.98012358307568e-06, + "loss": 0.0673, + "step": 2739 + }, + { + "epoch": 0.23, + "grad_norm": 0.30178482413869234, + "learning_rate": 8.979297488672019e-06, + "loss": 0.0741, + "step": 2740 + }, + { + "epoch": 0.23, + "grad_norm": 0.571455642226894, + "learning_rate": 8.9784710978671e-06, + "loss": 0.1515, + "step": 2741 + }, + { + "epoch": 0.23, + "grad_norm": 0.47980663204494933, + "learning_rate": 8.977644410722475e-06, + "loss": 0.0855, + "step": 2742 + }, + { + "epoch": 0.23, + "grad_norm": 0.19803699107565897, + "learning_rate": 8.976817427299724e-06, + "loss": 0.0723, + "step": 2743 + }, + { + "epoch": 0.23, + "grad_norm": 0.7116909736110781, + "learning_rate": 8.975990147660444e-06, + "loss": 0.1062, + "step": 2744 + }, + { + "epoch": 0.23, + "grad_norm": 0.3031909469027027, + "learning_rate": 8.975162571866255e-06, + "loss": 0.0826, + "step": 2745 + }, + { + "epoch": 0.23, + "grad_norm": 0.42399983097062166, + "learning_rate": 8.974334699978803e-06, + "loss": 0.0946, + "step": 2746 + }, + { + "epoch": 0.23, + "grad_norm": 0.6491029982640961, + "learning_rate": 8.973506532059749e-06, + "loss": 0.0727, + "step": 2747 + }, + { + "epoch": 0.23, + "grad_norm": 0.2149978682935139, + "learning_rate": 8.972678068170781e-06, + "loss": 0.0631, + "step": 2748 + }, + { + "epoch": 0.23, + "grad_norm": 0.450109538037787, + "learning_rate": 8.971849308373608e-06, + "loss": 0.1045, + "step": 2749 + }, + { + "epoch": 0.23, + "grad_norm": 0.331023447233103, + "learning_rate": 8.971020252729961e-06, + "loss": 0.1064, + "step": 2750 + }, + { + "epoch": 0.23, + "grad_norm": 0.20582904841185387, + "learning_rate": 8.970190901301595e-06, + "loss": 0.0605, + "step": 2751 + }, + { + "epoch": 0.23, + "grad_norm": 0.24943963396730862, + "learning_rate": 8.96936125415028e-06, + "loss": 0.0661, + "step": 2752 + }, + { + "epoch": 0.23, + "grad_norm": 0.3130314373925038, + "learning_rate": 8.968531311337814e-06, + "loss": 0.0926, + "step": 2753 + }, + { + "epoch": 0.23, + "grad_norm": 0.39420445098373563, + "learning_rate": 8.96770107292602e-06, + "loss": 0.0986, + "step": 2754 + }, + { + "epoch": 0.23, + "grad_norm": 0.4103619106541245, + "learning_rate": 8.966870538976736e-06, + "loss": 0.1, + "step": 2755 + }, + { + "epoch": 0.23, + "grad_norm": 0.3719851568235137, + "learning_rate": 8.966039709551826e-06, + "loss": 0.0982, + "step": 2756 + }, + { + "epoch": 0.23, + "grad_norm": 0.2953239385620919, + "learning_rate": 8.965208584713173e-06, + "loss": 0.0757, + "step": 2757 + }, + { + "epoch": 0.23, + "grad_norm": 0.3406903095112857, + "learning_rate": 8.964377164522686e-06, + "loss": 0.0795, + "step": 2758 + }, + { + "epoch": 0.23, + "grad_norm": 0.3338676647667838, + "learning_rate": 8.963545449042294e-06, + "loss": 0.0789, + "step": 2759 + }, + { + "epoch": 0.23, + "grad_norm": 0.30406041117699845, + "learning_rate": 8.962713438333948e-06, + "loss": 0.0602, + "step": 2760 + }, + { + "epoch": 0.23, + "grad_norm": 0.42905916956599444, + "learning_rate": 8.96188113245962e-06, + "loss": 0.109, + "step": 2761 + }, + { + "epoch": 0.23, + "grad_norm": 0.7120739398509142, + "learning_rate": 8.961048531481306e-06, + "loss": 0.124, + "step": 2762 + }, + { + "epoch": 0.23, + "grad_norm": 0.2887606799930288, + "learning_rate": 8.960215635461023e-06, + "loss": 0.0941, + "step": 2763 + }, + { + "epoch": 0.23, + "grad_norm": 0.2647619570069839, + "learning_rate": 8.959382444460808e-06, + "loss": 0.05, + "step": 2764 + }, + { + "epoch": 0.23, + "grad_norm": 0.2436400523389256, + "learning_rate": 8.958548958542726e-06, + "loss": 0.042, + "step": 2765 + }, + { + "epoch": 0.23, + "grad_norm": 0.34521278704333364, + "learning_rate": 8.957715177768856e-06, + "loss": 0.0946, + "step": 2766 + }, + { + "epoch": 0.23, + "grad_norm": 0.9392575631150798, + "learning_rate": 8.956881102201303e-06, + "loss": 0.1483, + "step": 2767 + }, + { + "epoch": 0.23, + "grad_norm": 0.40481432764914393, + "learning_rate": 8.956046731902196e-06, + "loss": 0.1014, + "step": 2768 + }, + { + "epoch": 0.23, + "grad_norm": 0.2985600816022739, + "learning_rate": 8.955212066933683e-06, + "loss": 0.0796, + "step": 2769 + }, + { + "epoch": 0.23, + "grad_norm": 0.24827947281851215, + "learning_rate": 8.954377107357935e-06, + "loss": 0.0674, + "step": 2770 + }, + { + "epoch": 0.23, + "grad_norm": 0.6783140624660977, + "learning_rate": 8.953541853237142e-06, + "loss": 0.1095, + "step": 2771 + }, + { + "epoch": 0.23, + "grad_norm": 0.41651437942943265, + "learning_rate": 8.952706304633522e-06, + "loss": 0.109, + "step": 2772 + }, + { + "epoch": 0.23, + "grad_norm": 0.4421396954934899, + "learning_rate": 8.951870461609308e-06, + "loss": 0.1372, + "step": 2773 + }, + { + "epoch": 0.23, + "grad_norm": 0.21061328471940802, + "learning_rate": 8.951034324226764e-06, + "loss": 0.0542, + "step": 2774 + }, + { + "epoch": 0.23, + "grad_norm": 0.4595805091546543, + "learning_rate": 8.950197892548166e-06, + "loss": 0.1173, + "step": 2775 + }, + { + "epoch": 0.23, + "grad_norm": 0.3471857963009139, + "learning_rate": 8.949361166635817e-06, + "loss": 0.0595, + "step": 2776 + }, + { + "epoch": 0.23, + "grad_norm": 0.2798436353232557, + "learning_rate": 8.948524146552043e-06, + "loss": 0.0673, + "step": 2777 + }, + { + "epoch": 0.23, + "grad_norm": 0.24665762314253364, + "learning_rate": 8.947686832359186e-06, + "loss": 0.0853, + "step": 2778 + }, + { + "epoch": 0.23, + "grad_norm": 0.30843007646134885, + "learning_rate": 8.946849224119618e-06, + "loss": 0.0879, + "step": 2779 + }, + { + "epoch": 0.23, + "grad_norm": 0.4082239758110478, + "learning_rate": 8.946011321895727e-06, + "loss": 0.1236, + "step": 2780 + }, + { + "epoch": 0.23, + "grad_norm": 0.3782139103338437, + "learning_rate": 8.945173125749927e-06, + "loss": 0.1002, + "step": 2781 + }, + { + "epoch": 0.23, + "grad_norm": 0.4061858643167637, + "learning_rate": 8.94433463574465e-06, + "loss": 0.107, + "step": 2782 + }, + { + "epoch": 0.23, + "grad_norm": 0.23856184381816875, + "learning_rate": 8.943495851942352e-06, + "loss": 0.0799, + "step": 2783 + }, + { + "epoch": 0.23, + "grad_norm": 0.42964048314409087, + "learning_rate": 8.94265677440551e-06, + "loss": 0.0805, + "step": 2784 + }, + { + "epoch": 0.23, + "grad_norm": 0.3520249798638243, + "learning_rate": 8.941817403196624e-06, + "loss": 0.0751, + "step": 2785 + }, + { + "epoch": 0.23, + "grad_norm": 0.5090531036333168, + "learning_rate": 8.940977738378214e-06, + "loss": 0.118, + "step": 2786 + }, + { + "epoch": 0.23, + "grad_norm": 0.5029884313094897, + "learning_rate": 8.940137780012825e-06, + "loss": 0.1337, + "step": 2787 + }, + { + "epoch": 0.23, + "grad_norm": 0.2897044651718947, + "learning_rate": 8.939297528163021e-06, + "loss": 0.0843, + "step": 2788 + }, + { + "epoch": 0.24, + "grad_norm": 0.49933053200983224, + "learning_rate": 8.93845698289139e-06, + "loss": 0.114, + "step": 2789 + }, + { + "epoch": 0.24, + "grad_norm": 0.2294182185806812, + "learning_rate": 8.937616144260537e-06, + "loss": 0.0725, + "step": 2790 + }, + { + "epoch": 0.24, + "grad_norm": 0.4258161410780781, + "learning_rate": 8.936775012333097e-06, + "loss": 0.1214, + "step": 2791 + }, + { + "epoch": 0.24, + "grad_norm": 0.32472605810986227, + "learning_rate": 8.935933587171722e-06, + "loss": 0.115, + "step": 2792 + }, + { + "epoch": 0.24, + "grad_norm": 0.47211953434842624, + "learning_rate": 8.935091868839083e-06, + "loss": 0.1195, + "step": 2793 + }, + { + "epoch": 0.24, + "grad_norm": 0.255466045805625, + "learning_rate": 8.934249857397878e-06, + "loss": 0.0702, + "step": 2794 + }, + { + "epoch": 0.24, + "grad_norm": 0.3103380620273569, + "learning_rate": 8.933407552910825e-06, + "loss": 0.0755, + "step": 2795 + }, + { + "epoch": 0.24, + "grad_norm": 0.3137532550394677, + "learning_rate": 8.932564955440665e-06, + "loss": 0.0908, + "step": 2796 + }, + { + "epoch": 0.24, + "grad_norm": 0.23521704931762324, + "learning_rate": 8.931722065050156e-06, + "loss": 0.0845, + "step": 2797 + }, + { + "epoch": 0.24, + "grad_norm": 0.25882444005046445, + "learning_rate": 8.930878881802085e-06, + "loss": 0.0537, + "step": 2798 + }, + { + "epoch": 0.24, + "grad_norm": 0.39272613559378483, + "learning_rate": 8.930035405759256e-06, + "loss": 0.1022, + "step": 2799 + }, + { + "epoch": 0.24, + "grad_norm": 0.40178841740603577, + "learning_rate": 8.929191636984497e-06, + "loss": 0.0822, + "step": 2800 + }, + { + "epoch": 0.24, + "grad_norm": 0.30141877929050653, + "learning_rate": 8.928347575540655e-06, + "loss": 0.0596, + "step": 2801 + }, + { + "epoch": 0.24, + "grad_norm": 0.3862418193709303, + "learning_rate": 8.927503221490598e-06, + "loss": 0.096, + "step": 2802 + }, + { + "epoch": 0.24, + "grad_norm": 0.3681834253637028, + "learning_rate": 8.926658574897225e-06, + "loss": 0.1129, + "step": 2803 + }, + { + "epoch": 0.24, + "grad_norm": 0.3268792490591226, + "learning_rate": 8.925813635823446e-06, + "loss": 0.0669, + "step": 2804 + }, + { + "epoch": 0.24, + "grad_norm": 0.23037313144027546, + "learning_rate": 8.9249684043322e-06, + "loss": 0.0573, + "step": 2805 + }, + { + "epoch": 0.24, + "grad_norm": 0.26532749631270036, + "learning_rate": 8.924122880486439e-06, + "loss": 0.0677, + "step": 2806 + }, + { + "epoch": 0.24, + "grad_norm": 0.31522562941668775, + "learning_rate": 8.923277064349147e-06, + "loss": 0.0602, + "step": 2807 + }, + { + "epoch": 0.24, + "grad_norm": 0.5229278502205444, + "learning_rate": 8.922430955983326e-06, + "loss": 0.1347, + "step": 2808 + }, + { + "epoch": 0.24, + "grad_norm": 0.37165257221447645, + "learning_rate": 8.921584555451997e-06, + "loss": 0.1056, + "step": 2809 + }, + { + "epoch": 0.24, + "grad_norm": 0.4678765455875168, + "learning_rate": 8.920737862818205e-06, + "loss": 0.0904, + "step": 2810 + }, + { + "epoch": 0.24, + "grad_norm": 0.3798818559732239, + "learning_rate": 8.919890878145016e-06, + "loss": 0.1146, + "step": 2811 + }, + { + "epoch": 0.24, + "grad_norm": 0.6224690370047794, + "learning_rate": 8.91904360149552e-06, + "loss": 0.1181, + "step": 2812 + }, + { + "epoch": 0.24, + "grad_norm": 0.4084461172779903, + "learning_rate": 8.918196032932827e-06, + "loss": 0.094, + "step": 2813 + }, + { + "epoch": 0.24, + "grad_norm": 0.41239747915839853, + "learning_rate": 8.917348172520069e-06, + "loss": 0.1097, + "step": 2814 + }, + { + "epoch": 0.24, + "grad_norm": 0.36825633041050204, + "learning_rate": 8.916500020320398e-06, + "loss": 0.0732, + "step": 2815 + }, + { + "epoch": 0.24, + "grad_norm": 0.5237830106558871, + "learning_rate": 8.91565157639699e-06, + "loss": 0.1306, + "step": 2816 + }, + { + "epoch": 0.24, + "grad_norm": 0.350257487694403, + "learning_rate": 8.914802840813044e-06, + "loss": 0.0916, + "step": 2817 + }, + { + "epoch": 0.24, + "grad_norm": 0.27900696208553033, + "learning_rate": 8.913953813631775e-06, + "loss": 0.0759, + "step": 2818 + }, + { + "epoch": 0.24, + "grad_norm": 0.3686581800894469, + "learning_rate": 8.913104494916425e-06, + "loss": 0.0577, + "step": 2819 + }, + { + "epoch": 0.24, + "grad_norm": 0.618928945166433, + "learning_rate": 8.912254884730259e-06, + "loss": 0.1046, + "step": 2820 + }, + { + "epoch": 0.24, + "grad_norm": 0.23629178657471436, + "learning_rate": 8.911404983136556e-06, + "loss": 0.0679, + "step": 2821 + }, + { + "epoch": 0.24, + "grad_norm": 0.27315518329800387, + "learning_rate": 8.910554790198627e-06, + "loss": 0.0754, + "step": 2822 + }, + { + "epoch": 0.24, + "grad_norm": 0.5289553100616763, + "learning_rate": 8.909704305979795e-06, + "loss": 0.1315, + "step": 2823 + }, + { + "epoch": 0.24, + "grad_norm": 0.3553910233472979, + "learning_rate": 8.908853530543408e-06, + "loss": 0.11, + "step": 2824 + }, + { + "epoch": 0.24, + "grad_norm": 0.3311007901480106, + "learning_rate": 8.908002463952843e-06, + "loss": 0.0824, + "step": 2825 + }, + { + "epoch": 0.24, + "grad_norm": 0.32371292554120545, + "learning_rate": 8.907151106271487e-06, + "loss": 0.0849, + "step": 2826 + }, + { + "epoch": 0.24, + "grad_norm": 0.5223889497269055, + "learning_rate": 8.906299457562757e-06, + "loss": 0.1099, + "step": 2827 + }, + { + "epoch": 0.24, + "grad_norm": 0.23312336567390182, + "learning_rate": 8.905447517890087e-06, + "loss": 0.0428, + "step": 2828 + }, + { + "epoch": 0.24, + "grad_norm": 0.37546354739097065, + "learning_rate": 8.904595287316936e-06, + "loss": 0.1063, + "step": 2829 + }, + { + "epoch": 0.24, + "grad_norm": 0.37472073223353686, + "learning_rate": 8.90374276590678e-06, + "loss": 0.1065, + "step": 2830 + }, + { + "epoch": 0.24, + "grad_norm": 0.6325258861975911, + "learning_rate": 8.902889953723122e-06, + "loss": 0.143, + "step": 2831 + }, + { + "epoch": 0.24, + "grad_norm": 0.7134435322607835, + "learning_rate": 8.902036850829485e-06, + "loss": 0.1385, + "step": 2832 + }, + { + "epoch": 0.24, + "grad_norm": 0.35354017725524867, + "learning_rate": 8.901183457289413e-06, + "loss": 0.0886, + "step": 2833 + }, + { + "epoch": 0.24, + "grad_norm": 0.5151913959789703, + "learning_rate": 8.900329773166471e-06, + "loss": 0.13, + "step": 2834 + }, + { + "epoch": 0.24, + "grad_norm": 0.14351625360780815, + "learning_rate": 8.899475798524245e-06, + "loss": 0.0324, + "step": 2835 + }, + { + "epoch": 0.24, + "grad_norm": 0.22916388682678246, + "learning_rate": 8.898621533426346e-06, + "loss": 0.0331, + "step": 2836 + }, + { + "epoch": 0.24, + "grad_norm": 0.4312486039375231, + "learning_rate": 8.897766977936404e-06, + "loss": 0.1088, + "step": 2837 + }, + { + "epoch": 0.24, + "grad_norm": 0.5615662450116726, + "learning_rate": 8.896912132118072e-06, + "loss": 0.1114, + "step": 2838 + }, + { + "epoch": 0.24, + "grad_norm": 0.39168859423279895, + "learning_rate": 8.896056996035024e-06, + "loss": 0.1012, + "step": 2839 + }, + { + "epoch": 0.24, + "grad_norm": 0.4427772288729868, + "learning_rate": 8.895201569750952e-06, + "loss": 0.1285, + "step": 2840 + }, + { + "epoch": 0.24, + "grad_norm": 0.5172872940523813, + "learning_rate": 8.894345853329577e-06, + "loss": 0.066, + "step": 2841 + }, + { + "epoch": 0.24, + "grad_norm": 0.4320043699356264, + "learning_rate": 8.893489846834637e-06, + "loss": 0.1408, + "step": 2842 + }, + { + "epoch": 0.24, + "grad_norm": 0.21826385738217519, + "learning_rate": 8.892633550329892e-06, + "loss": 0.0451, + "step": 2843 + }, + { + "epoch": 0.24, + "grad_norm": 0.30536975239453507, + "learning_rate": 8.891776963879124e-06, + "loss": 0.0987, + "step": 2844 + }, + { + "epoch": 0.24, + "grad_norm": 0.41305233935395874, + "learning_rate": 8.890920087546135e-06, + "loss": 0.1219, + "step": 2845 + }, + { + "epoch": 0.24, + "grad_norm": 0.29496752962675066, + "learning_rate": 8.890062921394753e-06, + "loss": 0.0659, + "step": 2846 + }, + { + "epoch": 0.24, + "grad_norm": 0.27501347898561496, + "learning_rate": 8.889205465488823e-06, + "loss": 0.0986, + "step": 2847 + }, + { + "epoch": 0.24, + "grad_norm": 0.33111899225956937, + "learning_rate": 8.888347719892214e-06, + "loss": 0.0849, + "step": 2848 + }, + { + "epoch": 0.24, + "grad_norm": 0.27545672599575527, + "learning_rate": 8.887489684668816e-06, + "loss": 0.0538, + "step": 2849 + }, + { + "epoch": 0.24, + "grad_norm": 0.25472040155819675, + "learning_rate": 8.886631359882538e-06, + "loss": 0.0735, + "step": 2850 + }, + { + "epoch": 0.24, + "grad_norm": 0.37903050561502716, + "learning_rate": 8.885772745597317e-06, + "loss": 0.0884, + "step": 2851 + }, + { + "epoch": 0.24, + "grad_norm": 0.5115405883212079, + "learning_rate": 8.884913841877106e-06, + "loss": 0.0863, + "step": 2852 + }, + { + "epoch": 0.24, + "grad_norm": 0.40063681772782056, + "learning_rate": 8.884054648785878e-06, + "loss": 0.0942, + "step": 2853 + }, + { + "epoch": 0.24, + "grad_norm": 0.41657987661935486, + "learning_rate": 8.883195166387635e-06, + "loss": 0.0624, + "step": 2854 + }, + { + "epoch": 0.24, + "grad_norm": 0.43316848546427505, + "learning_rate": 8.882335394746395e-06, + "loss": 0.1244, + "step": 2855 + }, + { + "epoch": 0.24, + "grad_norm": 0.3297483499339481, + "learning_rate": 8.881475333926198e-06, + "loss": 0.1023, + "step": 2856 + }, + { + "epoch": 0.24, + "grad_norm": 0.279182412492037, + "learning_rate": 8.880614983991106e-06, + "loss": 0.0763, + "step": 2857 + }, + { + "epoch": 0.24, + "grad_norm": 0.3678743158692316, + "learning_rate": 8.879754345005207e-06, + "loss": 0.1089, + "step": 2858 + }, + { + "epoch": 0.24, + "grad_norm": 0.3975343197899079, + "learning_rate": 8.8788934170326e-06, + "loss": 0.1185, + "step": 2859 + }, + { + "epoch": 0.24, + "grad_norm": 0.3612735218594484, + "learning_rate": 8.878032200137417e-06, + "loss": 0.0608, + "step": 2860 + }, + { + "epoch": 0.24, + "grad_norm": 0.36003023521498584, + "learning_rate": 8.877170694383803e-06, + "loss": 0.0891, + "step": 2861 + }, + { + "epoch": 0.24, + "grad_norm": 0.4427646180924044, + "learning_rate": 8.876308899835931e-06, + "loss": 0.1154, + "step": 2862 + }, + { + "epoch": 0.24, + "grad_norm": 0.37212812679235696, + "learning_rate": 8.875446816557988e-06, + "loss": 0.0689, + "step": 2863 + }, + { + "epoch": 0.24, + "grad_norm": 0.23265850182196846, + "learning_rate": 8.874584444614193e-06, + "loss": 0.0649, + "step": 2864 + }, + { + "epoch": 0.24, + "grad_norm": 0.26637280780579425, + "learning_rate": 8.873721784068775e-06, + "loss": 0.0493, + "step": 2865 + }, + { + "epoch": 0.24, + "grad_norm": 0.2719625337348453, + "learning_rate": 8.872858834985995e-06, + "loss": 0.073, + "step": 2866 + }, + { + "epoch": 0.24, + "grad_norm": 0.3883071654519712, + "learning_rate": 8.871995597430128e-06, + "loss": 0.0679, + "step": 2867 + }, + { + "epoch": 0.24, + "grad_norm": 0.273445482848648, + "learning_rate": 8.871132071465472e-06, + "loss": 0.0611, + "step": 2868 + }, + { + "epoch": 0.24, + "grad_norm": 0.14251544165414695, + "learning_rate": 8.870268257156346e-06, + "loss": 0.0356, + "step": 2869 + }, + { + "epoch": 0.24, + "grad_norm": 0.36282991457711605, + "learning_rate": 8.869404154567097e-06, + "loss": 0.0564, + "step": 2870 + }, + { + "epoch": 0.24, + "grad_norm": 0.4724302640022304, + "learning_rate": 8.868539763762085e-06, + "loss": 0.095, + "step": 2871 + }, + { + "epoch": 0.24, + "grad_norm": 0.40559956579385575, + "learning_rate": 8.867675084805696e-06, + "loss": 0.1012, + "step": 2872 + }, + { + "epoch": 0.24, + "grad_norm": 0.42567031255506615, + "learning_rate": 8.866810117762335e-06, + "loss": 0.1294, + "step": 2873 + }, + { + "epoch": 0.24, + "grad_norm": 0.30781394833221576, + "learning_rate": 8.865944862696431e-06, + "loss": 0.0545, + "step": 2874 + }, + { + "epoch": 0.24, + "grad_norm": 0.31404349108611535, + "learning_rate": 8.865079319672432e-06, + "loss": 0.0988, + "step": 2875 + }, + { + "epoch": 0.24, + "grad_norm": 0.3086156659255089, + "learning_rate": 8.864213488754808e-06, + "loss": 0.0929, + "step": 2876 + }, + { + "epoch": 0.24, + "grad_norm": 0.29320734567447476, + "learning_rate": 8.863347370008058e-06, + "loss": 0.0864, + "step": 2877 + }, + { + "epoch": 0.24, + "grad_norm": 0.3587082246279781, + "learning_rate": 8.862480963496687e-06, + "loss": 0.064, + "step": 2878 + }, + { + "epoch": 0.24, + "grad_norm": 0.39194698361953034, + "learning_rate": 8.861614269285233e-06, + "loss": 0.0586, + "step": 2879 + }, + { + "epoch": 0.24, + "grad_norm": 0.38667544477744226, + "learning_rate": 8.860747287438252e-06, + "loss": 0.1191, + "step": 2880 + }, + { + "epoch": 0.24, + "grad_norm": 0.2874327132378102, + "learning_rate": 8.859880018020323e-06, + "loss": 0.0775, + "step": 2881 + }, + { + "epoch": 0.24, + "grad_norm": 0.34855671160765467, + "learning_rate": 8.859012461096045e-06, + "loss": 0.0966, + "step": 2882 + }, + { + "epoch": 0.24, + "grad_norm": 0.4004832468044629, + "learning_rate": 8.858144616730038e-06, + "loss": 0.0995, + "step": 2883 + }, + { + "epoch": 0.24, + "grad_norm": 0.29411404420807263, + "learning_rate": 8.857276484986946e-06, + "loss": 0.1008, + "step": 2884 + }, + { + "epoch": 0.24, + "grad_norm": 0.6280329599597151, + "learning_rate": 8.856408065931429e-06, + "loss": 0.1251, + "step": 2885 + }, + { + "epoch": 0.24, + "grad_norm": 0.25545803823799185, + "learning_rate": 8.855539359628177e-06, + "loss": 0.0622, + "step": 2886 + }, + { + "epoch": 0.24, + "grad_norm": 0.3207803412588621, + "learning_rate": 8.85467036614189e-06, + "loss": 0.0759, + "step": 2887 + }, + { + "epoch": 0.24, + "grad_norm": 0.3119671468797237, + "learning_rate": 8.853801085537303e-06, + "loss": 0.0648, + "step": 2888 + }, + { + "epoch": 0.24, + "grad_norm": 0.5293291658847464, + "learning_rate": 8.852931517879157e-06, + "loss": 0.1208, + "step": 2889 + }, + { + "epoch": 0.24, + "grad_norm": 0.3874955631323083, + "learning_rate": 8.85206166323223e-06, + "loss": 0.11, + "step": 2890 + }, + { + "epoch": 0.24, + "grad_norm": 0.2395488291426755, + "learning_rate": 8.851191521661308e-06, + "loss": 0.0811, + "step": 2891 + }, + { + "epoch": 0.24, + "grad_norm": 0.4209398389180246, + "learning_rate": 8.850321093231209e-06, + "loss": 0.0567, + "step": 2892 + }, + { + "epoch": 0.24, + "grad_norm": 0.2421634663167766, + "learning_rate": 8.849450378006763e-06, + "loss": 0.0745, + "step": 2893 + }, + { + "epoch": 0.24, + "grad_norm": 0.2208218228211407, + "learning_rate": 8.848579376052828e-06, + "loss": 0.0574, + "step": 2894 + }, + { + "epoch": 0.24, + "grad_norm": 0.5694713614156356, + "learning_rate": 8.847708087434282e-06, + "loss": 0.095, + "step": 2895 + }, + { + "epoch": 0.24, + "grad_norm": 0.3097219552399733, + "learning_rate": 8.846836512216022e-06, + "loss": 0.0514, + "step": 2896 + }, + { + "epoch": 0.24, + "grad_norm": 0.21961186232410493, + "learning_rate": 8.845964650462971e-06, + "loss": 0.0611, + "step": 2897 + }, + { + "epoch": 0.24, + "grad_norm": 0.4584144280189024, + "learning_rate": 8.845092502240067e-06, + "loss": 0.1088, + "step": 2898 + }, + { + "epoch": 0.24, + "grad_norm": 0.4130477547080702, + "learning_rate": 8.844220067612276e-06, + "loss": 0.1015, + "step": 2899 + }, + { + "epoch": 0.24, + "grad_norm": 0.2802338282277875, + "learning_rate": 8.843347346644579e-06, + "loss": 0.0721, + "step": 2900 + }, + { + "epoch": 0.24, + "grad_norm": 0.3561563520956477, + "learning_rate": 8.842474339401983e-06, + "loss": 0.0906, + "step": 2901 + }, + { + "epoch": 0.24, + "grad_norm": 0.2778591764946319, + "learning_rate": 8.841601045949513e-06, + "loss": 0.0749, + "step": 2902 + }, + { + "epoch": 0.24, + "grad_norm": 0.5255139502632714, + "learning_rate": 8.84072746635222e-06, + "loss": 0.1101, + "step": 2903 + }, + { + "epoch": 0.24, + "grad_norm": 0.5799560394608094, + "learning_rate": 8.83985360067517e-06, + "loss": 0.1276, + "step": 2904 + }, + { + "epoch": 0.24, + "grad_norm": 0.2367330495962254, + "learning_rate": 8.838979448983453e-06, + "loss": 0.0405, + "step": 2905 + }, + { + "epoch": 0.24, + "grad_norm": 0.5101877524234021, + "learning_rate": 8.838105011342186e-06, + "loss": 0.1049, + "step": 2906 + }, + { + "epoch": 0.24, + "grad_norm": 0.24916647818240295, + "learning_rate": 8.837230287816498e-06, + "loss": 0.0685, + "step": 2907 + }, + { + "epoch": 0.25, + "grad_norm": 0.24931999935395047, + "learning_rate": 8.836355278471545e-06, + "loss": 0.0685, + "step": 2908 + }, + { + "epoch": 0.25, + "grad_norm": 0.3354044661605128, + "learning_rate": 8.835479983372502e-06, + "loss": 0.0881, + "step": 2909 + }, + { + "epoch": 0.25, + "grad_norm": 0.3390664443223106, + "learning_rate": 8.834604402584566e-06, + "loss": 0.0814, + "step": 2910 + }, + { + "epoch": 0.25, + "grad_norm": 0.3003475997487348, + "learning_rate": 8.833728536172958e-06, + "loss": 0.0739, + "step": 2911 + }, + { + "epoch": 0.25, + "grad_norm": 0.3181003839413497, + "learning_rate": 8.832852384202915e-06, + "loss": 0.0978, + "step": 2912 + }, + { + "epoch": 0.25, + "grad_norm": 0.19012958822881082, + "learning_rate": 8.831975946739697e-06, + "loss": 0.0466, + "step": 2913 + }, + { + "epoch": 0.25, + "grad_norm": 0.3871243806789563, + "learning_rate": 8.831099223848587e-06, + "loss": 0.0927, + "step": 2914 + }, + { + "epoch": 0.25, + "grad_norm": 0.47936710351009654, + "learning_rate": 8.83022221559489e-06, + "loss": 0.15, + "step": 2915 + }, + { + "epoch": 0.25, + "grad_norm": 0.4124137259689811, + "learning_rate": 8.82934492204393e-06, + "loss": 0.1078, + "step": 2916 + }, + { + "epoch": 0.25, + "grad_norm": 0.25830085286326315, + "learning_rate": 8.828467343261054e-06, + "loss": 0.0748, + "step": 2917 + }, + { + "epoch": 0.25, + "grad_norm": 0.33899724041855966, + "learning_rate": 8.827589479311626e-06, + "loss": 0.0551, + "step": 2918 + }, + { + "epoch": 0.25, + "grad_norm": 0.6024074170621591, + "learning_rate": 8.826711330261036e-06, + "loss": 0.1035, + "step": 2919 + }, + { + "epoch": 0.25, + "grad_norm": 0.2353928638402001, + "learning_rate": 8.825832896174696e-06, + "loss": 0.0552, + "step": 2920 + }, + { + "epoch": 0.25, + "grad_norm": 0.429466880403828, + "learning_rate": 8.824954177118032e-06, + "loss": 0.1235, + "step": 2921 + }, + { + "epoch": 0.25, + "grad_norm": 0.4304723017926431, + "learning_rate": 8.8240751731565e-06, + "loss": 0.0806, + "step": 2922 + }, + { + "epoch": 0.25, + "grad_norm": 0.4218768100065337, + "learning_rate": 8.823195884355572e-06, + "loss": 0.1126, + "step": 2923 + }, + { + "epoch": 0.25, + "grad_norm": 0.2872072748294613, + "learning_rate": 8.822316310780743e-06, + "loss": 0.0794, + "step": 2924 + }, + { + "epoch": 0.25, + "grad_norm": 0.29955523937796386, + "learning_rate": 8.82143645249753e-06, + "loss": 0.0785, + "step": 2925 + }, + { + "epoch": 0.25, + "grad_norm": 0.3581329494392368, + "learning_rate": 8.820556309571467e-06, + "loss": 0.0873, + "step": 2926 + }, + { + "epoch": 0.25, + "grad_norm": 0.4154836976986268, + "learning_rate": 8.819675882068113e-06, + "loss": 0.0443, + "step": 2927 + }, + { + "epoch": 0.25, + "grad_norm": 0.2929112583999319, + "learning_rate": 8.818795170053049e-06, + "loss": 0.0679, + "step": 2928 + }, + { + "epoch": 0.25, + "grad_norm": 0.2283100763348576, + "learning_rate": 8.817914173591873e-06, + "loss": 0.088, + "step": 2929 + }, + { + "epoch": 0.25, + "grad_norm": 0.5825168582646038, + "learning_rate": 8.81703289275021e-06, + "loss": 0.118, + "step": 2930 + }, + { + "epoch": 0.25, + "grad_norm": 0.3870929273539651, + "learning_rate": 8.816151327593702e-06, + "loss": 0.1041, + "step": 2931 + }, + { + "epoch": 0.25, + "grad_norm": 0.40418358212486455, + "learning_rate": 8.81526947818801e-06, + "loss": 0.0872, + "step": 2932 + }, + { + "epoch": 0.25, + "grad_norm": 0.43715991417852335, + "learning_rate": 8.814387344598823e-06, + "loss": 0.1258, + "step": 2933 + }, + { + "epoch": 0.25, + "grad_norm": 0.396511259850949, + "learning_rate": 8.813504926891848e-06, + "loss": 0.0837, + "step": 2934 + }, + { + "epoch": 0.25, + "grad_norm": 0.3998401387987405, + "learning_rate": 8.812622225132809e-06, + "loss": 0.1155, + "step": 2935 + }, + { + "epoch": 0.25, + "grad_norm": 0.229249685559685, + "learning_rate": 8.811739239387456e-06, + "loss": 0.0659, + "step": 2936 + }, + { + "epoch": 0.25, + "grad_norm": 0.31346605930253274, + "learning_rate": 8.81085596972156e-06, + "loss": 0.0862, + "step": 2937 + }, + { + "epoch": 0.25, + "grad_norm": 0.26630506028654455, + "learning_rate": 8.809972416200911e-06, + "loss": 0.0759, + "step": 2938 + }, + { + "epoch": 0.25, + "grad_norm": 0.47053928460776173, + "learning_rate": 8.809088578891323e-06, + "loss": 0.1145, + "step": 2939 + }, + { + "epoch": 0.25, + "grad_norm": 0.3856814689919608, + "learning_rate": 8.808204457858627e-06, + "loss": 0.1078, + "step": 2940 + }, + { + "epoch": 0.25, + "grad_norm": 0.4581838423026915, + "learning_rate": 8.807320053168677e-06, + "loss": 0.0998, + "step": 2941 + }, + { + "epoch": 0.25, + "grad_norm": 0.33724341757192344, + "learning_rate": 8.806435364887353e-06, + "loss": 0.0791, + "step": 2942 + }, + { + "epoch": 0.25, + "grad_norm": 0.24557272507719297, + "learning_rate": 8.80555039308055e-06, + "loss": 0.0522, + "step": 2943 + }, + { + "epoch": 0.25, + "grad_norm": 0.3706194307175531, + "learning_rate": 8.80466513781418e-06, + "loss": 0.1475, + "step": 2944 + }, + { + "epoch": 0.25, + "grad_norm": 0.23770944744467992, + "learning_rate": 8.80377959915419e-06, + "loss": 0.0312, + "step": 2945 + }, + { + "epoch": 0.25, + "grad_norm": 0.4996902536641272, + "learning_rate": 8.802893777166535e-06, + "loss": 0.1042, + "step": 2946 + }, + { + "epoch": 0.25, + "grad_norm": 0.3937901437884222, + "learning_rate": 8.8020076719172e-06, + "loss": 0.1113, + "step": 2947 + }, + { + "epoch": 0.25, + "grad_norm": 0.4067606665286067, + "learning_rate": 8.801121283472185e-06, + "loss": 0.0884, + "step": 2948 + }, + { + "epoch": 0.25, + "grad_norm": 0.2721738940222727, + "learning_rate": 8.800234611897513e-06, + "loss": 0.0916, + "step": 2949 + }, + { + "epoch": 0.25, + "grad_norm": 0.41975841466611485, + "learning_rate": 8.799347657259228e-06, + "loss": 0.0866, + "step": 2950 + }, + { + "epoch": 0.25, + "grad_norm": 0.3128445145052996, + "learning_rate": 8.798460419623398e-06, + "loss": 0.0761, + "step": 2951 + }, + { + "epoch": 0.25, + "grad_norm": 0.20865477733894178, + "learning_rate": 8.797572899056109e-06, + "loss": 0.0371, + "step": 2952 + }, + { + "epoch": 0.25, + "grad_norm": 0.35299886958277493, + "learning_rate": 8.796685095623466e-06, + "loss": 0.0769, + "step": 2953 + }, + { + "epoch": 0.25, + "grad_norm": 0.36173531563296485, + "learning_rate": 8.7957970093916e-06, + "loss": 0.0921, + "step": 2954 + }, + { + "epoch": 0.25, + "grad_norm": 0.4767283692558023, + "learning_rate": 8.794908640426663e-06, + "loss": 0.1292, + "step": 2955 + }, + { + "epoch": 0.25, + "grad_norm": 0.2861666948176078, + "learning_rate": 8.79401998879482e-06, + "loss": 0.0783, + "step": 2956 + }, + { + "epoch": 0.25, + "grad_norm": 0.26759349085206163, + "learning_rate": 8.79313105456227e-06, + "loss": 0.0901, + "step": 2957 + }, + { + "epoch": 0.25, + "grad_norm": 0.32168013246623794, + "learning_rate": 8.792241837795219e-06, + "loss": 0.0719, + "step": 2958 + }, + { + "epoch": 0.25, + "grad_norm": 0.3334721712264962, + "learning_rate": 8.791352338559906e-06, + "loss": 0.0653, + "step": 2959 + }, + { + "epoch": 0.25, + "grad_norm": 0.3429156405548126, + "learning_rate": 8.790462556922585e-06, + "loss": 0.0619, + "step": 2960 + }, + { + "epoch": 0.25, + "grad_norm": 0.4742728942252845, + "learning_rate": 8.789572492949533e-06, + "loss": 0.134, + "step": 2961 + }, + { + "epoch": 0.25, + "grad_norm": 0.3652179769118772, + "learning_rate": 8.788682146707043e-06, + "loss": 0.1204, + "step": 2962 + }, + { + "epoch": 0.25, + "grad_norm": 0.3381616096317784, + "learning_rate": 8.787791518261437e-06, + "loss": 0.0656, + "step": 2963 + }, + { + "epoch": 0.25, + "grad_norm": 0.3457025712510556, + "learning_rate": 8.786900607679053e-06, + "loss": 0.0548, + "step": 2964 + }, + { + "epoch": 0.25, + "grad_norm": 0.4812280973689279, + "learning_rate": 8.786009415026254e-06, + "loss": 0.1399, + "step": 2965 + }, + { + "epoch": 0.25, + "grad_norm": 0.7052403817332502, + "learning_rate": 8.785117940369415e-06, + "loss": 0.1001, + "step": 2966 + }, + { + "epoch": 0.25, + "grad_norm": 0.2287226237926877, + "learning_rate": 8.784226183774944e-06, + "loss": 0.0715, + "step": 2967 + }, + { + "epoch": 0.25, + "grad_norm": 0.7434693176748717, + "learning_rate": 8.783334145309262e-06, + "loss": 0.0882, + "step": 2968 + }, + { + "epoch": 0.25, + "grad_norm": 0.45715312805054975, + "learning_rate": 8.782441825038811e-06, + "loss": 0.0771, + "step": 2969 + }, + { + "epoch": 0.25, + "grad_norm": 0.304969482041532, + "learning_rate": 8.781549223030061e-06, + "loss": 0.0653, + "step": 2970 + }, + { + "epoch": 0.25, + "grad_norm": 0.34863359697087204, + "learning_rate": 8.780656339349495e-06, + "loss": 0.0753, + "step": 2971 + }, + { + "epoch": 0.25, + "grad_norm": 0.24027347350793954, + "learning_rate": 8.77976317406362e-06, + "loss": 0.0493, + "step": 2972 + }, + { + "epoch": 0.25, + "grad_norm": 0.40889968863113907, + "learning_rate": 8.778869727238967e-06, + "loss": 0.1111, + "step": 2973 + }, + { + "epoch": 0.25, + "grad_norm": 0.18882107639783857, + "learning_rate": 8.77797599894208e-06, + "loss": 0.0435, + "step": 2974 + }, + { + "epoch": 0.25, + "grad_norm": 0.3838513400340036, + "learning_rate": 8.777081989239536e-06, + "loss": 0.0999, + "step": 2975 + }, + { + "epoch": 0.25, + "grad_norm": 0.45983458419741485, + "learning_rate": 8.77618769819792e-06, + "loss": 0.1147, + "step": 2976 + }, + { + "epoch": 0.25, + "grad_norm": 0.5124004979419058, + "learning_rate": 8.775293125883848e-06, + "loss": 0.0741, + "step": 2977 + }, + { + "epoch": 0.25, + "grad_norm": 0.39616417187255293, + "learning_rate": 8.77439827236395e-06, + "loss": 0.0732, + "step": 2978 + }, + { + "epoch": 0.25, + "grad_norm": 0.4479651846763837, + "learning_rate": 8.773503137704882e-06, + "loss": 0.086, + "step": 2979 + }, + { + "epoch": 0.25, + "grad_norm": 0.39806397267298743, + "learning_rate": 8.772607721973317e-06, + "loss": 0.098, + "step": 2980 + }, + { + "epoch": 0.25, + "grad_norm": 0.34899815414181645, + "learning_rate": 8.771712025235953e-06, + "loss": 0.0717, + "step": 2981 + }, + { + "epoch": 0.25, + "grad_norm": 0.23439161833035924, + "learning_rate": 8.770816047559504e-06, + "loss": 0.0622, + "step": 2982 + }, + { + "epoch": 0.25, + "grad_norm": 0.36554932962078446, + "learning_rate": 8.76991978901071e-06, + "loss": 0.0747, + "step": 2983 + }, + { + "epoch": 0.25, + "grad_norm": 0.3532014102586062, + "learning_rate": 8.769023249656327e-06, + "loss": 0.0774, + "step": 2984 + }, + { + "epoch": 0.25, + "grad_norm": 0.345853014483529, + "learning_rate": 8.768126429563138e-06, + "loss": 0.0573, + "step": 2985 + }, + { + "epoch": 0.25, + "grad_norm": 0.35310386537462773, + "learning_rate": 8.767229328797943e-06, + "loss": 0.0952, + "step": 2986 + }, + { + "epoch": 0.25, + "grad_norm": 0.37319853034545414, + "learning_rate": 8.76633194742756e-06, + "loss": 0.0684, + "step": 2987 + }, + { + "epoch": 0.25, + "grad_norm": 0.23295897174057056, + "learning_rate": 8.765434285518832e-06, + "loss": 0.0518, + "step": 2988 + }, + { + "epoch": 0.25, + "grad_norm": 0.23156680819498554, + "learning_rate": 8.764536343138626e-06, + "loss": 0.0718, + "step": 2989 + }, + { + "epoch": 0.25, + "grad_norm": 0.3775996706689483, + "learning_rate": 8.763638120353821e-06, + "loss": 0.101, + "step": 2990 + }, + { + "epoch": 0.25, + "grad_norm": 0.5060511053304001, + "learning_rate": 8.762739617231325e-06, + "loss": 0.122, + "step": 2991 + }, + { + "epoch": 0.25, + "grad_norm": 0.3203705557553357, + "learning_rate": 8.761840833838062e-06, + "loss": 0.0664, + "step": 2992 + }, + { + "epoch": 0.25, + "grad_norm": 0.4165276899833806, + "learning_rate": 8.760941770240979e-06, + "loss": 0.0926, + "step": 2993 + }, + { + "epoch": 0.25, + "grad_norm": 0.30619266291650543, + "learning_rate": 8.760042426507044e-06, + "loss": 0.0866, + "step": 2994 + }, + { + "epoch": 0.25, + "grad_norm": 0.44687678887352683, + "learning_rate": 8.759142802703246e-06, + "loss": 0.1012, + "step": 2995 + }, + { + "epoch": 0.25, + "grad_norm": 0.42565070428799945, + "learning_rate": 8.758242898896594e-06, + "loss": 0.0941, + "step": 2996 + }, + { + "epoch": 0.25, + "grad_norm": 0.3334117676579736, + "learning_rate": 8.757342715154115e-06, + "loss": 0.07, + "step": 2997 + }, + { + "epoch": 0.25, + "grad_norm": 0.5134372633258016, + "learning_rate": 8.756442251542864e-06, + "loss": 0.1225, + "step": 2998 + }, + { + "epoch": 0.25, + "grad_norm": 0.29412708525730175, + "learning_rate": 8.755541508129912e-06, + "loss": 0.071, + "step": 2999 + }, + { + "epoch": 0.25, + "grad_norm": 0.5219701811151635, + "learning_rate": 8.754640484982351e-06, + "loss": 0.111, + "step": 3000 + }, + { + "epoch": 0.25, + "grad_norm": 0.46805525685333077, + "learning_rate": 8.753739182167294e-06, + "loss": 0.0612, + "step": 3001 + }, + { + "epoch": 0.25, + "grad_norm": 0.29479732104974793, + "learning_rate": 8.752837599751873e-06, + "loss": 0.0618, + "step": 3002 + }, + { + "epoch": 0.25, + "grad_norm": 0.27199341588978804, + "learning_rate": 8.751935737803248e-06, + "loss": 0.0879, + "step": 3003 + }, + { + "epoch": 0.25, + "grad_norm": 0.3978160169497034, + "learning_rate": 8.751033596388593e-06, + "loss": 0.0744, + "step": 3004 + }, + { + "epoch": 0.25, + "grad_norm": 0.34080311756766973, + "learning_rate": 8.750131175575103e-06, + "loss": 0.0733, + "step": 3005 + }, + { + "epoch": 0.25, + "grad_norm": 0.3631759022776894, + "learning_rate": 8.749228475429999e-06, + "loss": 0.1012, + "step": 3006 + }, + { + "epoch": 0.25, + "grad_norm": 0.32774359259040653, + "learning_rate": 8.748325496020512e-06, + "loss": 0.1149, + "step": 3007 + }, + { + "epoch": 0.25, + "grad_norm": 0.2992196278523508, + "learning_rate": 8.747422237413912e-06, + "loss": 0.0657, + "step": 3008 + }, + { + "epoch": 0.25, + "grad_norm": 0.34297064806118344, + "learning_rate": 8.746518699677472e-06, + "loss": 0.0974, + "step": 3009 + }, + { + "epoch": 0.25, + "grad_norm": 0.1873070952707471, + "learning_rate": 8.745614882878494e-06, + "loss": 0.0338, + "step": 3010 + }, + { + "epoch": 0.25, + "grad_norm": 0.307782549440564, + "learning_rate": 8.744710787084299e-06, + "loss": 0.0665, + "step": 3011 + }, + { + "epoch": 0.25, + "grad_norm": 0.19646688663582387, + "learning_rate": 8.74380641236223e-06, + "loss": 0.0399, + "step": 3012 + }, + { + "epoch": 0.25, + "grad_norm": 0.29279733842085304, + "learning_rate": 8.742901758779651e-06, + "loss": 0.0792, + "step": 3013 + }, + { + "epoch": 0.25, + "grad_norm": 0.2263859255474588, + "learning_rate": 8.741996826403944e-06, + "loss": 0.0619, + "step": 3014 + }, + { + "epoch": 0.25, + "grad_norm": 0.20951131458836975, + "learning_rate": 8.741091615302516e-06, + "loss": 0.0537, + "step": 3015 + }, + { + "epoch": 0.25, + "grad_norm": 0.2790485558260262, + "learning_rate": 8.74018612554279e-06, + "loss": 0.0686, + "step": 3016 + }, + { + "epoch": 0.25, + "grad_norm": 0.2848789155589141, + "learning_rate": 8.739280357192213e-06, + "loss": 0.0794, + "step": 3017 + }, + { + "epoch": 0.25, + "grad_norm": 0.26033986062806086, + "learning_rate": 8.738374310318253e-06, + "loss": 0.0594, + "step": 3018 + }, + { + "epoch": 0.25, + "grad_norm": 0.4880779228685505, + "learning_rate": 8.737467984988397e-06, + "loss": 0.1166, + "step": 3019 + }, + { + "epoch": 0.25, + "grad_norm": 0.4776696157447335, + "learning_rate": 8.736561381270152e-06, + "loss": 0.1388, + "step": 3020 + }, + { + "epoch": 0.25, + "grad_norm": 0.4597053267578651, + "learning_rate": 8.735654499231049e-06, + "loss": 0.1015, + "step": 3021 + }, + { + "epoch": 0.25, + "grad_norm": 0.3739923924755732, + "learning_rate": 8.734747338938636e-06, + "loss": 0.0821, + "step": 3022 + }, + { + "epoch": 0.25, + "grad_norm": 0.19204212567712858, + "learning_rate": 8.733839900460487e-06, + "loss": 0.0588, + "step": 3023 + }, + { + "epoch": 0.25, + "grad_norm": 0.49656902512924556, + "learning_rate": 8.732932183864188e-06, + "loss": 0.0884, + "step": 3024 + }, + { + "epoch": 0.25, + "grad_norm": 0.30436965255209997, + "learning_rate": 8.732024189217355e-06, + "loss": 0.0852, + "step": 3025 + }, + { + "epoch": 0.25, + "grad_norm": 0.37335454443625116, + "learning_rate": 8.73111591658762e-06, + "loss": 0.0584, + "step": 3026 + }, + { + "epoch": 0.26, + "grad_norm": 0.3266896381540124, + "learning_rate": 8.730207366042636e-06, + "loss": 0.0936, + "step": 3027 + }, + { + "epoch": 0.26, + "grad_norm": 0.4783785021587216, + "learning_rate": 8.729298537650078e-06, + "loss": 0.1237, + "step": 3028 + }, + { + "epoch": 0.26, + "grad_norm": 0.333226837475945, + "learning_rate": 8.728389431477639e-06, + "loss": 0.0825, + "step": 3029 + }, + { + "epoch": 0.26, + "grad_norm": 0.25996530304656607, + "learning_rate": 8.727480047593037e-06, + "loss": 0.0897, + "step": 3030 + }, + { + "epoch": 0.26, + "grad_norm": 0.2920157497176272, + "learning_rate": 8.726570386064006e-06, + "loss": 0.0521, + "step": 3031 + }, + { + "epoch": 0.26, + "grad_norm": 0.42987280768225616, + "learning_rate": 8.725660446958304e-06, + "loss": 0.1024, + "step": 3032 + }, + { + "epoch": 0.26, + "grad_norm": 0.308683485152685, + "learning_rate": 8.72475023034371e-06, + "loss": 0.0818, + "step": 3033 + }, + { + "epoch": 0.26, + "grad_norm": 0.294305976354729, + "learning_rate": 8.723839736288017e-06, + "loss": 0.0682, + "step": 3034 + }, + { + "epoch": 0.26, + "grad_norm": 0.23245477279577353, + "learning_rate": 8.72292896485905e-06, + "loss": 0.0648, + "step": 3035 + }, + { + "epoch": 0.26, + "grad_norm": 0.2762312648125152, + "learning_rate": 8.722017916124646e-06, + "loss": 0.0466, + "step": 3036 + }, + { + "epoch": 0.26, + "grad_norm": 0.1544132769203554, + "learning_rate": 8.721106590152666e-06, + "loss": 0.0322, + "step": 3037 + }, + { + "epoch": 0.26, + "grad_norm": 0.7385130726875857, + "learning_rate": 8.720194987010988e-06, + "loss": 0.1177, + "step": 3038 + }, + { + "epoch": 0.26, + "grad_norm": 0.3876200212981495, + "learning_rate": 8.719283106767515e-06, + "loss": 0.0681, + "step": 3039 + }, + { + "epoch": 0.26, + "grad_norm": 0.2864410882581645, + "learning_rate": 8.718370949490172e-06, + "loss": 0.0825, + "step": 3040 + }, + { + "epoch": 0.26, + "grad_norm": 0.44777902534983166, + "learning_rate": 8.717458515246899e-06, + "loss": 0.0847, + "step": 3041 + }, + { + "epoch": 0.26, + "grad_norm": 0.4304585180889677, + "learning_rate": 8.71654580410566e-06, + "loss": 0.0967, + "step": 3042 + }, + { + "epoch": 0.26, + "grad_norm": 0.444879683370426, + "learning_rate": 8.715632816134438e-06, + "loss": 0.0779, + "step": 3043 + }, + { + "epoch": 0.26, + "grad_norm": 0.24787005757509195, + "learning_rate": 8.71471955140124e-06, + "loss": 0.0608, + "step": 3044 + }, + { + "epoch": 0.26, + "grad_norm": 0.3237969194907698, + "learning_rate": 8.71380600997409e-06, + "loss": 0.089, + "step": 3045 + }, + { + "epoch": 0.26, + "grad_norm": 0.3078945057934309, + "learning_rate": 8.712892191921032e-06, + "loss": 0.0813, + "step": 3046 + }, + { + "epoch": 0.26, + "grad_norm": 0.5526745925550122, + "learning_rate": 8.711978097310137e-06, + "loss": 0.1168, + "step": 3047 + }, + { + "epoch": 0.26, + "grad_norm": 0.26734762486589675, + "learning_rate": 8.711063726209488e-06, + "loss": 0.0695, + "step": 3048 + }, + { + "epoch": 0.26, + "grad_norm": 0.4340663189824596, + "learning_rate": 8.710149078687193e-06, + "loss": 0.109, + "step": 3049 + }, + { + "epoch": 0.26, + "grad_norm": 0.26267975452279574, + "learning_rate": 8.709234154811384e-06, + "loss": 0.06, + "step": 3050 + }, + { + "epoch": 0.26, + "grad_norm": 0.26464568140649297, + "learning_rate": 8.708318954650206e-06, + "loss": 0.0305, + "step": 3051 + }, + { + "epoch": 0.26, + "grad_norm": 0.20673229308521307, + "learning_rate": 8.70740347827183e-06, + "loss": 0.0543, + "step": 3052 + }, + { + "epoch": 0.26, + "grad_norm": 0.42673339340516275, + "learning_rate": 8.706487725744444e-06, + "loss": 0.1238, + "step": 3053 + }, + { + "epoch": 0.26, + "grad_norm": 0.3143014509646133, + "learning_rate": 8.705571697136263e-06, + "loss": 0.0968, + "step": 3054 + }, + { + "epoch": 0.26, + "grad_norm": 0.30545729716000103, + "learning_rate": 8.704655392515513e-06, + "loss": 0.0945, + "step": 3055 + }, + { + "epoch": 0.26, + "grad_norm": 0.2560163250501721, + "learning_rate": 8.70373881195045e-06, + "loss": 0.0463, + "step": 3056 + }, + { + "epoch": 0.26, + "grad_norm": 0.2537603282629021, + "learning_rate": 8.702821955509345e-06, + "loss": 0.0891, + "step": 3057 + }, + { + "epoch": 0.26, + "grad_norm": 0.39570207196513024, + "learning_rate": 8.701904823260488e-06, + "loss": 0.083, + "step": 3058 + }, + { + "epoch": 0.26, + "grad_norm": 0.6619417778811499, + "learning_rate": 8.700987415272195e-06, + "loss": 0.0507, + "step": 3059 + }, + { + "epoch": 0.26, + "grad_norm": 0.4229151948048635, + "learning_rate": 8.700069731612799e-06, + "loss": 0.0992, + "step": 3060 + }, + { + "epoch": 0.26, + "grad_norm": 0.31991536612454646, + "learning_rate": 8.699151772350656e-06, + "loss": 0.0978, + "step": 3061 + }, + { + "epoch": 0.26, + "grad_norm": 0.24058634715423152, + "learning_rate": 8.69823353755414e-06, + "loss": 0.0626, + "step": 3062 + }, + { + "epoch": 0.26, + "grad_norm": 0.40669894172985, + "learning_rate": 8.697315027291645e-06, + "loss": 0.1698, + "step": 3063 + }, + { + "epoch": 0.26, + "grad_norm": 0.3774105248406947, + "learning_rate": 8.69639624163159e-06, + "loss": 0.1113, + "step": 3064 + }, + { + "epoch": 0.26, + "grad_norm": 0.3624513429718424, + "learning_rate": 8.695477180642409e-06, + "loss": 0.1022, + "step": 3065 + }, + { + "epoch": 0.26, + "grad_norm": 0.37015760486948013, + "learning_rate": 8.69455784439256e-06, + "loss": 0.0826, + "step": 3066 + }, + { + "epoch": 0.26, + "grad_norm": 0.2556386154762862, + "learning_rate": 8.693638232950522e-06, + "loss": 0.0761, + "step": 3067 + }, + { + "epoch": 0.26, + "grad_norm": 0.22507158405401548, + "learning_rate": 8.69271834638479e-06, + "loss": 0.0728, + "step": 3068 + }, + { + "epoch": 0.26, + "grad_norm": 0.37669699910002513, + "learning_rate": 8.691798184763883e-06, + "loss": 0.1068, + "step": 3069 + }, + { + "epoch": 0.26, + "grad_norm": 0.4168428571113629, + "learning_rate": 8.69087774815634e-06, + "loss": 0.1006, + "step": 3070 + }, + { + "epoch": 0.26, + "grad_norm": 0.3939861919101858, + "learning_rate": 8.689957036630723e-06, + "loss": 0.0905, + "step": 3071 + }, + { + "epoch": 0.26, + "grad_norm": 0.6139665360505826, + "learning_rate": 8.689036050255611e-06, + "loss": 0.1464, + "step": 3072 + }, + { + "epoch": 0.26, + "grad_norm": 0.3155217628362793, + "learning_rate": 8.688114789099603e-06, + "loss": 0.0982, + "step": 3073 + }, + { + "epoch": 0.26, + "grad_norm": 0.6456320150900486, + "learning_rate": 8.687193253231321e-06, + "loss": 0.1239, + "step": 3074 + }, + { + "epoch": 0.26, + "grad_norm": 0.4089570285304536, + "learning_rate": 8.686271442719404e-06, + "loss": 0.1085, + "step": 3075 + }, + { + "epoch": 0.26, + "grad_norm": 0.31931112691738567, + "learning_rate": 8.685349357632518e-06, + "loss": 0.0804, + "step": 3076 + }, + { + "epoch": 0.26, + "grad_norm": 0.5620520747543952, + "learning_rate": 8.684426998039342e-06, + "loss": 0.0923, + "step": 3077 + }, + { + "epoch": 0.26, + "grad_norm": 0.2980340551852319, + "learning_rate": 8.683504364008582e-06, + "loss": 0.0499, + "step": 3078 + }, + { + "epoch": 0.26, + "grad_norm": 0.5172604670301609, + "learning_rate": 8.682581455608956e-06, + "loss": 0.1117, + "step": 3079 + }, + { + "epoch": 0.26, + "grad_norm": 0.44942208783293075, + "learning_rate": 8.681658272909213e-06, + "loss": 0.1105, + "step": 3080 + }, + { + "epoch": 0.26, + "grad_norm": 0.645272144801532, + "learning_rate": 8.680734815978112e-06, + "loss": 0.0833, + "step": 3081 + }, + { + "epoch": 0.26, + "grad_norm": 0.3886851201409186, + "learning_rate": 8.679811084884442e-06, + "loss": 0.0907, + "step": 3082 + }, + { + "epoch": 0.26, + "grad_norm": 0.2926562268833976, + "learning_rate": 8.678887079697005e-06, + "loss": 0.1033, + "step": 3083 + }, + { + "epoch": 0.26, + "grad_norm": 0.2592224304525484, + "learning_rate": 8.677962800484628e-06, + "loss": 0.0809, + "step": 3084 + }, + { + "epoch": 0.26, + "grad_norm": 0.5386606562840128, + "learning_rate": 8.677038247316156e-06, + "loss": 0.0675, + "step": 3085 + }, + { + "epoch": 0.26, + "grad_norm": 0.29711257637923505, + "learning_rate": 8.676113420260454e-06, + "loss": 0.0643, + "step": 3086 + }, + { + "epoch": 0.26, + "grad_norm": 0.23687841914215468, + "learning_rate": 8.675188319386409e-06, + "loss": 0.0692, + "step": 3087 + }, + { + "epoch": 0.26, + "grad_norm": 0.40167497144295966, + "learning_rate": 8.67426294476293e-06, + "loss": 0.1198, + "step": 3088 + }, + { + "epoch": 0.26, + "grad_norm": 0.2667791449724475, + "learning_rate": 8.673337296458943e-06, + "loss": 0.0593, + "step": 3089 + }, + { + "epoch": 0.26, + "grad_norm": 0.30843353990812183, + "learning_rate": 8.672411374543395e-06, + "loss": 0.0729, + "step": 3090 + }, + { + "epoch": 0.26, + "grad_norm": 0.3674815263894456, + "learning_rate": 8.671485179085253e-06, + "loss": 0.0783, + "step": 3091 + }, + { + "epoch": 0.26, + "grad_norm": 0.37472969635240316, + "learning_rate": 8.670558710153507e-06, + "loss": 0.1174, + "step": 3092 + }, + { + "epoch": 0.26, + "grad_norm": 0.3503859966142475, + "learning_rate": 8.669631967817168e-06, + "loss": 0.0966, + "step": 3093 + }, + { + "epoch": 0.26, + "grad_norm": 0.37443217149266933, + "learning_rate": 8.66870495214526e-06, + "loss": 0.1056, + "step": 3094 + }, + { + "epoch": 0.26, + "grad_norm": 0.4196137241317033, + "learning_rate": 8.667777663206837e-06, + "loss": 0.1063, + "step": 3095 + }, + { + "epoch": 0.26, + "grad_norm": 0.35339454112322427, + "learning_rate": 8.666850101070968e-06, + "loss": 0.0552, + "step": 3096 + }, + { + "epoch": 0.26, + "grad_norm": 0.17312533850363393, + "learning_rate": 8.665922265806741e-06, + "loss": 0.0534, + "step": 3097 + }, + { + "epoch": 0.26, + "grad_norm": 0.25386378969272116, + "learning_rate": 8.664994157483269e-06, + "loss": 0.0542, + "step": 3098 + }, + { + "epoch": 0.26, + "grad_norm": 0.6522153747464053, + "learning_rate": 8.664065776169681e-06, + "loss": 0.1661, + "step": 3099 + }, + { + "epoch": 0.26, + "grad_norm": 0.33530007524378136, + "learning_rate": 8.66313712193513e-06, + "loss": 0.0748, + "step": 3100 + }, + { + "epoch": 0.26, + "grad_norm": 0.530169144597035, + "learning_rate": 8.662208194848786e-06, + "loss": 0.1216, + "step": 3101 + }, + { + "epoch": 0.26, + "grad_norm": 0.5103953536176062, + "learning_rate": 8.661278994979843e-06, + "loss": 0.1227, + "step": 3102 + }, + { + "epoch": 0.26, + "grad_norm": 0.3606476525458075, + "learning_rate": 8.66034952239751e-06, + "loss": 0.0826, + "step": 3103 + }, + { + "epoch": 0.26, + "grad_norm": 0.2785053445972938, + "learning_rate": 8.659419777171022e-06, + "loss": 0.076, + "step": 3104 + }, + { + "epoch": 0.26, + "grad_norm": 0.2687087666482121, + "learning_rate": 8.658489759369633e-06, + "loss": 0.0799, + "step": 3105 + }, + { + "epoch": 0.26, + "grad_norm": 0.3410263948835231, + "learning_rate": 8.657559469062614e-06, + "loss": 0.0706, + "step": 3106 + }, + { + "epoch": 0.26, + "grad_norm": 0.3079909184001713, + "learning_rate": 8.656628906319258e-06, + "loss": 0.0687, + "step": 3107 + }, + { + "epoch": 0.26, + "grad_norm": 0.35143030901980515, + "learning_rate": 8.65569807120888e-06, + "loss": 0.103, + "step": 3108 + }, + { + "epoch": 0.26, + "grad_norm": 0.22985844520561322, + "learning_rate": 8.654766963800814e-06, + "loss": 0.0707, + "step": 3109 + }, + { + "epoch": 0.26, + "grad_norm": 0.2158617418890222, + "learning_rate": 8.653835584164415e-06, + "loss": 0.0395, + "step": 3110 + }, + { + "epoch": 0.26, + "grad_norm": 0.289186626036599, + "learning_rate": 8.652903932369053e-06, + "loss": 0.0809, + "step": 3111 + }, + { + "epoch": 0.26, + "grad_norm": 0.25289569586848415, + "learning_rate": 8.651972008484129e-06, + "loss": 0.0998, + "step": 3112 + }, + { + "epoch": 0.26, + "grad_norm": 0.5648942626602366, + "learning_rate": 8.651039812579054e-06, + "loss": 0.1131, + "step": 3113 + }, + { + "epoch": 0.26, + "grad_norm": 0.3601583992794044, + "learning_rate": 8.650107344723266e-06, + "loss": 0.1083, + "step": 3114 + }, + { + "epoch": 0.26, + "grad_norm": 0.3309370885185487, + "learning_rate": 8.649174604986219e-06, + "loss": 0.0799, + "step": 3115 + }, + { + "epoch": 0.26, + "grad_norm": 0.32542710895638866, + "learning_rate": 8.64824159343739e-06, + "loss": 0.0946, + "step": 3116 + }, + { + "epoch": 0.26, + "grad_norm": 0.3459152992662643, + "learning_rate": 8.647308310146273e-06, + "loss": 0.0936, + "step": 3117 + }, + { + "epoch": 0.26, + "grad_norm": 0.5691984017877604, + "learning_rate": 8.646374755182388e-06, + "loss": 0.1248, + "step": 3118 + }, + { + "epoch": 0.26, + "grad_norm": 0.4569362608514308, + "learning_rate": 8.645440928615269e-06, + "loss": 0.1279, + "step": 3119 + }, + { + "epoch": 0.26, + "grad_norm": 0.4338039577266958, + "learning_rate": 8.644506830514475e-06, + "loss": 0.1007, + "step": 3120 + }, + { + "epoch": 0.26, + "grad_norm": 0.3044495409370965, + "learning_rate": 8.643572460949578e-06, + "loss": 0.0947, + "step": 3121 + }, + { + "epoch": 0.26, + "grad_norm": 0.29216930135792835, + "learning_rate": 8.64263781999018e-06, + "loss": 0.0732, + "step": 3122 + }, + { + "epoch": 0.26, + "grad_norm": 0.5215553813165708, + "learning_rate": 8.641702907705898e-06, + "loss": 0.1351, + "step": 3123 + }, + { + "epoch": 0.26, + "grad_norm": 0.4176139003627783, + "learning_rate": 8.640767724166368e-06, + "loss": 0.1094, + "step": 3124 + }, + { + "epoch": 0.26, + "grad_norm": 0.3078718509833029, + "learning_rate": 8.63983226944125e-06, + "loss": 0.0566, + "step": 3125 + }, + { + "epoch": 0.26, + "grad_norm": 0.38293845412458544, + "learning_rate": 8.63889654360022e-06, + "loss": 0.0794, + "step": 3126 + }, + { + "epoch": 0.26, + "grad_norm": 0.49416550841508555, + "learning_rate": 8.637960546712977e-06, + "loss": 0.1037, + "step": 3127 + }, + { + "epoch": 0.26, + "grad_norm": 0.5180489303394368, + "learning_rate": 8.637024278849241e-06, + "loss": 0.1162, + "step": 3128 + }, + { + "epoch": 0.26, + "grad_norm": 0.30283153033511245, + "learning_rate": 8.636087740078749e-06, + "loss": 0.0492, + "step": 3129 + }, + { + "epoch": 0.26, + "grad_norm": 0.24541939070108781, + "learning_rate": 8.63515093047126e-06, + "loss": 0.0475, + "step": 3130 + }, + { + "epoch": 0.26, + "grad_norm": 0.3284929833474759, + "learning_rate": 8.634213850096554e-06, + "loss": 0.1035, + "step": 3131 + }, + { + "epoch": 0.26, + "grad_norm": 0.2690705848976222, + "learning_rate": 8.63327649902443e-06, + "loss": 0.0765, + "step": 3132 + }, + { + "epoch": 0.26, + "grad_norm": 0.36139522796423673, + "learning_rate": 8.632338877324705e-06, + "loss": 0.1057, + "step": 3133 + }, + { + "epoch": 0.26, + "grad_norm": 0.354741597268939, + "learning_rate": 8.63140098506722e-06, + "loss": 0.1171, + "step": 3134 + }, + { + "epoch": 0.26, + "grad_norm": 0.3939376050659103, + "learning_rate": 8.630462822321838e-06, + "loss": 0.0914, + "step": 3135 + }, + { + "epoch": 0.26, + "grad_norm": 0.37242398320385267, + "learning_rate": 8.629524389158435e-06, + "loss": 0.1162, + "step": 3136 + }, + { + "epoch": 0.26, + "grad_norm": 0.6210828343589856, + "learning_rate": 8.62858568564691e-06, + "loss": 0.1042, + "step": 3137 + }, + { + "epoch": 0.26, + "grad_norm": 0.44341954276916, + "learning_rate": 8.627646711857188e-06, + "loss": 0.1074, + "step": 3138 + }, + { + "epoch": 0.26, + "grad_norm": 0.40703856623009427, + "learning_rate": 8.626707467859205e-06, + "loss": 0.0809, + "step": 3139 + }, + { + "epoch": 0.26, + "grad_norm": 0.5131052633462835, + "learning_rate": 8.625767953722923e-06, + "loss": 0.1299, + "step": 3140 + }, + { + "epoch": 0.26, + "grad_norm": 0.40440762471270764, + "learning_rate": 8.624828169518322e-06, + "loss": 0.0673, + "step": 3141 + }, + { + "epoch": 0.26, + "grad_norm": 0.4026228171449898, + "learning_rate": 8.623888115315402e-06, + "loss": 0.1242, + "step": 3142 + }, + { + "epoch": 0.26, + "grad_norm": 1.0405054218533596, + "learning_rate": 8.622947791184186e-06, + "loss": 0.1539, + "step": 3143 + }, + { + "epoch": 0.26, + "grad_norm": 0.45489837857045895, + "learning_rate": 8.622007197194712e-06, + "loss": 0.1188, + "step": 3144 + }, + { + "epoch": 0.26, + "grad_norm": 0.458041621658602, + "learning_rate": 8.621066333417044e-06, + "loss": 0.0983, + "step": 3145 + }, + { + "epoch": 0.27, + "grad_norm": 0.5868371905219253, + "learning_rate": 8.620125199921259e-06, + "loss": 0.0838, + "step": 3146 + }, + { + "epoch": 0.27, + "grad_norm": 0.4996584318164899, + "learning_rate": 8.61918379677746e-06, + "loss": 0.1522, + "step": 3147 + }, + { + "epoch": 0.27, + "grad_norm": 0.6631142003833113, + "learning_rate": 8.61824212405577e-06, + "loss": 0.1108, + "step": 3148 + }, + { + "epoch": 0.27, + "grad_norm": 0.4410454527504012, + "learning_rate": 8.617300181826328e-06, + "loss": 0.1194, + "step": 3149 + }, + { + "epoch": 0.27, + "grad_norm": 0.27908004179805007, + "learning_rate": 8.616357970159296e-06, + "loss": 0.07, + "step": 3150 + }, + { + "epoch": 0.27, + "grad_norm": 0.34609408537989883, + "learning_rate": 8.615415489124857e-06, + "loss": 0.0669, + "step": 3151 + }, + { + "epoch": 0.27, + "grad_norm": 0.419953704372675, + "learning_rate": 8.61447273879321e-06, + "loss": 0.1108, + "step": 3152 + }, + { + "epoch": 0.27, + "grad_norm": 0.40446890730651647, + "learning_rate": 8.613529719234577e-06, + "loss": 0.1217, + "step": 3153 + }, + { + "epoch": 0.27, + "grad_norm": 0.28617730337793196, + "learning_rate": 8.612586430519201e-06, + "loss": 0.092, + "step": 3154 + }, + { + "epoch": 0.27, + "grad_norm": 0.28130652207297047, + "learning_rate": 8.611642872717343e-06, + "loss": 0.0772, + "step": 3155 + }, + { + "epoch": 0.27, + "grad_norm": 0.5650392784552916, + "learning_rate": 8.610699045899283e-06, + "loss": 0.1281, + "step": 3156 + }, + { + "epoch": 0.27, + "grad_norm": 0.17599631933648477, + "learning_rate": 8.609754950135325e-06, + "loss": 0.0444, + "step": 3157 + }, + { + "epoch": 0.27, + "grad_norm": 0.33425221227894586, + "learning_rate": 8.60881058549579e-06, + "loss": 0.0709, + "step": 3158 + }, + { + "epoch": 0.27, + "grad_norm": 0.18045066763482817, + "learning_rate": 8.607865952051021e-06, + "loss": 0.0299, + "step": 3159 + }, + { + "epoch": 0.27, + "grad_norm": 0.42381083368727096, + "learning_rate": 8.606921049871377e-06, + "loss": 0.0909, + "step": 3160 + }, + { + "epoch": 0.27, + "grad_norm": 0.4385859831444366, + "learning_rate": 8.605975879027242e-06, + "loss": 0.0952, + "step": 3161 + }, + { + "epoch": 0.27, + "grad_norm": 0.3716520722317863, + "learning_rate": 8.605030439589017e-06, + "loss": 0.1081, + "step": 3162 + }, + { + "epoch": 0.27, + "grad_norm": 0.2660368474783016, + "learning_rate": 8.604084731627123e-06, + "loss": 0.053, + "step": 3163 + }, + { + "epoch": 0.27, + "grad_norm": 0.32090225166767555, + "learning_rate": 8.603138755212004e-06, + "loss": 0.0668, + "step": 3164 + }, + { + "epoch": 0.27, + "grad_norm": 0.4790531991050869, + "learning_rate": 8.602192510414121e-06, + "loss": 0.0858, + "step": 3165 + }, + { + "epoch": 0.27, + "grad_norm": 0.2840525100702858, + "learning_rate": 8.601245997303954e-06, + "loss": 0.0673, + "step": 3166 + }, + { + "epoch": 0.27, + "grad_norm": 0.6141098224050812, + "learning_rate": 8.600299215952009e-06, + "loss": 0.1266, + "step": 3167 + }, + { + "epoch": 0.27, + "grad_norm": 0.2597524150143281, + "learning_rate": 8.5993521664288e-06, + "loss": 0.0653, + "step": 3168 + }, + { + "epoch": 0.27, + "grad_norm": 0.2904008801759814, + "learning_rate": 8.598404848804879e-06, + "loss": 0.0612, + "step": 3169 + }, + { + "epoch": 0.27, + "grad_norm": 0.21435316282959738, + "learning_rate": 8.597457263150801e-06, + "loss": 0.0703, + "step": 3170 + }, + { + "epoch": 0.27, + "grad_norm": 0.43803103169438823, + "learning_rate": 8.596509409537148e-06, + "loss": 0.103, + "step": 3171 + }, + { + "epoch": 0.27, + "grad_norm": 0.40892091518874096, + "learning_rate": 8.595561288034524e-06, + "loss": 0.0771, + "step": 3172 + }, + { + "epoch": 0.27, + "grad_norm": 0.3598338325366773, + "learning_rate": 8.59461289871355e-06, + "loss": 0.076, + "step": 3173 + }, + { + "epoch": 0.27, + "grad_norm": 0.281174090911734, + "learning_rate": 8.593664241644868e-06, + "loss": 0.0543, + "step": 3174 + }, + { + "epoch": 0.27, + "grad_norm": 0.8070277444030411, + "learning_rate": 8.592715316899136e-06, + "loss": 0.0899, + "step": 3175 + }, + { + "epoch": 0.27, + "grad_norm": 0.4905251976539588, + "learning_rate": 8.59176612454704e-06, + "loss": 0.0813, + "step": 3176 + }, + { + "epoch": 0.27, + "grad_norm": 0.27669804600285297, + "learning_rate": 8.590816664659279e-06, + "loss": 0.097, + "step": 3177 + }, + { + "epoch": 0.27, + "grad_norm": 0.22095066131920046, + "learning_rate": 8.589866937306574e-06, + "loss": 0.0707, + "step": 3178 + }, + { + "epoch": 0.27, + "grad_norm": 0.3147108398502547, + "learning_rate": 8.588916942559668e-06, + "loss": 0.0778, + "step": 3179 + }, + { + "epoch": 0.27, + "grad_norm": 0.46495837026485753, + "learning_rate": 8.58796668048932e-06, + "loss": 0.0704, + "step": 3180 + }, + { + "epoch": 0.27, + "grad_norm": 0.47869112909600686, + "learning_rate": 8.587016151166314e-06, + "loss": 0.0922, + "step": 3181 + }, + { + "epoch": 0.27, + "grad_norm": 0.39161912982290736, + "learning_rate": 8.586065354661448e-06, + "loss": 0.1078, + "step": 3182 + }, + { + "epoch": 0.27, + "grad_norm": 0.2928832466448882, + "learning_rate": 8.585114291045544e-06, + "loss": 0.0767, + "step": 3183 + }, + { + "epoch": 0.27, + "grad_norm": 0.4152114814618614, + "learning_rate": 8.584162960389443e-06, + "loss": 0.103, + "step": 3184 + }, + { + "epoch": 0.27, + "grad_norm": 0.34526393663227817, + "learning_rate": 8.583211362764006e-06, + "loss": 0.0696, + "step": 3185 + }, + { + "epoch": 0.27, + "grad_norm": 0.4783039155017807, + "learning_rate": 8.582259498240111e-06, + "loss": 0.1255, + "step": 3186 + }, + { + "epoch": 0.27, + "grad_norm": 0.2983994278844034, + "learning_rate": 8.581307366888663e-06, + "loss": 0.0953, + "step": 3187 + }, + { + "epoch": 0.27, + "grad_norm": 0.4297595322064294, + "learning_rate": 8.580354968780578e-06, + "loss": 0.087, + "step": 3188 + }, + { + "epoch": 0.27, + "grad_norm": 0.2702825337725005, + "learning_rate": 8.579402303986798e-06, + "loss": 0.064, + "step": 3189 + }, + { + "epoch": 0.27, + "grad_norm": 0.25057890993016113, + "learning_rate": 8.578449372578284e-06, + "loss": 0.0536, + "step": 3190 + }, + { + "epoch": 0.27, + "grad_norm": 0.4997154962747346, + "learning_rate": 8.577496174626013e-06, + "loss": 0.1462, + "step": 3191 + }, + { + "epoch": 0.27, + "grad_norm": 0.5484457245657279, + "learning_rate": 8.576542710200987e-06, + "loss": 0.1782, + "step": 3192 + }, + { + "epoch": 0.27, + "grad_norm": 0.7553463793023507, + "learning_rate": 8.575588979374226e-06, + "loss": 0.1222, + "step": 3193 + }, + { + "epoch": 0.27, + "grad_norm": 0.21169058914202843, + "learning_rate": 8.574634982216767e-06, + "loss": 0.0697, + "step": 3194 + }, + { + "epoch": 0.27, + "grad_norm": 0.4280841174287749, + "learning_rate": 8.573680718799671e-06, + "loss": 0.0999, + "step": 3195 + }, + { + "epoch": 0.27, + "grad_norm": 0.5388658852358397, + "learning_rate": 8.572726189194019e-06, + "loss": 0.1348, + "step": 3196 + }, + { + "epoch": 0.27, + "grad_norm": 0.3920844544821534, + "learning_rate": 8.571771393470904e-06, + "loss": 0.1133, + "step": 3197 + }, + { + "epoch": 0.27, + "grad_norm": 0.27407678053272166, + "learning_rate": 8.570816331701451e-06, + "loss": 0.0695, + "step": 3198 + }, + { + "epoch": 0.27, + "grad_norm": 0.3430960494607401, + "learning_rate": 8.569861003956795e-06, + "loss": 0.0869, + "step": 3199 + }, + { + "epoch": 0.27, + "grad_norm": 0.2686785339460621, + "learning_rate": 8.568905410308097e-06, + "loss": 0.0621, + "step": 3200 + }, + { + "epoch": 0.27, + "grad_norm": 0.26427304549817626, + "learning_rate": 8.567949550826532e-06, + "loss": 0.0731, + "step": 3201 + }, + { + "epoch": 0.27, + "grad_norm": 0.3711964973476967, + "learning_rate": 8.5669934255833e-06, + "loss": 0.0971, + "step": 3202 + }, + { + "epoch": 0.27, + "grad_norm": 0.5404813466275674, + "learning_rate": 8.56603703464962e-06, + "loss": 0.0863, + "step": 3203 + }, + { + "epoch": 0.27, + "grad_norm": 0.8233651893174969, + "learning_rate": 8.565080378096725e-06, + "loss": 0.116, + "step": 3204 + }, + { + "epoch": 0.27, + "grad_norm": 0.3238031058502858, + "learning_rate": 8.564123455995877e-06, + "loss": 0.0896, + "step": 3205 + }, + { + "epoch": 0.27, + "grad_norm": 0.376880042601365, + "learning_rate": 8.563166268418351e-06, + "loss": 0.0862, + "step": 3206 + }, + { + "epoch": 0.27, + "grad_norm": 0.2526099829136848, + "learning_rate": 8.562208815435445e-06, + "loss": 0.082, + "step": 3207 + }, + { + "epoch": 0.27, + "grad_norm": 0.36082686878299813, + "learning_rate": 8.561251097118475e-06, + "loss": 0.1148, + "step": 3208 + }, + { + "epoch": 0.27, + "grad_norm": 0.32571027721387186, + "learning_rate": 8.560293113538777e-06, + "loss": 0.0968, + "step": 3209 + }, + { + "epoch": 0.27, + "grad_norm": 0.3023408386922019, + "learning_rate": 8.559334864767707e-06, + "loss": 0.0466, + "step": 3210 + }, + { + "epoch": 0.27, + "grad_norm": 0.1878076277003008, + "learning_rate": 8.558376350876644e-06, + "loss": 0.0481, + "step": 3211 + }, + { + "epoch": 0.27, + "grad_norm": 0.3410576094691958, + "learning_rate": 8.557417571936978e-06, + "loss": 0.1131, + "step": 3212 + }, + { + "epoch": 0.27, + "grad_norm": 0.49962613976695847, + "learning_rate": 8.556458528020131e-06, + "loss": 0.1155, + "step": 3213 + }, + { + "epoch": 0.27, + "grad_norm": 0.34573822382790637, + "learning_rate": 8.555499219197534e-06, + "loss": 0.1105, + "step": 3214 + }, + { + "epoch": 0.27, + "grad_norm": 0.2873693084160559, + "learning_rate": 8.554539645540642e-06, + "loss": 0.0796, + "step": 3215 + }, + { + "epoch": 0.27, + "grad_norm": 0.2675779164142398, + "learning_rate": 8.55357980712093e-06, + "loss": 0.0512, + "step": 3216 + }, + { + "epoch": 0.27, + "grad_norm": 0.40171732986158, + "learning_rate": 8.552619704009895e-06, + "loss": 0.1018, + "step": 3217 + }, + { + "epoch": 0.27, + "grad_norm": 0.3130559161605051, + "learning_rate": 8.551659336279047e-06, + "loss": 0.0994, + "step": 3218 + }, + { + "epoch": 0.27, + "grad_norm": 0.35490154059866313, + "learning_rate": 8.550698703999922e-06, + "loss": 0.0673, + "step": 3219 + }, + { + "epoch": 0.27, + "grad_norm": 0.5070013958030166, + "learning_rate": 8.549737807244073e-06, + "loss": 0.0994, + "step": 3220 + }, + { + "epoch": 0.27, + "grad_norm": 0.3087753827818303, + "learning_rate": 8.548776646083074e-06, + "loss": 0.0595, + "step": 3221 + }, + { + "epoch": 0.27, + "grad_norm": 0.32824077294457665, + "learning_rate": 8.547815220588517e-06, + "loss": 0.0863, + "step": 3222 + }, + { + "epoch": 0.27, + "grad_norm": 0.3743889538050714, + "learning_rate": 8.546853530832015e-06, + "loss": 0.1064, + "step": 3223 + }, + { + "epoch": 0.27, + "grad_norm": 0.2801371714876428, + "learning_rate": 8.545891576885198e-06, + "loss": 0.0566, + "step": 3224 + }, + { + "epoch": 0.27, + "grad_norm": 0.5621597618454878, + "learning_rate": 8.544929358819724e-06, + "loss": 0.0845, + "step": 3225 + }, + { + "epoch": 0.27, + "grad_norm": 0.34000926095524525, + "learning_rate": 8.543966876707258e-06, + "loss": 0.0973, + "step": 3226 + }, + { + "epoch": 0.27, + "grad_norm": 0.537761908965727, + "learning_rate": 8.543004130619494e-06, + "loss": 0.1599, + "step": 3227 + }, + { + "epoch": 0.27, + "grad_norm": 0.4273946410502113, + "learning_rate": 8.542041120628143e-06, + "loss": 0.0933, + "step": 3228 + }, + { + "epoch": 0.27, + "grad_norm": 0.2548743596543969, + "learning_rate": 8.541077846804937e-06, + "loss": 0.0593, + "step": 3229 + }, + { + "epoch": 0.27, + "grad_norm": 0.5140422198652104, + "learning_rate": 8.540114309221624e-06, + "loss": 0.0918, + "step": 3230 + }, + { + "epoch": 0.27, + "grad_norm": 0.314189752415136, + "learning_rate": 8.539150507949972e-06, + "loss": 0.0877, + "step": 3231 + }, + { + "epoch": 0.27, + "grad_norm": 0.42965783813692837, + "learning_rate": 8.538186443061775e-06, + "loss": 0.12, + "step": 3232 + }, + { + "epoch": 0.27, + "grad_norm": 0.3936450967581396, + "learning_rate": 8.53722211462884e-06, + "loss": 0.0916, + "step": 3233 + }, + { + "epoch": 0.27, + "grad_norm": 0.36522016352049114, + "learning_rate": 8.536257522722997e-06, + "loss": 0.0982, + "step": 3234 + }, + { + "epoch": 0.27, + "grad_norm": 0.41862391248228376, + "learning_rate": 8.535292667416093e-06, + "loss": 0.1167, + "step": 3235 + }, + { + "epoch": 0.27, + "grad_norm": 0.22624187113222125, + "learning_rate": 8.534327548779996e-06, + "loss": 0.0608, + "step": 3236 + }, + { + "epoch": 0.27, + "grad_norm": 0.22955399230281884, + "learning_rate": 8.533362166886595e-06, + "loss": 0.0576, + "step": 3237 + }, + { + "epoch": 0.27, + "grad_norm": 0.34469200880000195, + "learning_rate": 8.532396521807795e-06, + "loss": 0.0435, + "step": 3238 + }, + { + "epoch": 0.27, + "grad_norm": 0.2884067616331789, + "learning_rate": 8.531430613615525e-06, + "loss": 0.1152, + "step": 3239 + }, + { + "epoch": 0.27, + "grad_norm": 0.732762773316448, + "learning_rate": 8.530464442381731e-06, + "loss": 0.1002, + "step": 3240 + }, + { + "epoch": 0.27, + "grad_norm": 0.2891667731297482, + "learning_rate": 8.52949800817838e-06, + "loss": 0.0744, + "step": 3241 + }, + { + "epoch": 0.27, + "grad_norm": 0.4674154182735356, + "learning_rate": 8.528531311077456e-06, + "loss": 0.113, + "step": 3242 + }, + { + "epoch": 0.27, + "grad_norm": 0.3753286632948059, + "learning_rate": 8.527564351150964e-06, + "loss": 0.076, + "step": 3243 + }, + { + "epoch": 0.27, + "grad_norm": 0.5524660177900298, + "learning_rate": 8.52659712847093e-06, + "loss": 0.0979, + "step": 3244 + }, + { + "epoch": 0.27, + "grad_norm": 0.3327205218749006, + "learning_rate": 8.525629643109397e-06, + "loss": 0.0762, + "step": 3245 + }, + { + "epoch": 0.27, + "grad_norm": 0.34950465810645814, + "learning_rate": 8.52466189513843e-06, + "loss": 0.1137, + "step": 3246 + }, + { + "epoch": 0.27, + "grad_norm": 0.4291830250885162, + "learning_rate": 8.523693884630113e-06, + "loss": 0.1211, + "step": 3247 + }, + { + "epoch": 0.27, + "grad_norm": 0.391255268613478, + "learning_rate": 8.522725611656548e-06, + "loss": 0.0879, + "step": 3248 + }, + { + "epoch": 0.27, + "grad_norm": 0.33353056761724076, + "learning_rate": 8.52175707628986e-06, + "loss": 0.0845, + "step": 3249 + }, + { + "epoch": 0.27, + "grad_norm": 0.403392058994754, + "learning_rate": 8.520788278602186e-06, + "loss": 0.0961, + "step": 3250 + }, + { + "epoch": 0.27, + "grad_norm": 0.39243587519409867, + "learning_rate": 8.519819218665692e-06, + "loss": 0.1291, + "step": 3251 + }, + { + "epoch": 0.27, + "grad_norm": 0.32935199675208676, + "learning_rate": 8.51884989655256e-06, + "loss": 0.0543, + "step": 3252 + }, + { + "epoch": 0.27, + "grad_norm": 0.40585213969785006, + "learning_rate": 8.517880312334987e-06, + "loss": 0.1092, + "step": 3253 + }, + { + "epoch": 0.27, + "grad_norm": 1.3890174081017157, + "learning_rate": 8.516910466085196e-06, + "loss": 0.1074, + "step": 3254 + }, + { + "epoch": 0.27, + "grad_norm": 0.2484434455016351, + "learning_rate": 8.515940357875425e-06, + "loss": 0.0791, + "step": 3255 + }, + { + "epoch": 0.27, + "grad_norm": 0.2658535939025943, + "learning_rate": 8.514969987777935e-06, + "loss": 0.0874, + "step": 3256 + }, + { + "epoch": 0.27, + "grad_norm": 0.43359056993230394, + "learning_rate": 8.513999355865003e-06, + "loss": 0.1033, + "step": 3257 + }, + { + "epoch": 0.27, + "grad_norm": 0.33723946502703545, + "learning_rate": 8.51302846220893e-06, + "loss": 0.0714, + "step": 3258 + }, + { + "epoch": 0.27, + "grad_norm": 0.34742262264342544, + "learning_rate": 8.512057306882032e-06, + "loss": 0.1103, + "step": 3259 + }, + { + "epoch": 0.27, + "grad_norm": 0.37170281098989383, + "learning_rate": 8.511085889956646e-06, + "loss": 0.0966, + "step": 3260 + }, + { + "epoch": 0.27, + "grad_norm": 0.22499351619756636, + "learning_rate": 8.51011421150513e-06, + "loss": 0.0859, + "step": 3261 + }, + { + "epoch": 0.27, + "grad_norm": 0.466483122628316, + "learning_rate": 8.509142271599859e-06, + "loss": 0.0821, + "step": 3262 + }, + { + "epoch": 0.27, + "grad_norm": 0.42798305368317, + "learning_rate": 8.508170070313229e-06, + "loss": 0.1029, + "step": 3263 + }, + { + "epoch": 0.28, + "grad_norm": 0.2784497769160566, + "learning_rate": 8.507197607717656e-06, + "loss": 0.0548, + "step": 3264 + }, + { + "epoch": 0.28, + "grad_norm": 0.5581748185905202, + "learning_rate": 8.506224883885574e-06, + "loss": 0.1074, + "step": 3265 + }, + { + "epoch": 0.28, + "grad_norm": 0.3189553204847817, + "learning_rate": 8.50525189888944e-06, + "loss": 0.1005, + "step": 3266 + }, + { + "epoch": 0.28, + "grad_norm": 0.28504913214545835, + "learning_rate": 8.50427865280172e-06, + "loss": 0.0582, + "step": 3267 + }, + { + "epoch": 0.28, + "grad_norm": 0.49875898758137166, + "learning_rate": 8.503305145694916e-06, + "loss": 0.1195, + "step": 3268 + }, + { + "epoch": 0.28, + "grad_norm": 0.2315835794495503, + "learning_rate": 8.502331377641535e-06, + "loss": 0.0555, + "step": 3269 + }, + { + "epoch": 0.28, + "grad_norm": 0.30933923842008554, + "learning_rate": 8.50135734871411e-06, + "loss": 0.0939, + "step": 3270 + }, + { + "epoch": 0.28, + "grad_norm": 0.5624360134399001, + "learning_rate": 8.500383058985193e-06, + "loss": 0.1329, + "step": 3271 + }, + { + "epoch": 0.28, + "grad_norm": 0.295397970556791, + "learning_rate": 8.499408508527355e-06, + "loss": 0.0851, + "step": 3272 + }, + { + "epoch": 0.28, + "grad_norm": 0.1675894245864827, + "learning_rate": 8.498433697413186e-06, + "loss": 0.0307, + "step": 3273 + }, + { + "epoch": 0.28, + "grad_norm": 0.5132502603384155, + "learning_rate": 8.497458625715294e-06, + "loss": 0.0918, + "step": 3274 + }, + { + "epoch": 0.28, + "grad_norm": 0.4733679774538817, + "learning_rate": 8.49648329350631e-06, + "loss": 0.1027, + "step": 3275 + }, + { + "epoch": 0.28, + "grad_norm": 0.27256448915757814, + "learning_rate": 8.495507700858882e-06, + "loss": 0.0805, + "step": 3276 + }, + { + "epoch": 0.28, + "grad_norm": 0.37408728493161036, + "learning_rate": 8.494531847845678e-06, + "loss": 0.1008, + "step": 3277 + }, + { + "epoch": 0.28, + "grad_norm": 0.39925872046497757, + "learning_rate": 8.493555734539386e-06, + "loss": 0.0921, + "step": 3278 + }, + { + "epoch": 0.28, + "grad_norm": 0.35720907920386613, + "learning_rate": 8.49257936101271e-06, + "loss": 0.0671, + "step": 3279 + }, + { + "epoch": 0.28, + "grad_norm": 0.2864578666391323, + "learning_rate": 8.491602727338375e-06, + "loss": 0.0878, + "step": 3280 + }, + { + "epoch": 0.28, + "grad_norm": 0.39967924543228067, + "learning_rate": 8.490625833589133e-06, + "loss": 0.1144, + "step": 3281 + }, + { + "epoch": 0.28, + "grad_norm": 0.41537538493289095, + "learning_rate": 8.489648679837741e-06, + "loss": 0.1004, + "step": 3282 + }, + { + "epoch": 0.28, + "grad_norm": 0.47951841674767526, + "learning_rate": 8.48867126615699e-06, + "loss": 0.0895, + "step": 3283 + }, + { + "epoch": 0.28, + "grad_norm": 0.38633985217669187, + "learning_rate": 8.487693592619677e-06, + "loss": 0.0836, + "step": 3284 + }, + { + "epoch": 0.28, + "grad_norm": 0.40296827372207644, + "learning_rate": 8.486715659298627e-06, + "loss": 0.104, + "step": 3285 + }, + { + "epoch": 0.28, + "grad_norm": 0.38767267164524444, + "learning_rate": 8.485737466266688e-06, + "loss": 0.1304, + "step": 3286 + }, + { + "epoch": 0.28, + "grad_norm": 0.35823461375247706, + "learning_rate": 8.484759013596713e-06, + "loss": 0.0967, + "step": 3287 + }, + { + "epoch": 0.28, + "grad_norm": 0.35578414199319686, + "learning_rate": 8.483780301361587e-06, + "loss": 0.0866, + "step": 3288 + }, + { + "epoch": 0.28, + "grad_norm": 0.3968411679378174, + "learning_rate": 8.482801329634209e-06, + "loss": 0.116, + "step": 3289 + }, + { + "epoch": 0.28, + "grad_norm": 0.44163512755169076, + "learning_rate": 8.481822098487502e-06, + "loss": 0.0716, + "step": 3290 + }, + { + "epoch": 0.28, + "grad_norm": 0.3954139121320707, + "learning_rate": 8.4808426079944e-06, + "loss": 0.0712, + "step": 3291 + }, + { + "epoch": 0.28, + "grad_norm": 0.44121891200486024, + "learning_rate": 8.479862858227863e-06, + "loss": 0.1385, + "step": 3292 + }, + { + "epoch": 0.28, + "grad_norm": 0.356646533576252, + "learning_rate": 8.478882849260868e-06, + "loss": 0.094, + "step": 3293 + }, + { + "epoch": 0.28, + "grad_norm": 0.3908175516053105, + "learning_rate": 8.477902581166413e-06, + "loss": 0.0981, + "step": 3294 + }, + { + "epoch": 0.28, + "grad_norm": 0.41678076025180893, + "learning_rate": 8.476922054017514e-06, + "loss": 0.0978, + "step": 3295 + }, + { + "epoch": 0.28, + "grad_norm": 0.2448513105698184, + "learning_rate": 8.475941267887206e-06, + "loss": 0.0542, + "step": 3296 + }, + { + "epoch": 0.28, + "grad_norm": 0.3616055497447477, + "learning_rate": 8.474960222848542e-06, + "loss": 0.1087, + "step": 3297 + }, + { + "epoch": 0.28, + "grad_norm": 0.33022684331591606, + "learning_rate": 8.473978918974598e-06, + "loss": 0.0679, + "step": 3298 + }, + { + "epoch": 0.28, + "grad_norm": 0.40708789926489036, + "learning_rate": 8.472997356338469e-06, + "loss": 0.0677, + "step": 3299 + }, + { + "epoch": 0.28, + "grad_norm": 0.30322614204820136, + "learning_rate": 8.472015535013262e-06, + "loss": 0.0914, + "step": 3300 + }, + { + "epoch": 0.28, + "grad_norm": 0.5341928224666488, + "learning_rate": 8.471033455072114e-06, + "loss": 0.1415, + "step": 3301 + }, + { + "epoch": 0.28, + "grad_norm": 0.4299489974869717, + "learning_rate": 8.470051116588174e-06, + "loss": 0.119, + "step": 3302 + }, + { + "epoch": 0.28, + "grad_norm": 0.7760515560778837, + "learning_rate": 8.46906851963461e-06, + "loss": 0.1193, + "step": 3303 + }, + { + "epoch": 0.28, + "grad_norm": 0.43172189096797997, + "learning_rate": 8.468085664284615e-06, + "loss": 0.1018, + "step": 3304 + }, + { + "epoch": 0.28, + "grad_norm": 0.36875631357121696, + "learning_rate": 8.467102550611398e-06, + "loss": 0.082, + "step": 3305 + }, + { + "epoch": 0.28, + "grad_norm": 0.43235261026342625, + "learning_rate": 8.466119178688186e-06, + "loss": 0.0841, + "step": 3306 + }, + { + "epoch": 0.28, + "grad_norm": 0.3569165129764003, + "learning_rate": 8.465135548588224e-06, + "loss": 0.0472, + "step": 3307 + }, + { + "epoch": 0.28, + "grad_norm": 0.31461599294489223, + "learning_rate": 8.46415166038478e-06, + "loss": 0.0759, + "step": 3308 + }, + { + "epoch": 0.28, + "grad_norm": 0.4468755758505138, + "learning_rate": 8.463167514151142e-06, + "loss": 0.1099, + "step": 3309 + }, + { + "epoch": 0.28, + "grad_norm": 0.21892813775295478, + "learning_rate": 8.462183109960613e-06, + "loss": 0.0521, + "step": 3310 + }, + { + "epoch": 0.28, + "grad_norm": 0.2304771767209447, + "learning_rate": 8.461198447886517e-06, + "loss": 0.0693, + "step": 3311 + }, + { + "epoch": 0.28, + "grad_norm": 0.41580216945953946, + "learning_rate": 8.460213528002197e-06, + "loss": 0.0788, + "step": 3312 + }, + { + "epoch": 0.28, + "grad_norm": 0.3435692167999843, + "learning_rate": 8.459228350381017e-06, + "loss": 0.0884, + "step": 3313 + }, + { + "epoch": 0.28, + "grad_norm": 0.3845269099039142, + "learning_rate": 8.458242915096356e-06, + "loss": 0.088, + "step": 3314 + }, + { + "epoch": 0.28, + "grad_norm": 0.37064353233896713, + "learning_rate": 8.45725722222162e-06, + "loss": 0.0855, + "step": 3315 + }, + { + "epoch": 0.28, + "grad_norm": 0.4662331189380145, + "learning_rate": 8.456271271830223e-06, + "loss": 0.1055, + "step": 3316 + }, + { + "epoch": 0.28, + "grad_norm": 0.2130347691131314, + "learning_rate": 8.45528506399561e-06, + "loss": 0.0627, + "step": 3317 + }, + { + "epoch": 0.28, + "grad_norm": 0.3033374201811118, + "learning_rate": 8.454298598791235e-06, + "loss": 0.0567, + "step": 3318 + }, + { + "epoch": 0.28, + "grad_norm": 0.28753764074345245, + "learning_rate": 8.45331187629058e-06, + "loss": 0.0688, + "step": 3319 + }, + { + "epoch": 0.28, + "grad_norm": 0.4298701532828766, + "learning_rate": 8.452324896567137e-06, + "loss": 0.1134, + "step": 3320 + }, + { + "epoch": 0.28, + "grad_norm": 0.24269968958832125, + "learning_rate": 8.451337659694424e-06, + "loss": 0.0547, + "step": 3321 + }, + { + "epoch": 0.28, + "grad_norm": 0.45275274014640904, + "learning_rate": 8.450350165745979e-06, + "loss": 0.1158, + "step": 3322 + }, + { + "epoch": 0.28, + "grad_norm": 0.33374972792258467, + "learning_rate": 8.449362414795353e-06, + "loss": 0.0934, + "step": 3323 + }, + { + "epoch": 0.28, + "grad_norm": 0.3141848919402492, + "learning_rate": 8.44837440691612e-06, + "loss": 0.0724, + "step": 3324 + }, + { + "epoch": 0.28, + "grad_norm": 0.5028623126882972, + "learning_rate": 8.447386142181873e-06, + "loss": 0.0945, + "step": 3325 + }, + { + "epoch": 0.28, + "grad_norm": 0.40024990555539547, + "learning_rate": 8.446397620666222e-06, + "loss": 0.1056, + "step": 3326 + }, + { + "epoch": 0.28, + "grad_norm": 0.2410627703341638, + "learning_rate": 8.445408842442802e-06, + "loss": 0.0929, + "step": 3327 + }, + { + "epoch": 0.28, + "grad_norm": 0.32016105033735426, + "learning_rate": 8.44441980758526e-06, + "loss": 0.096, + "step": 3328 + }, + { + "epoch": 0.28, + "grad_norm": 0.38275698114661477, + "learning_rate": 8.443430516167263e-06, + "loss": 0.1167, + "step": 3329 + }, + { + "epoch": 0.28, + "grad_norm": 0.23848083978635368, + "learning_rate": 8.442440968262503e-06, + "loss": 0.0758, + "step": 3330 + }, + { + "epoch": 0.28, + "grad_norm": 0.3357317616355323, + "learning_rate": 8.441451163944687e-06, + "loss": 0.0666, + "step": 3331 + }, + { + "epoch": 0.28, + "grad_norm": 0.4315098014956081, + "learning_rate": 8.440461103287541e-06, + "loss": 0.1221, + "step": 3332 + }, + { + "epoch": 0.28, + "grad_norm": 0.22908625309514344, + "learning_rate": 8.439470786364808e-06, + "loss": 0.0573, + "step": 3333 + }, + { + "epoch": 0.28, + "grad_norm": 0.19572494488928305, + "learning_rate": 8.438480213250256e-06, + "loss": 0.0467, + "step": 3334 + }, + { + "epoch": 0.28, + "grad_norm": 0.32482667806699966, + "learning_rate": 8.437489384017667e-06, + "loss": 0.1303, + "step": 3335 + }, + { + "epoch": 0.28, + "grad_norm": 0.4469492501081355, + "learning_rate": 8.436498298740842e-06, + "loss": 0.1065, + "step": 3336 + }, + { + "epoch": 0.28, + "grad_norm": 0.46064522623430737, + "learning_rate": 8.435506957493606e-06, + "loss": 0.1077, + "step": 3337 + }, + { + "epoch": 0.28, + "grad_norm": 0.267567471283384, + "learning_rate": 8.434515360349798e-06, + "loss": 0.0619, + "step": 3338 + }, + { + "epoch": 0.28, + "grad_norm": 0.30638934078597135, + "learning_rate": 8.433523507383279e-06, + "loss": 0.0894, + "step": 3339 + }, + { + "epoch": 0.28, + "grad_norm": 0.40060837399836735, + "learning_rate": 8.432531398667928e-06, + "loss": 0.1157, + "step": 3340 + }, + { + "epoch": 0.28, + "grad_norm": 0.324327732462354, + "learning_rate": 8.431539034277642e-06, + "loss": 0.1002, + "step": 3341 + }, + { + "epoch": 0.28, + "grad_norm": 0.39166590891194175, + "learning_rate": 8.430546414286336e-06, + "loss": 0.0946, + "step": 3342 + }, + { + "epoch": 0.28, + "grad_norm": 0.6252142762430505, + "learning_rate": 8.429553538767952e-06, + "loss": 0.0928, + "step": 3343 + }, + { + "epoch": 0.28, + "grad_norm": 0.6107044291807683, + "learning_rate": 8.42856040779644e-06, + "loss": 0.0728, + "step": 3344 + }, + { + "epoch": 0.28, + "grad_norm": 0.4558872260103241, + "learning_rate": 8.427567021445777e-06, + "loss": 0.1205, + "step": 3345 + }, + { + "epoch": 0.28, + "grad_norm": 0.3875100230588183, + "learning_rate": 8.426573379789956e-06, + "loss": 0.0436, + "step": 3346 + }, + { + "epoch": 0.28, + "grad_norm": 0.3771494326187267, + "learning_rate": 8.425579482902986e-06, + "loss": 0.0913, + "step": 3347 + }, + { + "epoch": 0.28, + "grad_norm": 0.5725745290525726, + "learning_rate": 8.4245853308589e-06, + "loss": 0.1401, + "step": 3348 + }, + { + "epoch": 0.28, + "grad_norm": 0.3774470102625326, + "learning_rate": 8.423590923731753e-06, + "loss": 0.0682, + "step": 3349 + }, + { + "epoch": 0.28, + "grad_norm": 0.562252413355768, + "learning_rate": 8.422596261595608e-06, + "loss": 0.1251, + "step": 3350 + }, + { + "epoch": 0.28, + "grad_norm": 0.31680553855293964, + "learning_rate": 8.421601344524555e-06, + "loss": 0.1092, + "step": 3351 + }, + { + "epoch": 0.28, + "grad_norm": 0.38373734913665547, + "learning_rate": 8.4206061725927e-06, + "loss": 0.1094, + "step": 3352 + }, + { + "epoch": 0.28, + "grad_norm": 0.20837822263642564, + "learning_rate": 8.419610745874175e-06, + "loss": 0.0621, + "step": 3353 + }, + { + "epoch": 0.28, + "grad_norm": 0.3703888947655228, + "learning_rate": 8.418615064443116e-06, + "loss": 0.0915, + "step": 3354 + }, + { + "epoch": 0.28, + "grad_norm": 0.6442335432160816, + "learning_rate": 8.417619128373695e-06, + "loss": 0.1201, + "step": 3355 + }, + { + "epoch": 0.28, + "grad_norm": 0.4260252350601668, + "learning_rate": 8.416622937740091e-06, + "loss": 0.0503, + "step": 3356 + }, + { + "epoch": 0.28, + "grad_norm": 0.5819320198359181, + "learning_rate": 8.415626492616509e-06, + "loss": 0.0822, + "step": 3357 + }, + { + "epoch": 0.28, + "grad_norm": 0.4235205358802179, + "learning_rate": 8.414629793077167e-06, + "loss": 0.1046, + "step": 3358 + }, + { + "epoch": 0.28, + "grad_norm": 0.2910420856174988, + "learning_rate": 8.413632839196306e-06, + "loss": 0.0737, + "step": 3359 + }, + { + "epoch": 0.28, + "grad_norm": 0.406100722013346, + "learning_rate": 8.412635631048188e-06, + "loss": 0.1004, + "step": 3360 + }, + { + "epoch": 0.28, + "grad_norm": 0.7850592134220534, + "learning_rate": 8.411638168707086e-06, + "loss": 0.1885, + "step": 3361 + }, + { + "epoch": 0.28, + "grad_norm": 0.39435603762595733, + "learning_rate": 8.410640452247299e-06, + "loss": 0.0798, + "step": 3362 + }, + { + "epoch": 0.28, + "grad_norm": 0.25497441405730864, + "learning_rate": 8.409642481743141e-06, + "loss": 0.0436, + "step": 3363 + }, + { + "epoch": 0.28, + "grad_norm": 0.2379903396782314, + "learning_rate": 8.408644257268951e-06, + "loss": 0.0468, + "step": 3364 + }, + { + "epoch": 0.28, + "grad_norm": 0.5142612670676274, + "learning_rate": 8.407645778899078e-06, + "loss": 0.0838, + "step": 3365 + }, + { + "epoch": 0.28, + "grad_norm": 0.3831801946782589, + "learning_rate": 8.4066470467079e-06, + "loss": 0.0887, + "step": 3366 + }, + { + "epoch": 0.28, + "grad_norm": 0.29258461204526076, + "learning_rate": 8.4056480607698e-06, + "loss": 0.0829, + "step": 3367 + }, + { + "epoch": 0.28, + "grad_norm": 0.20288648934254114, + "learning_rate": 8.404648821159196e-06, + "loss": 0.0543, + "step": 3368 + }, + { + "epoch": 0.28, + "grad_norm": 0.4027044736075758, + "learning_rate": 8.403649327950511e-06, + "loss": 0.125, + "step": 3369 + }, + { + "epoch": 0.28, + "grad_norm": 0.6729101275743553, + "learning_rate": 8.4026495812182e-06, + "loss": 0.1429, + "step": 3370 + }, + { + "epoch": 0.28, + "grad_norm": 0.2694361518728078, + "learning_rate": 8.401649581036724e-06, + "loss": 0.0842, + "step": 3371 + }, + { + "epoch": 0.28, + "grad_norm": 0.2010541498615636, + "learning_rate": 8.400649327480572e-06, + "loss": 0.0416, + "step": 3372 + }, + { + "epoch": 0.28, + "grad_norm": 0.38034497185975724, + "learning_rate": 8.399648820624249e-06, + "loss": 0.0667, + "step": 3373 + }, + { + "epoch": 0.28, + "grad_norm": 0.2916757737457415, + "learning_rate": 8.398648060542275e-06, + "loss": 0.0696, + "step": 3374 + }, + { + "epoch": 0.28, + "grad_norm": 0.2542035852949873, + "learning_rate": 8.397647047309198e-06, + "loss": 0.042, + "step": 3375 + }, + { + "epoch": 0.28, + "grad_norm": 0.19226230547334297, + "learning_rate": 8.396645780999573e-06, + "loss": 0.0291, + "step": 3376 + }, + { + "epoch": 0.28, + "grad_norm": 0.27866466288238123, + "learning_rate": 8.395644261687985e-06, + "loss": 0.0645, + "step": 3377 + }, + { + "epoch": 0.28, + "grad_norm": 0.43291053893351605, + "learning_rate": 8.394642489449031e-06, + "loss": 0.0949, + "step": 3378 + }, + { + "epoch": 0.28, + "grad_norm": 0.2437447326906455, + "learning_rate": 8.393640464357329e-06, + "loss": 0.051, + "step": 3379 + }, + { + "epoch": 0.28, + "grad_norm": 0.40948147827448067, + "learning_rate": 8.392638186487516e-06, + "loss": 0.119, + "step": 3380 + }, + { + "epoch": 0.28, + "grad_norm": 0.3842415440354286, + "learning_rate": 8.391635655914247e-06, + "loss": 0.0994, + "step": 3381 + }, + { + "epoch": 0.28, + "grad_norm": 0.3067628130372608, + "learning_rate": 8.390632872712198e-06, + "loss": 0.1152, + "step": 3382 + }, + { + "epoch": 0.29, + "grad_norm": 0.24297628585068776, + "learning_rate": 8.38962983695606e-06, + "loss": 0.0581, + "step": 3383 + }, + { + "epoch": 0.29, + "grad_norm": 0.46841119776419604, + "learning_rate": 8.388626548720544e-06, + "loss": 0.0951, + "step": 3384 + }, + { + "epoch": 0.29, + "grad_norm": 0.32623700397439986, + "learning_rate": 8.387623008080385e-06, + "loss": 0.092, + "step": 3385 + }, + { + "epoch": 0.29, + "grad_norm": 0.3053157098640161, + "learning_rate": 8.386619215110328e-06, + "loss": 0.0773, + "step": 3386 + }, + { + "epoch": 0.29, + "grad_norm": 0.46665660376585216, + "learning_rate": 8.385615169885144e-06, + "loss": 0.0956, + "step": 3387 + }, + { + "epoch": 0.29, + "grad_norm": 0.4055192819865835, + "learning_rate": 8.38461087247962e-06, + "loss": 0.1216, + "step": 3388 + }, + { + "epoch": 0.29, + "grad_norm": 0.2561331108094009, + "learning_rate": 8.38360632296856e-06, + "loss": 0.092, + "step": 3389 + }, + { + "epoch": 0.29, + "grad_norm": 0.32356700897078516, + "learning_rate": 8.382601521426792e-06, + "loss": 0.0752, + "step": 3390 + }, + { + "epoch": 0.29, + "grad_norm": 0.30851726092992915, + "learning_rate": 8.381596467929156e-06, + "loss": 0.0639, + "step": 3391 + }, + { + "epoch": 0.29, + "grad_norm": 0.2730356500934993, + "learning_rate": 8.380591162550516e-06, + "loss": 0.0625, + "step": 3392 + }, + { + "epoch": 0.29, + "grad_norm": 0.35882615126242606, + "learning_rate": 8.379585605365753e-06, + "loss": 0.0856, + "step": 3393 + }, + { + "epoch": 0.29, + "grad_norm": 0.3049135956875774, + "learning_rate": 8.378579796449767e-06, + "loss": 0.0467, + "step": 3394 + }, + { + "epoch": 0.29, + "grad_norm": 0.7393813429775998, + "learning_rate": 8.377573735877476e-06, + "loss": 0.1475, + "step": 3395 + }, + { + "epoch": 0.29, + "grad_norm": 0.38477887793628174, + "learning_rate": 8.376567423723817e-06, + "loss": 0.0804, + "step": 3396 + }, + { + "epoch": 0.29, + "grad_norm": 0.3014155759279757, + "learning_rate": 8.375560860063747e-06, + "loss": 0.0525, + "step": 3397 + }, + { + "epoch": 0.29, + "grad_norm": 0.4305428006655133, + "learning_rate": 8.374554044972239e-06, + "loss": 0.1105, + "step": 3398 + }, + { + "epoch": 0.29, + "grad_norm": 0.20110282404068208, + "learning_rate": 8.373546978524288e-06, + "loss": 0.0358, + "step": 3399 + }, + { + "epoch": 0.29, + "grad_norm": 0.24467207764729795, + "learning_rate": 8.372539660794907e-06, + "loss": 0.0649, + "step": 3400 + }, + { + "epoch": 0.29, + "grad_norm": 0.2879025959924376, + "learning_rate": 8.371532091859123e-06, + "loss": 0.0589, + "step": 3401 + }, + { + "epoch": 0.29, + "grad_norm": 0.39648550462949134, + "learning_rate": 8.37052427179199e-06, + "loss": 0.1069, + "step": 3402 + }, + { + "epoch": 0.29, + "grad_norm": 0.39206689623802476, + "learning_rate": 8.369516200668574e-06, + "loss": 0.0887, + "step": 3403 + }, + { + "epoch": 0.29, + "grad_norm": 0.3661587965246451, + "learning_rate": 8.368507878563963e-06, + "loss": 0.0656, + "step": 3404 + }, + { + "epoch": 0.29, + "grad_norm": 0.8021457762769479, + "learning_rate": 8.367499305553261e-06, + "loss": 0.1068, + "step": 3405 + }, + { + "epoch": 0.29, + "grad_norm": 0.5576168217332138, + "learning_rate": 8.366490481711594e-06, + "loss": 0.0585, + "step": 3406 + }, + { + "epoch": 0.29, + "grad_norm": 0.42226592023703713, + "learning_rate": 8.365481407114104e-06, + "loss": 0.1144, + "step": 3407 + }, + { + "epoch": 0.29, + "grad_norm": 0.38381436000033164, + "learning_rate": 8.364472081835955e-06, + "loss": 0.1067, + "step": 3408 + }, + { + "epoch": 0.29, + "grad_norm": 0.42376842507324347, + "learning_rate": 8.363462505952325e-06, + "loss": 0.0711, + "step": 3409 + }, + { + "epoch": 0.29, + "grad_norm": 0.3744793545771979, + "learning_rate": 8.362452679538415e-06, + "loss": 0.1398, + "step": 3410 + }, + { + "epoch": 0.29, + "grad_norm": 0.424122164805052, + "learning_rate": 8.36144260266944e-06, + "loss": 0.119, + "step": 3411 + }, + { + "epoch": 0.29, + "grad_norm": 0.40064419966549436, + "learning_rate": 8.36043227542064e-06, + "loss": 0.0956, + "step": 3412 + }, + { + "epoch": 0.29, + "grad_norm": 0.5196394155545212, + "learning_rate": 8.359421697867266e-06, + "loss": 0.1166, + "step": 3413 + }, + { + "epoch": 0.29, + "grad_norm": 0.3187101905279199, + "learning_rate": 8.358410870084595e-06, + "loss": 0.0723, + "step": 3414 + }, + { + "epoch": 0.29, + "grad_norm": 0.38243723969580906, + "learning_rate": 8.357399792147917e-06, + "loss": 0.1356, + "step": 3415 + }, + { + "epoch": 0.29, + "grad_norm": 0.28558225328996106, + "learning_rate": 8.356388464132546e-06, + "loss": 0.0936, + "step": 3416 + }, + { + "epoch": 0.29, + "grad_norm": 0.268210303968939, + "learning_rate": 8.35537688611381e-06, + "loss": 0.0905, + "step": 3417 + }, + { + "epoch": 0.29, + "grad_norm": 0.3855491479870864, + "learning_rate": 8.354365058167055e-06, + "loss": 0.1107, + "step": 3418 + }, + { + "epoch": 0.29, + "grad_norm": 0.6535339478613275, + "learning_rate": 8.353352980367653e-06, + "loss": 0.1088, + "step": 3419 + }, + { + "epoch": 0.29, + "grad_norm": 0.32860708361758817, + "learning_rate": 8.352340652790982e-06, + "loss": 0.0565, + "step": 3420 + }, + { + "epoch": 0.29, + "grad_norm": 0.6543902840969773, + "learning_rate": 8.351328075512454e-06, + "loss": 0.1378, + "step": 3421 + }, + { + "epoch": 0.29, + "grad_norm": 0.30560326658256287, + "learning_rate": 8.350315248607485e-06, + "loss": 0.0918, + "step": 3422 + }, + { + "epoch": 0.29, + "grad_norm": 0.3127499532125756, + "learning_rate": 8.34930217215152e-06, + "loss": 0.0786, + "step": 3423 + }, + { + "epoch": 0.29, + "grad_norm": 0.34923588434775715, + "learning_rate": 8.34828884622002e-06, + "loss": 0.0773, + "step": 3424 + }, + { + "epoch": 0.29, + "grad_norm": 0.36858274709077565, + "learning_rate": 8.347275270888462e-06, + "loss": 0.0954, + "step": 3425 + }, + { + "epoch": 0.29, + "grad_norm": 0.20811523959502642, + "learning_rate": 8.34626144623234e-06, + "loss": 0.0496, + "step": 3426 + }, + { + "epoch": 0.29, + "grad_norm": 0.3158100457855755, + "learning_rate": 8.345247372327174e-06, + "loss": 0.0809, + "step": 3427 + }, + { + "epoch": 0.29, + "grad_norm": 0.27452373484084475, + "learning_rate": 8.344233049248495e-06, + "loss": 0.0637, + "step": 3428 + }, + { + "epoch": 0.29, + "grad_norm": 0.2608608757382667, + "learning_rate": 8.343218477071857e-06, + "loss": 0.0793, + "step": 3429 + }, + { + "epoch": 0.29, + "grad_norm": 0.2680826696313099, + "learning_rate": 8.342203655872832e-06, + "loss": 0.0852, + "step": 3430 + }, + { + "epoch": 0.29, + "grad_norm": 0.32596781746935116, + "learning_rate": 8.341188585727006e-06, + "loss": 0.0928, + "step": 3431 + }, + { + "epoch": 0.29, + "grad_norm": 0.37439402604427874, + "learning_rate": 8.340173266709994e-06, + "loss": 0.0581, + "step": 3432 + }, + { + "epoch": 0.29, + "grad_norm": 0.3936628481772283, + "learning_rate": 8.339157698897417e-06, + "loss": 0.077, + "step": 3433 + }, + { + "epoch": 0.29, + "grad_norm": 0.34204410315491285, + "learning_rate": 8.338141882364924e-06, + "loss": 0.1201, + "step": 3434 + }, + { + "epoch": 0.29, + "grad_norm": 0.2479212433040463, + "learning_rate": 8.337125817188177e-06, + "loss": 0.0652, + "step": 3435 + }, + { + "epoch": 0.29, + "grad_norm": 0.31367139096097785, + "learning_rate": 8.336109503442858e-06, + "loss": 0.0941, + "step": 3436 + }, + { + "epoch": 0.29, + "grad_norm": 0.3669283433980056, + "learning_rate": 8.335092941204668e-06, + "loss": 0.1177, + "step": 3437 + }, + { + "epoch": 0.29, + "grad_norm": 0.30490197760017007, + "learning_rate": 8.33407613054933e-06, + "loss": 0.0925, + "step": 3438 + }, + { + "epoch": 0.29, + "grad_norm": 0.43609424706837446, + "learning_rate": 8.333059071552579e-06, + "loss": 0.0962, + "step": 3439 + }, + { + "epoch": 0.29, + "grad_norm": 0.3523186587277727, + "learning_rate": 8.332041764290171e-06, + "loss": 0.0867, + "step": 3440 + }, + { + "epoch": 0.29, + "grad_norm": 0.25415591107423124, + "learning_rate": 8.331024208837882e-06, + "loss": 0.0555, + "step": 3441 + }, + { + "epoch": 0.29, + "grad_norm": 0.36675167917227486, + "learning_rate": 8.330006405271504e-06, + "loss": 0.043, + "step": 3442 + }, + { + "epoch": 0.29, + "grad_norm": 0.5277467307141047, + "learning_rate": 8.328988353666852e-06, + "loss": 0.1209, + "step": 3443 + }, + { + "epoch": 0.29, + "grad_norm": 0.21321544284063657, + "learning_rate": 8.327970054099754e-06, + "loss": 0.0516, + "step": 3444 + }, + { + "epoch": 0.29, + "grad_norm": 0.35728047458352996, + "learning_rate": 8.32695150664606e-06, + "loss": 0.1053, + "step": 3445 + }, + { + "epoch": 0.29, + "grad_norm": 0.32190113084790645, + "learning_rate": 8.325932711381636e-06, + "loss": 0.1214, + "step": 3446 + }, + { + "epoch": 0.29, + "grad_norm": 0.27364183733812764, + "learning_rate": 8.324913668382368e-06, + "loss": 0.0831, + "step": 3447 + }, + { + "epoch": 0.29, + "grad_norm": 0.3117275454118977, + "learning_rate": 8.323894377724163e-06, + "loss": 0.1122, + "step": 3448 + }, + { + "epoch": 0.29, + "grad_norm": 0.40745333124696304, + "learning_rate": 8.322874839482941e-06, + "loss": 0.1058, + "step": 3449 + }, + { + "epoch": 0.29, + "grad_norm": 0.3805713969100236, + "learning_rate": 8.321855053734643e-06, + "loss": 0.1098, + "step": 3450 + }, + { + "epoch": 0.29, + "grad_norm": 0.432110539569376, + "learning_rate": 8.32083502055523e-06, + "loss": 0.0969, + "step": 3451 + }, + { + "epoch": 0.29, + "grad_norm": 0.5270207021297184, + "learning_rate": 8.319814740020678e-06, + "loss": 0.1184, + "step": 3452 + }, + { + "epoch": 0.29, + "grad_norm": 0.4031304466773929, + "learning_rate": 8.318794212206986e-06, + "loss": 0.0474, + "step": 3453 + }, + { + "epoch": 0.29, + "grad_norm": 0.6952881550734994, + "learning_rate": 8.317773437190169e-06, + "loss": 0.1224, + "step": 3454 + }, + { + "epoch": 0.29, + "grad_norm": 0.39883864374984407, + "learning_rate": 8.316752415046258e-06, + "loss": 0.1101, + "step": 3455 + }, + { + "epoch": 0.29, + "grad_norm": 0.20566967699171593, + "learning_rate": 8.315731145851305e-06, + "loss": 0.0626, + "step": 3456 + }, + { + "epoch": 0.29, + "grad_norm": 0.3322982673747095, + "learning_rate": 8.31470962968138e-06, + "loss": 0.0628, + "step": 3457 + }, + { + "epoch": 0.29, + "grad_norm": 0.2523443883086348, + "learning_rate": 8.313687866612574e-06, + "loss": 0.094, + "step": 3458 + }, + { + "epoch": 0.29, + "grad_norm": 0.40750634059975216, + "learning_rate": 8.31266585672099e-06, + "loss": 0.1194, + "step": 3459 + }, + { + "epoch": 0.29, + "grad_norm": 0.21047445463151845, + "learning_rate": 8.311643600082758e-06, + "loss": 0.069, + "step": 3460 + }, + { + "epoch": 0.29, + "grad_norm": 0.18731140143254316, + "learning_rate": 8.310621096774016e-06, + "loss": 0.0535, + "step": 3461 + }, + { + "epoch": 0.29, + "grad_norm": 0.5631463785512291, + "learning_rate": 8.309598346870931e-06, + "loss": 0.0901, + "step": 3462 + }, + { + "epoch": 0.29, + "grad_norm": 0.48073372924624547, + "learning_rate": 8.308575350449679e-06, + "loss": 0.1201, + "step": 3463 + }, + { + "epoch": 0.29, + "grad_norm": 0.5515112312867327, + "learning_rate": 8.307552107586463e-06, + "loss": 0.1343, + "step": 3464 + }, + { + "epoch": 0.29, + "grad_norm": 0.30658532689762324, + "learning_rate": 8.306528618357498e-06, + "loss": 0.0635, + "step": 3465 + }, + { + "epoch": 0.29, + "grad_norm": 0.26958992981370533, + "learning_rate": 8.30550488283902e-06, + "loss": 0.0983, + "step": 3466 + }, + { + "epoch": 0.29, + "grad_norm": 0.24971534817696114, + "learning_rate": 8.304480901107282e-06, + "loss": 0.0463, + "step": 3467 + }, + { + "epoch": 0.29, + "grad_norm": 0.36192954983787184, + "learning_rate": 8.303456673238555e-06, + "loss": 0.1318, + "step": 3468 + }, + { + "epoch": 0.29, + "grad_norm": 0.26012356317526036, + "learning_rate": 8.302432199309132e-06, + "loss": 0.0964, + "step": 3469 + }, + { + "epoch": 0.29, + "grad_norm": 0.22578942833083543, + "learning_rate": 8.30140747939532e-06, + "loss": 0.0751, + "step": 3470 + }, + { + "epoch": 0.29, + "grad_norm": 0.3906385308712652, + "learning_rate": 8.300382513573447e-06, + "loss": 0.0834, + "step": 3471 + }, + { + "epoch": 0.29, + "grad_norm": 0.25919778751735095, + "learning_rate": 8.299357301919859e-06, + "loss": 0.056, + "step": 3472 + }, + { + "epoch": 0.29, + "grad_norm": 0.3045165011962118, + "learning_rate": 8.298331844510918e-06, + "loss": 0.0811, + "step": 3473 + }, + { + "epoch": 0.29, + "grad_norm": 0.25508796042251275, + "learning_rate": 8.297306141423009e-06, + "loss": 0.0684, + "step": 3474 + }, + { + "epoch": 0.29, + "grad_norm": 0.3406187460622309, + "learning_rate": 8.296280192732529e-06, + "loss": 0.123, + "step": 3475 + }, + { + "epoch": 0.29, + "grad_norm": 0.39811895158647004, + "learning_rate": 8.295253998515897e-06, + "loss": 0.0992, + "step": 3476 + }, + { + "epoch": 0.29, + "grad_norm": 0.28772373208319796, + "learning_rate": 8.294227558849553e-06, + "loss": 0.0981, + "step": 3477 + }, + { + "epoch": 0.29, + "grad_norm": 0.43034726107327487, + "learning_rate": 8.293200873809948e-06, + "loss": 0.1163, + "step": 3478 + }, + { + "epoch": 0.29, + "grad_norm": 0.3795166731517897, + "learning_rate": 8.29217394347356e-06, + "loss": 0.0828, + "step": 3479 + }, + { + "epoch": 0.29, + "grad_norm": 0.26059099469733243, + "learning_rate": 8.291146767916875e-06, + "loss": 0.0819, + "step": 3480 + }, + { + "epoch": 0.29, + "grad_norm": 0.26233130541654637, + "learning_rate": 8.29011934721641e-06, + "loss": 0.0412, + "step": 3481 + }, + { + "epoch": 0.29, + "grad_norm": 0.3730884760728986, + "learning_rate": 8.289091681448688e-06, + "loss": 0.0904, + "step": 3482 + }, + { + "epoch": 0.29, + "grad_norm": 0.35183852015895173, + "learning_rate": 8.288063770690257e-06, + "loss": 0.0987, + "step": 3483 + }, + { + "epoch": 0.29, + "grad_norm": 0.6369365079551031, + "learning_rate": 8.287035615017682e-06, + "loss": 0.1072, + "step": 3484 + }, + { + "epoch": 0.29, + "grad_norm": 0.2618042624267544, + "learning_rate": 8.286007214507547e-06, + "loss": 0.0632, + "step": 3485 + }, + { + "epoch": 0.29, + "grad_norm": 0.24961948529249958, + "learning_rate": 8.28497856923645e-06, + "loss": 0.0738, + "step": 3486 + }, + { + "epoch": 0.29, + "grad_norm": 0.35731559779365896, + "learning_rate": 8.283949679281014e-06, + "loss": 0.1242, + "step": 3487 + }, + { + "epoch": 0.29, + "grad_norm": 0.3841415219010094, + "learning_rate": 8.282920544717876e-06, + "loss": 0.1144, + "step": 3488 + }, + { + "epoch": 0.29, + "grad_norm": 0.3050665812414802, + "learning_rate": 8.281891165623693e-06, + "loss": 0.0651, + "step": 3489 + }, + { + "epoch": 0.29, + "grad_norm": 0.34841682970388693, + "learning_rate": 8.280861542075134e-06, + "loss": 0.0901, + "step": 3490 + }, + { + "epoch": 0.29, + "grad_norm": 0.3746083592924003, + "learning_rate": 8.279831674148898e-06, + "loss": 0.1214, + "step": 3491 + }, + { + "epoch": 0.29, + "grad_norm": 0.28994739762570476, + "learning_rate": 8.27880156192169e-06, + "loss": 0.0745, + "step": 3492 + }, + { + "epoch": 0.29, + "grad_norm": 0.40156534165780644, + "learning_rate": 8.277771205470242e-06, + "loss": 0.0965, + "step": 3493 + }, + { + "epoch": 0.29, + "grad_norm": 0.4694312903797416, + "learning_rate": 8.2767406048713e-06, + "loss": 0.1042, + "step": 3494 + }, + { + "epoch": 0.29, + "grad_norm": 0.2651029565468178, + "learning_rate": 8.27570976020163e-06, + "loss": 0.0955, + "step": 3495 + }, + { + "epoch": 0.29, + "grad_norm": 0.36348285242495476, + "learning_rate": 8.274678671538014e-06, + "loss": 0.078, + "step": 3496 + }, + { + "epoch": 0.29, + "grad_norm": 0.35051604730621977, + "learning_rate": 8.273647338957256e-06, + "loss": 0.0678, + "step": 3497 + }, + { + "epoch": 0.29, + "grad_norm": 0.36564406133536825, + "learning_rate": 8.272615762536171e-06, + "loss": 0.0883, + "step": 3498 + }, + { + "epoch": 0.29, + "grad_norm": 0.25322654230355174, + "learning_rate": 8.271583942351602e-06, + "loss": 0.0714, + "step": 3499 + }, + { + "epoch": 0.29, + "grad_norm": 0.3586927081095055, + "learning_rate": 8.270551878480402e-06, + "loss": 0.0849, + "step": 3500 + }, + { + "epoch": 0.29, + "grad_norm": 0.3027782906153767, + "learning_rate": 8.269519570999444e-06, + "loss": 0.0771, + "step": 3501 + }, + { + "epoch": 0.3, + "grad_norm": 0.4608092505711324, + "learning_rate": 8.268487019985624e-06, + "loss": 0.0808, + "step": 3502 + }, + { + "epoch": 0.3, + "grad_norm": 0.2735354911387808, + "learning_rate": 8.267454225515848e-06, + "loss": 0.0994, + "step": 3503 + }, + { + "epoch": 0.3, + "grad_norm": 0.304705067909962, + "learning_rate": 8.26642118766705e-06, + "loss": 0.0794, + "step": 3504 + }, + { + "epoch": 0.3, + "grad_norm": 0.3965411136038813, + "learning_rate": 8.265387906516171e-06, + "loss": 0.0676, + "step": 3505 + }, + { + "epoch": 0.3, + "grad_norm": 0.4386935815845951, + "learning_rate": 8.26435438214018e-06, + "loss": 0.1038, + "step": 3506 + }, + { + "epoch": 0.3, + "grad_norm": 0.25239037871008524, + "learning_rate": 8.263320614616059e-06, + "loss": 0.0731, + "step": 3507 + }, + { + "epoch": 0.3, + "grad_norm": 0.30296284753705743, + "learning_rate": 8.262286604020808e-06, + "loss": 0.0811, + "step": 3508 + }, + { + "epoch": 0.3, + "grad_norm": 0.3138322538743376, + "learning_rate": 8.261252350431446e-06, + "loss": 0.0718, + "step": 3509 + }, + { + "epoch": 0.3, + "grad_norm": 0.19164499727840723, + "learning_rate": 8.26021785392501e-06, + "loss": 0.0447, + "step": 3510 + }, + { + "epoch": 0.3, + "grad_norm": 0.34125096007889766, + "learning_rate": 8.259183114578556e-06, + "loss": 0.0892, + "step": 3511 + }, + { + "epoch": 0.3, + "grad_norm": 0.3341500671102026, + "learning_rate": 8.258148132469157e-06, + "loss": 0.082, + "step": 3512 + }, + { + "epoch": 0.3, + "grad_norm": 0.46559353871791576, + "learning_rate": 8.257112907673908e-06, + "loss": 0.1557, + "step": 3513 + }, + { + "epoch": 0.3, + "grad_norm": 0.22963138197462746, + "learning_rate": 8.256077440269912e-06, + "loss": 0.0549, + "step": 3514 + }, + { + "epoch": 0.3, + "grad_norm": 0.3097432695846166, + "learning_rate": 8.255041730334303e-06, + "loss": 0.0907, + "step": 3515 + }, + { + "epoch": 0.3, + "grad_norm": 0.2966561078055357, + "learning_rate": 8.254005777944223e-06, + "loss": 0.1031, + "step": 3516 + }, + { + "epoch": 0.3, + "grad_norm": 0.20922882769907816, + "learning_rate": 8.252969583176837e-06, + "loss": 0.0647, + "step": 3517 + }, + { + "epoch": 0.3, + "grad_norm": 0.2843670838712974, + "learning_rate": 8.251933146109326e-06, + "loss": 0.0813, + "step": 3518 + }, + { + "epoch": 0.3, + "grad_norm": 0.35062437946849434, + "learning_rate": 8.25089646681889e-06, + "loss": 0.054, + "step": 3519 + }, + { + "epoch": 0.3, + "grad_norm": 0.28324491593157763, + "learning_rate": 8.249859545382748e-06, + "loss": 0.0886, + "step": 3520 + }, + { + "epoch": 0.3, + "grad_norm": 0.32046533788374665, + "learning_rate": 8.248822381878135e-06, + "loss": 0.0854, + "step": 3521 + }, + { + "epoch": 0.3, + "grad_norm": 0.4422833536255353, + "learning_rate": 8.247784976382306e-06, + "loss": 0.0711, + "step": 3522 + }, + { + "epoch": 0.3, + "grad_norm": 0.19907422451738827, + "learning_rate": 8.24674732897253e-06, + "loss": 0.0651, + "step": 3523 + }, + { + "epoch": 0.3, + "grad_norm": 0.23001856860185055, + "learning_rate": 8.2457094397261e-06, + "loss": 0.0659, + "step": 3524 + }, + { + "epoch": 0.3, + "grad_norm": 0.400343839236898, + "learning_rate": 8.244671308720324e-06, + "loss": 0.0754, + "step": 3525 + }, + { + "epoch": 0.3, + "grad_norm": 0.41407416669775604, + "learning_rate": 8.243632936032526e-06, + "loss": 0.0657, + "step": 3526 + }, + { + "epoch": 0.3, + "grad_norm": 0.4794970798554485, + "learning_rate": 8.242594321740052e-06, + "loss": 0.0884, + "step": 3527 + }, + { + "epoch": 0.3, + "grad_norm": 0.562444624737108, + "learning_rate": 8.241555465920265e-06, + "loss": 0.1285, + "step": 3528 + }, + { + "epoch": 0.3, + "grad_norm": 0.242208397891645, + "learning_rate": 8.240516368650542e-06, + "loss": 0.0535, + "step": 3529 + }, + { + "epoch": 0.3, + "grad_norm": 0.31719335091294476, + "learning_rate": 8.239477030008284e-06, + "loss": 0.0807, + "step": 3530 + }, + { + "epoch": 0.3, + "grad_norm": 0.40360204486442014, + "learning_rate": 8.238437450070904e-06, + "loss": 0.0742, + "step": 3531 + }, + { + "epoch": 0.3, + "grad_norm": 0.23549210019814987, + "learning_rate": 8.237397628915838e-06, + "loss": 0.0627, + "step": 3532 + }, + { + "epoch": 0.3, + "grad_norm": 0.27018376168501396, + "learning_rate": 8.236357566620537e-06, + "loss": 0.0544, + "step": 3533 + }, + { + "epoch": 0.3, + "grad_norm": 0.36156645275346017, + "learning_rate": 8.23531726326247e-06, + "loss": 0.0974, + "step": 3534 + }, + { + "epoch": 0.3, + "grad_norm": 0.3345352689895445, + "learning_rate": 8.234276718919127e-06, + "loss": 0.0818, + "step": 3535 + }, + { + "epoch": 0.3, + "grad_norm": 0.2747257354504063, + "learning_rate": 8.233235933668014e-06, + "loss": 0.0488, + "step": 3536 + }, + { + "epoch": 0.3, + "grad_norm": 0.27251394631729614, + "learning_rate": 8.232194907586652e-06, + "loss": 0.087, + "step": 3537 + }, + { + "epoch": 0.3, + "grad_norm": 0.24365137952845387, + "learning_rate": 8.231153640752588e-06, + "loss": 0.0876, + "step": 3538 + }, + { + "epoch": 0.3, + "grad_norm": 0.34021744218146943, + "learning_rate": 8.230112133243374e-06, + "loss": 0.0611, + "step": 3539 + }, + { + "epoch": 0.3, + "grad_norm": 0.5712587597484108, + "learning_rate": 8.229070385136593e-06, + "loss": 0.0825, + "step": 3540 + }, + { + "epoch": 0.3, + "grad_norm": 0.2661618508196568, + "learning_rate": 8.22802839650984e-06, + "loss": 0.0883, + "step": 3541 + }, + { + "epoch": 0.3, + "grad_norm": 0.22350074367142747, + "learning_rate": 8.226986167440725e-06, + "loss": 0.0521, + "step": 3542 + }, + { + "epoch": 0.3, + "grad_norm": 0.3454542046527263, + "learning_rate": 8.225943698006883e-06, + "loss": 0.1, + "step": 3543 + }, + { + "epoch": 0.3, + "grad_norm": 0.23772672229894337, + "learning_rate": 8.22490098828596e-06, + "loss": 0.0603, + "step": 3544 + }, + { + "epoch": 0.3, + "grad_norm": 0.2369100399154882, + "learning_rate": 8.223858038355628e-06, + "loss": 0.0571, + "step": 3545 + }, + { + "epoch": 0.3, + "grad_norm": 0.8091001582078899, + "learning_rate": 8.222814848293568e-06, + "loss": 0.1457, + "step": 3546 + }, + { + "epoch": 0.3, + "grad_norm": 0.665874188650964, + "learning_rate": 8.221771418177483e-06, + "loss": 0.1688, + "step": 3547 + }, + { + "epoch": 0.3, + "grad_norm": 0.41061806825129316, + "learning_rate": 8.220727748085096e-06, + "loss": 0.0957, + "step": 3548 + }, + { + "epoch": 0.3, + "grad_norm": 0.2206177184196639, + "learning_rate": 8.219683838094143e-06, + "loss": 0.0595, + "step": 3549 + }, + { + "epoch": 0.3, + "grad_norm": 0.4672874731214148, + "learning_rate": 8.21863968828238e-06, + "loss": 0.1043, + "step": 3550 + }, + { + "epoch": 0.3, + "grad_norm": 0.6024646838735105, + "learning_rate": 8.217595298727586e-06, + "loss": 0.123, + "step": 3551 + }, + { + "epoch": 0.3, + "grad_norm": 0.30994340675601734, + "learning_rate": 8.216550669507549e-06, + "loss": 0.0525, + "step": 3552 + }, + { + "epoch": 0.3, + "grad_norm": 0.33669491623773484, + "learning_rate": 8.21550580070008e-06, + "loss": 0.0951, + "step": 3553 + }, + { + "epoch": 0.3, + "grad_norm": 0.23209858450625856, + "learning_rate": 8.214460692383008e-06, + "loss": 0.067, + "step": 3554 + }, + { + "epoch": 0.3, + "grad_norm": 0.30621287286132703, + "learning_rate": 8.213415344634178e-06, + "loss": 0.0846, + "step": 3555 + }, + { + "epoch": 0.3, + "grad_norm": 0.2740132163539907, + "learning_rate": 8.212369757531453e-06, + "loss": 0.0746, + "step": 3556 + }, + { + "epoch": 0.3, + "grad_norm": 0.3725701253001791, + "learning_rate": 8.211323931152718e-06, + "loss": 0.0817, + "step": 3557 + }, + { + "epoch": 0.3, + "grad_norm": 0.3150927742638896, + "learning_rate": 8.210277865575866e-06, + "loss": 0.0626, + "step": 3558 + }, + { + "epoch": 0.3, + "grad_norm": 0.25094659183947016, + "learning_rate": 8.209231560878818e-06, + "loss": 0.0777, + "step": 3559 + }, + { + "epoch": 0.3, + "grad_norm": 0.43274462771994365, + "learning_rate": 8.208185017139508e-06, + "loss": 0.1107, + "step": 3560 + }, + { + "epoch": 0.3, + "grad_norm": 0.3118682713527896, + "learning_rate": 8.20713823443589e-06, + "loss": 0.0652, + "step": 3561 + }, + { + "epoch": 0.3, + "grad_norm": 0.21452713826020006, + "learning_rate": 8.206091212845932e-06, + "loss": 0.047, + "step": 3562 + }, + { + "epoch": 0.3, + "grad_norm": 0.19473713711449442, + "learning_rate": 8.205043952447622e-06, + "loss": 0.0601, + "step": 3563 + }, + { + "epoch": 0.3, + "grad_norm": 0.5717441589621339, + "learning_rate": 8.203996453318971e-06, + "loss": 0.0765, + "step": 3564 + }, + { + "epoch": 0.3, + "grad_norm": 0.23739584595384405, + "learning_rate": 8.202948715537998e-06, + "loss": 0.0574, + "step": 3565 + }, + { + "epoch": 0.3, + "grad_norm": 0.33774805467086577, + "learning_rate": 8.201900739182746e-06, + "loss": 0.0985, + "step": 3566 + }, + { + "epoch": 0.3, + "grad_norm": 0.23469365327432187, + "learning_rate": 8.200852524331275e-06, + "loss": 0.0811, + "step": 3567 + }, + { + "epoch": 0.3, + "grad_norm": 0.704950679816774, + "learning_rate": 8.199804071061661e-06, + "loss": 0.0822, + "step": 3568 + }, + { + "epoch": 0.3, + "grad_norm": 0.2774618020586867, + "learning_rate": 8.198755379452e-06, + "loss": 0.0867, + "step": 3569 + }, + { + "epoch": 0.3, + "grad_norm": 0.3011726794268524, + "learning_rate": 8.197706449580405e-06, + "loss": 0.1231, + "step": 3570 + }, + { + "epoch": 0.3, + "grad_norm": 0.26096145580664165, + "learning_rate": 8.196657281525005e-06, + "loss": 0.0468, + "step": 3571 + }, + { + "epoch": 0.3, + "grad_norm": 0.78419528255088, + "learning_rate": 8.195607875363949e-06, + "loss": 0.0874, + "step": 3572 + }, + { + "epoch": 0.3, + "grad_norm": 0.29915199365948836, + "learning_rate": 8.194558231175401e-06, + "loss": 0.096, + "step": 3573 + }, + { + "epoch": 0.3, + "grad_norm": 0.19783771611164497, + "learning_rate": 8.193508349037548e-06, + "loss": 0.0611, + "step": 3574 + }, + { + "epoch": 0.3, + "grad_norm": 0.21091219727055546, + "learning_rate": 8.19245822902859e-06, + "loss": 0.0659, + "step": 3575 + }, + { + "epoch": 0.3, + "grad_norm": 0.22506974299469842, + "learning_rate": 8.191407871226744e-06, + "loss": 0.0512, + "step": 3576 + }, + { + "epoch": 0.3, + "grad_norm": 0.21873567408392205, + "learning_rate": 8.190357275710249e-06, + "loss": 0.048, + "step": 3577 + }, + { + "epoch": 0.3, + "grad_norm": 0.41682267051210686, + "learning_rate": 8.189306442557359e-06, + "loss": 0.0959, + "step": 3578 + }, + { + "epoch": 0.3, + "grad_norm": 0.391513123048695, + "learning_rate": 8.188255371846347e-06, + "loss": 0.0933, + "step": 3579 + }, + { + "epoch": 0.3, + "grad_norm": 0.3307180489648822, + "learning_rate": 8.187204063655501e-06, + "loss": 0.1214, + "step": 3580 + }, + { + "epoch": 0.3, + "grad_norm": 0.3335712656891786, + "learning_rate": 8.186152518063128e-06, + "loss": 0.0626, + "step": 3581 + }, + { + "epoch": 0.3, + "grad_norm": 0.2570347551445168, + "learning_rate": 8.185100735147558e-06, + "loss": 0.0808, + "step": 3582 + }, + { + "epoch": 0.3, + "grad_norm": 0.27483236727369664, + "learning_rate": 8.184048714987129e-06, + "loss": 0.0829, + "step": 3583 + }, + { + "epoch": 0.3, + "grad_norm": 0.3363792797239031, + "learning_rate": 8.182996457660202e-06, + "loss": 0.0486, + "step": 3584 + }, + { + "epoch": 0.3, + "grad_norm": 0.3243358621086891, + "learning_rate": 8.181943963245157e-06, + "loss": 0.0753, + "step": 3585 + }, + { + "epoch": 0.3, + "grad_norm": 0.20914061746171497, + "learning_rate": 8.180891231820392e-06, + "loss": 0.0506, + "step": 3586 + }, + { + "epoch": 0.3, + "grad_norm": 0.23723633120098409, + "learning_rate": 8.179838263464316e-06, + "loss": 0.0882, + "step": 3587 + }, + { + "epoch": 0.3, + "grad_norm": 0.37242300880644585, + "learning_rate": 8.178785058255363e-06, + "loss": 0.0885, + "step": 3588 + }, + { + "epoch": 0.3, + "grad_norm": 0.5209478834628155, + "learning_rate": 8.17773161627198e-06, + "loss": 0.1358, + "step": 3589 + }, + { + "epoch": 0.3, + "grad_norm": 0.4079965886472989, + "learning_rate": 8.176677937592634e-06, + "loss": 0.0767, + "step": 3590 + }, + { + "epoch": 0.3, + "grad_norm": 0.20742024159347838, + "learning_rate": 8.175624022295812e-06, + "loss": 0.0457, + "step": 3591 + }, + { + "epoch": 0.3, + "grad_norm": 0.393137049734039, + "learning_rate": 8.174569870460012e-06, + "loss": 0.0696, + "step": 3592 + }, + { + "epoch": 0.3, + "grad_norm": 0.47523286203037013, + "learning_rate": 8.173515482163756e-06, + "loss": 0.1363, + "step": 3593 + }, + { + "epoch": 0.3, + "grad_norm": 0.26323421201168046, + "learning_rate": 8.172460857485578e-06, + "loss": 0.0739, + "step": 3594 + }, + { + "epoch": 0.3, + "grad_norm": 0.6857273593503733, + "learning_rate": 8.171405996504033e-06, + "loss": 0.1454, + "step": 3595 + }, + { + "epoch": 0.3, + "grad_norm": 0.3201587353551199, + "learning_rate": 8.1703508992977e-06, + "loss": 0.0606, + "step": 3596 + }, + { + "epoch": 0.3, + "grad_norm": 0.3144356367893049, + "learning_rate": 8.169295565945158e-06, + "loss": 0.0935, + "step": 3597 + }, + { + "epoch": 0.3, + "grad_norm": 0.4174814452084571, + "learning_rate": 8.168239996525022e-06, + "loss": 0.1347, + "step": 3598 + }, + { + "epoch": 0.3, + "grad_norm": 0.3661488273990816, + "learning_rate": 8.167184191115915e-06, + "loss": 0.0919, + "step": 3599 + }, + { + "epoch": 0.3, + "grad_norm": 0.38804316550183937, + "learning_rate": 8.16612814979648e-06, + "loss": 0.0923, + "step": 3600 + }, + { + "epoch": 0.3, + "grad_norm": 0.3199892317272469, + "learning_rate": 8.165071872645373e-06, + "loss": 0.0927, + "step": 3601 + }, + { + "epoch": 0.3, + "grad_norm": 0.2893005194217812, + "learning_rate": 8.164015359741277e-06, + "loss": 0.0981, + "step": 3602 + }, + { + "epoch": 0.3, + "grad_norm": 0.3129566487938126, + "learning_rate": 8.162958611162886e-06, + "loss": 0.0413, + "step": 3603 + }, + { + "epoch": 0.3, + "grad_norm": 0.2835923847409358, + "learning_rate": 8.161901626988912e-06, + "loss": 0.0681, + "step": 3604 + }, + { + "epoch": 0.3, + "grad_norm": 0.3643393212368736, + "learning_rate": 8.160844407298085e-06, + "loss": 0.0784, + "step": 3605 + }, + { + "epoch": 0.3, + "grad_norm": 0.2981156765135449, + "learning_rate": 8.159786952169153e-06, + "loss": 0.086, + "step": 3606 + }, + { + "epoch": 0.3, + "grad_norm": 0.37979050067759085, + "learning_rate": 8.158729261680881e-06, + "loss": 0.0879, + "step": 3607 + }, + { + "epoch": 0.3, + "grad_norm": 0.19201077150890355, + "learning_rate": 8.157671335912054e-06, + "loss": 0.0425, + "step": 3608 + }, + { + "epoch": 0.3, + "grad_norm": 0.5772767501716746, + "learning_rate": 8.15661317494147e-06, + "loss": 0.1064, + "step": 3609 + }, + { + "epoch": 0.3, + "grad_norm": 0.6341318181437655, + "learning_rate": 8.15555477884795e-06, + "loss": 0.076, + "step": 3610 + }, + { + "epoch": 0.3, + "grad_norm": 0.47905500146513535, + "learning_rate": 8.154496147710325e-06, + "loss": 0.1207, + "step": 3611 + }, + { + "epoch": 0.3, + "grad_norm": 0.4694424233853518, + "learning_rate": 8.153437281607452e-06, + "loss": 0.1391, + "step": 3612 + }, + { + "epoch": 0.3, + "grad_norm": 0.28938582919678285, + "learning_rate": 8.1523781806182e-06, + "loss": 0.06, + "step": 3613 + }, + { + "epoch": 0.3, + "grad_norm": 0.4116984259175424, + "learning_rate": 8.151318844821456e-06, + "loss": 0.0856, + "step": 3614 + }, + { + "epoch": 0.3, + "grad_norm": 0.23732564409107743, + "learning_rate": 8.150259274296128e-06, + "loss": 0.0777, + "step": 3615 + }, + { + "epoch": 0.3, + "grad_norm": 0.4813275978794444, + "learning_rate": 8.149199469121138e-06, + "loss": 0.0826, + "step": 3616 + }, + { + "epoch": 0.3, + "grad_norm": 0.4143177241081707, + "learning_rate": 8.148139429375425e-06, + "loss": 0.0945, + "step": 3617 + }, + { + "epoch": 0.3, + "grad_norm": 0.3283683014729715, + "learning_rate": 8.14707915513795e-06, + "loss": 0.0573, + "step": 3618 + }, + { + "epoch": 0.3, + "grad_norm": 0.22316466148213573, + "learning_rate": 8.146018646487684e-06, + "loss": 0.0755, + "step": 3619 + }, + { + "epoch": 0.31, + "grad_norm": 0.3678202858659283, + "learning_rate": 8.144957903503622e-06, + "loss": 0.0887, + "step": 3620 + }, + { + "epoch": 0.31, + "grad_norm": 0.1949478289167243, + "learning_rate": 8.143896926264778e-06, + "loss": 0.0538, + "step": 3621 + }, + { + "epoch": 0.31, + "grad_norm": 0.2561366638630743, + "learning_rate": 8.142835714850173e-06, + "loss": 0.0721, + "step": 3622 + }, + { + "epoch": 0.31, + "grad_norm": 0.3465086359544568, + "learning_rate": 8.141774269338858e-06, + "loss": 0.1099, + "step": 3623 + }, + { + "epoch": 0.31, + "grad_norm": 0.3747047550201552, + "learning_rate": 8.140712589809891e-06, + "loss": 0.0824, + "step": 3624 + }, + { + "epoch": 0.31, + "grad_norm": 0.2795556229996969, + "learning_rate": 8.139650676342357e-06, + "loss": 0.0865, + "step": 3625 + }, + { + "epoch": 0.31, + "grad_norm": 0.3336606853708106, + "learning_rate": 8.138588529015348e-06, + "loss": 0.0932, + "step": 3626 + }, + { + "epoch": 0.31, + "grad_norm": 0.24890182717019194, + "learning_rate": 8.137526147907984e-06, + "loss": 0.0474, + "step": 3627 + }, + { + "epoch": 0.31, + "grad_norm": 0.5590284713264075, + "learning_rate": 8.136463533099392e-06, + "loss": 0.1092, + "step": 3628 + }, + { + "epoch": 0.31, + "grad_norm": 0.283909236953108, + "learning_rate": 8.135400684668727e-06, + "loss": 0.078, + "step": 3629 + }, + { + "epoch": 0.31, + "grad_norm": 0.22304138643966928, + "learning_rate": 8.134337602695154e-06, + "loss": 0.0485, + "step": 3630 + }, + { + "epoch": 0.31, + "grad_norm": 0.5463154782975195, + "learning_rate": 8.133274287257857e-06, + "loss": 0.0474, + "step": 3631 + }, + { + "epoch": 0.31, + "grad_norm": 0.33949702524007497, + "learning_rate": 8.13221073843604e-06, + "loss": 0.0753, + "step": 3632 + }, + { + "epoch": 0.31, + "grad_norm": 0.2104648296125629, + "learning_rate": 8.131146956308917e-06, + "loss": 0.0494, + "step": 3633 + }, + { + "epoch": 0.31, + "grad_norm": 0.34143602421276253, + "learning_rate": 8.130082940955733e-06, + "loss": 0.0876, + "step": 3634 + }, + { + "epoch": 0.31, + "grad_norm": 0.2534738497041492, + "learning_rate": 8.129018692455735e-06, + "loss": 0.0637, + "step": 3635 + }, + { + "epoch": 0.31, + "grad_norm": 0.3284661252711693, + "learning_rate": 8.127954210888197e-06, + "loss": 0.1019, + "step": 3636 + }, + { + "epoch": 0.31, + "grad_norm": 0.26153305852086406, + "learning_rate": 8.126889496332408e-06, + "loss": 0.0668, + "step": 3637 + }, + { + "epoch": 0.31, + "grad_norm": 0.48344683443548797, + "learning_rate": 8.125824548867673e-06, + "loss": 0.0757, + "step": 3638 + }, + { + "epoch": 0.31, + "grad_norm": 0.3629083209154181, + "learning_rate": 8.124759368573316e-06, + "loss": 0.1273, + "step": 3639 + }, + { + "epoch": 0.31, + "grad_norm": 0.27587691975975437, + "learning_rate": 8.12369395552868e-06, + "loss": 0.0627, + "step": 3640 + }, + { + "epoch": 0.31, + "grad_norm": 0.2409562128772796, + "learning_rate": 8.122628309813121e-06, + "loss": 0.0748, + "step": 3641 + }, + { + "epoch": 0.31, + "grad_norm": 0.27847207017261966, + "learning_rate": 8.121562431506013e-06, + "loss": 0.0678, + "step": 3642 + }, + { + "epoch": 0.31, + "grad_norm": 0.24451562099210608, + "learning_rate": 8.120496320686752e-06, + "loss": 0.0614, + "step": 3643 + }, + { + "epoch": 0.31, + "grad_norm": 0.31305820048668476, + "learning_rate": 8.119429977434747e-06, + "loss": 0.1057, + "step": 3644 + }, + { + "epoch": 0.31, + "grad_norm": 0.30697942267223816, + "learning_rate": 8.118363401829425e-06, + "loss": 0.0781, + "step": 3645 + }, + { + "epoch": 0.31, + "grad_norm": 0.20905448528153592, + "learning_rate": 8.117296593950231e-06, + "loss": 0.047, + "step": 3646 + }, + { + "epoch": 0.31, + "grad_norm": 0.3511743706260753, + "learning_rate": 8.116229553876628e-06, + "loss": 0.0664, + "step": 3647 + }, + { + "epoch": 0.31, + "grad_norm": 0.3776031318940993, + "learning_rate": 8.115162281688092e-06, + "loss": 0.1237, + "step": 3648 + }, + { + "epoch": 0.31, + "grad_norm": 0.5453083053449695, + "learning_rate": 8.114094777464125e-06, + "loss": 0.1156, + "step": 3649 + }, + { + "epoch": 0.31, + "grad_norm": 0.24765382057948584, + "learning_rate": 8.113027041284236e-06, + "loss": 0.0866, + "step": 3650 + }, + { + "epoch": 0.31, + "grad_norm": 0.27328041672248937, + "learning_rate": 8.111959073227959e-06, + "loss": 0.0499, + "step": 3651 + }, + { + "epoch": 0.31, + "grad_norm": 0.30793815360314775, + "learning_rate": 8.110890873374842e-06, + "loss": 0.0507, + "step": 3652 + }, + { + "epoch": 0.31, + "grad_norm": 0.24849401989727152, + "learning_rate": 8.10982244180445e-06, + "loss": 0.1009, + "step": 3653 + }, + { + "epoch": 0.31, + "grad_norm": 0.30482750278982224, + "learning_rate": 8.108753778596367e-06, + "loss": 0.0795, + "step": 3654 + }, + { + "epoch": 0.31, + "grad_norm": 0.17556756459036374, + "learning_rate": 8.107684883830193e-06, + "loss": 0.0798, + "step": 3655 + }, + { + "epoch": 0.31, + "grad_norm": 0.18133493124607133, + "learning_rate": 8.106615757585544e-06, + "loss": 0.058, + "step": 3656 + }, + { + "epoch": 0.31, + "grad_norm": 0.45069641238224784, + "learning_rate": 8.105546399942057e-06, + "loss": 0.0814, + "step": 3657 + }, + { + "epoch": 0.31, + "grad_norm": 0.22320412028183, + "learning_rate": 8.104476810979382e-06, + "loss": 0.069, + "step": 3658 + }, + { + "epoch": 0.31, + "grad_norm": 0.20819664079902336, + "learning_rate": 8.103406990777189e-06, + "loss": 0.0455, + "step": 3659 + }, + { + "epoch": 0.31, + "grad_norm": 0.48140303058035505, + "learning_rate": 8.102336939415167e-06, + "loss": 0.0897, + "step": 3660 + }, + { + "epoch": 0.31, + "grad_norm": 0.4209688726266939, + "learning_rate": 8.101266656973015e-06, + "loss": 0.1058, + "step": 3661 + }, + { + "epoch": 0.31, + "grad_norm": 1.3292900339491034, + "learning_rate": 8.100196143530456e-06, + "loss": 0.0818, + "step": 3662 + }, + { + "epoch": 0.31, + "grad_norm": 0.3747237462560882, + "learning_rate": 8.099125399167231e-06, + "loss": 0.1113, + "step": 3663 + }, + { + "epoch": 0.31, + "grad_norm": 0.4722430355332535, + "learning_rate": 8.098054423963089e-06, + "loss": 0.1183, + "step": 3664 + }, + { + "epoch": 0.31, + "grad_norm": 0.20359783999502326, + "learning_rate": 8.096983217997808e-06, + "loss": 0.0336, + "step": 3665 + }, + { + "epoch": 0.31, + "grad_norm": 0.40177786233970975, + "learning_rate": 8.095911781351175e-06, + "loss": 0.084, + "step": 3666 + }, + { + "epoch": 0.31, + "grad_norm": 0.2807103633055263, + "learning_rate": 8.094840114102999e-06, + "loss": 0.0695, + "step": 3667 + }, + { + "epoch": 0.31, + "grad_norm": 0.6461125023952256, + "learning_rate": 8.093768216333102e-06, + "loss": 0.1055, + "step": 3668 + }, + { + "epoch": 0.31, + "grad_norm": 0.38947143382002863, + "learning_rate": 8.092696088121324e-06, + "loss": 0.0941, + "step": 3669 + }, + { + "epoch": 0.31, + "grad_norm": 0.3333353050447398, + "learning_rate": 8.091623729547526e-06, + "loss": 0.0494, + "step": 3670 + }, + { + "epoch": 0.31, + "grad_norm": 0.2086566818308642, + "learning_rate": 8.090551140691582e-06, + "loss": 0.0512, + "step": 3671 + }, + { + "epoch": 0.31, + "grad_norm": 0.4242079701764188, + "learning_rate": 8.089478321633386e-06, + "loss": 0.1267, + "step": 3672 + }, + { + "epoch": 0.31, + "grad_norm": 0.5420750095291259, + "learning_rate": 8.088405272452848e-06, + "loss": 0.1095, + "step": 3673 + }, + { + "epoch": 0.31, + "grad_norm": 0.4147364802054457, + "learning_rate": 8.087331993229895e-06, + "loss": 0.0875, + "step": 3674 + }, + { + "epoch": 0.31, + "grad_norm": 0.39723798537863864, + "learning_rate": 8.086258484044466e-06, + "loss": 0.1235, + "step": 3675 + }, + { + "epoch": 0.31, + "grad_norm": 0.40117143883772277, + "learning_rate": 8.08518474497653e-06, + "loss": 0.0873, + "step": 3676 + }, + { + "epoch": 0.31, + "grad_norm": 0.3959386515439047, + "learning_rate": 8.08411077610606e-06, + "loss": 0.063, + "step": 3677 + }, + { + "epoch": 0.31, + "grad_norm": 0.27169115320492493, + "learning_rate": 8.083036577513052e-06, + "loss": 0.0541, + "step": 3678 + }, + { + "epoch": 0.31, + "grad_norm": 0.2926125995064389, + "learning_rate": 8.081962149277523e-06, + "loss": 0.0968, + "step": 3679 + }, + { + "epoch": 0.31, + "grad_norm": 0.43539065859844706, + "learning_rate": 8.080887491479497e-06, + "loss": 0.0914, + "step": 3680 + }, + { + "epoch": 0.31, + "grad_norm": 0.31477202594534875, + "learning_rate": 8.079812604199023e-06, + "loss": 0.0933, + "step": 3681 + }, + { + "epoch": 0.31, + "grad_norm": 1.2560898443670172, + "learning_rate": 8.078737487516164e-06, + "loss": 0.1258, + "step": 3682 + }, + { + "epoch": 0.31, + "grad_norm": 0.295743862308518, + "learning_rate": 8.077662141511004e-06, + "loss": 0.0515, + "step": 3683 + }, + { + "epoch": 0.31, + "grad_norm": 0.48089399395202886, + "learning_rate": 8.07658656626364e-06, + "loss": 0.1035, + "step": 3684 + }, + { + "epoch": 0.31, + "grad_norm": 0.40529573665224317, + "learning_rate": 8.075510761854184e-06, + "loss": 0.0883, + "step": 3685 + }, + { + "epoch": 0.31, + "grad_norm": 0.24327707616647226, + "learning_rate": 8.07443472836277e-06, + "loss": 0.0627, + "step": 3686 + }, + { + "epoch": 0.31, + "grad_norm": 0.3328996560533555, + "learning_rate": 8.073358465869549e-06, + "loss": 0.1126, + "step": 3687 + }, + { + "epoch": 0.31, + "grad_norm": 0.6477096064387154, + "learning_rate": 8.072281974454685e-06, + "loss": 0.1033, + "step": 3688 + }, + { + "epoch": 0.31, + "grad_norm": 0.35847201575424137, + "learning_rate": 8.071205254198362e-06, + "loss": 0.0729, + "step": 3689 + }, + { + "epoch": 0.31, + "grad_norm": 0.26547948699896057, + "learning_rate": 8.070128305180783e-06, + "loss": 0.0845, + "step": 3690 + }, + { + "epoch": 0.31, + "grad_norm": 0.35094222777145845, + "learning_rate": 8.06905112748216e-06, + "loss": 0.0799, + "step": 3691 + }, + { + "epoch": 0.31, + "grad_norm": 0.6214925157759148, + "learning_rate": 8.06797372118273e-06, + "loss": 0.0904, + "step": 3692 + }, + { + "epoch": 0.31, + "grad_norm": 0.2412061053804197, + "learning_rate": 8.066896086362747e-06, + "loss": 0.0483, + "step": 3693 + }, + { + "epoch": 0.31, + "grad_norm": 0.5907969227458691, + "learning_rate": 8.065818223102477e-06, + "loss": 0.115, + "step": 3694 + }, + { + "epoch": 0.31, + "grad_norm": 0.26826139996480475, + "learning_rate": 8.064740131482207e-06, + "loss": 0.0751, + "step": 3695 + }, + { + "epoch": 0.31, + "grad_norm": 0.4743967680165217, + "learning_rate": 8.063661811582237e-06, + "loss": 0.1316, + "step": 3696 + }, + { + "epoch": 0.31, + "grad_norm": 0.493772496394193, + "learning_rate": 8.062583263482888e-06, + "loss": 0.0711, + "step": 3697 + }, + { + "epoch": 0.31, + "grad_norm": 0.2779520010329725, + "learning_rate": 8.061504487264498e-06, + "loss": 0.0748, + "step": 3698 + }, + { + "epoch": 0.31, + "grad_norm": 0.3040146023723784, + "learning_rate": 8.060425483007418e-06, + "loss": 0.0361, + "step": 3699 + }, + { + "epoch": 0.31, + "grad_norm": 0.318086531917773, + "learning_rate": 8.05934625079202e-06, + "loss": 0.0767, + "step": 3700 + }, + { + "epoch": 0.31, + "grad_norm": 0.28112501659550976, + "learning_rate": 8.058266790698693e-06, + "loss": 0.0712, + "step": 3701 + }, + { + "epoch": 0.31, + "grad_norm": 0.4194752463188414, + "learning_rate": 8.057187102807837e-06, + "loss": 0.0841, + "step": 3702 + }, + { + "epoch": 0.31, + "grad_norm": 0.2745856466850972, + "learning_rate": 8.056107187199877e-06, + "loss": 0.0683, + "step": 3703 + }, + { + "epoch": 0.31, + "grad_norm": 0.2939525718786237, + "learning_rate": 8.05502704395525e-06, + "loss": 0.0899, + "step": 3704 + }, + { + "epoch": 0.31, + "grad_norm": 0.24799364574236718, + "learning_rate": 8.053946673154414e-06, + "loss": 0.0711, + "step": 3705 + }, + { + "epoch": 0.31, + "grad_norm": 0.354084552460418, + "learning_rate": 8.052866074877839e-06, + "loss": 0.11, + "step": 3706 + }, + { + "epoch": 0.31, + "grad_norm": 0.4658330039787627, + "learning_rate": 8.051785249206013e-06, + "loss": 0.059, + "step": 3707 + }, + { + "epoch": 0.31, + "grad_norm": 0.37496933240899205, + "learning_rate": 8.050704196219443e-06, + "loss": 0.0843, + "step": 3708 + }, + { + "epoch": 0.31, + "grad_norm": 0.291499464635093, + "learning_rate": 8.049622915998655e-06, + "loss": 0.0558, + "step": 3709 + }, + { + "epoch": 0.31, + "grad_norm": 0.33919935477158353, + "learning_rate": 8.048541408624186e-06, + "loss": 0.0856, + "step": 3710 + }, + { + "epoch": 0.31, + "grad_norm": 0.21457262177588945, + "learning_rate": 8.047459674176592e-06, + "loss": 0.0767, + "step": 3711 + }, + { + "epoch": 0.31, + "grad_norm": 0.22744239761159804, + "learning_rate": 8.046377712736453e-06, + "loss": 0.0838, + "step": 3712 + }, + { + "epoch": 0.31, + "grad_norm": 0.3779288965197235, + "learning_rate": 8.04529552438435e-06, + "loss": 0.1181, + "step": 3713 + }, + { + "epoch": 0.31, + "grad_norm": 0.33942952897197093, + "learning_rate": 8.044213109200901e-06, + "loss": 0.0582, + "step": 3714 + }, + { + "epoch": 0.31, + "grad_norm": 0.37548894951209105, + "learning_rate": 8.043130467266725e-06, + "loss": 0.0824, + "step": 3715 + }, + { + "epoch": 0.31, + "grad_norm": 0.2819441628175042, + "learning_rate": 8.04204759866246e-06, + "loss": 0.0805, + "step": 3716 + }, + { + "epoch": 0.31, + "grad_norm": 0.3187724688118607, + "learning_rate": 8.040964503468774e-06, + "loss": 0.1112, + "step": 3717 + }, + { + "epoch": 0.31, + "grad_norm": 0.4284914009235011, + "learning_rate": 8.039881181766334e-06, + "loss": 0.067, + "step": 3718 + }, + { + "epoch": 0.31, + "grad_norm": 0.5179409049099479, + "learning_rate": 8.038797633635836e-06, + "loss": 0.0994, + "step": 3719 + }, + { + "epoch": 0.31, + "grad_norm": 0.3073693328207546, + "learning_rate": 8.037713859157987e-06, + "loss": 0.0832, + "step": 3720 + }, + { + "epoch": 0.31, + "grad_norm": 0.4280695536533562, + "learning_rate": 8.036629858413514e-06, + "loss": 0.1194, + "step": 3721 + }, + { + "epoch": 0.31, + "grad_norm": 0.32675315090568596, + "learning_rate": 8.03554563148316e-06, + "loss": 0.1209, + "step": 3722 + }, + { + "epoch": 0.31, + "grad_norm": 0.31071896612432215, + "learning_rate": 8.034461178447683e-06, + "loss": 0.0868, + "step": 3723 + }, + { + "epoch": 0.31, + "grad_norm": 0.40817108240019284, + "learning_rate": 8.033376499387862e-06, + "loss": 0.1357, + "step": 3724 + }, + { + "epoch": 0.31, + "grad_norm": 0.6003230305454093, + "learning_rate": 8.032291594384488e-06, + "loss": 0.1261, + "step": 3725 + }, + { + "epoch": 0.31, + "grad_norm": 0.25260403102843826, + "learning_rate": 8.03120646351837e-06, + "loss": 0.0937, + "step": 3726 + }, + { + "epoch": 0.31, + "grad_norm": 0.249582174415589, + "learning_rate": 8.030121106870339e-06, + "loss": 0.0872, + "step": 3727 + }, + { + "epoch": 0.31, + "grad_norm": 0.26247104330602844, + "learning_rate": 8.029035524521234e-06, + "loss": 0.0811, + "step": 3728 + }, + { + "epoch": 0.31, + "grad_norm": 0.43865107517533963, + "learning_rate": 8.02794971655192e-06, + "loss": 0.1098, + "step": 3729 + }, + { + "epoch": 0.31, + "grad_norm": 0.2891859854900654, + "learning_rate": 8.02686368304327e-06, + "loss": 0.0842, + "step": 3730 + }, + { + "epoch": 0.31, + "grad_norm": 0.4965339159025126, + "learning_rate": 8.02577742407618e-06, + "loss": 0.1356, + "step": 3731 + }, + { + "epoch": 0.31, + "grad_norm": 0.29430826029829954, + "learning_rate": 8.024690939731563e-06, + "loss": 0.0832, + "step": 3732 + }, + { + "epoch": 0.31, + "grad_norm": 0.16406307966210348, + "learning_rate": 8.023604230090342e-06, + "loss": 0.0339, + "step": 3733 + }, + { + "epoch": 0.31, + "grad_norm": 0.40968631719028714, + "learning_rate": 8.022517295233467e-06, + "loss": 0.0925, + "step": 3734 + }, + { + "epoch": 0.31, + "grad_norm": 0.33663372437045724, + "learning_rate": 8.021430135241895e-06, + "loss": 0.0763, + "step": 3735 + }, + { + "epoch": 0.31, + "grad_norm": 0.32420294957211787, + "learning_rate": 8.020342750196605e-06, + "loss": 0.0731, + "step": 3736 + }, + { + "epoch": 0.31, + "grad_norm": 0.6422374175470135, + "learning_rate": 8.019255140178593e-06, + "loss": 0.107, + "step": 3737 + }, + { + "epoch": 0.31, + "grad_norm": 0.41598252984729867, + "learning_rate": 8.018167305268869e-06, + "loss": 0.0766, + "step": 3738 + }, + { + "epoch": 0.32, + "grad_norm": 0.32295314016272697, + "learning_rate": 8.017079245548462e-06, + "loss": 0.0703, + "step": 3739 + }, + { + "epoch": 0.32, + "grad_norm": 0.26044425627338225, + "learning_rate": 8.015990961098419e-06, + "loss": 0.0538, + "step": 3740 + }, + { + "epoch": 0.32, + "grad_norm": 0.4061907312453345, + "learning_rate": 8.014902451999797e-06, + "loss": 0.1057, + "step": 3741 + }, + { + "epoch": 0.32, + "grad_norm": 0.6102667859978426, + "learning_rate": 8.013813718333679e-06, + "loss": 0.1603, + "step": 3742 + }, + { + "epoch": 0.32, + "grad_norm": 0.46677668651824966, + "learning_rate": 8.012724760181158e-06, + "loss": 0.14, + "step": 3743 + }, + { + "epoch": 0.32, + "grad_norm": 0.5604786397016787, + "learning_rate": 8.011635577623346e-06, + "loss": 0.1216, + "step": 3744 + }, + { + "epoch": 0.32, + "grad_norm": 0.2799842894426531, + "learning_rate": 8.010546170741371e-06, + "loss": 0.0589, + "step": 3745 + }, + { + "epoch": 0.32, + "grad_norm": 0.37356984686095, + "learning_rate": 8.009456539616383e-06, + "loss": 0.1057, + "step": 3746 + }, + { + "epoch": 0.32, + "grad_norm": 0.33033500975523866, + "learning_rate": 8.008366684329537e-06, + "loss": 0.0874, + "step": 3747 + }, + { + "epoch": 0.32, + "grad_norm": 0.23759675904431252, + "learning_rate": 8.007276604962018e-06, + "loss": 0.0525, + "step": 3748 + }, + { + "epoch": 0.32, + "grad_norm": 0.28392591091699076, + "learning_rate": 8.006186301595015e-06, + "loss": 0.0888, + "step": 3749 + }, + { + "epoch": 0.32, + "grad_norm": 0.4213891765355426, + "learning_rate": 8.005095774309748e-06, + "loss": 0.0911, + "step": 3750 + }, + { + "epoch": 0.32, + "grad_norm": 0.44643507668545834, + "learning_rate": 8.00400502318744e-06, + "loss": 0.0898, + "step": 3751 + }, + { + "epoch": 0.32, + "grad_norm": 0.38592621563839047, + "learning_rate": 8.002914048309337e-06, + "loss": 0.1081, + "step": 3752 + }, + { + "epoch": 0.32, + "grad_norm": 0.2819134600796379, + "learning_rate": 8.001822849756706e-06, + "loss": 0.092, + "step": 3753 + }, + { + "epoch": 0.32, + "grad_norm": 0.2737352135456345, + "learning_rate": 8.00073142761082e-06, + "loss": 0.0697, + "step": 3754 + }, + { + "epoch": 0.32, + "grad_norm": 0.2904980900951311, + "learning_rate": 7.999639781952977e-06, + "loss": 0.1116, + "step": 3755 + }, + { + "epoch": 0.32, + "grad_norm": 0.3701877233316966, + "learning_rate": 7.998547912864487e-06, + "loss": 0.1094, + "step": 3756 + }, + { + "epoch": 0.32, + "grad_norm": 0.3663783360100223, + "learning_rate": 7.997455820426683e-06, + "loss": 0.0544, + "step": 3757 + }, + { + "epoch": 0.32, + "grad_norm": 0.3242944619684403, + "learning_rate": 7.996363504720908e-06, + "loss": 0.065, + "step": 3758 + }, + { + "epoch": 0.32, + "grad_norm": 0.34193773516066184, + "learning_rate": 7.995270965828523e-06, + "loss": 0.0937, + "step": 3759 + }, + { + "epoch": 0.32, + "grad_norm": 0.30047277951887874, + "learning_rate": 7.994178203830908e-06, + "loss": 0.0931, + "step": 3760 + }, + { + "epoch": 0.32, + "grad_norm": 0.2954148708980294, + "learning_rate": 7.993085218809458e-06, + "loss": 0.0752, + "step": 3761 + }, + { + "epoch": 0.32, + "grad_norm": 0.43071383283306847, + "learning_rate": 7.991992010845585e-06, + "loss": 0.0976, + "step": 3762 + }, + { + "epoch": 0.32, + "grad_norm": 0.27370587956260023, + "learning_rate": 7.990898580020718e-06, + "loss": 0.0892, + "step": 3763 + }, + { + "epoch": 0.32, + "grad_norm": 0.3618802701649216, + "learning_rate": 7.989804926416301e-06, + "loss": 0.1002, + "step": 3764 + }, + { + "epoch": 0.32, + "grad_norm": 0.33916408424279826, + "learning_rate": 7.988711050113795e-06, + "loss": 0.0723, + "step": 3765 + }, + { + "epoch": 0.32, + "grad_norm": 0.32785999220023543, + "learning_rate": 7.987616951194681e-06, + "loss": 0.0826, + "step": 3766 + }, + { + "epoch": 0.32, + "grad_norm": 0.2811981708809026, + "learning_rate": 7.986522629740452e-06, + "loss": 0.0784, + "step": 3767 + }, + { + "epoch": 0.32, + "grad_norm": 0.2751896598181098, + "learning_rate": 7.98542808583262e-06, + "loss": 0.0913, + "step": 3768 + }, + { + "epoch": 0.32, + "grad_norm": 0.23900676243501434, + "learning_rate": 7.984333319552711e-06, + "loss": 0.0588, + "step": 3769 + }, + { + "epoch": 0.32, + "grad_norm": 0.3532744854124887, + "learning_rate": 7.983238330982274e-06, + "loss": 0.0991, + "step": 3770 + }, + { + "epoch": 0.32, + "grad_norm": 0.3650630600543345, + "learning_rate": 7.982143120202866e-06, + "loss": 0.0671, + "step": 3771 + }, + { + "epoch": 0.32, + "grad_norm": 0.29389219374624903, + "learning_rate": 7.981047687296066e-06, + "loss": 0.0588, + "step": 3772 + }, + { + "epoch": 0.32, + "grad_norm": 0.22012455743083292, + "learning_rate": 7.979952032343467e-06, + "loss": 0.0593, + "step": 3773 + }, + { + "epoch": 0.32, + "grad_norm": 0.2491490718056531, + "learning_rate": 7.978856155426684e-06, + "loss": 0.0586, + "step": 3774 + }, + { + "epoch": 0.32, + "grad_norm": 0.7935956973755018, + "learning_rate": 7.97776005662734e-06, + "loss": 0.1274, + "step": 3775 + }, + { + "epoch": 0.32, + "grad_norm": 0.4266731885008378, + "learning_rate": 7.976663736027082e-06, + "loss": 0.0967, + "step": 3776 + }, + { + "epoch": 0.32, + "grad_norm": 0.35885999500722937, + "learning_rate": 7.975567193707565e-06, + "loss": 0.0793, + "step": 3777 + }, + { + "epoch": 0.32, + "grad_norm": 0.33575430918281457, + "learning_rate": 7.974470429750473e-06, + "loss": 0.0761, + "step": 3778 + }, + { + "epoch": 0.32, + "grad_norm": 0.27771340377330506, + "learning_rate": 7.973373444237493e-06, + "loss": 0.0862, + "step": 3779 + }, + { + "epoch": 0.32, + "grad_norm": 0.30167685911801534, + "learning_rate": 7.97227623725034e-06, + "loss": 0.0695, + "step": 3780 + }, + { + "epoch": 0.32, + "grad_norm": 0.5315730211112671, + "learning_rate": 7.971178808870736e-06, + "loss": 0.0851, + "step": 3781 + }, + { + "epoch": 0.32, + "grad_norm": 0.46850194183641813, + "learning_rate": 7.970081159180427e-06, + "loss": 0.1096, + "step": 3782 + }, + { + "epoch": 0.32, + "grad_norm": 0.3101169668685658, + "learning_rate": 7.968983288261171e-06, + "loss": 0.0978, + "step": 3783 + }, + { + "epoch": 0.32, + "grad_norm": 0.36512434846685166, + "learning_rate": 7.967885196194744e-06, + "loss": 0.0675, + "step": 3784 + }, + { + "epoch": 0.32, + "grad_norm": 0.4697088769772414, + "learning_rate": 7.96678688306294e-06, + "loss": 0.0859, + "step": 3785 + }, + { + "epoch": 0.32, + "grad_norm": 0.85916001771976, + "learning_rate": 7.965688348947565e-06, + "loss": 0.2002, + "step": 3786 + }, + { + "epoch": 0.32, + "grad_norm": 0.42580393030440017, + "learning_rate": 7.964589593930446e-06, + "loss": 0.0867, + "step": 3787 + }, + { + "epoch": 0.32, + "grad_norm": 0.27080932444298184, + "learning_rate": 7.963490618093422e-06, + "loss": 0.0714, + "step": 3788 + }, + { + "epoch": 0.32, + "grad_norm": 0.45194285866952794, + "learning_rate": 7.962391421518354e-06, + "loss": 0.0931, + "step": 3789 + }, + { + "epoch": 0.32, + "grad_norm": 0.7537514573794841, + "learning_rate": 7.961292004287116e-06, + "loss": 0.1332, + "step": 3790 + }, + { + "epoch": 0.32, + "grad_norm": 0.6315922545752019, + "learning_rate": 7.9601923664816e-06, + "loss": 0.1204, + "step": 3791 + }, + { + "epoch": 0.32, + "grad_norm": 0.20256743726844248, + "learning_rate": 7.95909250818371e-06, + "loss": 0.0502, + "step": 3792 + }, + { + "epoch": 0.32, + "grad_norm": 0.3088851271048101, + "learning_rate": 7.957992429475375e-06, + "loss": 0.0736, + "step": 3793 + }, + { + "epoch": 0.32, + "grad_norm": 0.44718490779769315, + "learning_rate": 7.95689213043853e-06, + "loss": 0.0898, + "step": 3794 + }, + { + "epoch": 0.32, + "grad_norm": 0.3308718439606947, + "learning_rate": 7.955791611155132e-06, + "loss": 0.0962, + "step": 3795 + }, + { + "epoch": 0.32, + "grad_norm": 0.5377240640959181, + "learning_rate": 7.954690871707159e-06, + "loss": 0.0767, + "step": 3796 + }, + { + "epoch": 0.32, + "grad_norm": 0.46453737444210064, + "learning_rate": 7.953589912176597e-06, + "loss": 0.1287, + "step": 3797 + }, + { + "epoch": 0.32, + "grad_norm": 0.22360239110456812, + "learning_rate": 7.952488732645452e-06, + "loss": 0.0551, + "step": 3798 + }, + { + "epoch": 0.32, + "grad_norm": 0.4090020753345458, + "learning_rate": 7.951387333195747e-06, + "loss": 0.1242, + "step": 3799 + }, + { + "epoch": 0.32, + "grad_norm": 0.43960818987052247, + "learning_rate": 7.95028571390952e-06, + "loss": 0.133, + "step": 3800 + }, + { + "epoch": 0.32, + "grad_norm": 0.4003211363864356, + "learning_rate": 7.949183874868827e-06, + "loss": 0.0825, + "step": 3801 + }, + { + "epoch": 0.32, + "grad_norm": 0.22981992857338113, + "learning_rate": 7.948081816155737e-06, + "loss": 0.0563, + "step": 3802 + }, + { + "epoch": 0.32, + "grad_norm": 0.5372224592880047, + "learning_rate": 7.946979537852341e-06, + "loss": 0.1108, + "step": 3803 + }, + { + "epoch": 0.32, + "grad_norm": 0.22713161151803155, + "learning_rate": 7.945877040040742e-06, + "loss": 0.0546, + "step": 3804 + }, + { + "epoch": 0.32, + "grad_norm": 0.24759246726908532, + "learning_rate": 7.94477432280306e-06, + "loss": 0.0884, + "step": 3805 + }, + { + "epoch": 0.32, + "grad_norm": 0.34277318370418386, + "learning_rate": 7.94367138622143e-06, + "loss": 0.0397, + "step": 3806 + }, + { + "epoch": 0.32, + "grad_norm": 0.25914238919115246, + "learning_rate": 7.942568230378008e-06, + "loss": 0.0792, + "step": 3807 + }, + { + "epoch": 0.32, + "grad_norm": 0.5739759948778025, + "learning_rate": 7.941464855354963e-06, + "loss": 0.1555, + "step": 3808 + }, + { + "epoch": 0.32, + "grad_norm": 0.3415678364698252, + "learning_rate": 7.940361261234481e-06, + "loss": 0.0905, + "step": 3809 + }, + { + "epoch": 0.32, + "grad_norm": 0.5800951868416174, + "learning_rate": 7.939257448098762e-06, + "loss": 0.0822, + "step": 3810 + }, + { + "epoch": 0.32, + "grad_norm": 0.33258165736331224, + "learning_rate": 7.938153416030025e-06, + "loss": 0.0919, + "step": 3811 + }, + { + "epoch": 0.32, + "grad_norm": 0.513736024388914, + "learning_rate": 7.937049165110509e-06, + "loss": 0.1251, + "step": 3812 + }, + { + "epoch": 0.32, + "grad_norm": 0.4315277977310471, + "learning_rate": 7.935944695422457e-06, + "loss": 0.1006, + "step": 3813 + }, + { + "epoch": 0.32, + "grad_norm": 0.7341287051289989, + "learning_rate": 7.934840007048144e-06, + "loss": 0.1098, + "step": 3814 + }, + { + "epoch": 0.32, + "grad_norm": 0.2788014270167921, + "learning_rate": 7.93373510006985e-06, + "loss": 0.0808, + "step": 3815 + }, + { + "epoch": 0.32, + "grad_norm": 0.22348277967031732, + "learning_rate": 7.932629974569877e-06, + "loss": 0.0647, + "step": 3816 + }, + { + "epoch": 0.32, + "grad_norm": 1.0739938558570539, + "learning_rate": 7.931524630630536e-06, + "loss": 0.0765, + "step": 3817 + }, + { + "epoch": 0.32, + "grad_norm": 0.2860032630299111, + "learning_rate": 7.930419068334166e-06, + "loss": 0.0449, + "step": 3818 + }, + { + "epoch": 0.32, + "grad_norm": 0.32122209962220377, + "learning_rate": 7.929313287763112e-06, + "loss": 0.0429, + "step": 3819 + }, + { + "epoch": 0.32, + "grad_norm": 0.3780656956081498, + "learning_rate": 7.92820728899974e-06, + "loss": 0.1374, + "step": 3820 + }, + { + "epoch": 0.32, + "grad_norm": 0.4086574044775283, + "learning_rate": 7.927101072126431e-06, + "loss": 0.1014, + "step": 3821 + }, + { + "epoch": 0.32, + "grad_norm": 0.35111464101065787, + "learning_rate": 7.925994637225582e-06, + "loss": 0.0931, + "step": 3822 + }, + { + "epoch": 0.32, + "grad_norm": 0.5400846971606367, + "learning_rate": 7.924887984379608e-06, + "loss": 0.1496, + "step": 3823 + }, + { + "epoch": 0.32, + "grad_norm": 0.4326758984865909, + "learning_rate": 7.923781113670937e-06, + "loss": 0.1256, + "step": 3824 + }, + { + "epoch": 0.32, + "grad_norm": 0.2422839778474585, + "learning_rate": 7.922674025182019e-06, + "loss": 0.0645, + "step": 3825 + }, + { + "epoch": 0.32, + "grad_norm": 0.21447927382797952, + "learning_rate": 7.92156671899531e-06, + "loss": 0.046, + "step": 3826 + }, + { + "epoch": 0.32, + "grad_norm": 0.46684582091269483, + "learning_rate": 7.920459195193296e-06, + "loss": 0.1158, + "step": 3827 + }, + { + "epoch": 0.32, + "grad_norm": 0.30989675276313333, + "learning_rate": 7.919351453858466e-06, + "loss": 0.0845, + "step": 3828 + }, + { + "epoch": 0.32, + "grad_norm": 0.3688883811791314, + "learning_rate": 7.918243495073333e-06, + "loss": 0.1002, + "step": 3829 + }, + { + "epoch": 0.32, + "grad_norm": 0.234582937182848, + "learning_rate": 7.917135318920425e-06, + "loss": 0.0906, + "step": 3830 + }, + { + "epoch": 0.32, + "grad_norm": 0.3188407941578234, + "learning_rate": 7.916026925482286e-06, + "loss": 0.0826, + "step": 3831 + }, + { + "epoch": 0.32, + "grad_norm": 0.18750359060658844, + "learning_rate": 7.914918314841473e-06, + "loss": 0.0569, + "step": 3832 + }, + { + "epoch": 0.32, + "grad_norm": 0.24653489913298587, + "learning_rate": 7.913809487080564e-06, + "loss": 0.0868, + "step": 3833 + }, + { + "epoch": 0.32, + "grad_norm": 0.3870221264856809, + "learning_rate": 7.91270044228215e-06, + "loss": 0.1354, + "step": 3834 + }, + { + "epoch": 0.32, + "grad_norm": 0.5787301331289472, + "learning_rate": 7.91159118052884e-06, + "loss": 0.1025, + "step": 3835 + }, + { + "epoch": 0.32, + "grad_norm": 0.34443228388239705, + "learning_rate": 7.910481701903257e-06, + "loss": 0.061, + "step": 3836 + }, + { + "epoch": 0.32, + "grad_norm": 0.4586440360893343, + "learning_rate": 7.909372006488041e-06, + "loss": 0.1127, + "step": 3837 + }, + { + "epoch": 0.32, + "grad_norm": 0.4419320629196965, + "learning_rate": 7.908262094365851e-06, + "loss": 0.088, + "step": 3838 + }, + { + "epoch": 0.32, + "grad_norm": 0.47197517626942104, + "learning_rate": 7.907151965619358e-06, + "loss": 0.1297, + "step": 3839 + }, + { + "epoch": 0.32, + "grad_norm": 0.40702805408889603, + "learning_rate": 7.906041620331251e-06, + "loss": 0.1143, + "step": 3840 + }, + { + "epoch": 0.32, + "grad_norm": 0.22399706089817173, + "learning_rate": 7.904931058584235e-06, + "loss": 0.0755, + "step": 3841 + }, + { + "epoch": 0.32, + "grad_norm": 0.1857511816799227, + "learning_rate": 7.903820280461032e-06, + "loss": 0.0518, + "step": 3842 + }, + { + "epoch": 0.32, + "grad_norm": 0.6232628861637834, + "learning_rate": 7.902709286044378e-06, + "loss": 0.1174, + "step": 3843 + }, + { + "epoch": 0.32, + "grad_norm": 0.3791585525964029, + "learning_rate": 7.901598075417027e-06, + "loss": 0.1125, + "step": 3844 + }, + { + "epoch": 0.32, + "grad_norm": 0.21971738802441587, + "learning_rate": 7.900486648661748e-06, + "loss": 0.0606, + "step": 3845 + }, + { + "epoch": 0.32, + "grad_norm": 0.46764976769635097, + "learning_rate": 7.899375005861327e-06, + "loss": 0.1001, + "step": 3846 + }, + { + "epoch": 0.32, + "grad_norm": 0.4755886785236143, + "learning_rate": 7.898263147098566e-06, + "loss": 0.1147, + "step": 3847 + }, + { + "epoch": 0.32, + "grad_norm": 0.4853486099080179, + "learning_rate": 7.897151072456281e-06, + "loss": 0.123, + "step": 3848 + }, + { + "epoch": 0.32, + "grad_norm": 0.2547251497808277, + "learning_rate": 7.896038782017308e-06, + "loss": 0.0617, + "step": 3849 + }, + { + "epoch": 0.32, + "grad_norm": 0.2255613555877384, + "learning_rate": 7.894926275864496e-06, + "loss": 0.0726, + "step": 3850 + }, + { + "epoch": 0.32, + "grad_norm": 0.4206119141303988, + "learning_rate": 7.89381355408071e-06, + "loss": 0.0997, + "step": 3851 + }, + { + "epoch": 0.32, + "grad_norm": 0.3801132782758231, + "learning_rate": 7.892700616748834e-06, + "loss": 0.1127, + "step": 3852 + }, + { + "epoch": 0.32, + "grad_norm": 0.22865707795389592, + "learning_rate": 7.891587463951762e-06, + "loss": 0.051, + "step": 3853 + }, + { + "epoch": 0.32, + "grad_norm": 0.48544415150394044, + "learning_rate": 7.890474095772413e-06, + "loss": 0.1265, + "step": 3854 + }, + { + "epoch": 0.32, + "grad_norm": 0.3376584033847918, + "learning_rate": 7.889360512293716e-06, + "loss": 0.0895, + "step": 3855 + }, + { + "epoch": 0.32, + "grad_norm": 0.4455884081721892, + "learning_rate": 7.888246713598615e-06, + "loss": 0.112, + "step": 3856 + }, + { + "epoch": 0.32, + "grad_norm": 0.3974832738480591, + "learning_rate": 7.887132699770074e-06, + "loss": 0.0776, + "step": 3857 + }, + { + "epoch": 0.33, + "grad_norm": 0.25374421042763706, + "learning_rate": 7.88601847089107e-06, + "loss": 0.0919, + "step": 3858 + }, + { + "epoch": 0.33, + "grad_norm": 0.362099186040887, + "learning_rate": 7.884904027044597e-06, + "loss": 0.1104, + "step": 3859 + }, + { + "epoch": 0.33, + "grad_norm": 0.21619924652458278, + "learning_rate": 7.883789368313668e-06, + "loss": 0.0313, + "step": 3860 + }, + { + "epoch": 0.33, + "grad_norm": 0.23705021872196716, + "learning_rate": 7.882674494781308e-06, + "loss": 0.0894, + "step": 3861 + }, + { + "epoch": 0.33, + "grad_norm": 0.28662358042766967, + "learning_rate": 7.881559406530558e-06, + "loss": 0.0706, + "step": 3862 + }, + { + "epoch": 0.33, + "grad_norm": 0.2872241798161116, + "learning_rate": 7.880444103644478e-06, + "loss": 0.0745, + "step": 3863 + }, + { + "epoch": 0.33, + "grad_norm": 0.2556383958122709, + "learning_rate": 7.879328586206138e-06, + "loss": 0.0494, + "step": 3864 + }, + { + "epoch": 0.33, + "grad_norm": 0.40214061625545827, + "learning_rate": 7.878212854298636e-06, + "loss": 0.1176, + "step": 3865 + }, + { + "epoch": 0.33, + "grad_norm": 0.32447818837388304, + "learning_rate": 7.877096908005071e-06, + "loss": 0.0687, + "step": 3866 + }, + { + "epoch": 0.33, + "grad_norm": 0.4427484428195962, + "learning_rate": 7.87598074740857e-06, + "loss": 0.0773, + "step": 3867 + }, + { + "epoch": 0.33, + "grad_norm": 0.4785020316303284, + "learning_rate": 7.874864372592266e-06, + "loss": 0.1219, + "step": 3868 + }, + { + "epoch": 0.33, + "grad_norm": 0.27507285393148845, + "learning_rate": 7.87374778363932e-06, + "loss": 0.0485, + "step": 3869 + }, + { + "epoch": 0.33, + "grad_norm": 0.3008880564571848, + "learning_rate": 7.872630980632896e-06, + "loss": 0.0923, + "step": 3870 + }, + { + "epoch": 0.33, + "grad_norm": 0.282580828274891, + "learning_rate": 7.87151396365618e-06, + "loss": 0.0749, + "step": 3871 + }, + { + "epoch": 0.33, + "grad_norm": 0.4426910482050526, + "learning_rate": 7.87039673279238e-06, + "loss": 0.0902, + "step": 3872 + }, + { + "epoch": 0.33, + "grad_norm": 0.34774976882377323, + "learning_rate": 7.86927928812471e-06, + "loss": 0.1083, + "step": 3873 + }, + { + "epoch": 0.33, + "grad_norm": 0.32808868323170776, + "learning_rate": 7.868161629736401e-06, + "loss": 0.098, + "step": 3874 + }, + { + "epoch": 0.33, + "grad_norm": 0.20919092817141202, + "learning_rate": 7.867043757710708e-06, + "loss": 0.0754, + "step": 3875 + }, + { + "epoch": 0.33, + "grad_norm": 0.36780692192015213, + "learning_rate": 7.865925672130895e-06, + "loss": 0.1315, + "step": 3876 + }, + { + "epoch": 0.33, + "grad_norm": 0.27328051781699075, + "learning_rate": 7.86480737308024e-06, + "loss": 0.0664, + "step": 3877 + }, + { + "epoch": 0.33, + "grad_norm": 0.7532479406464252, + "learning_rate": 7.863688860642044e-06, + "loss": 0.1199, + "step": 3878 + }, + { + "epoch": 0.33, + "grad_norm": 0.267697721072923, + "learning_rate": 7.862570134899619e-06, + "loss": 0.0594, + "step": 3879 + }, + { + "epoch": 0.33, + "grad_norm": 0.2661207783712932, + "learning_rate": 7.861451195936297e-06, + "loss": 0.0665, + "step": 3880 + }, + { + "epoch": 0.33, + "grad_norm": 0.34507336322935234, + "learning_rate": 7.860332043835416e-06, + "loss": 0.074, + "step": 3881 + }, + { + "epoch": 0.33, + "grad_norm": 0.4842819332788676, + "learning_rate": 7.859212678680345e-06, + "loss": 0.1305, + "step": 3882 + }, + { + "epoch": 0.33, + "grad_norm": 0.22775452056139886, + "learning_rate": 7.858093100554459e-06, + "loss": 0.0835, + "step": 3883 + }, + { + "epoch": 0.33, + "grad_norm": 0.2649677826567675, + "learning_rate": 7.856973309541147e-06, + "loss": 0.0657, + "step": 3884 + }, + { + "epoch": 0.33, + "grad_norm": 0.39299774013880595, + "learning_rate": 7.85585330572382e-06, + "loss": 0.0936, + "step": 3885 + }, + { + "epoch": 0.33, + "grad_norm": 0.1688500630926694, + "learning_rate": 7.854733089185903e-06, + "loss": 0.0228, + "step": 3886 + }, + { + "epoch": 0.33, + "grad_norm": 0.3117916717171854, + "learning_rate": 7.853612660010834e-06, + "loss": 0.0553, + "step": 3887 + }, + { + "epoch": 0.33, + "grad_norm": 0.337920200309975, + "learning_rate": 7.852492018282072e-06, + "loss": 0.0679, + "step": 3888 + }, + { + "epoch": 0.33, + "grad_norm": 0.24995447289068923, + "learning_rate": 7.851371164083087e-06, + "loss": 0.0571, + "step": 3889 + }, + { + "epoch": 0.33, + "grad_norm": 0.3943027218458631, + "learning_rate": 7.850250097497366e-06, + "loss": 0.0815, + "step": 3890 + }, + { + "epoch": 0.33, + "grad_norm": 0.29212582309694685, + "learning_rate": 7.849128818608416e-06, + "loss": 0.0743, + "step": 3891 + }, + { + "epoch": 0.33, + "grad_norm": 0.315231818190116, + "learning_rate": 7.848007327499754e-06, + "loss": 0.065, + "step": 3892 + }, + { + "epoch": 0.33, + "grad_norm": 0.6298199122204137, + "learning_rate": 7.846885624254913e-06, + "loss": 0.1219, + "step": 3893 + }, + { + "epoch": 0.33, + "grad_norm": 0.29929562240464264, + "learning_rate": 7.845763708957448e-06, + "loss": 0.0674, + "step": 3894 + }, + { + "epoch": 0.33, + "grad_norm": 0.4851510718287001, + "learning_rate": 7.844641581690925e-06, + "loss": 0.0949, + "step": 3895 + }, + { + "epoch": 0.33, + "grad_norm": 0.24261751625604414, + "learning_rate": 7.843519242538925e-06, + "loss": 0.0578, + "step": 3896 + }, + { + "epoch": 0.33, + "grad_norm": 0.5517817474207253, + "learning_rate": 7.842396691585046e-06, + "loss": 0.1166, + "step": 3897 + }, + { + "epoch": 0.33, + "grad_norm": 0.2546134293116837, + "learning_rate": 7.841273928912905e-06, + "loss": 0.0939, + "step": 3898 + }, + { + "epoch": 0.33, + "grad_norm": 0.39441983782377865, + "learning_rate": 7.84015095460613e-06, + "loss": 0.0827, + "step": 3899 + }, + { + "epoch": 0.33, + "grad_norm": 0.3035935505016369, + "learning_rate": 7.839027768748364e-06, + "loss": 0.079, + "step": 3900 + }, + { + "epoch": 0.33, + "grad_norm": 0.30435216409594557, + "learning_rate": 7.837904371423276e-06, + "loss": 0.0754, + "step": 3901 + }, + { + "epoch": 0.33, + "grad_norm": 0.45913164376111093, + "learning_rate": 7.836780762714534e-06, + "loss": 0.1253, + "step": 3902 + }, + { + "epoch": 0.33, + "grad_norm": 0.3601705501532005, + "learning_rate": 7.835656942705839e-06, + "loss": 0.1068, + "step": 3903 + }, + { + "epoch": 0.33, + "grad_norm": 0.40586107288741385, + "learning_rate": 7.834532911480893e-06, + "loss": 0.0982, + "step": 3904 + }, + { + "epoch": 0.33, + "grad_norm": 0.3106545064412594, + "learning_rate": 7.833408669123423e-06, + "loss": 0.1153, + "step": 3905 + }, + { + "epoch": 0.33, + "grad_norm": 0.20534041734360922, + "learning_rate": 7.832284215717171e-06, + "loss": 0.0609, + "step": 3906 + }, + { + "epoch": 0.33, + "grad_norm": 0.375585410253011, + "learning_rate": 7.831159551345892e-06, + "loss": 0.1162, + "step": 3907 + }, + { + "epoch": 0.33, + "grad_norm": 0.18370658776983168, + "learning_rate": 7.830034676093356e-06, + "loss": 0.0697, + "step": 3908 + }, + { + "epoch": 0.33, + "grad_norm": 0.20099520097408763, + "learning_rate": 7.828909590043351e-06, + "loss": 0.0625, + "step": 3909 + }, + { + "epoch": 0.33, + "grad_norm": 0.5327097194989433, + "learning_rate": 7.82778429327968e-06, + "loss": 0.1214, + "step": 3910 + }, + { + "epoch": 0.33, + "grad_norm": 0.48473639875631025, + "learning_rate": 7.826658785886163e-06, + "loss": 0.1183, + "step": 3911 + }, + { + "epoch": 0.33, + "grad_norm": 0.3927484108510367, + "learning_rate": 7.825533067946632e-06, + "loss": 0.0989, + "step": 3912 + }, + { + "epoch": 0.33, + "grad_norm": 0.40168002164475985, + "learning_rate": 7.824407139544938e-06, + "loss": 0.149, + "step": 3913 + }, + { + "epoch": 0.33, + "grad_norm": 0.27247877096938683, + "learning_rate": 7.823281000764948e-06, + "loss": 0.0702, + "step": 3914 + }, + { + "epoch": 0.33, + "grad_norm": 0.24980230421511473, + "learning_rate": 7.822154651690543e-06, + "loss": 0.0418, + "step": 3915 + }, + { + "epoch": 0.33, + "grad_norm": 0.38864783944524817, + "learning_rate": 7.821028092405616e-06, + "loss": 0.0915, + "step": 3916 + }, + { + "epoch": 0.33, + "grad_norm": 0.5167848048494923, + "learning_rate": 7.819901322994085e-06, + "loss": 0.1205, + "step": 3917 + }, + { + "epoch": 0.33, + "grad_norm": 0.27163177366930685, + "learning_rate": 7.818774343539876e-06, + "loss": 0.08, + "step": 3918 + }, + { + "epoch": 0.33, + "grad_norm": 0.3866988964807973, + "learning_rate": 7.817647154126933e-06, + "loss": 0.0611, + "step": 3919 + }, + { + "epoch": 0.33, + "grad_norm": 0.3371888403759303, + "learning_rate": 7.816519754839217e-06, + "loss": 0.0944, + "step": 3920 + }, + { + "epoch": 0.33, + "grad_norm": 0.26309356733731043, + "learning_rate": 7.815392145760702e-06, + "loss": 0.089, + "step": 3921 + }, + { + "epoch": 0.33, + "grad_norm": 0.359518063411512, + "learning_rate": 7.814264326975377e-06, + "loss": 0.0878, + "step": 3922 + }, + { + "epoch": 0.33, + "grad_norm": 0.3070311216569019, + "learning_rate": 7.813136298567254e-06, + "loss": 0.0717, + "step": 3923 + }, + { + "epoch": 0.33, + "grad_norm": 0.271065586738851, + "learning_rate": 7.81200806062035e-06, + "loss": 0.0796, + "step": 3924 + }, + { + "epoch": 0.33, + "grad_norm": 0.26696739368720934, + "learning_rate": 7.810879613218705e-06, + "loss": 0.0817, + "step": 3925 + }, + { + "epoch": 0.33, + "grad_norm": 0.3020440417645833, + "learning_rate": 7.80975095644637e-06, + "loss": 0.059, + "step": 3926 + }, + { + "epoch": 0.33, + "grad_norm": 0.28228129717591405, + "learning_rate": 7.808622090387418e-06, + "loss": 0.0838, + "step": 3927 + }, + { + "epoch": 0.33, + "grad_norm": 0.2760120273273923, + "learning_rate": 7.807493015125932e-06, + "loss": 0.0572, + "step": 3928 + }, + { + "epoch": 0.33, + "grad_norm": 0.2516615265248732, + "learning_rate": 7.806363730746009e-06, + "loss": 0.0612, + "step": 3929 + }, + { + "epoch": 0.33, + "grad_norm": 0.3047945124657934, + "learning_rate": 7.805234237331767e-06, + "loss": 0.0896, + "step": 3930 + }, + { + "epoch": 0.33, + "grad_norm": 0.21608690471088784, + "learning_rate": 7.804104534967339e-06, + "loss": 0.0401, + "step": 3931 + }, + { + "epoch": 0.33, + "grad_norm": 0.32367240352112353, + "learning_rate": 7.80297462373687e-06, + "loss": 0.069, + "step": 3932 + }, + { + "epoch": 0.33, + "grad_norm": 0.24022132954773023, + "learning_rate": 7.801844503724521e-06, + "loss": 0.091, + "step": 3933 + }, + { + "epoch": 0.33, + "grad_norm": 0.4523752191940846, + "learning_rate": 7.800714175014472e-06, + "loss": 0.0842, + "step": 3934 + }, + { + "epoch": 0.33, + "grad_norm": 0.4437494070946812, + "learning_rate": 7.799583637690917e-06, + "loss": 0.1166, + "step": 3935 + }, + { + "epoch": 0.33, + "grad_norm": 0.25327393771307, + "learning_rate": 7.798452891838061e-06, + "loss": 0.0709, + "step": 3936 + }, + { + "epoch": 0.33, + "grad_norm": 0.28885642118695637, + "learning_rate": 7.797321937540134e-06, + "loss": 0.0911, + "step": 3937 + }, + { + "epoch": 0.33, + "grad_norm": 0.4397357676924909, + "learning_rate": 7.796190774881374e-06, + "loss": 0.1429, + "step": 3938 + }, + { + "epoch": 0.33, + "grad_norm": 0.5784013673537458, + "learning_rate": 7.795059403946034e-06, + "loss": 0.1314, + "step": 3939 + }, + { + "epoch": 0.33, + "grad_norm": 0.3465212507978782, + "learning_rate": 7.793927824818387e-06, + "loss": 0.0992, + "step": 3940 + }, + { + "epoch": 0.33, + "grad_norm": 0.3967270184271305, + "learning_rate": 7.792796037582721e-06, + "loss": 0.0972, + "step": 3941 + }, + { + "epoch": 0.33, + "grad_norm": 0.2423899022389975, + "learning_rate": 7.791664042323338e-06, + "loss": 0.0815, + "step": 3942 + }, + { + "epoch": 0.33, + "grad_norm": 0.3158363326405831, + "learning_rate": 7.790531839124554e-06, + "loss": 0.0933, + "step": 3943 + }, + { + "epoch": 0.33, + "grad_norm": 0.4994796120404779, + "learning_rate": 7.789399428070701e-06, + "loss": 0.1053, + "step": 3944 + }, + { + "epoch": 0.33, + "grad_norm": 0.47922147707100066, + "learning_rate": 7.78826680924613e-06, + "loss": 0.1296, + "step": 3945 + }, + { + "epoch": 0.33, + "grad_norm": 0.4446338324211169, + "learning_rate": 7.787133982735204e-06, + "loss": 0.0934, + "step": 3946 + }, + { + "epoch": 0.33, + "grad_norm": 0.642337923926641, + "learning_rate": 7.786000948622303e-06, + "loss": 0.13, + "step": 3947 + }, + { + "epoch": 0.33, + "grad_norm": 0.34137154381810103, + "learning_rate": 7.784867706991822e-06, + "loss": 0.0972, + "step": 3948 + }, + { + "epoch": 0.33, + "grad_norm": 0.3816905833071871, + "learning_rate": 7.78373425792817e-06, + "loss": 0.1206, + "step": 3949 + }, + { + "epoch": 0.33, + "grad_norm": 0.286984379845749, + "learning_rate": 7.782600601515776e-06, + "loss": 0.0627, + "step": 3950 + }, + { + "epoch": 0.33, + "grad_norm": 0.34552144315568517, + "learning_rate": 7.781466737839076e-06, + "loss": 0.1077, + "step": 3951 + }, + { + "epoch": 0.33, + "grad_norm": 0.2585494459406052, + "learning_rate": 7.780332666982533e-06, + "loss": 0.0883, + "step": 3952 + }, + { + "epoch": 0.33, + "grad_norm": 0.2722975769808025, + "learning_rate": 7.779198389030616e-06, + "loss": 0.0647, + "step": 3953 + }, + { + "epoch": 0.33, + "grad_norm": 0.17233674689960177, + "learning_rate": 7.778063904067812e-06, + "loss": 0.051, + "step": 3954 + }, + { + "epoch": 0.33, + "grad_norm": 0.2831338892258267, + "learning_rate": 7.776929212178625e-06, + "loss": 0.049, + "step": 3955 + }, + { + "epoch": 0.33, + "grad_norm": 0.18884196810759638, + "learning_rate": 7.775794313447574e-06, + "loss": 0.0677, + "step": 3956 + }, + { + "epoch": 0.33, + "grad_norm": 0.32855299250160236, + "learning_rate": 7.774659207959192e-06, + "loss": 0.0806, + "step": 3957 + }, + { + "epoch": 0.33, + "grad_norm": 0.24441013219395658, + "learning_rate": 7.773523895798029e-06, + "loss": 0.071, + "step": 3958 + }, + { + "epoch": 0.33, + "grad_norm": 0.3521650954837509, + "learning_rate": 7.77238837704865e-06, + "loss": 0.0881, + "step": 3959 + }, + { + "epoch": 0.33, + "grad_norm": 0.40023080072597395, + "learning_rate": 7.771252651795634e-06, + "loss": 0.0766, + "step": 3960 + }, + { + "epoch": 0.33, + "grad_norm": 0.4300898088255061, + "learning_rate": 7.770116720123575e-06, + "loss": 0.1034, + "step": 3961 + }, + { + "epoch": 0.33, + "grad_norm": 0.31574904581382807, + "learning_rate": 7.768980582117088e-06, + "loss": 0.0535, + "step": 3962 + }, + { + "epoch": 0.33, + "grad_norm": 0.33147446430818694, + "learning_rate": 7.767844237860796e-06, + "loss": 0.0907, + "step": 3963 + }, + { + "epoch": 0.33, + "grad_norm": 0.37455763022483446, + "learning_rate": 7.766707687439338e-06, + "loss": 0.1008, + "step": 3964 + }, + { + "epoch": 0.33, + "grad_norm": 0.21217688007194535, + "learning_rate": 7.765570930937378e-06, + "loss": 0.0562, + "step": 3965 + }, + { + "epoch": 0.33, + "grad_norm": 0.22957078114091542, + "learning_rate": 7.764433968439584e-06, + "loss": 0.0424, + "step": 3966 + }, + { + "epoch": 0.33, + "grad_norm": 0.2494239311952329, + "learning_rate": 7.763296800030645e-06, + "loss": 0.0648, + "step": 3967 + }, + { + "epoch": 0.33, + "grad_norm": 0.39853637975776846, + "learning_rate": 7.76215942579526e-06, + "loss": 0.1025, + "step": 3968 + }, + { + "epoch": 0.33, + "grad_norm": 0.47141907447822345, + "learning_rate": 7.761021845818151e-06, + "loss": 0.1081, + "step": 3969 + }, + { + "epoch": 0.33, + "grad_norm": 0.25839828207284204, + "learning_rate": 7.75988406018405e-06, + "loss": 0.0749, + "step": 3970 + }, + { + "epoch": 0.33, + "grad_norm": 0.36266493309753617, + "learning_rate": 7.75874606897771e-06, + "loss": 0.1203, + "step": 3971 + }, + { + "epoch": 0.33, + "grad_norm": 0.6177078188743766, + "learning_rate": 7.75760787228389e-06, + "loss": 0.1198, + "step": 3972 + }, + { + "epoch": 0.33, + "grad_norm": 0.4094266014929528, + "learning_rate": 7.75646947018737e-06, + "loss": 0.0995, + "step": 3973 + }, + { + "epoch": 0.33, + "grad_norm": 0.3594921905797321, + "learning_rate": 7.755330862772948e-06, + "loss": 0.0701, + "step": 3974 + }, + { + "epoch": 0.33, + "grad_norm": 0.5153371565438242, + "learning_rate": 7.754192050125432e-06, + "loss": 0.0905, + "step": 3975 + }, + { + "epoch": 0.34, + "grad_norm": 0.4524479252696568, + "learning_rate": 7.753053032329649e-06, + "loss": 0.117, + "step": 3976 + }, + { + "epoch": 0.34, + "grad_norm": 0.5007515020466712, + "learning_rate": 7.751913809470436e-06, + "loss": 0.0886, + "step": 3977 + }, + { + "epoch": 0.34, + "grad_norm": 0.2674171332182288, + "learning_rate": 7.750774381632654e-06, + "loss": 0.0546, + "step": 3978 + }, + { + "epoch": 0.34, + "grad_norm": 0.23899557220292109, + "learning_rate": 7.749634748901169e-06, + "loss": 0.0788, + "step": 3979 + }, + { + "epoch": 0.34, + "grad_norm": 0.39561194783171605, + "learning_rate": 7.74849491136087e-06, + "loss": 0.0575, + "step": 3980 + }, + { + "epoch": 0.34, + "grad_norm": 0.29513834962406893, + "learning_rate": 7.747354869096662e-06, + "loss": 0.0833, + "step": 3981 + }, + { + "epoch": 0.34, + "grad_norm": 0.25178111276156356, + "learning_rate": 7.746214622193456e-06, + "loss": 0.0729, + "step": 3982 + }, + { + "epoch": 0.34, + "grad_norm": 0.31989368785416455, + "learning_rate": 7.74507417073619e-06, + "loss": 0.0756, + "step": 3983 + }, + { + "epoch": 0.34, + "grad_norm": 0.28770204019580486, + "learning_rate": 7.743933514809806e-06, + "loss": 0.0909, + "step": 3984 + }, + { + "epoch": 0.34, + "grad_norm": 0.4168901267760503, + "learning_rate": 7.742792654499266e-06, + "loss": 0.0874, + "step": 3985 + }, + { + "epoch": 0.34, + "grad_norm": 0.36537482128960536, + "learning_rate": 7.741651589889557e-06, + "loss": 0.1088, + "step": 3986 + }, + { + "epoch": 0.34, + "grad_norm": 0.23887990002328263, + "learning_rate": 7.740510321065664e-06, + "loss": 0.0638, + "step": 3987 + }, + { + "epoch": 0.34, + "grad_norm": 0.4304363520350159, + "learning_rate": 7.739368848112599e-06, + "loss": 0.0863, + "step": 3988 + }, + { + "epoch": 0.34, + "grad_norm": 0.2692463121263935, + "learning_rate": 7.738227171115382e-06, + "loss": 0.0624, + "step": 3989 + }, + { + "epoch": 0.34, + "grad_norm": 0.34397976813151626, + "learning_rate": 7.737085290159054e-06, + "loss": 0.0945, + "step": 3990 + }, + { + "epoch": 0.34, + "grad_norm": 0.2808250520498091, + "learning_rate": 7.735943205328671e-06, + "loss": 0.0784, + "step": 3991 + }, + { + "epoch": 0.34, + "grad_norm": 0.2001538830970748, + "learning_rate": 7.734800916709297e-06, + "loss": 0.0653, + "step": 3992 + }, + { + "epoch": 0.34, + "grad_norm": 0.239184288096797, + "learning_rate": 7.733658424386023e-06, + "loss": 0.0846, + "step": 3993 + }, + { + "epoch": 0.34, + "grad_norm": 0.27395350249367867, + "learning_rate": 7.73251572844394e-06, + "loss": 0.0756, + "step": 3994 + }, + { + "epoch": 0.34, + "grad_norm": 0.309972309252477, + "learning_rate": 7.731372828968173e-06, + "loss": 0.0526, + "step": 3995 + }, + { + "epoch": 0.34, + "grad_norm": 0.6218458113463283, + "learning_rate": 7.730229726043844e-06, + "loss": 0.0847, + "step": 3996 + }, + { + "epoch": 0.34, + "grad_norm": 0.4148747261021244, + "learning_rate": 7.729086419756098e-06, + "loss": 0.101, + "step": 3997 + }, + { + "epoch": 0.34, + "grad_norm": 0.47661683101120744, + "learning_rate": 7.7279429101901e-06, + "loss": 0.1037, + "step": 3998 + }, + { + "epoch": 0.34, + "grad_norm": 0.41667334874578343, + "learning_rate": 7.726799197431023e-06, + "loss": 0.1047, + "step": 3999 + }, + { + "epoch": 0.34, + "grad_norm": 0.28002711284733733, + "learning_rate": 7.725655281564055e-06, + "loss": 0.0552, + "step": 4000 + }, + { + "epoch": 0.34, + "grad_norm": 0.42819959084073117, + "learning_rate": 7.724511162674404e-06, + "loss": 0.1029, + "step": 4001 + }, + { + "epoch": 0.34, + "grad_norm": 0.33068936577648816, + "learning_rate": 7.723366840847291e-06, + "loss": 0.0896, + "step": 4002 + }, + { + "epoch": 0.34, + "grad_norm": 0.6769773883105994, + "learning_rate": 7.722222316167952e-06, + "loss": 0.1748, + "step": 4003 + }, + { + "epoch": 0.34, + "grad_norm": 0.3447402535009358, + "learning_rate": 7.721077588721632e-06, + "loss": 0.0679, + "step": 4004 + }, + { + "epoch": 0.34, + "grad_norm": 0.48399046976434695, + "learning_rate": 7.719932658593607e-06, + "loss": 0.0698, + "step": 4005 + }, + { + "epoch": 0.34, + "grad_norm": 0.4115589996149729, + "learning_rate": 7.71878752586915e-06, + "loss": 0.0737, + "step": 4006 + }, + { + "epoch": 0.34, + "grad_norm": 0.3244341082691695, + "learning_rate": 7.71764219063356e-06, + "loss": 0.1081, + "step": 4007 + }, + { + "epoch": 0.34, + "grad_norm": 0.4752411844244107, + "learning_rate": 7.71649665297215e-06, + "loss": 0.1112, + "step": 4008 + }, + { + "epoch": 0.34, + "grad_norm": 0.42373652064661477, + "learning_rate": 7.715350912970243e-06, + "loss": 0.1088, + "step": 4009 + }, + { + "epoch": 0.34, + "grad_norm": 0.35431428663881925, + "learning_rate": 7.714204970713182e-06, + "loss": 0.0579, + "step": 4010 + }, + { + "epoch": 0.34, + "grad_norm": 0.2930407518608745, + "learning_rate": 7.713058826286324e-06, + "loss": 0.079, + "step": 4011 + }, + { + "epoch": 0.34, + "grad_norm": 0.3590825257539356, + "learning_rate": 7.711912479775041e-06, + "loss": 0.1151, + "step": 4012 + }, + { + "epoch": 0.34, + "grad_norm": 0.4574084655550777, + "learning_rate": 7.710765931264716e-06, + "loss": 0.0808, + "step": 4013 + }, + { + "epoch": 0.34, + "grad_norm": 0.2720926805606191, + "learning_rate": 7.709619180840755e-06, + "loss": 0.0669, + "step": 4014 + }, + { + "epoch": 0.34, + "grad_norm": 0.38781416518756745, + "learning_rate": 7.70847222858857e-06, + "loss": 0.1294, + "step": 4015 + }, + { + "epoch": 0.34, + "grad_norm": 0.39553677643909313, + "learning_rate": 7.707325074593599e-06, + "loss": 0.1232, + "step": 4016 + }, + { + "epoch": 0.34, + "grad_norm": 0.2944292876842651, + "learning_rate": 7.706177718941284e-06, + "loss": 0.0838, + "step": 4017 + }, + { + "epoch": 0.34, + "grad_norm": 0.2756021041853114, + "learning_rate": 7.705030161717086e-06, + "loss": 0.0786, + "step": 4018 + }, + { + "epoch": 0.34, + "grad_norm": 0.2880406976479227, + "learning_rate": 7.703882403006484e-06, + "loss": 0.0683, + "step": 4019 + }, + { + "epoch": 0.34, + "grad_norm": 0.3760143685694372, + "learning_rate": 7.702734442894971e-06, + "loss": 0.0961, + "step": 4020 + }, + { + "epoch": 0.34, + "grad_norm": 0.3210768261952558, + "learning_rate": 7.701586281468052e-06, + "loss": 0.0756, + "step": 4021 + }, + { + "epoch": 0.34, + "grad_norm": 0.20587020655063945, + "learning_rate": 7.700437918811247e-06, + "loss": 0.0603, + "step": 4022 + }, + { + "epoch": 0.34, + "grad_norm": 0.5094366062290338, + "learning_rate": 7.699289355010096e-06, + "loss": 0.1066, + "step": 4023 + }, + { + "epoch": 0.34, + "grad_norm": 0.4034650487557578, + "learning_rate": 7.69814059015015e-06, + "loss": 0.0982, + "step": 4024 + }, + { + "epoch": 0.34, + "grad_norm": 0.2741892725820665, + "learning_rate": 7.696991624316974e-06, + "loss": 0.0754, + "step": 4025 + }, + { + "epoch": 0.34, + "grad_norm": 0.3205332042832609, + "learning_rate": 7.69584245759615e-06, + "loss": 0.0964, + "step": 4026 + }, + { + "epoch": 0.34, + "grad_norm": 0.43474262630995025, + "learning_rate": 7.694693090073275e-06, + "loss": 0.11, + "step": 4027 + }, + { + "epoch": 0.34, + "grad_norm": 0.27107576963396923, + "learning_rate": 7.693543521833962e-06, + "loss": 0.0533, + "step": 4028 + }, + { + "epoch": 0.34, + "grad_norm": 0.2203427389352263, + "learning_rate": 7.692393752963837e-06, + "loss": 0.0464, + "step": 4029 + }, + { + "epoch": 0.34, + "grad_norm": 0.2533190497200509, + "learning_rate": 7.69124378354854e-06, + "loss": 0.0697, + "step": 4030 + }, + { + "epoch": 0.34, + "grad_norm": 0.2996798470197408, + "learning_rate": 7.690093613673727e-06, + "loss": 0.0858, + "step": 4031 + }, + { + "epoch": 0.34, + "grad_norm": 0.21973213558292526, + "learning_rate": 7.688943243425073e-06, + "loss": 0.0489, + "step": 4032 + }, + { + "epoch": 0.34, + "grad_norm": 0.48715801892400007, + "learning_rate": 7.687792672888261e-06, + "loss": 0.0912, + "step": 4033 + }, + { + "epoch": 0.34, + "grad_norm": 0.30703317361014504, + "learning_rate": 7.686641902148994e-06, + "loss": 0.0747, + "step": 4034 + }, + { + "epoch": 0.34, + "grad_norm": 0.23298684841048295, + "learning_rate": 7.685490931292987e-06, + "loss": 0.0632, + "step": 4035 + }, + { + "epoch": 0.34, + "grad_norm": 0.2711921061733807, + "learning_rate": 7.68433976040597e-06, + "loss": 0.0786, + "step": 4036 + }, + { + "epoch": 0.34, + "grad_norm": 0.21030724459067965, + "learning_rate": 7.683188389573692e-06, + "loss": 0.0431, + "step": 4037 + }, + { + "epoch": 0.34, + "grad_norm": 0.707414531849631, + "learning_rate": 7.682036818881913e-06, + "loss": 0.1177, + "step": 4038 + }, + { + "epoch": 0.34, + "grad_norm": 0.4889481316486415, + "learning_rate": 7.680885048416405e-06, + "loss": 0.1031, + "step": 4039 + }, + { + "epoch": 0.34, + "grad_norm": 0.3996951815634166, + "learning_rate": 7.679733078262963e-06, + "loss": 0.124, + "step": 4040 + }, + { + "epoch": 0.34, + "grad_norm": 0.37750502644961265, + "learning_rate": 7.67858090850739e-06, + "loss": 0.0997, + "step": 4041 + }, + { + "epoch": 0.34, + "grad_norm": 0.5240621574382807, + "learning_rate": 7.677428539235508e-06, + "loss": 0.1012, + "step": 4042 + }, + { + "epoch": 0.34, + "grad_norm": 0.43680227639711205, + "learning_rate": 7.67627597053315e-06, + "loss": 0.0938, + "step": 4043 + }, + { + "epoch": 0.34, + "grad_norm": 0.2689256479813027, + "learning_rate": 7.675123202486167e-06, + "loss": 0.0938, + "step": 4044 + }, + { + "epoch": 0.34, + "grad_norm": 0.20647459481454553, + "learning_rate": 7.673970235180423e-06, + "loss": 0.0592, + "step": 4045 + }, + { + "epoch": 0.34, + "grad_norm": 0.4784859020930211, + "learning_rate": 7.6728170687018e-06, + "loss": 0.1002, + "step": 4046 + }, + { + "epoch": 0.34, + "grad_norm": 0.2845623831074091, + "learning_rate": 7.671663703136193e-06, + "loss": 0.0976, + "step": 4047 + }, + { + "epoch": 0.34, + "grad_norm": 0.4091619167054101, + "learning_rate": 7.67051013856951e-06, + "loss": 0.0957, + "step": 4048 + }, + { + "epoch": 0.34, + "grad_norm": 0.43880039722637615, + "learning_rate": 7.669356375087673e-06, + "loss": 0.0811, + "step": 4049 + }, + { + "epoch": 0.34, + "grad_norm": 0.5165536880517452, + "learning_rate": 7.668202412776624e-06, + "loss": 0.064, + "step": 4050 + }, + { + "epoch": 0.34, + "grad_norm": 0.2512815741295981, + "learning_rate": 7.667048251722316e-06, + "loss": 0.0586, + "step": 4051 + }, + { + "epoch": 0.34, + "grad_norm": 0.3867109785146295, + "learning_rate": 7.665893892010717e-06, + "loss": 0.083, + "step": 4052 + }, + { + "epoch": 0.34, + "grad_norm": 0.4886579592787364, + "learning_rate": 7.66473933372781e-06, + "loss": 0.1243, + "step": 4053 + }, + { + "epoch": 0.34, + "grad_norm": 0.28646513835626664, + "learning_rate": 7.663584576959596e-06, + "loss": 0.0654, + "step": 4054 + }, + { + "epoch": 0.34, + "grad_norm": 0.3553371167488568, + "learning_rate": 7.662429621792085e-06, + "loss": 0.093, + "step": 4055 + }, + { + "epoch": 0.34, + "grad_norm": 0.3522209563233432, + "learning_rate": 7.661274468311306e-06, + "loss": 0.1049, + "step": 4056 + }, + { + "epoch": 0.34, + "grad_norm": 0.18011262515380627, + "learning_rate": 7.660119116603301e-06, + "loss": 0.0298, + "step": 4057 + }, + { + "epoch": 0.34, + "grad_norm": 0.4011966156190251, + "learning_rate": 7.65896356675413e-06, + "loss": 0.0537, + "step": 4058 + }, + { + "epoch": 0.34, + "grad_norm": 0.5989935728998027, + "learning_rate": 7.657807818849861e-06, + "loss": 0.1262, + "step": 4059 + }, + { + "epoch": 0.34, + "grad_norm": 0.3411606643183269, + "learning_rate": 7.656651872976584e-06, + "loss": 0.0974, + "step": 4060 + }, + { + "epoch": 0.34, + "grad_norm": 0.5844102571786082, + "learning_rate": 7.6554957292204e-06, + "loss": 0.1586, + "step": 4061 + }, + { + "epoch": 0.34, + "grad_norm": 0.33484023894948817, + "learning_rate": 7.654339387667423e-06, + "loss": 0.0653, + "step": 4062 + }, + { + "epoch": 0.34, + "grad_norm": 0.4843659168103004, + "learning_rate": 7.653182848403786e-06, + "loss": 0.0754, + "step": 4063 + }, + { + "epoch": 0.34, + "grad_norm": 0.39481290357573484, + "learning_rate": 7.652026111515635e-06, + "loss": 0.072, + "step": 4064 + }, + { + "epoch": 0.34, + "grad_norm": 0.2845060003189976, + "learning_rate": 7.65086917708913e-06, + "loss": 0.0725, + "step": 4065 + }, + { + "epoch": 0.34, + "grad_norm": 0.18394871914271696, + "learning_rate": 7.649712045210446e-06, + "loss": 0.0773, + "step": 4066 + }, + { + "epoch": 0.34, + "grad_norm": 0.217378176775968, + "learning_rate": 7.648554715965772e-06, + "loss": 0.0627, + "step": 4067 + }, + { + "epoch": 0.34, + "grad_norm": 0.3090135678269327, + "learning_rate": 7.647397189441315e-06, + "loss": 0.0966, + "step": 4068 + }, + { + "epoch": 0.34, + "grad_norm": 0.3604927719882226, + "learning_rate": 7.646239465723293e-06, + "loss": 0.0904, + "step": 4069 + }, + { + "epoch": 0.34, + "grad_norm": 0.3213725105667064, + "learning_rate": 7.64508154489794e-06, + "loss": 0.099, + "step": 4070 + }, + { + "epoch": 0.34, + "grad_norm": 0.27765185240490897, + "learning_rate": 7.643923427051506e-06, + "loss": 0.0717, + "step": 4071 + }, + { + "epoch": 0.34, + "grad_norm": 0.26264450806378553, + "learning_rate": 7.642765112270251e-06, + "loss": 0.0637, + "step": 4072 + }, + { + "epoch": 0.34, + "grad_norm": 0.6326742163905014, + "learning_rate": 7.641606600640458e-06, + "loss": 0.1498, + "step": 4073 + }, + { + "epoch": 0.34, + "grad_norm": 0.2166582316883081, + "learning_rate": 7.640447892248416e-06, + "loss": 0.064, + "step": 4074 + }, + { + "epoch": 0.34, + "grad_norm": 0.25109836178583345, + "learning_rate": 7.63928898718043e-06, + "loss": 0.0699, + "step": 4075 + }, + { + "epoch": 0.34, + "grad_norm": 0.3297433233606274, + "learning_rate": 7.638129885522829e-06, + "loss": 0.0866, + "step": 4076 + }, + { + "epoch": 0.34, + "grad_norm": 0.3077088854014417, + "learning_rate": 7.636970587361946e-06, + "loss": 0.0953, + "step": 4077 + }, + { + "epoch": 0.34, + "grad_norm": 0.4627542230524302, + "learning_rate": 7.63581109278413e-06, + "loss": 0.098, + "step": 4078 + }, + { + "epoch": 0.34, + "grad_norm": 0.3384242436075803, + "learning_rate": 7.63465140187575e-06, + "loss": 0.0761, + "step": 4079 + }, + { + "epoch": 0.34, + "grad_norm": 0.40829844522700826, + "learning_rate": 7.633491514723185e-06, + "loss": 0.0902, + "step": 4080 + }, + { + "epoch": 0.34, + "grad_norm": 0.6590651614844071, + "learning_rate": 7.632331431412831e-06, + "loss": 0.0725, + "step": 4081 + }, + { + "epoch": 0.34, + "grad_norm": 0.6599271711843734, + "learning_rate": 7.631171152031099e-06, + "loss": 0.13, + "step": 4082 + }, + { + "epoch": 0.34, + "grad_norm": 0.422721413179361, + "learning_rate": 7.63001067666441e-06, + "loss": 0.1149, + "step": 4083 + }, + { + "epoch": 0.34, + "grad_norm": 0.44969440773774494, + "learning_rate": 7.628850005399205e-06, + "loss": 0.1118, + "step": 4084 + }, + { + "epoch": 0.34, + "grad_norm": 0.5397707870719491, + "learning_rate": 7.627689138321939e-06, + "loss": 0.094, + "step": 4085 + }, + { + "epoch": 0.34, + "grad_norm": 0.2977252998941182, + "learning_rate": 7.626528075519076e-06, + "loss": 0.0817, + "step": 4086 + }, + { + "epoch": 0.34, + "grad_norm": 0.2182586032932128, + "learning_rate": 7.625366817077103e-06, + "loss": 0.0703, + "step": 4087 + }, + { + "epoch": 0.34, + "grad_norm": 0.23855785350685535, + "learning_rate": 7.624205363082515e-06, + "loss": 0.0713, + "step": 4088 + }, + { + "epoch": 0.34, + "grad_norm": 0.2706152001778837, + "learning_rate": 7.623043713621825e-06, + "loss": 0.0859, + "step": 4089 + }, + { + "epoch": 0.34, + "grad_norm": 0.35517180063459086, + "learning_rate": 7.621881868781559e-06, + "loss": 0.096, + "step": 4090 + }, + { + "epoch": 0.34, + "grad_norm": 0.3343674798532962, + "learning_rate": 7.620719828648257e-06, + "loss": 0.0858, + "step": 4091 + }, + { + "epoch": 0.34, + "grad_norm": 0.3385285508028679, + "learning_rate": 7.6195575933084755e-06, + "loss": 0.086, + "step": 4092 + }, + { + "epoch": 0.34, + "grad_norm": 0.3116993432037076, + "learning_rate": 7.618395162848785e-06, + "loss": 0.0954, + "step": 4093 + }, + { + "epoch": 0.34, + "grad_norm": 0.3687705844903468, + "learning_rate": 7.6172325373557685e-06, + "loss": 0.0784, + "step": 4094 + }, + { + "epoch": 0.35, + "grad_norm": 0.31312325750676134, + "learning_rate": 7.616069716916028e-06, + "loss": 0.0592, + "step": 4095 + }, + { + "epoch": 0.35, + "grad_norm": 0.31934650431237904, + "learning_rate": 7.6149067016161735e-06, + "loss": 0.089, + "step": 4096 + }, + { + "epoch": 0.35, + "grad_norm": 0.28491119834756096, + "learning_rate": 7.613743491542835e-06, + "loss": 0.0616, + "step": 4097 + }, + { + "epoch": 0.35, + "grad_norm": 0.3054768956489881, + "learning_rate": 7.612580086782655e-06, + "loss": 0.1074, + "step": 4098 + }, + { + "epoch": 0.35, + "grad_norm": 0.29584524204119894, + "learning_rate": 7.6114164874222915e-06, + "loss": 0.0613, + "step": 4099 + }, + { + "epoch": 0.35, + "grad_norm": 0.4403849222399699, + "learning_rate": 7.610252693548416e-06, + "loss": 0.1221, + "step": 4100 + }, + { + "epoch": 0.35, + "grad_norm": 0.376056319007393, + "learning_rate": 7.609088705247711e-06, + "loss": 0.091, + "step": 4101 + }, + { + "epoch": 0.35, + "grad_norm": 0.3235599324792098, + "learning_rate": 7.607924522606884e-06, + "loss": 0.1014, + "step": 4102 + }, + { + "epoch": 0.35, + "grad_norm": 0.44820619060878325, + "learning_rate": 7.606760145712644e-06, + "loss": 0.1395, + "step": 4103 + }, + { + "epoch": 0.35, + "grad_norm": 0.3740918933968217, + "learning_rate": 7.605595574651724e-06, + "loss": 0.0905, + "step": 4104 + }, + { + "epoch": 0.35, + "grad_norm": 0.3529507226498215, + "learning_rate": 7.604430809510866e-06, + "loss": 0.1305, + "step": 4105 + }, + { + "epoch": 0.35, + "grad_norm": 0.20612356771082319, + "learning_rate": 7.603265850376831e-06, + "loss": 0.0576, + "step": 4106 + }, + { + "epoch": 0.35, + "grad_norm": 0.3253870556899657, + "learning_rate": 7.602100697336387e-06, + "loss": 0.0977, + "step": 4107 + }, + { + "epoch": 0.35, + "grad_norm": 0.25916445341616645, + "learning_rate": 7.600935350476328e-06, + "loss": 0.0912, + "step": 4108 + }, + { + "epoch": 0.35, + "grad_norm": 0.33828030855006297, + "learning_rate": 7.599769809883451e-06, + "loss": 0.0554, + "step": 4109 + }, + { + "epoch": 0.35, + "grad_norm": 0.32881785971908845, + "learning_rate": 7.598604075644574e-06, + "loss": 0.0959, + "step": 4110 + }, + { + "epoch": 0.35, + "grad_norm": 0.3222898441562243, + "learning_rate": 7.597438147846527e-06, + "loss": 0.0684, + "step": 4111 + }, + { + "epoch": 0.35, + "grad_norm": 0.1910580485649205, + "learning_rate": 7.596272026576157e-06, + "loss": 0.064, + "step": 4112 + }, + { + "epoch": 0.35, + "grad_norm": 0.27142651348028524, + "learning_rate": 7.595105711920321e-06, + "loss": 0.0789, + "step": 4113 + }, + { + "epoch": 0.35, + "grad_norm": 0.2801995910893233, + "learning_rate": 7.593939203965894e-06, + "loss": 0.1186, + "step": 4114 + }, + { + "epoch": 0.35, + "grad_norm": 0.3548840279446117, + "learning_rate": 7.592772502799763e-06, + "loss": 0.0726, + "step": 4115 + }, + { + "epoch": 0.35, + "grad_norm": 0.41259025342836914, + "learning_rate": 7.591605608508834e-06, + "loss": 0.1342, + "step": 4116 + }, + { + "epoch": 0.35, + "grad_norm": 0.33476649294004124, + "learning_rate": 7.590438521180021e-06, + "loss": 0.0493, + "step": 4117 + }, + { + "epoch": 0.35, + "grad_norm": 0.3191818976659879, + "learning_rate": 7.589271240900258e-06, + "loss": 0.065, + "step": 4118 + }, + { + "epoch": 0.35, + "grad_norm": 0.3365665200953937, + "learning_rate": 7.5881037677564886e-06, + "loss": 0.0693, + "step": 4119 + }, + { + "epoch": 0.35, + "grad_norm": 0.3216148159309258, + "learning_rate": 7.586936101835672e-06, + "loss": 0.0837, + "step": 4120 + }, + { + "epoch": 0.35, + "grad_norm": 0.3098741407220086, + "learning_rate": 7.585768243224787e-06, + "loss": 0.1195, + "step": 4121 + }, + { + "epoch": 0.35, + "grad_norm": 0.2511957876148925, + "learning_rate": 7.584600192010818e-06, + "loss": 0.0707, + "step": 4122 + }, + { + "epoch": 0.35, + "grad_norm": 0.3403499917598926, + "learning_rate": 7.5834319482807715e-06, + "loss": 0.1108, + "step": 4123 + }, + { + "epoch": 0.35, + "grad_norm": 0.45511733623025663, + "learning_rate": 7.582263512121664e-06, + "loss": 0.0957, + "step": 4124 + }, + { + "epoch": 0.35, + "grad_norm": 0.3459484540901208, + "learning_rate": 7.5810948836205265e-06, + "loss": 0.0973, + "step": 4125 + }, + { + "epoch": 0.35, + "grad_norm": 0.3043133381206861, + "learning_rate": 7.579926062864406e-06, + "loss": 0.0492, + "step": 4126 + }, + { + "epoch": 0.35, + "grad_norm": 0.2938463178085005, + "learning_rate": 7.578757049940364e-06, + "loss": 0.1048, + "step": 4127 + }, + { + "epoch": 0.35, + "grad_norm": 0.27691768795903104, + "learning_rate": 7.577587844935475e-06, + "loss": 0.0684, + "step": 4128 + }, + { + "epoch": 0.35, + "grad_norm": 0.21021584985902717, + "learning_rate": 7.576418447936827e-06, + "loss": 0.0363, + "step": 4129 + }, + { + "epoch": 0.35, + "grad_norm": 0.30641278176073256, + "learning_rate": 7.575248859031523e-06, + "loss": 0.0807, + "step": 4130 + }, + { + "epoch": 0.35, + "grad_norm": 0.36701046998657166, + "learning_rate": 7.574079078306684e-06, + "loss": 0.1126, + "step": 4131 + }, + { + "epoch": 0.35, + "grad_norm": 0.43908391053022533, + "learning_rate": 7.57290910584944e-06, + "loss": 0.1269, + "step": 4132 + }, + { + "epoch": 0.35, + "grad_norm": 0.44113010548333725, + "learning_rate": 7.571738941746935e-06, + "loss": 0.1378, + "step": 4133 + }, + { + "epoch": 0.35, + "grad_norm": 0.5012581841078975, + "learning_rate": 7.570568586086336e-06, + "loss": 0.1007, + "step": 4134 + }, + { + "epoch": 0.35, + "grad_norm": 0.40304205665830417, + "learning_rate": 7.569398038954812e-06, + "loss": 0.0718, + "step": 4135 + }, + { + "epoch": 0.35, + "grad_norm": 0.2966017866581755, + "learning_rate": 7.568227300439554e-06, + "loss": 0.0842, + "step": 4136 + }, + { + "epoch": 0.35, + "grad_norm": 0.5568593402814396, + "learning_rate": 7.567056370627766e-06, + "loss": 0.1111, + "step": 4137 + }, + { + "epoch": 0.35, + "grad_norm": 0.26139722336636106, + "learning_rate": 7.565885249606666e-06, + "loss": 0.0727, + "step": 4138 + }, + { + "epoch": 0.35, + "grad_norm": 0.7530181094940503, + "learning_rate": 7.5647139374634835e-06, + "loss": 0.0765, + "step": 4139 + }, + { + "epoch": 0.35, + "grad_norm": 0.3267124310184364, + "learning_rate": 7.563542434285467e-06, + "loss": 0.0776, + "step": 4140 + }, + { + "epoch": 0.35, + "grad_norm": 0.8516435835676295, + "learning_rate": 7.562370740159877e-06, + "loss": 0.0573, + "step": 4141 + }, + { + "epoch": 0.35, + "grad_norm": 0.4116861578214954, + "learning_rate": 7.561198855173988e-06, + "loss": 0.0894, + "step": 4142 + }, + { + "epoch": 0.35, + "grad_norm": 0.5749317862054534, + "learning_rate": 7.560026779415086e-06, + "loss": 0.0985, + "step": 4143 + }, + { + "epoch": 0.35, + "grad_norm": 0.31547999819508743, + "learning_rate": 7.558854512970477e-06, + "loss": 0.1059, + "step": 4144 + }, + { + "epoch": 0.35, + "grad_norm": 0.41689606429873405, + "learning_rate": 7.557682055927478e-06, + "loss": 0.1119, + "step": 4145 + }, + { + "epoch": 0.35, + "grad_norm": 0.4386684940280412, + "learning_rate": 7.55650940837342e-06, + "loss": 0.067, + "step": 4146 + }, + { + "epoch": 0.35, + "grad_norm": 0.397457858997808, + "learning_rate": 7.555336570395648e-06, + "loss": 0.0872, + "step": 4147 + }, + { + "epoch": 0.35, + "grad_norm": 0.37713801683202114, + "learning_rate": 7.554163542081523e-06, + "loss": 0.1082, + "step": 4148 + }, + { + "epoch": 0.35, + "grad_norm": 0.3098048010208587, + "learning_rate": 7.552990323518417e-06, + "loss": 0.061, + "step": 4149 + }, + { + "epoch": 0.35, + "grad_norm": 0.37481142438343534, + "learning_rate": 7.551816914793721e-06, + "loss": 0.0884, + "step": 4150 + }, + { + "epoch": 0.35, + "grad_norm": 0.23284921278655388, + "learning_rate": 7.550643315994834e-06, + "loss": 0.0543, + "step": 4151 + }, + { + "epoch": 0.35, + "grad_norm": 0.4292008093071825, + "learning_rate": 7.549469527209175e-06, + "loss": 0.1323, + "step": 4152 + }, + { + "epoch": 0.35, + "grad_norm": 0.3233260079947718, + "learning_rate": 7.548295548524175e-06, + "loss": 0.0974, + "step": 4153 + }, + { + "epoch": 0.35, + "grad_norm": 0.4350542027997086, + "learning_rate": 7.5471213800272755e-06, + "loss": 0.0448, + "step": 4154 + }, + { + "epoch": 0.35, + "grad_norm": 0.38890532705287545, + "learning_rate": 7.545947021805939e-06, + "loss": 0.092, + "step": 4155 + }, + { + "epoch": 0.35, + "grad_norm": 0.24481828590691174, + "learning_rate": 7.5447724739476365e-06, + "loss": 0.0522, + "step": 4156 + }, + { + "epoch": 0.35, + "grad_norm": 0.3660009746965549, + "learning_rate": 7.543597736539856e-06, + "loss": 0.0807, + "step": 4157 + }, + { + "epoch": 0.35, + "grad_norm": 0.4262702730060066, + "learning_rate": 7.542422809670099e-06, + "loss": 0.1323, + "step": 4158 + }, + { + "epoch": 0.35, + "grad_norm": 0.24779812890943945, + "learning_rate": 7.54124769342588e-06, + "loss": 0.0762, + "step": 4159 + }, + { + "epoch": 0.35, + "grad_norm": 0.34417899737674845, + "learning_rate": 7.540072387894729e-06, + "loss": 0.1008, + "step": 4160 + }, + { + "epoch": 0.35, + "grad_norm": 0.2562297954923462, + "learning_rate": 7.538896893164189e-06, + "loss": 0.0764, + "step": 4161 + }, + { + "epoch": 0.35, + "grad_norm": 0.22245925678477532, + "learning_rate": 7.5377212093218175e-06, + "loss": 0.0767, + "step": 4162 + }, + { + "epoch": 0.35, + "grad_norm": 0.34927215054126065, + "learning_rate": 7.53654533645519e-06, + "loss": 0.1014, + "step": 4163 + }, + { + "epoch": 0.35, + "grad_norm": 0.33852001573023194, + "learning_rate": 7.535369274651887e-06, + "loss": 0.1089, + "step": 4164 + }, + { + "epoch": 0.35, + "grad_norm": 0.3606040794537733, + "learning_rate": 7.534193023999512e-06, + "loss": 0.0818, + "step": 4165 + }, + { + "epoch": 0.35, + "grad_norm": 0.2653663237601913, + "learning_rate": 7.533016584585676e-06, + "loss": 0.0723, + "step": 4166 + }, + { + "epoch": 0.35, + "grad_norm": 0.23641807673459653, + "learning_rate": 7.531839956498011e-06, + "loss": 0.0932, + "step": 4167 + }, + { + "epoch": 0.35, + "grad_norm": 0.2910779870897051, + "learning_rate": 7.5306631398241546e-06, + "loss": 0.0717, + "step": 4168 + }, + { + "epoch": 0.35, + "grad_norm": 0.2876788304586991, + "learning_rate": 7.529486134651767e-06, + "loss": 0.0909, + "step": 4169 + }, + { + "epoch": 0.35, + "grad_norm": 0.4687252274879929, + "learning_rate": 7.528308941068517e-06, + "loss": 0.1216, + "step": 4170 + }, + { + "epoch": 0.35, + "grad_norm": 0.40151991465859976, + "learning_rate": 7.5271315591620875e-06, + "loss": 0.087, + "step": 4171 + }, + { + "epoch": 0.35, + "grad_norm": 0.32262277655879124, + "learning_rate": 7.525953989020179e-06, + "loss": 0.1034, + "step": 4172 + }, + { + "epoch": 0.35, + "grad_norm": 0.2137020105433216, + "learning_rate": 7.5247762307305014e-06, + "loss": 0.0558, + "step": 4173 + }, + { + "epoch": 0.35, + "grad_norm": 0.6679533827417082, + "learning_rate": 7.523598284380784e-06, + "loss": 0.152, + "step": 4174 + }, + { + "epoch": 0.35, + "grad_norm": 0.2506628711585966, + "learning_rate": 7.5224201500587636e-06, + "loss": 0.0871, + "step": 4175 + }, + { + "epoch": 0.35, + "grad_norm": 0.28187695699823245, + "learning_rate": 7.521241827852197e-06, + "loss": 0.0985, + "step": 4176 + }, + { + "epoch": 0.35, + "grad_norm": 0.36163269822089655, + "learning_rate": 7.5200633178488516e-06, + "loss": 0.07, + "step": 4177 + }, + { + "epoch": 0.35, + "grad_norm": 0.40897357867534684, + "learning_rate": 7.518884620136511e-06, + "loss": 0.0713, + "step": 4178 + }, + { + "epoch": 0.35, + "grad_norm": 0.26324978767307883, + "learning_rate": 7.5177057348029685e-06, + "loss": 0.0911, + "step": 4179 + }, + { + "epoch": 0.35, + "grad_norm": 0.34255479554418916, + "learning_rate": 7.516526661936037e-06, + "loss": 0.0737, + "step": 4180 + }, + { + "epoch": 0.35, + "grad_norm": 0.3927609344767061, + "learning_rate": 7.515347401623538e-06, + "loss": 0.0673, + "step": 4181 + }, + { + "epoch": 0.35, + "grad_norm": 0.3154516670476979, + "learning_rate": 7.514167953953315e-06, + "loss": 0.0429, + "step": 4182 + }, + { + "epoch": 0.35, + "grad_norm": 0.39799054981395265, + "learning_rate": 7.512988319013215e-06, + "loss": 0.0885, + "step": 4183 + }, + { + "epoch": 0.35, + "grad_norm": 0.2823373720810965, + "learning_rate": 7.511808496891105e-06, + "loss": 0.0797, + "step": 4184 + }, + { + "epoch": 0.35, + "grad_norm": 0.25413186905328616, + "learning_rate": 7.510628487674866e-06, + "loss": 0.0943, + "step": 4185 + }, + { + "epoch": 0.35, + "grad_norm": 0.5179136387412128, + "learning_rate": 7.509448291452392e-06, + "loss": 0.1069, + "step": 4186 + }, + { + "epoch": 0.35, + "grad_norm": 0.22786725529163349, + "learning_rate": 7.50826790831159e-06, + "loss": 0.0449, + "step": 4187 + }, + { + "epoch": 0.35, + "grad_norm": 0.44288703961664744, + "learning_rate": 7.5070873383403845e-06, + "loss": 0.1184, + "step": 4188 + }, + { + "epoch": 0.35, + "grad_norm": 0.5032653617230661, + "learning_rate": 7.5059065816267074e-06, + "loss": 0.1053, + "step": 4189 + }, + { + "epoch": 0.35, + "grad_norm": 0.4173607717122207, + "learning_rate": 7.50472563825851e-06, + "loss": 0.0983, + "step": 4190 + }, + { + "epoch": 0.35, + "grad_norm": 0.6177052969813429, + "learning_rate": 7.503544508323757e-06, + "loss": 0.1335, + "step": 4191 + }, + { + "epoch": 0.35, + "grad_norm": 0.3419605783288976, + "learning_rate": 7.502363191910424e-06, + "loss": 0.085, + "step": 4192 + }, + { + "epoch": 0.35, + "grad_norm": 0.38536667607078806, + "learning_rate": 7.501181689106504e-06, + "loss": 0.083, + "step": 4193 + }, + { + "epoch": 0.35, + "grad_norm": 0.382908617202299, + "learning_rate": 7.500000000000001e-06, + "loss": 0.1015, + "step": 4194 + }, + { + "epoch": 0.35, + "grad_norm": 0.2793272672166173, + "learning_rate": 7.4988181246789335e-06, + "loss": 0.0706, + "step": 4195 + }, + { + "epoch": 0.35, + "grad_norm": 0.3085585836213908, + "learning_rate": 7.497636063231336e-06, + "loss": 0.0896, + "step": 4196 + }, + { + "epoch": 0.35, + "grad_norm": 0.4486469496712699, + "learning_rate": 7.496453815745255e-06, + "loss": 0.0776, + "step": 4197 + }, + { + "epoch": 0.35, + "grad_norm": 0.4723629341062975, + "learning_rate": 7.495271382308751e-06, + "loss": 0.1314, + "step": 4198 + }, + { + "epoch": 0.35, + "grad_norm": 0.3067639473574543, + "learning_rate": 7.494088763009899e-06, + "loss": 0.0961, + "step": 4199 + }, + { + "epoch": 0.35, + "grad_norm": 0.30803491687027423, + "learning_rate": 7.492905957936785e-06, + "loss": 0.0957, + "step": 4200 + }, + { + "epoch": 0.35, + "grad_norm": 0.30490982915499015, + "learning_rate": 7.491722967177515e-06, + "loss": 0.0973, + "step": 4201 + }, + { + "epoch": 0.35, + "grad_norm": 0.3602679208411932, + "learning_rate": 7.490539790820202e-06, + "loss": 0.0892, + "step": 4202 + }, + { + "epoch": 0.35, + "grad_norm": 0.3453827824664803, + "learning_rate": 7.489356428952975e-06, + "loss": 0.0817, + "step": 4203 + }, + { + "epoch": 0.35, + "grad_norm": 0.3200416659825833, + "learning_rate": 7.488172881663982e-06, + "loss": 0.0572, + "step": 4204 + }, + { + "epoch": 0.35, + "grad_norm": 0.4568010529951095, + "learning_rate": 7.486989149041377e-06, + "loss": 0.1035, + "step": 4205 + }, + { + "epoch": 0.35, + "grad_norm": 0.49987374517798855, + "learning_rate": 7.485805231173334e-06, + "loss": 0.0895, + "step": 4206 + }, + { + "epoch": 0.35, + "grad_norm": 0.4525307472605828, + "learning_rate": 7.484621128148035e-06, + "loss": 0.0923, + "step": 4207 + }, + { + "epoch": 0.35, + "grad_norm": 0.271804623528407, + "learning_rate": 7.483436840053682e-06, + "loss": 0.0587, + "step": 4208 + }, + { + "epoch": 0.35, + "grad_norm": 0.29785231539836793, + "learning_rate": 7.482252366978484e-06, + "loss": 0.0713, + "step": 4209 + }, + { + "epoch": 0.35, + "grad_norm": 0.1617458282059297, + "learning_rate": 7.481067709010671e-06, + "loss": 0.0488, + "step": 4210 + }, + { + "epoch": 0.35, + "grad_norm": 0.43792823878787984, + "learning_rate": 7.479882866238482e-06, + "loss": 0.0983, + "step": 4211 + }, + { + "epoch": 0.35, + "grad_norm": 0.21414884785832047, + "learning_rate": 7.47869783875017e-06, + "loss": 0.0666, + "step": 4212 + }, + { + "epoch": 0.35, + "grad_norm": 0.27697379653093857, + "learning_rate": 7.477512626634002e-06, + "loss": 0.0701, + "step": 4213 + }, + { + "epoch": 0.36, + "grad_norm": 0.4521924906831223, + "learning_rate": 7.476327229978263e-06, + "loss": 0.0977, + "step": 4214 + }, + { + "epoch": 0.36, + "grad_norm": 0.3222599408085639, + "learning_rate": 7.4751416488712455e-06, + "loss": 0.1198, + "step": 4215 + }, + { + "epoch": 0.36, + "grad_norm": 0.2772283639148759, + "learning_rate": 7.473955883401261e-06, + "loss": 0.0602, + "step": 4216 + }, + { + "epoch": 0.36, + "grad_norm": 0.2596234663136592, + "learning_rate": 7.4727699336566275e-06, + "loss": 0.0611, + "step": 4217 + }, + { + "epoch": 0.36, + "grad_norm": 0.3653756763184814, + "learning_rate": 7.471583799725686e-06, + "loss": 0.0911, + "step": 4218 + }, + { + "epoch": 0.36, + "grad_norm": 0.3515642998130048, + "learning_rate": 7.470397481696784e-06, + "loss": 0.1062, + "step": 4219 + }, + { + "epoch": 0.36, + "grad_norm": 0.28273921880289354, + "learning_rate": 7.469210979658287e-06, + "loss": 0.0743, + "step": 4220 + }, + { + "epoch": 0.36, + "grad_norm": 0.23584351591631172, + "learning_rate": 7.468024293698573e-06, + "loss": 0.0712, + "step": 4221 + }, + { + "epoch": 0.36, + "grad_norm": 0.2438092906701349, + "learning_rate": 7.466837423906031e-06, + "loss": 0.0823, + "step": 4222 + }, + { + "epoch": 0.36, + "grad_norm": 0.2979226629421547, + "learning_rate": 7.465650370369069e-06, + "loss": 0.0757, + "step": 4223 + }, + { + "epoch": 0.36, + "grad_norm": 0.36829834848447557, + "learning_rate": 7.464463133176101e-06, + "loss": 0.0895, + "step": 4224 + }, + { + "epoch": 0.36, + "grad_norm": 0.7047535146889492, + "learning_rate": 7.463275712415564e-06, + "loss": 0.134, + "step": 4225 + }, + { + "epoch": 0.36, + "grad_norm": 0.6487316211899284, + "learning_rate": 7.462088108175902e-06, + "loss": 0.1576, + "step": 4226 + }, + { + "epoch": 0.36, + "grad_norm": 0.3763779670163988, + "learning_rate": 7.460900320545575e-06, + "loss": 0.0957, + "step": 4227 + }, + { + "epoch": 0.36, + "grad_norm": 0.317205523517601, + "learning_rate": 7.459712349613058e-06, + "loss": 0.0517, + "step": 4228 + }, + { + "epoch": 0.36, + "grad_norm": 0.3522019042803247, + "learning_rate": 7.458524195466834e-06, + "loss": 0.0896, + "step": 4229 + }, + { + "epoch": 0.36, + "grad_norm": 0.290961807312452, + "learning_rate": 7.457335858195408e-06, + "loss": 0.0848, + "step": 4230 + }, + { + "epoch": 0.36, + "grad_norm": 0.22823859281522973, + "learning_rate": 7.4561473378872905e-06, + "loss": 0.0655, + "step": 4231 + }, + { + "epoch": 0.36, + "grad_norm": 0.15120987780472853, + "learning_rate": 7.454958634631011e-06, + "loss": 0.0255, + "step": 4232 + }, + { + "epoch": 0.36, + "grad_norm": 0.4257866707613998, + "learning_rate": 7.4537697485151126e-06, + "loss": 0.084, + "step": 4233 + }, + { + "epoch": 0.36, + "grad_norm": 0.4059194992600163, + "learning_rate": 7.452580679628149e-06, + "loss": 0.0864, + "step": 4234 + }, + { + "epoch": 0.36, + "grad_norm": 0.4420201394157988, + "learning_rate": 7.4513914280586885e-06, + "loss": 0.109, + "step": 4235 + }, + { + "epoch": 0.36, + "grad_norm": 0.38939669023136964, + "learning_rate": 7.450201993895314e-06, + "loss": 0.0983, + "step": 4236 + }, + { + "epoch": 0.36, + "grad_norm": 0.37014027955169737, + "learning_rate": 7.449012377226621e-06, + "loss": 0.0963, + "step": 4237 + }, + { + "epoch": 0.36, + "grad_norm": 0.25991086053387835, + "learning_rate": 7.44782257814122e-06, + "loss": 0.0527, + "step": 4238 + }, + { + "epoch": 0.36, + "grad_norm": 0.30986282707186885, + "learning_rate": 7.446632596727734e-06, + "loss": 0.1031, + "step": 4239 + }, + { + "epoch": 0.36, + "grad_norm": 0.4952863843892052, + "learning_rate": 7.4454424330748e-06, + "loss": 0.0638, + "step": 4240 + }, + { + "epoch": 0.36, + "grad_norm": 0.24812752302684063, + "learning_rate": 7.444252087271067e-06, + "loss": 0.0633, + "step": 4241 + }, + { + "epoch": 0.36, + "grad_norm": 0.3790164482487376, + "learning_rate": 7.4430615594052e-06, + "loss": 0.0655, + "step": 4242 + }, + { + "epoch": 0.36, + "grad_norm": 0.2841771283441297, + "learning_rate": 7.441870849565875e-06, + "loss": 0.0684, + "step": 4243 + }, + { + "epoch": 0.36, + "grad_norm": 0.5213306147747089, + "learning_rate": 7.440679957841786e-06, + "loss": 0.1346, + "step": 4244 + }, + { + "epoch": 0.36, + "grad_norm": 0.3833954140037391, + "learning_rate": 7.439488884321635e-06, + "loss": 0.0853, + "step": 4245 + }, + { + "epoch": 0.36, + "grad_norm": 0.28636090140439124, + "learning_rate": 7.438297629094141e-06, + "loss": 0.071, + "step": 4246 + }, + { + "epoch": 0.36, + "grad_norm": 0.260601557035118, + "learning_rate": 7.437106192248036e-06, + "loss": 0.0804, + "step": 4247 + }, + { + "epoch": 0.36, + "grad_norm": 0.18756272795851933, + "learning_rate": 7.435914573872064e-06, + "loss": 0.043, + "step": 4248 + }, + { + "epoch": 0.36, + "grad_norm": 0.25927868537453286, + "learning_rate": 7.434722774054983e-06, + "loss": 0.0789, + "step": 4249 + }, + { + "epoch": 0.36, + "grad_norm": 0.4349795420956213, + "learning_rate": 7.433530792885568e-06, + "loss": 0.1236, + "step": 4250 + }, + { + "epoch": 0.36, + "grad_norm": 0.3836609119259851, + "learning_rate": 7.4323386304526024e-06, + "loss": 0.0997, + "step": 4251 + }, + { + "epoch": 0.36, + "grad_norm": 0.5399210676146076, + "learning_rate": 7.431146286844888e-06, + "loss": 0.1203, + "step": 4252 + }, + { + "epoch": 0.36, + "grad_norm": 0.42750565960692893, + "learning_rate": 7.429953762151234e-06, + "loss": 0.0996, + "step": 4253 + }, + { + "epoch": 0.36, + "grad_norm": 0.24300432508148984, + "learning_rate": 7.4287610564604675e-06, + "loss": 0.0654, + "step": 4254 + }, + { + "epoch": 0.36, + "grad_norm": 0.2445315404402922, + "learning_rate": 7.427568169861431e-06, + "loss": 0.0731, + "step": 4255 + }, + { + "epoch": 0.36, + "grad_norm": 0.3099650758809224, + "learning_rate": 7.426375102442974e-06, + "loss": 0.0811, + "step": 4256 + }, + { + "epoch": 0.36, + "grad_norm": 0.27910099420005163, + "learning_rate": 7.4251818542939654e-06, + "loss": 0.0848, + "step": 4257 + }, + { + "epoch": 0.36, + "grad_norm": 0.4491837183840561, + "learning_rate": 7.423988425503284e-06, + "loss": 0.1483, + "step": 4258 + }, + { + "epoch": 0.36, + "grad_norm": 0.3619488778756981, + "learning_rate": 7.422794816159824e-06, + "loss": 0.0577, + "step": 4259 + }, + { + "epoch": 0.36, + "grad_norm": 0.2378925630263716, + "learning_rate": 7.421601026352494e-06, + "loss": 0.0605, + "step": 4260 + }, + { + "epoch": 0.36, + "grad_norm": 0.38285524554311845, + "learning_rate": 7.42040705617021e-06, + "loss": 0.1405, + "step": 4261 + }, + { + "epoch": 0.36, + "grad_norm": 0.4694541219337153, + "learning_rate": 7.419212905701911e-06, + "loss": 0.0977, + "step": 4262 + }, + { + "epoch": 0.36, + "grad_norm": 0.34160608264501197, + "learning_rate": 7.418018575036539e-06, + "loss": 0.102, + "step": 4263 + }, + { + "epoch": 0.36, + "grad_norm": 0.2919471505769069, + "learning_rate": 7.41682406426306e-06, + "loss": 0.0541, + "step": 4264 + }, + { + "epoch": 0.36, + "grad_norm": 0.3673163091884878, + "learning_rate": 7.415629373470446e-06, + "loss": 0.1073, + "step": 4265 + }, + { + "epoch": 0.36, + "grad_norm": 0.2537527465018049, + "learning_rate": 7.414434502747682e-06, + "loss": 0.0745, + "step": 4266 + }, + { + "epoch": 0.36, + "grad_norm": 0.5081120227541285, + "learning_rate": 7.413239452183772e-06, + "loss": 0.1233, + "step": 4267 + }, + { + "epoch": 0.36, + "grad_norm": 0.2717066177084876, + "learning_rate": 7.412044221867729e-06, + "loss": 0.06, + "step": 4268 + }, + { + "epoch": 0.36, + "grad_norm": 0.32611557622984544, + "learning_rate": 7.410848811888582e-06, + "loss": 0.1074, + "step": 4269 + }, + { + "epoch": 0.36, + "grad_norm": 0.5820155301713758, + "learning_rate": 7.409653222335371e-06, + "loss": 0.1007, + "step": 4270 + }, + { + "epoch": 0.36, + "grad_norm": 0.38054317632231843, + "learning_rate": 7.408457453297151e-06, + "loss": 0.0993, + "step": 4271 + }, + { + "epoch": 0.36, + "grad_norm": 0.3344910941148318, + "learning_rate": 7.407261504862989e-06, + "loss": 0.0951, + "step": 4272 + }, + { + "epoch": 0.36, + "grad_norm": 0.2990307399820234, + "learning_rate": 7.406065377121967e-06, + "loss": 0.0638, + "step": 4273 + }, + { + "epoch": 0.36, + "grad_norm": 0.3346970639531175, + "learning_rate": 7.404869070163181e-06, + "loss": 0.1017, + "step": 4274 + }, + { + "epoch": 0.36, + "grad_norm": 0.7723632660353381, + "learning_rate": 7.403672584075734e-06, + "loss": 0.1693, + "step": 4275 + }, + { + "epoch": 0.36, + "grad_norm": 0.25662223130860173, + "learning_rate": 7.402475918948753e-06, + "loss": 0.0633, + "step": 4276 + }, + { + "epoch": 0.36, + "grad_norm": 0.26781687018007666, + "learning_rate": 7.401279074871369e-06, + "loss": 0.0642, + "step": 4277 + }, + { + "epoch": 0.36, + "grad_norm": 0.2852689940981586, + "learning_rate": 7.400082051932732e-06, + "loss": 0.1095, + "step": 4278 + }, + { + "epoch": 0.36, + "grad_norm": 0.33201343511126774, + "learning_rate": 7.398884850222001e-06, + "loss": 0.0961, + "step": 4279 + }, + { + "epoch": 0.36, + "grad_norm": 0.2800210007439544, + "learning_rate": 7.397687469828353e-06, + "loss": 0.0562, + "step": 4280 + }, + { + "epoch": 0.36, + "grad_norm": 0.26111695759027387, + "learning_rate": 7.3964899108409736e-06, + "loss": 0.0699, + "step": 4281 + }, + { + "epoch": 0.36, + "grad_norm": 0.33090038162953533, + "learning_rate": 7.395292173349067e-06, + "loss": 0.0861, + "step": 4282 + }, + { + "epoch": 0.36, + "grad_norm": 0.2377455497978928, + "learning_rate": 7.394094257441843e-06, + "loss": 0.0788, + "step": 4283 + }, + { + "epoch": 0.36, + "grad_norm": 0.1911629932700685, + "learning_rate": 7.392896163208534e-06, + "loss": 0.0626, + "step": 4284 + }, + { + "epoch": 0.36, + "grad_norm": 0.32592342801297475, + "learning_rate": 7.391697890738379e-06, + "loss": 0.0989, + "step": 4285 + }, + { + "epoch": 0.36, + "grad_norm": 0.43014910282404156, + "learning_rate": 7.3904994401206305e-06, + "loss": 0.0727, + "step": 4286 + }, + { + "epoch": 0.36, + "grad_norm": 0.32603578617581186, + "learning_rate": 7.3893008114445595e-06, + "loss": 0.0859, + "step": 4287 + }, + { + "epoch": 0.36, + "grad_norm": 0.47791746023235065, + "learning_rate": 7.388102004799445e-06, + "loss": 0.102, + "step": 4288 + }, + { + "epoch": 0.36, + "grad_norm": 0.2215172017050562, + "learning_rate": 7.3869030202745826e-06, + "loss": 0.0843, + "step": 4289 + }, + { + "epoch": 0.36, + "grad_norm": 0.205709306705733, + "learning_rate": 7.385703857959277e-06, + "loss": 0.0528, + "step": 4290 + }, + { + "epoch": 0.36, + "grad_norm": 0.26121843163748204, + "learning_rate": 7.384504517942852e-06, + "loss": 0.0758, + "step": 4291 + }, + { + "epoch": 0.36, + "grad_norm": 0.20342928259837592, + "learning_rate": 7.383305000314638e-06, + "loss": 0.0403, + "step": 4292 + }, + { + "epoch": 0.36, + "grad_norm": 0.4642664675965222, + "learning_rate": 7.382105305163985e-06, + "loss": 0.131, + "step": 4293 + }, + { + "epoch": 0.36, + "grad_norm": 0.31492577233828084, + "learning_rate": 7.380905432580251e-06, + "loss": 0.1056, + "step": 4294 + }, + { + "epoch": 0.36, + "grad_norm": 0.30987995677540575, + "learning_rate": 7.379705382652811e-06, + "loss": 0.0901, + "step": 4295 + }, + { + "epoch": 0.36, + "grad_norm": 0.45601195233350356, + "learning_rate": 7.378505155471052e-06, + "loss": 0.0946, + "step": 4296 + }, + { + "epoch": 0.36, + "grad_norm": 0.2612468015373667, + "learning_rate": 7.377304751124372e-06, + "loss": 0.0659, + "step": 4297 + }, + { + "epoch": 0.36, + "grad_norm": 0.37997070327040067, + "learning_rate": 7.376104169702186e-06, + "loss": 0.0845, + "step": 4298 + }, + { + "epoch": 0.36, + "grad_norm": 0.340355623409111, + "learning_rate": 7.374903411293919e-06, + "loss": 0.1022, + "step": 4299 + }, + { + "epoch": 0.36, + "grad_norm": 0.19625677991097032, + "learning_rate": 7.373702475989011e-06, + "loss": 0.0554, + "step": 4300 + }, + { + "epoch": 0.36, + "grad_norm": 0.28547970499009795, + "learning_rate": 7.372501363876916e-06, + "loss": 0.0672, + "step": 4301 + }, + { + "epoch": 0.36, + "grad_norm": 0.478412425951132, + "learning_rate": 7.371300075047097e-06, + "loss": 0.155, + "step": 4302 + }, + { + "epoch": 0.36, + "grad_norm": 0.40425070243568867, + "learning_rate": 7.370098609589037e-06, + "loss": 0.1026, + "step": 4303 + }, + { + "epoch": 0.36, + "grad_norm": 0.22174731517684848, + "learning_rate": 7.368896967592224e-06, + "loss": 0.0521, + "step": 4304 + }, + { + "epoch": 0.36, + "grad_norm": 0.3201767814085296, + "learning_rate": 7.367695149146165e-06, + "loss": 0.0784, + "step": 4305 + }, + { + "epoch": 0.36, + "grad_norm": 0.4657728150406824, + "learning_rate": 7.3664931543403786e-06, + "loss": 0.0697, + "step": 4306 + }, + { + "epoch": 0.36, + "grad_norm": 0.3392540445899379, + "learning_rate": 7.365290983264396e-06, + "loss": 0.09, + "step": 4307 + }, + { + "epoch": 0.36, + "grad_norm": 0.466388277451937, + "learning_rate": 7.364088636007763e-06, + "loss": 0.1246, + "step": 4308 + }, + { + "epoch": 0.36, + "grad_norm": 0.3090290866025594, + "learning_rate": 7.362886112660036e-06, + "loss": 0.0554, + "step": 4309 + }, + { + "epoch": 0.36, + "grad_norm": 0.2535047477503763, + "learning_rate": 7.3616834133107874e-06, + "loss": 0.0997, + "step": 4310 + }, + { + "epoch": 0.36, + "grad_norm": 0.35809566285902594, + "learning_rate": 7.3604805380495984e-06, + "loss": 0.0873, + "step": 4311 + }, + { + "epoch": 0.36, + "grad_norm": 0.27652032160288487, + "learning_rate": 7.35927748696607e-06, + "loss": 0.0862, + "step": 4312 + }, + { + "epoch": 0.36, + "grad_norm": 0.3537699966785868, + "learning_rate": 7.358074260149809e-06, + "loss": 0.0737, + "step": 4313 + }, + { + "epoch": 0.36, + "grad_norm": 0.35875827955352657, + "learning_rate": 7.356870857690441e-06, + "loss": 0.0882, + "step": 4314 + }, + { + "epoch": 0.36, + "grad_norm": 0.2705893899795068, + "learning_rate": 7.355667279677602e-06, + "loss": 0.1007, + "step": 4315 + }, + { + "epoch": 0.36, + "grad_norm": 0.2438026333145513, + "learning_rate": 7.354463526200941e-06, + "loss": 0.0842, + "step": 4316 + }, + { + "epoch": 0.36, + "grad_norm": 0.3616875225960766, + "learning_rate": 7.3532595973501205e-06, + "loss": 0.0677, + "step": 4317 + }, + { + "epoch": 0.36, + "grad_norm": 0.4095175612627275, + "learning_rate": 7.352055493214818e-06, + "loss": 0.0612, + "step": 4318 + }, + { + "epoch": 0.36, + "grad_norm": 0.2719831860414376, + "learning_rate": 7.3508512138847185e-06, + "loss": 0.0916, + "step": 4319 + }, + { + "epoch": 0.36, + "grad_norm": 0.27334555564793167, + "learning_rate": 7.349646759449527e-06, + "loss": 0.0695, + "step": 4320 + }, + { + "epoch": 0.36, + "grad_norm": 0.24949931427990701, + "learning_rate": 7.348442129998956e-06, + "loss": 0.0705, + "step": 4321 + }, + { + "epoch": 0.36, + "grad_norm": 0.6742848721421911, + "learning_rate": 7.3472373256227356e-06, + "loss": 0.1002, + "step": 4322 + }, + { + "epoch": 0.36, + "grad_norm": 0.28512955865307865, + "learning_rate": 7.346032346410605e-06, + "loss": 0.0665, + "step": 4323 + }, + { + "epoch": 0.36, + "grad_norm": 0.2801798710924794, + "learning_rate": 7.34482719245232e-06, + "loss": 0.0844, + "step": 4324 + }, + { + "epoch": 0.36, + "grad_norm": 0.2343678444683059, + "learning_rate": 7.343621863837644e-06, + "loss": 0.0839, + "step": 4325 + }, + { + "epoch": 0.36, + "grad_norm": 0.4005533387618653, + "learning_rate": 7.34241636065636e-06, + "loss": 0.1362, + "step": 4326 + }, + { + "epoch": 0.36, + "grad_norm": 0.26963481204494405, + "learning_rate": 7.341210682998261e-06, + "loss": 0.0931, + "step": 4327 + }, + { + "epoch": 0.36, + "grad_norm": 0.42451083675964074, + "learning_rate": 7.340004830953149e-06, + "loss": 0.126, + "step": 4328 + }, + { + "epoch": 0.36, + "grad_norm": 0.3573707140213417, + "learning_rate": 7.338798804610847e-06, + "loss": 0.055, + "step": 4329 + }, + { + "epoch": 0.36, + "grad_norm": 0.32194781214678536, + "learning_rate": 7.337592604061187e-06, + "loss": 0.0811, + "step": 4330 + }, + { + "epoch": 0.36, + "grad_norm": 0.29655329621819715, + "learning_rate": 7.336386229394011e-06, + "loss": 0.0784, + "step": 4331 + }, + { + "epoch": 0.37, + "grad_norm": 0.21850906753868482, + "learning_rate": 7.335179680699178e-06, + "loss": 0.0612, + "step": 4332 + }, + { + "epoch": 0.37, + "grad_norm": 0.49927903593591255, + "learning_rate": 7.33397295806656e-06, + "loss": 0.1366, + "step": 4333 + }, + { + "epoch": 0.37, + "grad_norm": 0.3577708260538142, + "learning_rate": 7.3327660615860375e-06, + "loss": 0.0484, + "step": 4334 + }, + { + "epoch": 0.37, + "grad_norm": 0.4454095433442848, + "learning_rate": 7.331558991347512e-06, + "loss": 0.1086, + "step": 4335 + }, + { + "epoch": 0.37, + "grad_norm": 0.23999068555625297, + "learning_rate": 7.330351747440888e-06, + "loss": 0.0809, + "step": 4336 + }, + { + "epoch": 0.37, + "grad_norm": 0.4103516917991739, + "learning_rate": 7.329144329956093e-06, + "loss": 0.1051, + "step": 4337 + }, + { + "epoch": 0.37, + "grad_norm": 0.4162759438458064, + "learning_rate": 7.327936738983058e-06, + "loss": 0.1078, + "step": 4338 + }, + { + "epoch": 0.37, + "grad_norm": 0.3275332949992294, + "learning_rate": 7.326728974611736e-06, + "loss": 0.0884, + "step": 4339 + }, + { + "epoch": 0.37, + "grad_norm": 0.28251117058098785, + "learning_rate": 7.325521036932083e-06, + "loss": 0.0728, + "step": 4340 + }, + { + "epoch": 0.37, + "grad_norm": 0.18909284090936726, + "learning_rate": 7.324312926034076e-06, + "loss": 0.0597, + "step": 4341 + }, + { + "epoch": 0.37, + "grad_norm": 0.3738375139878811, + "learning_rate": 7.323104642007704e-06, + "loss": 0.1018, + "step": 4342 + }, + { + "epoch": 0.37, + "grad_norm": 0.19427880971931988, + "learning_rate": 7.3218961849429635e-06, + "loss": 0.0508, + "step": 4343 + }, + { + "epoch": 0.37, + "grad_norm": 0.40532333884198357, + "learning_rate": 7.320687554929871e-06, + "loss": 0.1016, + "step": 4344 + }, + { + "epoch": 0.37, + "grad_norm": 0.3229727574160323, + "learning_rate": 7.319478752058449e-06, + "loss": 0.0726, + "step": 4345 + }, + { + "epoch": 0.37, + "grad_norm": 0.38166721496828687, + "learning_rate": 7.318269776418738e-06, + "loss": 0.0945, + "step": 4346 + }, + { + "epoch": 0.37, + "grad_norm": 0.3453011602507455, + "learning_rate": 7.31706062810079e-06, + "loss": 0.0768, + "step": 4347 + }, + { + "epoch": 0.37, + "grad_norm": 0.45076466141969534, + "learning_rate": 7.315851307194668e-06, + "loss": 0.0914, + "step": 4348 + }, + { + "epoch": 0.37, + "grad_norm": 0.6466746095245473, + "learning_rate": 7.314641813790449e-06, + "loss": 0.1331, + "step": 4349 + }, + { + "epoch": 0.37, + "grad_norm": 0.3392762434734487, + "learning_rate": 7.313432147978225e-06, + "loss": 0.0998, + "step": 4350 + }, + { + "epoch": 0.37, + "grad_norm": 0.27683348733956964, + "learning_rate": 7.312222309848096e-06, + "loss": 0.0758, + "step": 4351 + }, + { + "epoch": 0.37, + "grad_norm": 0.3197989311735069, + "learning_rate": 7.311012299490182e-06, + "loss": 0.1033, + "step": 4352 + }, + { + "epoch": 0.37, + "grad_norm": 0.49980893745225385, + "learning_rate": 7.309802116994608e-06, + "loss": 0.0809, + "step": 4353 + }, + { + "epoch": 0.37, + "grad_norm": 0.5765369405183879, + "learning_rate": 7.308591762451518e-06, + "loss": 0.139, + "step": 4354 + }, + { + "epoch": 0.37, + "grad_norm": 0.30240568795392647, + "learning_rate": 7.307381235951063e-06, + "loss": 0.0495, + "step": 4355 + }, + { + "epoch": 0.37, + "grad_norm": 0.27540500074467517, + "learning_rate": 7.306170537583413e-06, + "loss": 0.0934, + "step": 4356 + }, + { + "epoch": 0.37, + "grad_norm": 0.335836855952277, + "learning_rate": 7.304959667438747e-06, + "loss": 0.0723, + "step": 4357 + }, + { + "epoch": 0.37, + "grad_norm": 0.23020509653818272, + "learning_rate": 7.303748625607256e-06, + "loss": 0.0607, + "step": 4358 + }, + { + "epoch": 0.37, + "grad_norm": 0.21433700953282805, + "learning_rate": 7.302537412179151e-06, + "loss": 0.066, + "step": 4359 + }, + { + "epoch": 0.37, + "grad_norm": 0.3642182633304163, + "learning_rate": 7.301326027244643e-06, + "loss": 0.1029, + "step": 4360 + }, + { + "epoch": 0.37, + "grad_norm": 0.2882530126919763, + "learning_rate": 7.300114470893968e-06, + "loss": 0.0857, + "step": 4361 + }, + { + "epoch": 0.37, + "grad_norm": 0.20496859589516553, + "learning_rate": 7.2989027432173665e-06, + "loss": 0.0479, + "step": 4362 + }, + { + "epoch": 0.37, + "grad_norm": 0.20590779088106023, + "learning_rate": 7.297690844305096e-06, + "loss": 0.0578, + "step": 4363 + }, + { + "epoch": 0.37, + "grad_norm": 1.077340475566252, + "learning_rate": 7.296478774247428e-06, + "loss": 0.1395, + "step": 4364 + }, + { + "epoch": 0.37, + "grad_norm": 0.2673874528426199, + "learning_rate": 7.2952665331346426e-06, + "loss": 0.0687, + "step": 4365 + }, + { + "epoch": 0.37, + "grad_norm": 0.2576477308017653, + "learning_rate": 7.294054121057036e-06, + "loss": 0.0744, + "step": 4366 + }, + { + "epoch": 0.37, + "grad_norm": 0.46834896153213434, + "learning_rate": 7.292841538104913e-06, + "loss": 0.13, + "step": 4367 + }, + { + "epoch": 0.37, + "grad_norm": 0.17731293448854304, + "learning_rate": 7.291628784368595e-06, + "loss": 0.0616, + "step": 4368 + }, + { + "epoch": 0.37, + "grad_norm": 0.32469769551803657, + "learning_rate": 7.2904158599384165e-06, + "loss": 0.114, + "step": 4369 + }, + { + "epoch": 0.37, + "grad_norm": 0.3390900253002126, + "learning_rate": 7.289202764904722e-06, + "loss": 0.0889, + "step": 4370 + }, + { + "epoch": 0.37, + "grad_norm": 0.33692052488196417, + "learning_rate": 7.28798949935787e-06, + "loss": 0.08, + "step": 4371 + }, + { + "epoch": 0.37, + "grad_norm": 0.35201374552178705, + "learning_rate": 7.286776063388231e-06, + "loss": 0.0831, + "step": 4372 + }, + { + "epoch": 0.37, + "grad_norm": 0.2824729277993885, + "learning_rate": 7.2855624570861905e-06, + "loss": 0.074, + "step": 4373 + }, + { + "epoch": 0.37, + "grad_norm": 0.3707127187786412, + "learning_rate": 7.284348680542144e-06, + "loss": 0.0881, + "step": 4374 + }, + { + "epoch": 0.37, + "grad_norm": 0.26636636851544493, + "learning_rate": 7.283134733846499e-06, + "loss": 0.0862, + "step": 4375 + }, + { + "epoch": 0.37, + "grad_norm": 0.31642570045095875, + "learning_rate": 7.28192061708968e-06, + "loss": 0.0648, + "step": 4376 + }, + { + "epoch": 0.37, + "grad_norm": 0.2592228677425772, + "learning_rate": 7.280706330362119e-06, + "loss": 0.0848, + "step": 4377 + }, + { + "epoch": 0.37, + "grad_norm": 0.3719727459868276, + "learning_rate": 7.279491873754265e-06, + "loss": 0.1168, + "step": 4378 + }, + { + "epoch": 0.37, + "grad_norm": 0.3843527606252072, + "learning_rate": 7.278277247356577e-06, + "loss": 0.0999, + "step": 4379 + }, + { + "epoch": 0.37, + "grad_norm": 0.2809575807692699, + "learning_rate": 7.277062451259529e-06, + "loss": 0.0909, + "step": 4380 + }, + { + "epoch": 0.37, + "grad_norm": 0.28417691026021047, + "learning_rate": 7.275847485553602e-06, + "loss": 0.0695, + "step": 4381 + }, + { + "epoch": 0.37, + "grad_norm": 0.40211499905870285, + "learning_rate": 7.274632350329299e-06, + "loss": 0.1088, + "step": 4382 + }, + { + "epoch": 0.37, + "grad_norm": 0.1991233353544238, + "learning_rate": 7.273417045677127e-06, + "loss": 0.0838, + "step": 4383 + }, + { + "epoch": 0.37, + "grad_norm": 0.2734156955822557, + "learning_rate": 7.272201571687611e-06, + "loss": 0.0824, + "step": 4384 + }, + { + "epoch": 0.37, + "grad_norm": 0.3403644439128153, + "learning_rate": 7.270985928451283e-06, + "loss": 0.0688, + "step": 4385 + }, + { + "epoch": 0.37, + "grad_norm": 0.34701787925141103, + "learning_rate": 7.269770116058695e-06, + "loss": 0.0793, + "step": 4386 + }, + { + "epoch": 0.37, + "grad_norm": 0.23436470402478968, + "learning_rate": 7.268554134600407e-06, + "loss": 0.0519, + "step": 4387 + }, + { + "epoch": 0.37, + "grad_norm": 0.35342647884045747, + "learning_rate": 7.267337984166991e-06, + "loss": 0.0695, + "step": 4388 + }, + { + "epoch": 0.37, + "grad_norm": 0.22537326706406283, + "learning_rate": 7.266121664849033e-06, + "loss": 0.0695, + "step": 4389 + }, + { + "epoch": 0.37, + "grad_norm": 0.39240502785855536, + "learning_rate": 7.264905176737134e-06, + "loss": 0.0934, + "step": 4390 + }, + { + "epoch": 0.37, + "grad_norm": 0.43820444685264803, + "learning_rate": 7.263688519921901e-06, + "loss": 0.1176, + "step": 4391 + }, + { + "epoch": 0.37, + "grad_norm": 0.3230419627260889, + "learning_rate": 7.262471694493961e-06, + "loss": 0.107, + "step": 4392 + }, + { + "epoch": 0.37, + "grad_norm": 0.23885547011906072, + "learning_rate": 7.261254700543951e-06, + "loss": 0.1052, + "step": 4393 + }, + { + "epoch": 0.37, + "grad_norm": 0.22362323047075436, + "learning_rate": 7.2600375381625155e-06, + "loss": 0.0578, + "step": 4394 + }, + { + "epoch": 0.37, + "grad_norm": 0.4307300132716547, + "learning_rate": 7.25882020744032e-06, + "loss": 0.1062, + "step": 4395 + }, + { + "epoch": 0.37, + "grad_norm": 0.37070712588669785, + "learning_rate": 7.257602708468036e-06, + "loss": 0.083, + "step": 4396 + }, + { + "epoch": 0.37, + "grad_norm": 0.25962095105683713, + "learning_rate": 7.256385041336352e-06, + "loss": 0.0461, + "step": 4397 + }, + { + "epoch": 0.37, + "grad_norm": 0.29170078736808197, + "learning_rate": 7.255167206135964e-06, + "loss": 0.0813, + "step": 4398 + }, + { + "epoch": 0.37, + "grad_norm": 0.5257765925315406, + "learning_rate": 7.2539492029575865e-06, + "loss": 0.1292, + "step": 4399 + }, + { + "epoch": 0.37, + "grad_norm": 0.3304763287260301, + "learning_rate": 7.252731031891943e-06, + "loss": 0.0971, + "step": 4400 + }, + { + "epoch": 0.37, + "grad_norm": 0.3190937303575025, + "learning_rate": 7.251512693029767e-06, + "loss": 0.0786, + "step": 4401 + }, + { + "epoch": 0.37, + "grad_norm": 0.29075318897506036, + "learning_rate": 7.250294186461811e-06, + "loss": 0.0771, + "step": 4402 + }, + { + "epoch": 0.37, + "grad_norm": 0.315261714070033, + "learning_rate": 7.249075512278835e-06, + "loss": 0.0906, + "step": 4403 + }, + { + "epoch": 0.37, + "grad_norm": 0.25880370681209053, + "learning_rate": 7.247856670571613e-06, + "loss": 0.0672, + "step": 4404 + }, + { + "epoch": 0.37, + "grad_norm": 0.24168920872624683, + "learning_rate": 7.246637661430931e-06, + "loss": 0.0814, + "step": 4405 + }, + { + "epoch": 0.37, + "grad_norm": 0.24182136866013298, + "learning_rate": 7.245418484947588e-06, + "loss": 0.0587, + "step": 4406 + }, + { + "epoch": 0.37, + "grad_norm": 0.2524587110529705, + "learning_rate": 7.244199141212397e-06, + "loss": 0.0464, + "step": 4407 + }, + { + "epoch": 0.37, + "grad_norm": 0.29652535553248915, + "learning_rate": 7.242979630316178e-06, + "loss": 0.0886, + "step": 4408 + }, + { + "epoch": 0.37, + "grad_norm": 0.3655872846260919, + "learning_rate": 7.2417599523497715e-06, + "loss": 0.0736, + "step": 4409 + }, + { + "epoch": 0.37, + "grad_norm": 0.5003253098559661, + "learning_rate": 7.240540107404026e-06, + "loss": 0.1424, + "step": 4410 + }, + { + "epoch": 0.37, + "grad_norm": 0.24440258021406092, + "learning_rate": 7.239320095569798e-06, + "loss": 0.0711, + "step": 4411 + }, + { + "epoch": 0.37, + "grad_norm": 0.29849592555062504, + "learning_rate": 7.238099916937967e-06, + "loss": 0.0678, + "step": 4412 + }, + { + "epoch": 0.37, + "grad_norm": 0.1933788462165639, + "learning_rate": 7.236879571599415e-06, + "loss": 0.0603, + "step": 4413 + }, + { + "epoch": 0.37, + "grad_norm": 0.2793616599522201, + "learning_rate": 7.2356590596450425e-06, + "loss": 0.0849, + "step": 4414 + }, + { + "epoch": 0.37, + "grad_norm": 0.44358257401033724, + "learning_rate": 7.234438381165759e-06, + "loss": 0.1118, + "step": 4415 + }, + { + "epoch": 0.37, + "grad_norm": 0.32619148622601785, + "learning_rate": 7.233217536252489e-06, + "loss": 0.0557, + "step": 4416 + }, + { + "epoch": 0.37, + "grad_norm": 0.38300263098741666, + "learning_rate": 7.231996524996167e-06, + "loss": 0.1245, + "step": 4417 + }, + { + "epoch": 0.37, + "grad_norm": 0.32762409360761097, + "learning_rate": 7.230775347487742e-06, + "loss": 0.0933, + "step": 4418 + }, + { + "epoch": 0.37, + "grad_norm": 0.3291932808459943, + "learning_rate": 7.229554003818172e-06, + "loss": 0.0731, + "step": 4419 + }, + { + "epoch": 0.37, + "grad_norm": 0.4863714235244281, + "learning_rate": 7.228332494078434e-06, + "loss": 0.0798, + "step": 4420 + }, + { + "epoch": 0.37, + "grad_norm": 0.24056915976906, + "learning_rate": 7.227110818359509e-06, + "loss": 0.053, + "step": 4421 + }, + { + "epoch": 0.37, + "grad_norm": 0.36118812232274367, + "learning_rate": 7.225888976752398e-06, + "loss": 0.0513, + "step": 4422 + }, + { + "epoch": 0.37, + "grad_norm": 0.4606595015375231, + "learning_rate": 7.224666969348107e-06, + "loss": 0.1252, + "step": 4423 + }, + { + "epoch": 0.37, + "grad_norm": 0.38433818350830407, + "learning_rate": 7.223444796237663e-06, + "loss": 0.085, + "step": 4424 + }, + { + "epoch": 0.37, + "grad_norm": 0.31759313152307744, + "learning_rate": 7.2222224575120955e-06, + "loss": 0.0989, + "step": 4425 + }, + { + "epoch": 0.37, + "grad_norm": 0.4305259783160022, + "learning_rate": 7.220999953262455e-06, + "loss": 0.0931, + "step": 4426 + }, + { + "epoch": 0.37, + "grad_norm": 0.3379742170354169, + "learning_rate": 7.219777283579801e-06, + "loss": 0.0802, + "step": 4427 + }, + { + "epoch": 0.37, + "grad_norm": 0.33606016540800654, + "learning_rate": 7.218554448555203e-06, + "loss": 0.0734, + "step": 4428 + }, + { + "epoch": 0.37, + "grad_norm": 0.19773440207354814, + "learning_rate": 7.217331448279745e-06, + "loss": 0.0371, + "step": 4429 + }, + { + "epoch": 0.37, + "grad_norm": 0.23918815767867072, + "learning_rate": 7.216108282844525e-06, + "loss": 0.0604, + "step": 4430 + }, + { + "epoch": 0.37, + "grad_norm": 0.4463802928039175, + "learning_rate": 7.2148849523406485e-06, + "loss": 0.0807, + "step": 4431 + }, + { + "epoch": 0.37, + "grad_norm": 0.1840464037980593, + "learning_rate": 7.21366145685924e-06, + "loss": 0.0557, + "step": 4432 + }, + { + "epoch": 0.37, + "grad_norm": 0.24062627832840017, + "learning_rate": 7.212437796491429e-06, + "loss": 0.0638, + "step": 4433 + }, + { + "epoch": 0.37, + "grad_norm": 0.23466465731384345, + "learning_rate": 7.211213971328364e-06, + "loss": 0.0638, + "step": 4434 + }, + { + "epoch": 0.37, + "grad_norm": 0.2004013942249298, + "learning_rate": 7.2099899814612005e-06, + "loss": 0.0502, + "step": 4435 + }, + { + "epoch": 0.37, + "grad_norm": 0.3551902406961578, + "learning_rate": 7.208765826981109e-06, + "loss": 0.101, + "step": 4436 + }, + { + "epoch": 0.37, + "grad_norm": 0.2925379078798644, + "learning_rate": 7.2075415079792725e-06, + "loss": 0.0796, + "step": 4437 + }, + { + "epoch": 0.37, + "grad_norm": 0.24888825117853458, + "learning_rate": 7.2063170245468824e-06, + "loss": 0.0873, + "step": 4438 + }, + { + "epoch": 0.37, + "grad_norm": 0.628007136146665, + "learning_rate": 7.205092376775149e-06, + "loss": 0.1327, + "step": 4439 + }, + { + "epoch": 0.37, + "grad_norm": 0.3756592008196272, + "learning_rate": 7.203867564755289e-06, + "loss": 0.0739, + "step": 4440 + }, + { + "epoch": 0.37, + "grad_norm": 0.3217638584536869, + "learning_rate": 7.202642588578536e-06, + "loss": 0.1105, + "step": 4441 + }, + { + "epoch": 0.37, + "grad_norm": 0.47164003380577274, + "learning_rate": 7.201417448336131e-06, + "loss": 0.0904, + "step": 4442 + }, + { + "epoch": 0.37, + "grad_norm": 0.3271624543576845, + "learning_rate": 7.200192144119329e-06, + "loss": 0.1075, + "step": 4443 + }, + { + "epoch": 0.37, + "grad_norm": 0.38859696516958403, + "learning_rate": 7.198966676019401e-06, + "loss": 0.1192, + "step": 4444 + }, + { + "epoch": 0.37, + "grad_norm": 0.4714428443026246, + "learning_rate": 7.197741044127623e-06, + "loss": 0.0718, + "step": 4445 + }, + { + "epoch": 0.37, + "grad_norm": 0.30058585516599096, + "learning_rate": 7.196515248535292e-06, + "loss": 0.0974, + "step": 4446 + }, + { + "epoch": 0.37, + "grad_norm": 0.45191510979623795, + "learning_rate": 7.195289289333706e-06, + "loss": 0.0863, + "step": 4447 + }, + { + "epoch": 0.37, + "grad_norm": 0.23339346480264778, + "learning_rate": 7.194063166614188e-06, + "loss": 0.0802, + "step": 4448 + }, + { + "epoch": 0.37, + "grad_norm": 0.24143093250850814, + "learning_rate": 7.1928368804680636e-06, + "loss": 0.0328, + "step": 4449 + }, + { + "epoch": 0.37, + "grad_norm": 0.3800915660306654, + "learning_rate": 7.191610430986673e-06, + "loss": 0.1163, + "step": 4450 + }, + { + "epoch": 0.38, + "grad_norm": 0.3815436742845, + "learning_rate": 7.190383818261372e-06, + "loss": 0.1036, + "step": 4451 + }, + { + "epoch": 0.38, + "grad_norm": 0.5924616699354528, + "learning_rate": 7.189157042383523e-06, + "loss": 0.1707, + "step": 4452 + }, + { + "epoch": 0.38, + "grad_norm": 0.40447090231590643, + "learning_rate": 7.187930103444505e-06, + "loss": 0.0697, + "step": 4453 + }, + { + "epoch": 0.38, + "grad_norm": 0.31096012007847545, + "learning_rate": 7.186703001535707e-06, + "loss": 0.1008, + "step": 4454 + }, + { + "epoch": 0.38, + "grad_norm": 0.2956273659078011, + "learning_rate": 7.185475736748532e-06, + "loss": 0.0791, + "step": 4455 + }, + { + "epoch": 0.38, + "grad_norm": 0.2687782417486176, + "learning_rate": 7.184248309174392e-06, + "loss": 0.0347, + "step": 4456 + }, + { + "epoch": 0.38, + "grad_norm": 0.39311432015158854, + "learning_rate": 7.183020718904714e-06, + "loss": 0.0656, + "step": 4457 + }, + { + "epoch": 0.38, + "grad_norm": 0.3604342669181342, + "learning_rate": 7.181792966030936e-06, + "loss": 0.1247, + "step": 4458 + }, + { + "epoch": 0.38, + "grad_norm": 0.32957690369770604, + "learning_rate": 7.180565050644507e-06, + "loss": 0.0655, + "step": 4459 + }, + { + "epoch": 0.38, + "grad_norm": 0.3078306724532581, + "learning_rate": 7.17933697283689e-06, + "loss": 0.0777, + "step": 4460 + }, + { + "epoch": 0.38, + "grad_norm": 0.41934440998114414, + "learning_rate": 7.178108732699563e-06, + "loss": 0.049, + "step": 4461 + }, + { + "epoch": 0.38, + "grad_norm": 0.33856948453393737, + "learning_rate": 7.176880330324006e-06, + "loss": 0.089, + "step": 4462 + }, + { + "epoch": 0.38, + "grad_norm": 0.1978172132743615, + "learning_rate": 7.175651765801721e-06, + "loss": 0.0492, + "step": 4463 + }, + { + "epoch": 0.38, + "grad_norm": 0.3435390649805543, + "learning_rate": 7.174423039224219e-06, + "loss": 0.095, + "step": 4464 + }, + { + "epoch": 0.38, + "grad_norm": 0.18487806934137355, + "learning_rate": 7.173194150683022e-06, + "loss": 0.0384, + "step": 4465 + }, + { + "epoch": 0.38, + "grad_norm": 0.45692108777722396, + "learning_rate": 7.171965100269664e-06, + "loss": 0.1263, + "step": 4466 + }, + { + "epoch": 0.38, + "grad_norm": 0.18543258613937713, + "learning_rate": 7.170735888075694e-06, + "loss": 0.0393, + "step": 4467 + }, + { + "epoch": 0.38, + "grad_norm": 0.29826467306355947, + "learning_rate": 7.16950651419267e-06, + "loss": 0.0503, + "step": 4468 + }, + { + "epoch": 0.38, + "grad_norm": 0.390754866364476, + "learning_rate": 7.1682769787121605e-06, + "loss": 0.1058, + "step": 4469 + }, + { + "epoch": 0.38, + "grad_norm": 0.2529586669336332, + "learning_rate": 7.167047281725752e-06, + "loss": 0.0469, + "step": 4470 + }, + { + "epoch": 0.38, + "grad_norm": 0.27740849612526397, + "learning_rate": 7.165817423325037e-06, + "loss": 0.0678, + "step": 4471 + }, + { + "epoch": 0.38, + "grad_norm": 0.3238305300639644, + "learning_rate": 7.164587403601624e-06, + "loss": 0.0897, + "step": 4472 + }, + { + "epoch": 0.38, + "grad_norm": 0.40256805226178577, + "learning_rate": 7.163357222647132e-06, + "loss": 0.1159, + "step": 4473 + }, + { + "epoch": 0.38, + "grad_norm": 0.27320217497394783, + "learning_rate": 7.162126880553191e-06, + "loss": 0.0703, + "step": 4474 + }, + { + "epoch": 0.38, + "grad_norm": 0.3012757814252815, + "learning_rate": 7.160896377411446e-06, + "loss": 0.0778, + "step": 4475 + }, + { + "epoch": 0.38, + "grad_norm": 0.33930309759192934, + "learning_rate": 7.159665713313549e-06, + "loss": 0.0905, + "step": 4476 + }, + { + "epoch": 0.38, + "grad_norm": 0.43918828338493454, + "learning_rate": 7.158434888351169e-06, + "loss": 0.0934, + "step": 4477 + }, + { + "epoch": 0.38, + "grad_norm": 0.3143226321284182, + "learning_rate": 7.157203902615984e-06, + "loss": 0.1043, + "step": 4478 + }, + { + "epoch": 0.38, + "grad_norm": 0.28527351755703373, + "learning_rate": 7.155972756199688e-06, + "loss": 0.1093, + "step": 4479 + }, + { + "epoch": 0.38, + "grad_norm": 0.24373037037810458, + "learning_rate": 7.154741449193979e-06, + "loss": 0.0503, + "step": 4480 + }, + { + "epoch": 0.38, + "grad_norm": 0.5836879325180363, + "learning_rate": 7.1535099816905765e-06, + "loss": 0.1217, + "step": 4481 + }, + { + "epoch": 0.38, + "grad_norm": 0.3293130545818517, + "learning_rate": 7.152278353781205e-06, + "loss": 0.05, + "step": 4482 + }, + { + "epoch": 0.38, + "grad_norm": 0.40905106472643826, + "learning_rate": 7.151046565557602e-06, + "loss": 0.0788, + "step": 4483 + }, + { + "epoch": 0.38, + "grad_norm": 0.2786103898926736, + "learning_rate": 7.1498146171115234e-06, + "loss": 0.0898, + "step": 4484 + }, + { + "epoch": 0.38, + "grad_norm": 0.23149565052398227, + "learning_rate": 7.148582508534726e-06, + "loss": 0.0727, + "step": 4485 + }, + { + "epoch": 0.38, + "grad_norm": 0.22047645770055957, + "learning_rate": 7.1473502399189875e-06, + "loss": 0.0743, + "step": 4486 + }, + { + "epoch": 0.38, + "grad_norm": 0.5297343025317314, + "learning_rate": 7.146117811356094e-06, + "loss": 0.111, + "step": 4487 + }, + { + "epoch": 0.38, + "grad_norm": 0.30991843567162825, + "learning_rate": 7.144885222937844e-06, + "loss": 0.0933, + "step": 4488 + }, + { + "epoch": 0.38, + "grad_norm": 0.30575832293479766, + "learning_rate": 7.143652474756047e-06, + "loss": 0.0793, + "step": 4489 + }, + { + "epoch": 0.38, + "grad_norm": 0.24623635944148872, + "learning_rate": 7.142419566902528e-06, + "loss": 0.0384, + "step": 4490 + }, + { + "epoch": 0.38, + "grad_norm": 0.3756769459497274, + "learning_rate": 7.141186499469117e-06, + "loss": 0.0846, + "step": 4491 + }, + { + "epoch": 0.38, + "grad_norm": 0.4207607937695023, + "learning_rate": 7.139953272547663e-06, + "loss": 0.1283, + "step": 4492 + }, + { + "epoch": 0.38, + "grad_norm": 0.25843885632434227, + "learning_rate": 7.138719886230022e-06, + "loss": 0.0635, + "step": 4493 + }, + { + "epoch": 0.38, + "grad_norm": 0.28195104279593974, + "learning_rate": 7.137486340608066e-06, + "loss": 0.094, + "step": 4494 + }, + { + "epoch": 0.38, + "grad_norm": 0.28088401620762926, + "learning_rate": 7.1362526357736765e-06, + "loss": 0.1129, + "step": 4495 + }, + { + "epoch": 0.38, + "grad_norm": 0.2892608047474482, + "learning_rate": 7.135018771818744e-06, + "loss": 0.0871, + "step": 4496 + }, + { + "epoch": 0.38, + "grad_norm": 0.5429554132612583, + "learning_rate": 7.133784748835177e-06, + "loss": 0.1375, + "step": 4497 + }, + { + "epoch": 0.38, + "grad_norm": 0.31689206404312115, + "learning_rate": 7.132550566914892e-06, + "loss": 0.0714, + "step": 4498 + }, + { + "epoch": 0.38, + "grad_norm": 0.36989331167756057, + "learning_rate": 7.131316226149818e-06, + "loss": 0.0826, + "step": 4499 + }, + { + "epoch": 0.38, + "grad_norm": 0.32107091480601957, + "learning_rate": 7.130081726631895e-06, + "loss": 0.0713, + "step": 4500 + }, + { + "epoch": 0.38, + "grad_norm": 0.29354001193379936, + "learning_rate": 7.128847068453076e-06, + "loss": 0.0849, + "step": 4501 + }, + { + "epoch": 0.38, + "grad_norm": 0.3026613125015102, + "learning_rate": 7.127612251705326e-06, + "loss": 0.0492, + "step": 4502 + }, + { + "epoch": 0.38, + "grad_norm": 0.3404872637488946, + "learning_rate": 7.126377276480622e-06, + "loss": 0.1008, + "step": 4503 + }, + { + "epoch": 0.38, + "grad_norm": 0.43239200291067, + "learning_rate": 7.125142142870951e-06, + "loss": 0.1408, + "step": 4504 + }, + { + "epoch": 0.38, + "grad_norm": 0.2140232836728838, + "learning_rate": 7.1239068509683126e-06, + "loss": 0.0347, + "step": 4505 + }, + { + "epoch": 0.38, + "grad_norm": 0.5432685958706636, + "learning_rate": 7.12267140086472e-06, + "loss": 0.075, + "step": 4506 + }, + { + "epoch": 0.38, + "grad_norm": 0.40960067802505173, + "learning_rate": 7.121435792652196e-06, + "loss": 0.1218, + "step": 4507 + }, + { + "epoch": 0.38, + "grad_norm": 0.3903609894798665, + "learning_rate": 7.120200026422775e-06, + "loss": 0.0878, + "step": 4508 + }, + { + "epoch": 0.38, + "grad_norm": 0.5993004614364129, + "learning_rate": 7.1189641022685064e-06, + "loss": 0.1654, + "step": 4509 + }, + { + "epoch": 0.38, + "grad_norm": 0.29129755236178356, + "learning_rate": 7.117728020281447e-06, + "loss": 0.064, + "step": 4510 + }, + { + "epoch": 0.38, + "grad_norm": 0.48661698619386207, + "learning_rate": 7.116491780553668e-06, + "loss": 0.1218, + "step": 4511 + }, + { + "epoch": 0.38, + "grad_norm": 0.3990283108548014, + "learning_rate": 7.115255383177252e-06, + "loss": 0.1032, + "step": 4512 + }, + { + "epoch": 0.38, + "grad_norm": 0.26757744786699955, + "learning_rate": 7.114018828244291e-06, + "loss": 0.0943, + "step": 4513 + }, + { + "epoch": 0.38, + "grad_norm": 0.5742791392232209, + "learning_rate": 7.112782115846894e-06, + "loss": 0.1582, + "step": 4514 + }, + { + "epoch": 0.38, + "grad_norm": 0.286803319433409, + "learning_rate": 7.111545246077179e-06, + "loss": 0.0701, + "step": 4515 + }, + { + "epoch": 0.38, + "grad_norm": 0.21214810768790418, + "learning_rate": 7.110308219027271e-06, + "loss": 0.0498, + "step": 4516 + }, + { + "epoch": 0.38, + "grad_norm": 0.3503200773857527, + "learning_rate": 7.109071034789314e-06, + "loss": 0.0654, + "step": 4517 + }, + { + "epoch": 0.38, + "grad_norm": 0.4210993995830312, + "learning_rate": 7.107833693455461e-06, + "loss": 0.1185, + "step": 4518 + }, + { + "epoch": 0.38, + "grad_norm": 0.372882918440158, + "learning_rate": 7.106596195117873e-06, + "loss": 0.0766, + "step": 4519 + }, + { + "epoch": 0.38, + "grad_norm": 0.25138807029501375, + "learning_rate": 7.105358539868731e-06, + "loss": 0.0768, + "step": 4520 + }, + { + "epoch": 0.38, + "grad_norm": 0.33535327460005016, + "learning_rate": 7.104120727800219e-06, + "loss": 0.0756, + "step": 4521 + }, + { + "epoch": 0.38, + "grad_norm": 0.23399303567254817, + "learning_rate": 7.102882759004539e-06, + "loss": 0.0493, + "step": 4522 + }, + { + "epoch": 0.38, + "grad_norm": 0.516439559244914, + "learning_rate": 7.101644633573899e-06, + "loss": 0.125, + "step": 4523 + }, + { + "epoch": 0.38, + "grad_norm": 0.32215312927783285, + "learning_rate": 7.1004063516005265e-06, + "loss": 0.0828, + "step": 4524 + }, + { + "epoch": 0.38, + "grad_norm": 0.34557942017290244, + "learning_rate": 7.09916791317665e-06, + "loss": 0.105, + "step": 4525 + }, + { + "epoch": 0.38, + "grad_norm": 0.4373268497909671, + "learning_rate": 7.097929318394521e-06, + "loss": 0.1342, + "step": 4526 + }, + { + "epoch": 0.38, + "grad_norm": 0.3376293974335842, + "learning_rate": 7.096690567346394e-06, + "loss": 0.0969, + "step": 4527 + }, + { + "epoch": 0.38, + "grad_norm": 0.3187422636359759, + "learning_rate": 7.095451660124541e-06, + "loss": 0.0744, + "step": 4528 + }, + { + "epoch": 0.38, + "grad_norm": 0.2693246089976058, + "learning_rate": 7.094212596821239e-06, + "loss": 0.069, + "step": 4529 + }, + { + "epoch": 0.38, + "grad_norm": 0.4123271220523029, + "learning_rate": 7.092973377528783e-06, + "loss": 0.1032, + "step": 4530 + }, + { + "epoch": 0.38, + "grad_norm": 0.2670640501216721, + "learning_rate": 7.0917340023394795e-06, + "loss": 0.0602, + "step": 4531 + }, + { + "epoch": 0.38, + "grad_norm": 0.19299859310112086, + "learning_rate": 7.090494471345641e-06, + "loss": 0.0679, + "step": 4532 + }, + { + "epoch": 0.38, + "grad_norm": 0.4056009924985113, + "learning_rate": 7.0892547846395975e-06, + "loss": 0.0912, + "step": 4533 + }, + { + "epoch": 0.38, + "grad_norm": 0.24943599537177996, + "learning_rate": 7.088014942313686e-06, + "loss": 0.0966, + "step": 4534 + }, + { + "epoch": 0.38, + "grad_norm": 0.3231146495877769, + "learning_rate": 7.086774944460258e-06, + "loss": 0.0792, + "step": 4535 + }, + { + "epoch": 0.38, + "grad_norm": 0.20979593245268607, + "learning_rate": 7.085534791171676e-06, + "loss": 0.0599, + "step": 4536 + }, + { + "epoch": 0.38, + "grad_norm": 0.40319949068699457, + "learning_rate": 7.084294482540312e-06, + "loss": 0.0929, + "step": 4537 + }, + { + "epoch": 0.38, + "grad_norm": 0.28956685189967385, + "learning_rate": 7.083054018658555e-06, + "loss": 0.0809, + "step": 4538 + }, + { + "epoch": 0.38, + "grad_norm": 0.3446692870375518, + "learning_rate": 7.081813399618799e-06, + "loss": 0.1038, + "step": 4539 + }, + { + "epoch": 0.38, + "grad_norm": 0.38601453513696404, + "learning_rate": 7.080572625513453e-06, + "loss": 0.1035, + "step": 4540 + }, + { + "epoch": 0.38, + "grad_norm": 0.6281817949263598, + "learning_rate": 7.079331696434939e-06, + "loss": 0.149, + "step": 4541 + }, + { + "epoch": 0.38, + "grad_norm": 0.20665617127371372, + "learning_rate": 7.078090612475686e-06, + "loss": 0.0395, + "step": 4542 + }, + { + "epoch": 0.38, + "grad_norm": 0.2790853537886932, + "learning_rate": 7.076849373728141e-06, + "loss": 0.0701, + "step": 4543 + }, + { + "epoch": 0.38, + "grad_norm": 0.302877243489361, + "learning_rate": 7.075607980284753e-06, + "loss": 0.063, + "step": 4544 + }, + { + "epoch": 0.38, + "grad_norm": 0.35800164546769353, + "learning_rate": 7.074366432237993e-06, + "loss": 0.0896, + "step": 4545 + }, + { + "epoch": 0.38, + "grad_norm": 0.2926497949919803, + "learning_rate": 7.073124729680336e-06, + "loss": 0.0911, + "step": 4546 + }, + { + "epoch": 0.38, + "grad_norm": 0.2406824708776851, + "learning_rate": 7.071882872704273e-06, + "loss": 0.0684, + "step": 4547 + }, + { + "epoch": 0.38, + "grad_norm": 0.25559412312241453, + "learning_rate": 7.070640861402305e-06, + "loss": 0.0688, + "step": 4548 + }, + { + "epoch": 0.38, + "grad_norm": 0.5451921562423949, + "learning_rate": 7.069398695866941e-06, + "loss": 0.1338, + "step": 4549 + }, + { + "epoch": 0.38, + "grad_norm": 0.2991058186136316, + "learning_rate": 7.068156376190709e-06, + "loss": 0.0761, + "step": 4550 + }, + { + "epoch": 0.38, + "grad_norm": 0.2907521569292752, + "learning_rate": 7.066913902466142e-06, + "loss": 0.1072, + "step": 4551 + }, + { + "epoch": 0.38, + "grad_norm": 0.25959314747313417, + "learning_rate": 7.065671274785786e-06, + "loss": 0.0848, + "step": 4552 + }, + { + "epoch": 0.38, + "grad_norm": 0.4980979468877188, + "learning_rate": 7.0644284932422e-06, + "loss": 0.0899, + "step": 4553 + }, + { + "epoch": 0.38, + "grad_norm": 0.2815966474336918, + "learning_rate": 7.063185557927954e-06, + "loss": 0.0599, + "step": 4554 + }, + { + "epoch": 0.38, + "grad_norm": 0.29621962435486915, + "learning_rate": 7.061942468935629e-06, + "loss": 0.0804, + "step": 4555 + }, + { + "epoch": 0.38, + "grad_norm": 0.3709539384068646, + "learning_rate": 7.060699226357817e-06, + "loss": 0.1021, + "step": 4556 + }, + { + "epoch": 0.38, + "grad_norm": 0.20716549058724973, + "learning_rate": 7.059455830287122e-06, + "loss": 0.0737, + "step": 4557 + }, + { + "epoch": 0.38, + "grad_norm": 0.266064273887425, + "learning_rate": 7.05821228081616e-06, + "loss": 0.0626, + "step": 4558 + }, + { + "epoch": 0.38, + "grad_norm": 0.2606200649832158, + "learning_rate": 7.056968578037556e-06, + "loss": 0.0519, + "step": 4559 + }, + { + "epoch": 0.38, + "grad_norm": 0.3190087466986366, + "learning_rate": 7.0557247220439514e-06, + "loss": 0.0846, + "step": 4560 + }, + { + "epoch": 0.38, + "grad_norm": 0.2474997357838211, + "learning_rate": 7.0544807129279935e-06, + "loss": 0.0533, + "step": 4561 + }, + { + "epoch": 0.38, + "grad_norm": 0.2813268579343367, + "learning_rate": 7.053236550782343e-06, + "loss": 0.056, + "step": 4562 + }, + { + "epoch": 0.38, + "grad_norm": 0.2038261062017343, + "learning_rate": 7.051992235699674e-06, + "loss": 0.0522, + "step": 4563 + }, + { + "epoch": 0.38, + "grad_norm": 0.45924830398045974, + "learning_rate": 7.050747767772669e-06, + "loss": 0.1049, + "step": 4564 + }, + { + "epoch": 0.38, + "grad_norm": 0.2599416720240521, + "learning_rate": 7.0495031470940246e-06, + "loss": 0.0945, + "step": 4565 + }, + { + "epoch": 0.38, + "grad_norm": 0.26395620351457916, + "learning_rate": 7.048258373756445e-06, + "loss": 0.0557, + "step": 4566 + }, + { + "epoch": 0.38, + "grad_norm": 0.21162293152280376, + "learning_rate": 7.047013447852651e-06, + "loss": 0.0629, + "step": 4567 + }, + { + "epoch": 0.38, + "grad_norm": 0.48992224361938397, + "learning_rate": 7.04576836947537e-06, + "loss": 0.1174, + "step": 4568 + }, + { + "epoch": 0.38, + "grad_norm": 0.38844309636071467, + "learning_rate": 7.044523138717344e-06, + "loss": 0.0927, + "step": 4569 + }, + { + "epoch": 0.39, + "grad_norm": 0.2740228355056441, + "learning_rate": 7.043277755671325e-06, + "loss": 0.0674, + "step": 4570 + }, + { + "epoch": 0.39, + "grad_norm": 0.24087935510242203, + "learning_rate": 7.042032220430074e-06, + "loss": 0.0657, + "step": 4571 + }, + { + "epoch": 0.39, + "grad_norm": 0.32338038399839775, + "learning_rate": 7.040786533086369e-06, + "loss": 0.0821, + "step": 4572 + }, + { + "epoch": 0.39, + "grad_norm": 0.31970494766109964, + "learning_rate": 7.039540693732994e-06, + "loss": 0.0747, + "step": 4573 + }, + { + "epoch": 0.39, + "grad_norm": 0.2981261456014911, + "learning_rate": 7.0382947024627465e-06, + "loss": 0.0943, + "step": 4574 + }, + { + "epoch": 0.39, + "grad_norm": 0.31875198386949877, + "learning_rate": 7.037048559368436e-06, + "loss": 0.0818, + "step": 4575 + }, + { + "epoch": 0.39, + "grad_norm": 0.469074319457, + "learning_rate": 7.035802264542881e-06, + "loss": 0.1001, + "step": 4576 + }, + { + "epoch": 0.39, + "grad_norm": 0.3937155124458543, + "learning_rate": 7.034555818078916e-06, + "loss": 0.0611, + "step": 4577 + }, + { + "epoch": 0.39, + "grad_norm": 0.21983044139229677, + "learning_rate": 7.0333092200693805e-06, + "loss": 0.0573, + "step": 4578 + }, + { + "epoch": 0.39, + "grad_norm": 0.3320530314303248, + "learning_rate": 7.032062470607131e-06, + "loss": 0.1016, + "step": 4579 + }, + { + "epoch": 0.39, + "grad_norm": 0.35522823385943625, + "learning_rate": 7.030815569785028e-06, + "loss": 0.0996, + "step": 4580 + }, + { + "epoch": 0.39, + "grad_norm": 0.2853870682707382, + "learning_rate": 7.029568517695952e-06, + "loss": 0.0828, + "step": 4581 + }, + { + "epoch": 0.39, + "grad_norm": 0.2533469884707315, + "learning_rate": 7.028321314432791e-06, + "loss": 0.0741, + "step": 4582 + }, + { + "epoch": 0.39, + "grad_norm": 0.3460787922153535, + "learning_rate": 7.027073960088442e-06, + "loss": 0.0978, + "step": 4583 + }, + { + "epoch": 0.39, + "grad_norm": 0.23104150361551903, + "learning_rate": 7.025826454755815e-06, + "loss": 0.0694, + "step": 4584 + }, + { + "epoch": 0.39, + "grad_norm": 0.5683729296573715, + "learning_rate": 7.024578798527834e-06, + "loss": 0.1005, + "step": 4585 + }, + { + "epoch": 0.39, + "grad_norm": 0.2313644119344922, + "learning_rate": 7.023330991497429e-06, + "loss": 0.0736, + "step": 4586 + }, + { + "epoch": 0.39, + "grad_norm": 0.24560630355133334, + "learning_rate": 7.0220830337575454e-06, + "loss": 0.0843, + "step": 4587 + }, + { + "epoch": 0.39, + "grad_norm": 0.375194937879267, + "learning_rate": 7.020834925401138e-06, + "loss": 0.1012, + "step": 4588 + }, + { + "epoch": 0.39, + "grad_norm": 0.2802463715205821, + "learning_rate": 7.019586666521172e-06, + "loss": 0.0879, + "step": 4589 + }, + { + "epoch": 0.39, + "grad_norm": 0.3238147135815922, + "learning_rate": 7.018338257210627e-06, + "loss": 0.0814, + "step": 4590 + }, + { + "epoch": 0.39, + "grad_norm": 0.7947401677785902, + "learning_rate": 7.01708969756249e-06, + "loss": 0.1503, + "step": 4591 + }, + { + "epoch": 0.39, + "grad_norm": 0.19029595064928856, + "learning_rate": 7.0158409876697646e-06, + "loss": 0.0717, + "step": 4592 + }, + { + "epoch": 0.39, + "grad_norm": 0.24476472428894042, + "learning_rate": 7.0145921276254555e-06, + "loss": 0.0683, + "step": 4593 + }, + { + "epoch": 0.39, + "grad_norm": 0.25057248777236724, + "learning_rate": 7.013343117522592e-06, + "loss": 0.0787, + "step": 4594 + }, + { + "epoch": 0.39, + "grad_norm": 0.5204783130474905, + "learning_rate": 7.012093957454202e-06, + "loss": 0.0944, + "step": 4595 + }, + { + "epoch": 0.39, + "grad_norm": 0.5037810067795928, + "learning_rate": 7.0108446475133356e-06, + "loss": 0.1523, + "step": 4596 + }, + { + "epoch": 0.39, + "grad_norm": 0.24075357476484868, + "learning_rate": 7.009595187793043e-06, + "loss": 0.0663, + "step": 4597 + }, + { + "epoch": 0.39, + "grad_norm": 0.6943028078535718, + "learning_rate": 7.0083455783863955e-06, + "loss": 0.1712, + "step": 4598 + }, + { + "epoch": 0.39, + "grad_norm": 0.37186329901988213, + "learning_rate": 7.00709581938647e-06, + "loss": 0.0701, + "step": 4599 + }, + { + "epoch": 0.39, + "grad_norm": 0.15270887004180989, + "learning_rate": 7.005845910886355e-06, + "loss": 0.0571, + "step": 4600 + }, + { + "epoch": 0.39, + "grad_norm": 0.4636467211431442, + "learning_rate": 7.004595852979153e-06, + "loss": 0.1013, + "step": 4601 + }, + { + "epoch": 0.39, + "grad_norm": 0.27646831570849695, + "learning_rate": 7.0033456457579715e-06, + "loss": 0.0667, + "step": 4602 + }, + { + "epoch": 0.39, + "grad_norm": 0.3197429005898988, + "learning_rate": 7.002095289315938e-06, + "loss": 0.0797, + "step": 4603 + }, + { + "epoch": 0.39, + "grad_norm": 0.4413457738987146, + "learning_rate": 7.000844783746185e-06, + "loss": 0.1036, + "step": 4604 + }, + { + "epoch": 0.39, + "grad_norm": 0.2531764616328079, + "learning_rate": 6.999594129141854e-06, + "loss": 0.0751, + "step": 4605 + }, + { + "epoch": 0.39, + "grad_norm": 0.15963499278391408, + "learning_rate": 6.998343325596106e-06, + "loss": 0.0578, + "step": 4606 + }, + { + "epoch": 0.39, + "grad_norm": 0.6642626830466372, + "learning_rate": 6.997092373202106e-06, + "loss": 0.1174, + "step": 4607 + }, + { + "epoch": 0.39, + "grad_norm": 0.3708441577381597, + "learning_rate": 6.995841272053031e-06, + "loss": 0.0961, + "step": 4608 + }, + { + "epoch": 0.39, + "grad_norm": 0.40781277400026017, + "learning_rate": 6.994590022242071e-06, + "loss": 0.0953, + "step": 4609 + }, + { + "epoch": 0.39, + "grad_norm": 0.4401928671007144, + "learning_rate": 6.993338623862428e-06, + "loss": 0.1091, + "step": 4610 + }, + { + "epoch": 0.39, + "grad_norm": 0.22573047832048782, + "learning_rate": 6.992087077007312e-06, + "loss": 0.0809, + "step": 4611 + }, + { + "epoch": 0.39, + "grad_norm": 0.37084836568178636, + "learning_rate": 6.990835381769946e-06, + "loss": 0.1166, + "step": 4612 + }, + { + "epoch": 0.39, + "grad_norm": 0.27369582024088684, + "learning_rate": 6.989583538243563e-06, + "loss": 0.0911, + "step": 4613 + }, + { + "epoch": 0.39, + "grad_norm": 0.21588924184149455, + "learning_rate": 6.988331546521408e-06, + "loss": 0.0676, + "step": 4614 + }, + { + "epoch": 0.39, + "grad_norm": 0.3194038077317392, + "learning_rate": 6.987079406696735e-06, + "loss": 0.075, + "step": 4615 + }, + { + "epoch": 0.39, + "grad_norm": 0.18295596406516973, + "learning_rate": 6.9858271188628144e-06, + "loss": 0.039, + "step": 4616 + }, + { + "epoch": 0.39, + "grad_norm": 0.3164521029956316, + "learning_rate": 6.98457468311292e-06, + "loss": 0.106, + "step": 4617 + }, + { + "epoch": 0.39, + "grad_norm": 0.3496827451053549, + "learning_rate": 6.983322099540344e-06, + "loss": 0.1049, + "step": 4618 + }, + { + "epoch": 0.39, + "grad_norm": 0.29804031091719513, + "learning_rate": 6.9820693682383825e-06, + "loss": 0.0629, + "step": 4619 + }, + { + "epoch": 0.39, + "grad_norm": 0.31747830851939596, + "learning_rate": 6.9808164893003496e-06, + "loss": 0.0869, + "step": 4620 + }, + { + "epoch": 0.39, + "grad_norm": 0.4211134714581951, + "learning_rate": 6.979563462819564e-06, + "loss": 0.1098, + "step": 4621 + }, + { + "epoch": 0.39, + "grad_norm": 0.2849165694934356, + "learning_rate": 6.9783102888893615e-06, + "loss": 0.0915, + "step": 4622 + }, + { + "epoch": 0.39, + "grad_norm": 0.4936751434421673, + "learning_rate": 6.977056967603083e-06, + "loss": 0.0908, + "step": 4623 + }, + { + "epoch": 0.39, + "grad_norm": 0.42582998041447295, + "learning_rate": 6.975803499054086e-06, + "loss": 0.1239, + "step": 4624 + }, + { + "epoch": 0.39, + "grad_norm": 0.26669509073853215, + "learning_rate": 6.974549883335731e-06, + "loss": 0.0926, + "step": 4625 + }, + { + "epoch": 0.39, + "grad_norm": 0.19023749707421994, + "learning_rate": 6.9732961205414016e-06, + "loss": 0.053, + "step": 4626 + }, + { + "epoch": 0.39, + "grad_norm": 0.39579521014249525, + "learning_rate": 6.97204221076448e-06, + "loss": 0.1199, + "step": 4627 + }, + { + "epoch": 0.39, + "grad_norm": 0.38001402498911424, + "learning_rate": 6.970788154098368e-06, + "loss": 0.0834, + "step": 4628 + }, + { + "epoch": 0.39, + "grad_norm": 0.35363927317472754, + "learning_rate": 6.969533950636471e-06, + "loss": 0.1259, + "step": 4629 + }, + { + "epoch": 0.39, + "grad_norm": 0.35084582737219505, + "learning_rate": 6.9682796004722145e-06, + "loss": 0.0949, + "step": 4630 + }, + { + "epoch": 0.39, + "grad_norm": 0.30235315681779523, + "learning_rate": 6.967025103699026e-06, + "loss": 0.0793, + "step": 4631 + }, + { + "epoch": 0.39, + "grad_norm": 0.49815176226630214, + "learning_rate": 6.96577046041035e-06, + "loss": 0.0801, + "step": 4632 + }, + { + "epoch": 0.39, + "grad_norm": 0.36047701743173827, + "learning_rate": 6.964515670699638e-06, + "loss": 0.0667, + "step": 4633 + }, + { + "epoch": 0.39, + "grad_norm": 0.5693876994954974, + "learning_rate": 6.9632607346603545e-06, + "loss": 0.1272, + "step": 4634 + }, + { + "epoch": 0.39, + "grad_norm": 0.3551048192312459, + "learning_rate": 6.9620056523859745e-06, + "loss": 0.0987, + "step": 4635 + }, + { + "epoch": 0.39, + "grad_norm": 0.3176765401358284, + "learning_rate": 6.960750423969984e-06, + "loss": 0.0948, + "step": 4636 + }, + { + "epoch": 0.39, + "grad_norm": 0.32885626929970263, + "learning_rate": 6.959495049505881e-06, + "loss": 0.0862, + "step": 4637 + }, + { + "epoch": 0.39, + "grad_norm": 0.33441706535845866, + "learning_rate": 6.958239529087171e-06, + "loss": 0.0904, + "step": 4638 + }, + { + "epoch": 0.39, + "grad_norm": 0.3664674076428767, + "learning_rate": 6.956983862807371e-06, + "loss": 0.1395, + "step": 4639 + }, + { + "epoch": 0.39, + "grad_norm": 0.3643029000004587, + "learning_rate": 6.9557280507600165e-06, + "loss": 0.0944, + "step": 4640 + }, + { + "epoch": 0.39, + "grad_norm": 0.39764525507965365, + "learning_rate": 6.954472093038641e-06, + "loss": 0.1085, + "step": 4641 + }, + { + "epoch": 0.39, + "grad_norm": 0.2462272943416633, + "learning_rate": 6.9532159897368e-06, + "loss": 0.0902, + "step": 4642 + }, + { + "epoch": 0.39, + "grad_norm": 0.6422862977200203, + "learning_rate": 6.951959740948053e-06, + "loss": 0.0977, + "step": 4643 + }, + { + "epoch": 0.39, + "grad_norm": 0.35167146544361116, + "learning_rate": 6.950703346765974e-06, + "loss": 0.061, + "step": 4644 + }, + { + "epoch": 0.39, + "grad_norm": 0.32838269354569666, + "learning_rate": 6.949446807284146e-06, + "loss": 0.0532, + "step": 4645 + }, + { + "epoch": 0.39, + "grad_norm": 0.35395532582212236, + "learning_rate": 6.948190122596162e-06, + "loss": 0.1, + "step": 4646 + }, + { + "epoch": 0.39, + "grad_norm": 0.34899245805070883, + "learning_rate": 6.946933292795631e-06, + "loss": 0.0911, + "step": 4647 + }, + { + "epoch": 0.39, + "grad_norm": 0.6800209985556319, + "learning_rate": 6.945676317976165e-06, + "loss": 0.0903, + "step": 4648 + }, + { + "epoch": 0.39, + "grad_norm": 0.24444936195849173, + "learning_rate": 6.944419198231393e-06, + "loss": 0.0709, + "step": 4649 + }, + { + "epoch": 0.39, + "grad_norm": 0.3572113125000436, + "learning_rate": 6.943161933654952e-06, + "loss": 0.093, + "step": 4650 + }, + { + "epoch": 0.39, + "grad_norm": 0.4835815259553198, + "learning_rate": 6.941904524340489e-06, + "loss": 0.1242, + "step": 4651 + }, + { + "epoch": 0.39, + "grad_norm": 0.3221147294056924, + "learning_rate": 6.940646970381667e-06, + "loss": 0.1001, + "step": 4652 + }, + { + "epoch": 0.39, + "grad_norm": 0.43774700711000897, + "learning_rate": 6.939389271872152e-06, + "loss": 0.1082, + "step": 4653 + }, + { + "epoch": 0.39, + "grad_norm": 0.36203633198312873, + "learning_rate": 6.938131428905628e-06, + "loss": 0.0902, + "step": 4654 + }, + { + "epoch": 0.39, + "grad_norm": 0.2744331205096702, + "learning_rate": 6.936873441575782e-06, + "loss": 0.075, + "step": 4655 + }, + { + "epoch": 0.39, + "grad_norm": 0.26726734788171613, + "learning_rate": 6.935615309976321e-06, + "loss": 0.0493, + "step": 4656 + }, + { + "epoch": 0.39, + "grad_norm": 0.1789680585102601, + "learning_rate": 6.934357034200955e-06, + "loss": 0.0452, + "step": 4657 + }, + { + "epoch": 0.39, + "grad_norm": 0.6429217807588916, + "learning_rate": 6.93309861434341e-06, + "loss": 0.1236, + "step": 4658 + }, + { + "epoch": 0.39, + "grad_norm": 0.4419662043680832, + "learning_rate": 6.931840050497417e-06, + "loss": 0.1288, + "step": 4659 + }, + { + "epoch": 0.39, + "grad_norm": 0.3736932881348034, + "learning_rate": 6.9305813427567245e-06, + "loss": 0.1074, + "step": 4660 + }, + { + "epoch": 0.39, + "grad_norm": 0.35688866018510507, + "learning_rate": 6.929322491215087e-06, + "loss": 0.0931, + "step": 4661 + }, + { + "epoch": 0.39, + "grad_norm": 0.2131496593756044, + "learning_rate": 6.9280634959662715e-06, + "loss": 0.0718, + "step": 4662 + }, + { + "epoch": 0.39, + "grad_norm": 0.235904323384669, + "learning_rate": 6.9268043571040545e-06, + "loss": 0.0737, + "step": 4663 + }, + { + "epoch": 0.39, + "grad_norm": 0.3035957461968339, + "learning_rate": 6.925545074722225e-06, + "loss": 0.0871, + "step": 4664 + }, + { + "epoch": 0.39, + "grad_norm": 0.4489333302941714, + "learning_rate": 6.924285648914581e-06, + "loss": 0.0902, + "step": 4665 + }, + { + "epoch": 0.39, + "grad_norm": 0.19537644175939278, + "learning_rate": 6.923026079774931e-06, + "loss": 0.054, + "step": 4666 + }, + { + "epoch": 0.39, + "grad_norm": 0.32063151889596025, + "learning_rate": 6.921766367397097e-06, + "loss": 0.0573, + "step": 4667 + }, + { + "epoch": 0.39, + "grad_norm": 0.34851712821721464, + "learning_rate": 6.920506511874909e-06, + "loss": 0.0728, + "step": 4668 + }, + { + "epoch": 0.39, + "grad_norm": 0.8600880905101923, + "learning_rate": 6.919246513302208e-06, + "loss": 0.1307, + "step": 4669 + }, + { + "epoch": 0.39, + "grad_norm": 0.2981160393683744, + "learning_rate": 6.917986371772847e-06, + "loss": 0.047, + "step": 4670 + }, + { + "epoch": 0.39, + "grad_norm": 0.26524166829265733, + "learning_rate": 6.916726087380688e-06, + "loss": 0.0594, + "step": 4671 + }, + { + "epoch": 0.39, + "grad_norm": 0.29683952650589285, + "learning_rate": 6.915465660219602e-06, + "loss": 0.0566, + "step": 4672 + }, + { + "epoch": 0.39, + "grad_norm": 0.4313234298898825, + "learning_rate": 6.914205090383477e-06, + "loss": 0.1119, + "step": 4673 + }, + { + "epoch": 0.39, + "grad_norm": 0.23507208346845698, + "learning_rate": 6.912944377966205e-06, + "loss": 0.0736, + "step": 4674 + }, + { + "epoch": 0.39, + "grad_norm": 0.24954692421620062, + "learning_rate": 6.911683523061693e-06, + "loss": 0.0807, + "step": 4675 + }, + { + "epoch": 0.39, + "grad_norm": 0.3524491627346, + "learning_rate": 6.910422525763855e-06, + "loss": 0.0712, + "step": 4676 + }, + { + "epoch": 0.39, + "grad_norm": 0.3509261483595118, + "learning_rate": 6.909161386166618e-06, + "loss": 0.06, + "step": 4677 + }, + { + "epoch": 0.39, + "grad_norm": 0.2331403126488029, + "learning_rate": 6.907900104363921e-06, + "loss": 0.0819, + "step": 4678 + }, + { + "epoch": 0.39, + "grad_norm": 0.3753832734274987, + "learning_rate": 6.906638680449709e-06, + "loss": 0.0831, + "step": 4679 + }, + { + "epoch": 0.39, + "grad_norm": 0.3123251492408427, + "learning_rate": 6.905377114517939e-06, + "loss": 0.076, + "step": 4680 + }, + { + "epoch": 0.39, + "grad_norm": 0.3943890690140778, + "learning_rate": 6.904115406662585e-06, + "loss": 0.1177, + "step": 4681 + }, + { + "epoch": 0.39, + "grad_norm": 0.2512982884076459, + "learning_rate": 6.902853556977623e-06, + "loss": 0.0431, + "step": 4682 + }, + { + "epoch": 0.39, + "grad_norm": 0.5464086302395986, + "learning_rate": 6.90159156555704e-06, + "loss": 0.1304, + "step": 4683 + }, + { + "epoch": 0.39, + "grad_norm": 0.25476762630256355, + "learning_rate": 6.9003294324948424e-06, + "loss": 0.0611, + "step": 4684 + }, + { + "epoch": 0.39, + "grad_norm": 0.4199267752505593, + "learning_rate": 6.8990671578850375e-06, + "loss": 0.11, + "step": 4685 + }, + { + "epoch": 0.39, + "grad_norm": 0.30638006343768687, + "learning_rate": 6.897804741821649e-06, + "loss": 0.0669, + "step": 4686 + }, + { + "epoch": 0.39, + "grad_norm": 0.27595651815173466, + "learning_rate": 6.896542184398706e-06, + "loss": 0.0776, + "step": 4687 + }, + { + "epoch": 0.4, + "grad_norm": 0.38570481456423733, + "learning_rate": 6.895279485710255e-06, + "loss": 0.0939, + "step": 4688 + }, + { + "epoch": 0.4, + "grad_norm": 0.8115734711332021, + "learning_rate": 6.8940166458503445e-06, + "loss": 0.0777, + "step": 4689 + }, + { + "epoch": 0.4, + "grad_norm": 0.23435058375226153, + "learning_rate": 6.892753664913043e-06, + "loss": 0.0477, + "step": 4690 + }, + { + "epoch": 0.4, + "grad_norm": 0.2515452964404833, + "learning_rate": 6.891490542992421e-06, + "loss": 0.0651, + "step": 4691 + }, + { + "epoch": 0.4, + "grad_norm": 0.2996416988695253, + "learning_rate": 6.890227280182566e-06, + "loss": 0.0805, + "step": 4692 + }, + { + "epoch": 0.4, + "grad_norm": 0.43598470353532426, + "learning_rate": 6.88896387657757e-06, + "loss": 0.1007, + "step": 4693 + }, + { + "epoch": 0.4, + "grad_norm": 0.5056086143166686, + "learning_rate": 6.887700332271542e-06, + "loss": 0.1029, + "step": 4694 + }, + { + "epoch": 0.4, + "grad_norm": 0.47204727655141326, + "learning_rate": 6.886436647358595e-06, + "loss": 0.106, + "step": 4695 + }, + { + "epoch": 0.4, + "grad_norm": 0.21778906506820928, + "learning_rate": 6.8851728219328596e-06, + "loss": 0.0563, + "step": 4696 + }, + { + "epoch": 0.4, + "grad_norm": 0.23184222218221479, + "learning_rate": 6.8839088560884686e-06, + "loss": 0.0545, + "step": 4697 + }, + { + "epoch": 0.4, + "grad_norm": 0.28575992476799483, + "learning_rate": 6.8826447499195716e-06, + "loss": 0.0779, + "step": 4698 + }, + { + "epoch": 0.4, + "grad_norm": 0.23144626952205274, + "learning_rate": 6.881380503520326e-06, + "loss": 0.0483, + "step": 4699 + }, + { + "epoch": 0.4, + "grad_norm": 0.4076725202702417, + "learning_rate": 6.880116116984903e-06, + "loss": 0.1182, + "step": 4700 + }, + { + "epoch": 0.4, + "grad_norm": 0.2603814980769807, + "learning_rate": 6.878851590407476e-06, + "loss": 0.0636, + "step": 4701 + }, + { + "epoch": 0.4, + "grad_norm": 0.39781841578384847, + "learning_rate": 6.877586923882239e-06, + "loss": 0.0959, + "step": 4702 + }, + { + "epoch": 0.4, + "grad_norm": 0.4672791986229205, + "learning_rate": 6.87632211750339e-06, + "loss": 0.118, + "step": 4703 + }, + { + "epoch": 0.4, + "grad_norm": 0.2774724526637543, + "learning_rate": 6.875057171365139e-06, + "loss": 0.0601, + "step": 4704 + }, + { + "epoch": 0.4, + "grad_norm": 0.3331048885944759, + "learning_rate": 6.873792085561708e-06, + "loss": 0.0908, + "step": 4705 + }, + { + "epoch": 0.4, + "grad_norm": 0.4808269838372655, + "learning_rate": 6.872526860187325e-06, + "loss": 0.0816, + "step": 4706 + }, + { + "epoch": 0.4, + "grad_norm": 0.2832218047384847, + "learning_rate": 6.871261495336234e-06, + "loss": 0.0793, + "step": 4707 + }, + { + "epoch": 0.4, + "grad_norm": 0.3558092756025122, + "learning_rate": 6.869995991102687e-06, + "loss": 0.0932, + "step": 4708 + }, + { + "epoch": 0.4, + "grad_norm": 0.597405732365022, + "learning_rate": 6.868730347580943e-06, + "loss": 0.1281, + "step": 4709 + }, + { + "epoch": 0.4, + "grad_norm": 0.35925914959064764, + "learning_rate": 6.8674645648652786e-06, + "loss": 0.1169, + "step": 4710 + }, + { + "epoch": 0.4, + "grad_norm": 0.29376236223766683, + "learning_rate": 6.866198643049973e-06, + "loss": 0.0501, + "step": 4711 + }, + { + "epoch": 0.4, + "grad_norm": 0.34245062628754164, + "learning_rate": 6.864932582229322e-06, + "loss": 0.0782, + "step": 4712 + }, + { + "epoch": 0.4, + "grad_norm": 0.3310320703503459, + "learning_rate": 6.863666382497628e-06, + "loss": 0.0968, + "step": 4713 + }, + { + "epoch": 0.4, + "grad_norm": 0.3239906387219654, + "learning_rate": 6.862400043949205e-06, + "loss": 0.0885, + "step": 4714 + }, + { + "epoch": 0.4, + "grad_norm": 0.2830266920446376, + "learning_rate": 6.861133566678379e-06, + "loss": 0.0783, + "step": 4715 + }, + { + "epoch": 0.4, + "grad_norm": 0.2570795726168445, + "learning_rate": 6.85986695077948e-06, + "loss": 0.0615, + "step": 4716 + }, + { + "epoch": 0.4, + "grad_norm": 0.3263729528481848, + "learning_rate": 6.8586001963468584e-06, + "loss": 0.1163, + "step": 4717 + }, + { + "epoch": 0.4, + "grad_norm": 0.28581196559547745, + "learning_rate": 6.857333303474866e-06, + "loss": 0.0767, + "step": 4718 + }, + { + "epoch": 0.4, + "grad_norm": 0.2784226396919977, + "learning_rate": 6.85606627225787e-06, + "loss": 0.0893, + "step": 4719 + }, + { + "epoch": 0.4, + "grad_norm": 0.37994558082634006, + "learning_rate": 6.854799102790245e-06, + "loss": 0.0695, + "step": 4720 + }, + { + "epoch": 0.4, + "grad_norm": 0.2534278134319433, + "learning_rate": 6.853531795166378e-06, + "loss": 0.0593, + "step": 4721 + }, + { + "epoch": 0.4, + "grad_norm": 0.3409146914360919, + "learning_rate": 6.852264349480666e-06, + "loss": 0.0768, + "step": 4722 + }, + { + "epoch": 0.4, + "grad_norm": 0.29256550314214086, + "learning_rate": 6.850996765827513e-06, + "loss": 0.0809, + "step": 4723 + }, + { + "epoch": 0.4, + "grad_norm": 0.16423458387963485, + "learning_rate": 6.849729044301341e-06, + "loss": 0.0554, + "step": 4724 + }, + { + "epoch": 0.4, + "grad_norm": 0.19693369960416923, + "learning_rate": 6.848461184996572e-06, + "loss": 0.048, + "step": 4725 + }, + { + "epoch": 0.4, + "grad_norm": 0.22390850530869377, + "learning_rate": 6.847193188007648e-06, + "loss": 0.0557, + "step": 4726 + }, + { + "epoch": 0.4, + "grad_norm": 0.35208414274819, + "learning_rate": 6.845925053429012e-06, + "loss": 0.1081, + "step": 4727 + }, + { + "epoch": 0.4, + "grad_norm": 0.34675431870085194, + "learning_rate": 6.844656781355127e-06, + "loss": 0.0722, + "step": 4728 + }, + { + "epoch": 0.4, + "grad_norm": 0.7770918014936488, + "learning_rate": 6.8433883718804576e-06, + "loss": 0.1634, + "step": 4729 + }, + { + "epoch": 0.4, + "grad_norm": 0.3428169366256438, + "learning_rate": 6.842119825099484e-06, + "loss": 0.0777, + "step": 4730 + }, + { + "epoch": 0.4, + "grad_norm": 0.4440970287684867, + "learning_rate": 6.8408511411066945e-06, + "loss": 0.131, + "step": 4731 + }, + { + "epoch": 0.4, + "grad_norm": 0.39000181323981087, + "learning_rate": 6.83958231999659e-06, + "loss": 0.1129, + "step": 4732 + }, + { + "epoch": 0.4, + "grad_norm": 0.3463804017090998, + "learning_rate": 6.838313361863675e-06, + "loss": 0.0904, + "step": 4733 + }, + { + "epoch": 0.4, + "grad_norm": 0.4025518594208263, + "learning_rate": 6.837044266802475e-06, + "loss": 0.1163, + "step": 4734 + }, + { + "epoch": 0.4, + "grad_norm": 0.31225765529396166, + "learning_rate": 6.835775034907515e-06, + "loss": 0.0423, + "step": 4735 + }, + { + "epoch": 0.4, + "grad_norm": 0.24187770221824845, + "learning_rate": 6.8345056662733345e-06, + "loss": 0.0823, + "step": 4736 + }, + { + "epoch": 0.4, + "grad_norm": 0.29409086590599975, + "learning_rate": 6.833236160994488e-06, + "loss": 0.0814, + "step": 4737 + }, + { + "epoch": 0.4, + "grad_norm": 0.4236879142949109, + "learning_rate": 6.831966519165531e-06, + "loss": 0.1076, + "step": 4738 + }, + { + "epoch": 0.4, + "grad_norm": 0.32556564861773535, + "learning_rate": 6.830696740881036e-06, + "loss": 0.079, + "step": 4739 + }, + { + "epoch": 0.4, + "grad_norm": 0.2843365415641183, + "learning_rate": 6.8294268262355835e-06, + "loss": 0.0572, + "step": 4740 + }, + { + "epoch": 0.4, + "grad_norm": 0.3185509694895841, + "learning_rate": 6.828156775323763e-06, + "loss": 0.0999, + "step": 4741 + }, + { + "epoch": 0.4, + "grad_norm": 0.3337265724468897, + "learning_rate": 6.826886588240177e-06, + "loss": 0.1151, + "step": 4742 + }, + { + "epoch": 0.4, + "grad_norm": 0.280392680161525, + "learning_rate": 6.825616265079435e-06, + "loss": 0.0806, + "step": 4743 + }, + { + "epoch": 0.4, + "grad_norm": 0.26640941899131354, + "learning_rate": 6.824345805936159e-06, + "loss": 0.1016, + "step": 4744 + }, + { + "epoch": 0.4, + "grad_norm": 0.2886765840287694, + "learning_rate": 6.823075210904979e-06, + "loss": 0.0699, + "step": 4745 + }, + { + "epoch": 0.4, + "grad_norm": 0.3291778913934836, + "learning_rate": 6.821804480080538e-06, + "loss": 0.0838, + "step": 4746 + }, + { + "epoch": 0.4, + "grad_norm": 0.2792069447540215, + "learning_rate": 6.820533613557487e-06, + "loss": 0.074, + "step": 4747 + }, + { + "epoch": 0.4, + "grad_norm": 0.3678678595464938, + "learning_rate": 6.819262611430485e-06, + "loss": 0.0691, + "step": 4748 + }, + { + "epoch": 0.4, + "grad_norm": 0.44540356711689905, + "learning_rate": 6.817991473794207e-06, + "loss": 0.0873, + "step": 4749 + }, + { + "epoch": 0.4, + "grad_norm": 0.28630218956333225, + "learning_rate": 6.816720200743334e-06, + "loss": 0.0597, + "step": 4750 + }, + { + "epoch": 0.4, + "grad_norm": 0.36136892266446796, + "learning_rate": 6.815448792372556e-06, + "loss": 0.0924, + "step": 4751 + }, + { + "epoch": 0.4, + "grad_norm": 0.29983071113530757, + "learning_rate": 6.814177248776578e-06, + "loss": 0.0986, + "step": 4752 + }, + { + "epoch": 0.4, + "grad_norm": 0.6994318226042349, + "learning_rate": 6.812905570050108e-06, + "loss": 0.0643, + "step": 4753 + }, + { + "epoch": 0.4, + "grad_norm": 0.313902942330313, + "learning_rate": 6.811633756287872e-06, + "loss": 0.0593, + "step": 4754 + }, + { + "epoch": 0.4, + "grad_norm": 0.4163270977390757, + "learning_rate": 6.810361807584599e-06, + "loss": 0.0756, + "step": 4755 + }, + { + "epoch": 0.4, + "grad_norm": 0.287752436939843, + "learning_rate": 6.809089724035034e-06, + "loss": 0.0768, + "step": 4756 + }, + { + "epoch": 0.4, + "grad_norm": 0.32523250985216284, + "learning_rate": 6.807817505733926e-06, + "loss": 0.087, + "step": 4757 + }, + { + "epoch": 0.4, + "grad_norm": 0.2858516588349136, + "learning_rate": 6.806545152776039e-06, + "loss": 0.071, + "step": 4758 + }, + { + "epoch": 0.4, + "grad_norm": 0.24995676174002754, + "learning_rate": 6.8052726652561454e-06, + "loss": 0.0695, + "step": 4759 + }, + { + "epoch": 0.4, + "grad_norm": 0.6729513509788052, + "learning_rate": 6.804000043269028e-06, + "loss": 0.1223, + "step": 4760 + }, + { + "epoch": 0.4, + "grad_norm": 0.4392571685416266, + "learning_rate": 6.802727286909477e-06, + "loss": 0.1077, + "step": 4761 + }, + { + "epoch": 0.4, + "grad_norm": 0.21284167256967465, + "learning_rate": 6.8014543962722976e-06, + "loss": 0.0697, + "step": 4762 + }, + { + "epoch": 0.4, + "grad_norm": 0.38992822451407055, + "learning_rate": 6.8001813714523e-06, + "loss": 0.1006, + "step": 4763 + }, + { + "epoch": 0.4, + "grad_norm": 0.15994176847069455, + "learning_rate": 6.798908212544306e-06, + "loss": 0.0295, + "step": 4764 + }, + { + "epoch": 0.4, + "grad_norm": 0.20832386338669084, + "learning_rate": 6.797634919643149e-06, + "loss": 0.0717, + "step": 4765 + }, + { + "epoch": 0.4, + "grad_norm": 0.24471129132564223, + "learning_rate": 6.796361492843673e-06, + "loss": 0.0699, + "step": 4766 + }, + { + "epoch": 0.4, + "grad_norm": 0.35018391640569246, + "learning_rate": 6.795087932240726e-06, + "loss": 0.0686, + "step": 4767 + }, + { + "epoch": 0.4, + "grad_norm": 0.3019177341286526, + "learning_rate": 6.793814237929175e-06, + "loss": 0.0773, + "step": 4768 + }, + { + "epoch": 0.4, + "grad_norm": 0.39358351764995964, + "learning_rate": 6.7925404100038875e-06, + "loss": 0.1116, + "step": 4769 + }, + { + "epoch": 0.4, + "grad_norm": 0.3347301918002869, + "learning_rate": 6.791266448559748e-06, + "loss": 0.101, + "step": 4770 + }, + { + "epoch": 0.4, + "grad_norm": 0.2614639320898981, + "learning_rate": 6.78999235369165e-06, + "loss": 0.061, + "step": 4771 + }, + { + "epoch": 0.4, + "grad_norm": 0.3479392756218102, + "learning_rate": 6.788718125494493e-06, + "loss": 0.0752, + "step": 4772 + }, + { + "epoch": 0.4, + "grad_norm": 0.35653337709757765, + "learning_rate": 6.7874437640631916e-06, + "loss": 0.0966, + "step": 4773 + }, + { + "epoch": 0.4, + "grad_norm": 0.6706222577931181, + "learning_rate": 6.7861692694926654e-06, + "loss": 0.0924, + "step": 4774 + }, + { + "epoch": 0.4, + "grad_norm": 0.28582327592269996, + "learning_rate": 6.784894641877845e-06, + "loss": 0.0827, + "step": 4775 + }, + { + "epoch": 0.4, + "grad_norm": 0.2989640083638522, + "learning_rate": 6.783619881313677e-06, + "loss": 0.0927, + "step": 4776 + }, + { + "epoch": 0.4, + "grad_norm": 0.30277120949898, + "learning_rate": 6.782344987895106e-06, + "loss": 0.0742, + "step": 4777 + }, + { + "epoch": 0.4, + "grad_norm": 0.2969396982049214, + "learning_rate": 6.781069961717101e-06, + "loss": 0.0664, + "step": 4778 + }, + { + "epoch": 0.4, + "grad_norm": 0.2530215928439572, + "learning_rate": 6.779794802874627e-06, + "loss": 0.1033, + "step": 4779 + }, + { + "epoch": 0.4, + "grad_norm": 0.30230400004630836, + "learning_rate": 6.77851951146267e-06, + "loss": 0.0486, + "step": 4780 + }, + { + "epoch": 0.4, + "grad_norm": 0.23455831225433627, + "learning_rate": 6.777244087576218e-06, + "loss": 0.073, + "step": 4781 + }, + { + "epoch": 0.4, + "grad_norm": 0.3704920470617132, + "learning_rate": 6.775968531310273e-06, + "loss": 0.1362, + "step": 4782 + }, + { + "epoch": 0.4, + "grad_norm": 0.3942548124910074, + "learning_rate": 6.774692842759848e-06, + "loss": 0.1314, + "step": 4783 + }, + { + "epoch": 0.4, + "grad_norm": 0.4250598129507288, + "learning_rate": 6.773417022019959e-06, + "loss": 0.1248, + "step": 4784 + }, + { + "epoch": 0.4, + "grad_norm": 0.431864465602126, + "learning_rate": 6.772141069185641e-06, + "loss": 0.1218, + "step": 4785 + }, + { + "epoch": 0.4, + "grad_norm": 0.2882545789030885, + "learning_rate": 6.770864984351933e-06, + "loss": 0.114, + "step": 4786 + }, + { + "epoch": 0.4, + "grad_norm": 0.27376283045821603, + "learning_rate": 6.769588767613883e-06, + "loss": 0.0837, + "step": 4787 + }, + { + "epoch": 0.4, + "grad_norm": 0.3278122185874359, + "learning_rate": 6.768312419066555e-06, + "loss": 0.0889, + "step": 4788 + }, + { + "epoch": 0.4, + "grad_norm": 0.40925665285111323, + "learning_rate": 6.767035938805015e-06, + "loss": 0.0938, + "step": 4789 + }, + { + "epoch": 0.4, + "grad_norm": 0.3091367832209835, + "learning_rate": 6.765759326924346e-06, + "loss": 0.0879, + "step": 4790 + }, + { + "epoch": 0.4, + "grad_norm": 0.35413264990721044, + "learning_rate": 6.764482583519636e-06, + "loss": 0.0969, + "step": 4791 + }, + { + "epoch": 0.4, + "grad_norm": 0.2910013166318915, + "learning_rate": 6.763205708685984e-06, + "loss": 0.0878, + "step": 4792 + }, + { + "epoch": 0.4, + "grad_norm": 0.25997263650834695, + "learning_rate": 6.761928702518499e-06, + "loss": 0.0806, + "step": 4793 + }, + { + "epoch": 0.4, + "grad_norm": 0.6882219264756096, + "learning_rate": 6.7606515651123e-06, + "loss": 0.1596, + "step": 4794 + }, + { + "epoch": 0.4, + "grad_norm": 0.4329014209363807, + "learning_rate": 6.759374296562516e-06, + "loss": 0.0897, + "step": 4795 + }, + { + "epoch": 0.4, + "grad_norm": 0.2444925578978857, + "learning_rate": 6.758096896964287e-06, + "loss": 0.0692, + "step": 4796 + }, + { + "epoch": 0.4, + "grad_norm": 0.35319776297209743, + "learning_rate": 6.756819366412758e-06, + "loss": 0.115, + "step": 4797 + }, + { + "epoch": 0.4, + "grad_norm": 0.32888963693556517, + "learning_rate": 6.755541705003088e-06, + "loss": 0.102, + "step": 4798 + }, + { + "epoch": 0.4, + "grad_norm": 0.23233317529481087, + "learning_rate": 6.754263912830446e-06, + "loss": 0.0277, + "step": 4799 + }, + { + "epoch": 0.4, + "grad_norm": 0.3159533062090278, + "learning_rate": 6.752985989990007e-06, + "loss": 0.1159, + "step": 4800 + }, + { + "epoch": 0.4, + "grad_norm": 0.3801680857696001, + "learning_rate": 6.751707936576961e-06, + "loss": 0.0898, + "step": 4801 + }, + { + "epoch": 0.4, + "grad_norm": 0.3065403007931138, + "learning_rate": 6.7504297526865025e-06, + "loss": 0.0513, + "step": 4802 + }, + { + "epoch": 0.4, + "grad_norm": 0.3265341824875987, + "learning_rate": 6.749151438413839e-06, + "loss": 0.075, + "step": 4803 + }, + { + "epoch": 0.4, + "grad_norm": 0.6896437665252437, + "learning_rate": 6.747872993854188e-06, + "loss": 0.1378, + "step": 4804 + }, + { + "epoch": 0.4, + "grad_norm": 0.35200937125392945, + "learning_rate": 6.746594419102773e-06, + "loss": 0.1084, + "step": 4805 + }, + { + "epoch": 0.4, + "grad_norm": 0.2811008958221108, + "learning_rate": 6.745315714254832e-06, + "loss": 0.1101, + "step": 4806 + }, + { + "epoch": 0.41, + "grad_norm": 0.42188344891479634, + "learning_rate": 6.744036879405609e-06, + "loss": 0.126, + "step": 4807 + }, + { + "epoch": 0.41, + "grad_norm": 0.29323847332343006, + "learning_rate": 6.742757914650358e-06, + "loss": 0.055, + "step": 4808 + }, + { + "epoch": 0.41, + "grad_norm": 0.37034951264801164, + "learning_rate": 6.741478820084346e-06, + "loss": 0.0969, + "step": 4809 + }, + { + "epoch": 0.41, + "grad_norm": 0.3477401866129879, + "learning_rate": 6.740199595802849e-06, + "loss": 0.1133, + "step": 4810 + }, + { + "epoch": 0.41, + "grad_norm": 0.2169235414354096, + "learning_rate": 6.738920241901146e-06, + "loss": 0.0751, + "step": 4811 + }, + { + "epoch": 0.41, + "grad_norm": 0.31316634728156545, + "learning_rate": 6.737640758474535e-06, + "loss": 0.1042, + "step": 4812 + }, + { + "epoch": 0.41, + "grad_norm": 0.4385252160361452, + "learning_rate": 6.736361145618317e-06, + "loss": 0.1106, + "step": 4813 + }, + { + "epoch": 0.41, + "grad_norm": 0.2078727550688456, + "learning_rate": 6.7350814034278055e-06, + "loss": 0.0545, + "step": 4814 + }, + { + "epoch": 0.41, + "grad_norm": 0.2883651508280866, + "learning_rate": 6.733801531998324e-06, + "loss": 0.0625, + "step": 4815 + }, + { + "epoch": 0.41, + "grad_norm": 0.3887164658302971, + "learning_rate": 6.7325215314252035e-06, + "loss": 0.114, + "step": 4816 + }, + { + "epoch": 0.41, + "grad_norm": 0.19195405583857708, + "learning_rate": 6.7312414018037874e-06, + "loss": 0.0418, + "step": 4817 + }, + { + "epoch": 0.41, + "grad_norm": 0.332848700826065, + "learning_rate": 6.729961143229427e-06, + "loss": 0.0648, + "step": 4818 + }, + { + "epoch": 0.41, + "grad_norm": 0.3977229443763912, + "learning_rate": 6.728680755797482e-06, + "loss": 0.1065, + "step": 4819 + }, + { + "epoch": 0.41, + "grad_norm": 0.34984364624294406, + "learning_rate": 6.727400239603324e-06, + "loss": 0.0673, + "step": 4820 + }, + { + "epoch": 0.41, + "grad_norm": 0.253708797316539, + "learning_rate": 6.726119594742333e-06, + "loss": 0.0745, + "step": 4821 + }, + { + "epoch": 0.41, + "grad_norm": 0.39043095701220837, + "learning_rate": 6.7248388213099e-06, + "loss": 0.1121, + "step": 4822 + }, + { + "epoch": 0.41, + "grad_norm": 0.22982872737022098, + "learning_rate": 6.723557919401423e-06, + "loss": 0.0486, + "step": 4823 + }, + { + "epoch": 0.41, + "grad_norm": 0.3204487931305174, + "learning_rate": 6.722276889112313e-06, + "loss": 0.0929, + "step": 4824 + }, + { + "epoch": 0.41, + "grad_norm": 0.20821716975417381, + "learning_rate": 6.7209957305379854e-06, + "loss": 0.0571, + "step": 4825 + }, + { + "epoch": 0.41, + "grad_norm": 0.2794845191851916, + "learning_rate": 6.719714443773872e-06, + "loss": 0.0965, + "step": 4826 + }, + { + "epoch": 0.41, + "grad_norm": 0.31605169667744376, + "learning_rate": 6.718433028915408e-06, + "loss": 0.072, + "step": 4827 + }, + { + "epoch": 0.41, + "grad_norm": 0.4475704121466054, + "learning_rate": 6.717151486058042e-06, + "loss": 0.1026, + "step": 4828 + }, + { + "epoch": 0.41, + "grad_norm": 0.32852973687900044, + "learning_rate": 6.715869815297229e-06, + "loss": 0.0596, + "step": 4829 + }, + { + "epoch": 0.41, + "grad_norm": 0.31312807089346767, + "learning_rate": 6.714588016728437e-06, + "loss": 0.0953, + "step": 4830 + }, + { + "epoch": 0.41, + "grad_norm": 0.3891474223614072, + "learning_rate": 6.713306090447142e-06, + "loss": 0.0978, + "step": 4831 + }, + { + "epoch": 0.41, + "grad_norm": 0.4289942857549009, + "learning_rate": 6.712024036548828e-06, + "loss": 0.094, + "step": 4832 + }, + { + "epoch": 0.41, + "grad_norm": 0.36745781484813717, + "learning_rate": 6.7107418551289894e-06, + "loss": 0.0853, + "step": 4833 + }, + { + "epoch": 0.41, + "grad_norm": 0.3724514019305569, + "learning_rate": 6.709459546283134e-06, + "loss": 0.1004, + "step": 4834 + }, + { + "epoch": 0.41, + "grad_norm": 0.3683050927059635, + "learning_rate": 6.708177110106771e-06, + "loss": 0.0931, + "step": 4835 + }, + { + "epoch": 0.41, + "grad_norm": 0.323883555395736, + "learning_rate": 6.706894546695427e-06, + "loss": 0.0881, + "step": 4836 + }, + { + "epoch": 0.41, + "grad_norm": 0.5019578078760347, + "learning_rate": 6.705611856144634e-06, + "loss": 0.1398, + "step": 4837 + }, + { + "epoch": 0.41, + "grad_norm": 0.6646548227223141, + "learning_rate": 6.704329038549933e-06, + "loss": 0.1351, + "step": 4838 + }, + { + "epoch": 0.41, + "grad_norm": 0.20279640774519977, + "learning_rate": 6.703046094006878e-06, + "loss": 0.081, + "step": 4839 + }, + { + "epoch": 0.41, + "grad_norm": 0.16055324276911118, + "learning_rate": 6.701763022611027e-06, + "loss": 0.0455, + "step": 4840 + }, + { + "epoch": 0.41, + "grad_norm": 0.40491465301659757, + "learning_rate": 6.700479824457955e-06, + "loss": 0.1257, + "step": 4841 + }, + { + "epoch": 0.41, + "grad_norm": 0.2901370396281359, + "learning_rate": 6.699196499643239e-06, + "loss": 0.0826, + "step": 4842 + }, + { + "epoch": 0.41, + "grad_norm": 0.35693484914402285, + "learning_rate": 6.697913048262469e-06, + "loss": 0.0934, + "step": 4843 + }, + { + "epoch": 0.41, + "grad_norm": 0.43464986668649397, + "learning_rate": 6.696629470411244e-06, + "loss": 0.0594, + "step": 4844 + }, + { + "epoch": 0.41, + "grad_norm": 0.2695823756281699, + "learning_rate": 6.695345766185171e-06, + "loss": 0.0794, + "step": 4845 + }, + { + "epoch": 0.41, + "grad_norm": 0.2887731969791938, + "learning_rate": 6.694061935679871e-06, + "loss": 0.0772, + "step": 4846 + }, + { + "epoch": 0.41, + "grad_norm": 0.4428833451363169, + "learning_rate": 6.6927779789909685e-06, + "loss": 0.1118, + "step": 4847 + }, + { + "epoch": 0.41, + "grad_norm": 0.314534917455563, + "learning_rate": 6.691493896214102e-06, + "loss": 0.0691, + "step": 4848 + }, + { + "epoch": 0.41, + "grad_norm": 0.22681864268721977, + "learning_rate": 6.690209687444915e-06, + "loss": 0.0489, + "step": 4849 + }, + { + "epoch": 0.41, + "grad_norm": 0.4072737600006994, + "learning_rate": 6.688925352779065e-06, + "loss": 0.1084, + "step": 4850 + }, + { + "epoch": 0.41, + "grad_norm": 0.4008894558535504, + "learning_rate": 6.687640892312217e-06, + "loss": 0.1073, + "step": 4851 + }, + { + "epoch": 0.41, + "grad_norm": 0.3610936045362011, + "learning_rate": 6.686356306140043e-06, + "loss": 0.096, + "step": 4852 + }, + { + "epoch": 0.41, + "grad_norm": 0.5640241505132098, + "learning_rate": 6.685071594358228e-06, + "loss": 0.1044, + "step": 4853 + }, + { + "epoch": 0.41, + "grad_norm": 0.4610293347749703, + "learning_rate": 6.683786757062464e-06, + "loss": 0.1229, + "step": 4854 + }, + { + "epoch": 0.41, + "grad_norm": 0.2698101289616672, + "learning_rate": 6.682501794348453e-06, + "loss": 0.0804, + "step": 4855 + }, + { + "epoch": 0.41, + "grad_norm": 0.42712417635640954, + "learning_rate": 6.681216706311908e-06, + "loss": 0.091, + "step": 4856 + }, + { + "epoch": 0.41, + "grad_norm": 0.29674661186414414, + "learning_rate": 6.6799314930485485e-06, + "loss": 0.066, + "step": 4857 + }, + { + "epoch": 0.41, + "grad_norm": 0.32561350124288796, + "learning_rate": 6.678646154654106e-06, + "loss": 0.0914, + "step": 4858 + }, + { + "epoch": 0.41, + "grad_norm": 0.22562405702638078, + "learning_rate": 6.677360691224317e-06, + "loss": 0.0692, + "step": 4859 + }, + { + "epoch": 0.41, + "grad_norm": 0.5124364880383158, + "learning_rate": 6.676075102854934e-06, + "loss": 0.1002, + "step": 4860 + }, + { + "epoch": 0.41, + "grad_norm": 0.3116331878894401, + "learning_rate": 6.674789389641713e-06, + "loss": 0.0989, + "step": 4861 + }, + { + "epoch": 0.41, + "grad_norm": 0.3171396490608389, + "learning_rate": 6.673503551680423e-06, + "loss": 0.0901, + "step": 4862 + }, + { + "epoch": 0.41, + "grad_norm": 0.2031925588133999, + "learning_rate": 6.672217589066838e-06, + "loss": 0.0647, + "step": 4863 + }, + { + "epoch": 0.41, + "grad_norm": 0.2776175363039191, + "learning_rate": 6.670931501896747e-06, + "loss": 0.0649, + "step": 4864 + }, + { + "epoch": 0.41, + "grad_norm": 0.16617942320294782, + "learning_rate": 6.669645290265944e-06, + "loss": 0.0359, + "step": 4865 + }, + { + "epoch": 0.41, + "grad_norm": 0.3253203596320841, + "learning_rate": 6.668358954270231e-06, + "loss": 0.0837, + "step": 4866 + }, + { + "epoch": 0.41, + "grad_norm": 0.37950736762392717, + "learning_rate": 6.667072494005426e-06, + "loss": 0.102, + "step": 4867 + }, + { + "epoch": 0.41, + "grad_norm": 0.30834604578980357, + "learning_rate": 6.6657859095673505e-06, + "loss": 0.0633, + "step": 4868 + }, + { + "epoch": 0.41, + "grad_norm": 0.27392371282424016, + "learning_rate": 6.664499201051835e-06, + "loss": 0.0706, + "step": 4869 + }, + { + "epoch": 0.41, + "grad_norm": 0.29401399993364213, + "learning_rate": 6.6632123685547235e-06, + "loss": 0.0627, + "step": 4870 + }, + { + "epoch": 0.41, + "grad_norm": 0.2792621981335468, + "learning_rate": 6.6619254121718655e-06, + "loss": 0.0607, + "step": 4871 + }, + { + "epoch": 0.41, + "grad_norm": 0.28960157944553366, + "learning_rate": 6.660638331999123e-06, + "loss": 0.0813, + "step": 4872 + }, + { + "epoch": 0.41, + "grad_norm": 0.2604280537471189, + "learning_rate": 6.659351128132361e-06, + "loss": 0.0907, + "step": 4873 + }, + { + "epoch": 0.41, + "grad_norm": 0.23430025588280498, + "learning_rate": 6.658063800667462e-06, + "loss": 0.0703, + "step": 4874 + }, + { + "epoch": 0.41, + "grad_norm": 0.28789725622956963, + "learning_rate": 6.6567763497003125e-06, + "loss": 0.0696, + "step": 4875 + }, + { + "epoch": 0.41, + "grad_norm": 0.3475314181852648, + "learning_rate": 6.655488775326808e-06, + "loss": 0.1162, + "step": 4876 + }, + { + "epoch": 0.41, + "grad_norm": 0.3232197410406932, + "learning_rate": 6.654201077642857e-06, + "loss": 0.1097, + "step": 4877 + }, + { + "epoch": 0.41, + "grad_norm": 0.4052611086314303, + "learning_rate": 6.652913256744371e-06, + "loss": 0.1004, + "step": 4878 + }, + { + "epoch": 0.41, + "grad_norm": 0.5527571619649555, + "learning_rate": 6.651625312727278e-06, + "loss": 0.1314, + "step": 4879 + }, + { + "epoch": 0.41, + "grad_norm": 0.28222193790591804, + "learning_rate": 6.650337245687511e-06, + "loss": 0.0683, + "step": 4880 + }, + { + "epoch": 0.41, + "grad_norm": 0.22896453687477603, + "learning_rate": 6.6490490557210105e-06, + "loss": 0.0948, + "step": 4881 + }, + { + "epoch": 0.41, + "grad_norm": 0.34335315066405003, + "learning_rate": 6.647760742923732e-06, + "loss": 0.1265, + "step": 4882 + }, + { + "epoch": 0.41, + "grad_norm": 0.21088929704846662, + "learning_rate": 6.646472307391633e-06, + "loss": 0.0589, + "step": 4883 + }, + { + "epoch": 0.41, + "grad_norm": 0.3305695649921629, + "learning_rate": 6.645183749220685e-06, + "loss": 0.087, + "step": 4884 + }, + { + "epoch": 0.41, + "grad_norm": 0.3049404180221026, + "learning_rate": 6.643895068506868e-06, + "loss": 0.0835, + "step": 4885 + }, + { + "epoch": 0.41, + "grad_norm": 0.3026572288903209, + "learning_rate": 6.6426062653461696e-06, + "loss": 0.0999, + "step": 4886 + }, + { + "epoch": 0.41, + "grad_norm": 0.48376280450561193, + "learning_rate": 6.641317339834589e-06, + "loss": 0.1293, + "step": 4887 + }, + { + "epoch": 0.41, + "grad_norm": 0.30927416150747383, + "learning_rate": 6.640028292068129e-06, + "loss": 0.0769, + "step": 4888 + }, + { + "epoch": 0.41, + "grad_norm": 0.421880856180528, + "learning_rate": 6.63873912214281e-06, + "loss": 0.1079, + "step": 4889 + }, + { + "epoch": 0.41, + "grad_norm": 0.1830477950654268, + "learning_rate": 6.6374498301546546e-06, + "loss": 0.0537, + "step": 4890 + }, + { + "epoch": 0.41, + "grad_norm": 0.3088043635102654, + "learning_rate": 6.636160416199695e-06, + "loss": 0.0948, + "step": 4891 + }, + { + "epoch": 0.41, + "grad_norm": 0.2282217431261135, + "learning_rate": 6.6348708803739795e-06, + "loss": 0.0571, + "step": 4892 + }, + { + "epoch": 0.41, + "grad_norm": 0.8078319267244853, + "learning_rate": 6.633581222773554e-06, + "loss": 0.1469, + "step": 4893 + }, + { + "epoch": 0.41, + "grad_norm": 0.4184935824904322, + "learning_rate": 6.6322914434944855e-06, + "loss": 0.1187, + "step": 4894 + }, + { + "epoch": 0.41, + "grad_norm": 0.2708270831306791, + "learning_rate": 6.631001542632839e-06, + "loss": 0.0752, + "step": 4895 + }, + { + "epoch": 0.41, + "grad_norm": 0.4090851538482934, + "learning_rate": 6.6297115202846985e-06, + "loss": 0.0764, + "step": 4896 + }, + { + "epoch": 0.41, + "grad_norm": 0.3323025764577, + "learning_rate": 6.628421376546148e-06, + "loss": 0.0969, + "step": 4897 + }, + { + "epoch": 0.41, + "grad_norm": 0.3903908275663629, + "learning_rate": 6.627131111513289e-06, + "loss": 0.0918, + "step": 4898 + }, + { + "epoch": 0.41, + "grad_norm": 0.384314944911904, + "learning_rate": 6.625840725282226e-06, + "loss": 0.0934, + "step": 4899 + }, + { + "epoch": 0.41, + "grad_norm": 0.20659339269110363, + "learning_rate": 6.624550217949073e-06, + "loss": 0.071, + "step": 4900 + }, + { + "epoch": 0.41, + "grad_norm": 0.362945188154971, + "learning_rate": 6.6232595896099565e-06, + "loss": 0.1113, + "step": 4901 + }, + { + "epoch": 0.41, + "grad_norm": 0.27344255774067555, + "learning_rate": 6.6219688403610104e-06, + "loss": 0.0852, + "step": 4902 + }, + { + "epoch": 0.41, + "grad_norm": 0.2732616025996568, + "learning_rate": 6.6206779702983745e-06, + "loss": 0.0826, + "step": 4903 + }, + { + "epoch": 0.41, + "grad_norm": 0.41497504940215346, + "learning_rate": 6.619386979518205e-06, + "loss": 0.0877, + "step": 4904 + }, + { + "epoch": 0.41, + "grad_norm": 0.28920091835835127, + "learning_rate": 6.6180958681166585e-06, + "loss": 0.087, + "step": 4905 + }, + { + "epoch": 0.41, + "grad_norm": 0.32556084547313535, + "learning_rate": 6.616804636189905e-06, + "loss": 0.0404, + "step": 4906 + }, + { + "epoch": 0.41, + "grad_norm": 0.28324056672292114, + "learning_rate": 6.615513283834124e-06, + "loss": 0.0872, + "step": 4907 + }, + { + "epoch": 0.41, + "grad_norm": 0.25415296747053184, + "learning_rate": 6.614221811145502e-06, + "loss": 0.0476, + "step": 4908 + }, + { + "epoch": 0.41, + "grad_norm": 0.2595607287247803, + "learning_rate": 6.612930218220237e-06, + "loss": 0.0862, + "step": 4909 + }, + { + "epoch": 0.41, + "grad_norm": 0.5484992492841477, + "learning_rate": 6.611638505154533e-06, + "loss": 0.0941, + "step": 4910 + }, + { + "epoch": 0.41, + "grad_norm": 0.23320228301337387, + "learning_rate": 6.6103466720446055e-06, + "loss": 0.0625, + "step": 4911 + }, + { + "epoch": 0.41, + "grad_norm": 0.3998521546899515, + "learning_rate": 6.609054718986675e-06, + "loss": 0.1315, + "step": 4912 + }, + { + "epoch": 0.41, + "grad_norm": 0.269050449869328, + "learning_rate": 6.6077626460769765e-06, + "loss": 0.0974, + "step": 4913 + }, + { + "epoch": 0.41, + "grad_norm": 0.26866290209082827, + "learning_rate": 6.606470453411751e-06, + "loss": 0.0923, + "step": 4914 + }, + { + "epoch": 0.41, + "grad_norm": 0.3542843637335846, + "learning_rate": 6.605178141087246e-06, + "loss": 0.0739, + "step": 4915 + }, + { + "epoch": 0.41, + "grad_norm": 0.6122403980574995, + "learning_rate": 6.603885709199723e-06, + "loss": 0.1405, + "step": 4916 + }, + { + "epoch": 0.41, + "grad_norm": 0.5077689164686657, + "learning_rate": 6.60259315784545e-06, + "loss": 0.1054, + "step": 4917 + }, + { + "epoch": 0.41, + "grad_norm": 0.2046183785340652, + "learning_rate": 6.601300487120701e-06, + "loss": 0.0426, + "step": 4918 + }, + { + "epoch": 0.41, + "grad_norm": 0.5727733455061874, + "learning_rate": 6.600007697121765e-06, + "loss": 0.0974, + "step": 4919 + }, + { + "epoch": 0.41, + "grad_norm": 0.23832815695287685, + "learning_rate": 6.598714787944934e-06, + "loss": 0.0869, + "step": 4920 + }, + { + "epoch": 0.41, + "grad_norm": 0.43052502571506923, + "learning_rate": 6.597421759686513e-06, + "loss": 0.1008, + "step": 4921 + }, + { + "epoch": 0.41, + "grad_norm": 0.4695517326619301, + "learning_rate": 6.596128612442814e-06, + "loss": 0.0926, + "step": 4922 + }, + { + "epoch": 0.41, + "grad_norm": 0.795711712940724, + "learning_rate": 6.594835346310158e-06, + "loss": 0.1327, + "step": 4923 + }, + { + "epoch": 0.41, + "grad_norm": 0.21722893584109934, + "learning_rate": 6.593541961384874e-06, + "loss": 0.0531, + "step": 4924 + }, + { + "epoch": 0.41, + "grad_norm": 0.18426186710451573, + "learning_rate": 6.5922484577633004e-06, + "loss": 0.0556, + "step": 4925 + }, + { + "epoch": 0.42, + "grad_norm": 0.19348824410343896, + "learning_rate": 6.590954835541788e-06, + "loss": 0.065, + "step": 4926 + }, + { + "epoch": 0.42, + "grad_norm": 0.6738626166901616, + "learning_rate": 6.589661094816691e-06, + "loss": 0.1399, + "step": 4927 + }, + { + "epoch": 0.42, + "grad_norm": 0.3371391942009781, + "learning_rate": 6.5883672356843765e-06, + "loss": 0.1053, + "step": 4928 + }, + { + "epoch": 0.42, + "grad_norm": 0.2866511261628598, + "learning_rate": 6.587073258241215e-06, + "loss": 0.0638, + "step": 4929 + }, + { + "epoch": 0.42, + "grad_norm": 0.5065218711763665, + "learning_rate": 6.585779162583593e-06, + "loss": 0.0867, + "step": 4930 + }, + { + "epoch": 0.42, + "grad_norm": 0.2754065165449605, + "learning_rate": 6.584484948807901e-06, + "loss": 0.0445, + "step": 4931 + }, + { + "epoch": 0.42, + "grad_norm": 0.2068035734573744, + "learning_rate": 6.5831906170105406e-06, + "loss": 0.0442, + "step": 4932 + }, + { + "epoch": 0.42, + "grad_norm": 0.3332628508681216, + "learning_rate": 6.58189616728792e-06, + "loss": 0.0477, + "step": 4933 + }, + { + "epoch": 0.42, + "grad_norm": 0.38297100287362024, + "learning_rate": 6.580601599736456e-06, + "loss": 0.0713, + "step": 4934 + }, + { + "epoch": 0.42, + "grad_norm": 0.21283155505754833, + "learning_rate": 6.579306914452579e-06, + "loss": 0.0856, + "step": 4935 + }, + { + "epoch": 0.42, + "grad_norm": 0.4201297106188521, + "learning_rate": 6.578012111532722e-06, + "loss": 0.1071, + "step": 4936 + }, + { + "epoch": 0.42, + "grad_norm": 0.3810822275868338, + "learning_rate": 6.57671719107333e-06, + "loss": 0.0955, + "step": 4937 + }, + { + "epoch": 0.42, + "grad_norm": 0.2642408145797131, + "learning_rate": 6.5754221531708575e-06, + "loss": 0.0582, + "step": 4938 + }, + { + "epoch": 0.42, + "grad_norm": 0.3506623595730322, + "learning_rate": 6.574126997921765e-06, + "loss": 0.1052, + "step": 4939 + }, + { + "epoch": 0.42, + "grad_norm": 0.2785563172419917, + "learning_rate": 6.5728317254225235e-06, + "loss": 0.0881, + "step": 4940 + }, + { + "epoch": 0.42, + "grad_norm": 0.2702326913968486, + "learning_rate": 6.571536335769612e-06, + "loss": 0.054, + "step": 4941 + }, + { + "epoch": 0.42, + "grad_norm": 0.4163297981989955, + "learning_rate": 6.57024082905952e-06, + "loss": 0.0758, + "step": 4942 + }, + { + "epoch": 0.42, + "grad_norm": 0.15162302990231197, + "learning_rate": 6.568945205388745e-06, + "loss": 0.0387, + "step": 4943 + }, + { + "epoch": 0.42, + "grad_norm": 0.3442304335066295, + "learning_rate": 6.567649464853789e-06, + "loss": 0.0801, + "step": 4944 + }, + { + "epoch": 0.42, + "grad_norm": 0.5387126750987744, + "learning_rate": 6.566353607551171e-06, + "loss": 0.0875, + "step": 4945 + }, + { + "epoch": 0.42, + "grad_norm": 0.27601361889855264, + "learning_rate": 6.5650576335774105e-06, + "loss": 0.0793, + "step": 4946 + }, + { + "epoch": 0.42, + "grad_norm": 0.2503927548068538, + "learning_rate": 6.563761543029039e-06, + "loss": 0.0518, + "step": 4947 + }, + { + "epoch": 0.42, + "grad_norm": 0.25470950084284744, + "learning_rate": 6.562465336002601e-06, + "loss": 0.0843, + "step": 4948 + }, + { + "epoch": 0.42, + "grad_norm": 0.3333534344818437, + "learning_rate": 6.561169012594641e-06, + "loss": 0.0941, + "step": 4949 + }, + { + "epoch": 0.42, + "grad_norm": 0.39105577873215336, + "learning_rate": 6.559872572901719e-06, + "loss": 0.1027, + "step": 4950 + }, + { + "epoch": 0.42, + "grad_norm": 0.47511744146933216, + "learning_rate": 6.558576017020401e-06, + "loss": 0.1022, + "step": 4951 + }, + { + "epoch": 0.42, + "grad_norm": 0.3351323068823734, + "learning_rate": 6.557279345047261e-06, + "loss": 0.0879, + "step": 4952 + }, + { + "epoch": 0.42, + "grad_norm": 0.4975771182781274, + "learning_rate": 6.555982557078884e-06, + "loss": 0.0836, + "step": 4953 + }, + { + "epoch": 0.42, + "grad_norm": 0.3179978187406493, + "learning_rate": 6.554685653211861e-06, + "loss": 0.0637, + "step": 4954 + }, + { + "epoch": 0.42, + "grad_norm": 0.2320710301638781, + "learning_rate": 6.553388633542795e-06, + "loss": 0.0694, + "step": 4955 + }, + { + "epoch": 0.42, + "grad_norm": 0.400426818784265, + "learning_rate": 6.552091498168293e-06, + "loss": 0.1153, + "step": 4956 + }, + { + "epoch": 0.42, + "grad_norm": 0.40578794588133793, + "learning_rate": 6.550794247184978e-06, + "loss": 0.1094, + "step": 4957 + }, + { + "epoch": 0.42, + "grad_norm": 0.3112202679021108, + "learning_rate": 6.54949688068947e-06, + "loss": 0.0905, + "step": 4958 + }, + { + "epoch": 0.42, + "grad_norm": 0.39327923878477794, + "learning_rate": 6.548199398778409e-06, + "loss": 0.091, + "step": 4959 + }, + { + "epoch": 0.42, + "grad_norm": 0.25205357815626855, + "learning_rate": 6.546901801548438e-06, + "loss": 0.075, + "step": 4960 + }, + { + "epoch": 0.42, + "grad_norm": 0.44146306484474024, + "learning_rate": 6.545604089096209e-06, + "loss": 0.1304, + "step": 4961 + }, + { + "epoch": 0.42, + "grad_norm": 0.2623672458765783, + "learning_rate": 6.544306261518385e-06, + "loss": 0.0793, + "step": 4962 + }, + { + "epoch": 0.42, + "grad_norm": 0.3722855822940689, + "learning_rate": 6.5430083189116325e-06, + "loss": 0.0901, + "step": 4963 + }, + { + "epoch": 0.42, + "grad_norm": 0.35561858737809965, + "learning_rate": 6.541710261372634e-06, + "loss": 0.0499, + "step": 4964 + }, + { + "epoch": 0.42, + "grad_norm": 0.36620689849902505, + "learning_rate": 6.5404120889980715e-06, + "loss": 0.0997, + "step": 4965 + }, + { + "epoch": 0.42, + "grad_norm": 0.2778471532545692, + "learning_rate": 6.539113801884645e-06, + "loss": 0.0809, + "step": 4966 + }, + { + "epoch": 0.42, + "grad_norm": 0.29347253779623605, + "learning_rate": 6.537815400129056e-06, + "loss": 0.0821, + "step": 4967 + }, + { + "epoch": 0.42, + "grad_norm": 0.3896457159713344, + "learning_rate": 6.536516883828017e-06, + "loss": 0.1091, + "step": 4968 + }, + { + "epoch": 0.42, + "grad_norm": 0.214039369518944, + "learning_rate": 6.53521825307825e-06, + "loss": 0.0703, + "step": 4969 + }, + { + "epoch": 0.42, + "grad_norm": 0.28368249130384643, + "learning_rate": 6.533919507976484e-06, + "loss": 0.0861, + "step": 4970 + }, + { + "epoch": 0.42, + "grad_norm": 0.26842093576754844, + "learning_rate": 6.532620648619457e-06, + "loss": 0.0848, + "step": 4971 + }, + { + "epoch": 0.42, + "grad_norm": 0.2625932227753389, + "learning_rate": 6.531321675103917e-06, + "loss": 0.0433, + "step": 4972 + }, + { + "epoch": 0.42, + "grad_norm": 0.6763495141258672, + "learning_rate": 6.5300225875266165e-06, + "loss": 0.1401, + "step": 4973 + }, + { + "epoch": 0.42, + "grad_norm": 0.24049173200348947, + "learning_rate": 6.528723385984322e-06, + "loss": 0.0758, + "step": 4974 + }, + { + "epoch": 0.42, + "grad_norm": 0.4541796484774277, + "learning_rate": 6.527424070573804e-06, + "loss": 0.0931, + "step": 4975 + }, + { + "epoch": 0.42, + "grad_norm": 0.3592578586107002, + "learning_rate": 6.5261246413918425e-06, + "loss": 0.0684, + "step": 4976 + }, + { + "epoch": 0.42, + "grad_norm": 0.23034998026534714, + "learning_rate": 6.524825098535228e-06, + "loss": 0.0528, + "step": 4977 + }, + { + "epoch": 0.42, + "grad_norm": 0.3175670333424844, + "learning_rate": 6.523525442100758e-06, + "loss": 0.088, + "step": 4978 + }, + { + "epoch": 0.42, + "grad_norm": 0.44402896122943847, + "learning_rate": 6.522225672185238e-06, + "loss": 0.1178, + "step": 4979 + }, + { + "epoch": 0.42, + "grad_norm": 0.23563041358295203, + "learning_rate": 6.52092578888548e-06, + "loss": 0.05, + "step": 4980 + }, + { + "epoch": 0.42, + "grad_norm": 0.2544305556475782, + "learning_rate": 6.519625792298311e-06, + "loss": 0.086, + "step": 4981 + }, + { + "epoch": 0.42, + "grad_norm": 0.30245869858540275, + "learning_rate": 6.518325682520561e-06, + "loss": 0.0565, + "step": 4982 + }, + { + "epoch": 0.42, + "grad_norm": 0.5057301390307193, + "learning_rate": 6.5170254596490675e-06, + "loss": 0.1195, + "step": 4983 + }, + { + "epoch": 0.42, + "grad_norm": 0.2504988483983231, + "learning_rate": 6.515725123780681e-06, + "loss": 0.0701, + "step": 4984 + }, + { + "epoch": 0.42, + "grad_norm": 0.31464784055165174, + "learning_rate": 6.514424675012259e-06, + "loss": 0.0749, + "step": 4985 + }, + { + "epoch": 0.42, + "grad_norm": 0.3384801464424417, + "learning_rate": 6.513124113440664e-06, + "loss": 0.1057, + "step": 4986 + }, + { + "epoch": 0.42, + "grad_norm": 0.3354224291961883, + "learning_rate": 6.51182343916277e-06, + "loss": 0.0791, + "step": 4987 + }, + { + "epoch": 0.42, + "grad_norm": 0.4848671341745712, + "learning_rate": 6.5105226522754595e-06, + "loss": 0.134, + "step": 4988 + }, + { + "epoch": 0.42, + "grad_norm": 0.2591184890045394, + "learning_rate": 6.509221752875623e-06, + "loss": 0.0563, + "step": 4989 + }, + { + "epoch": 0.42, + "grad_norm": 0.4125943355897507, + "learning_rate": 6.507920741060158e-06, + "loss": 0.097, + "step": 4990 + }, + { + "epoch": 0.42, + "grad_norm": 0.22542728972058484, + "learning_rate": 6.506619616925972e-06, + "loss": 0.0595, + "step": 4991 + }, + { + "epoch": 0.42, + "grad_norm": 0.21044219942430617, + "learning_rate": 6.505318380569981e-06, + "loss": 0.0631, + "step": 4992 + }, + { + "epoch": 0.42, + "grad_norm": 0.3066235737812797, + "learning_rate": 6.504017032089106e-06, + "loss": 0.0669, + "step": 4993 + }, + { + "epoch": 0.42, + "grad_norm": 0.34139031567789946, + "learning_rate": 6.502715571580282e-06, + "loss": 0.0777, + "step": 4994 + }, + { + "epoch": 0.42, + "grad_norm": 0.6212185286482573, + "learning_rate": 6.501413999140448e-06, + "loss": 0.1636, + "step": 4995 + }, + { + "epoch": 0.42, + "grad_norm": 0.3045151125150769, + "learning_rate": 6.500112314866553e-06, + "loss": 0.0769, + "step": 4996 + }, + { + "epoch": 0.42, + "grad_norm": 0.23479195078238124, + "learning_rate": 6.498810518855555e-06, + "loss": 0.0658, + "step": 4997 + }, + { + "epoch": 0.42, + "grad_norm": 0.2935654694196785, + "learning_rate": 6.497508611204417e-06, + "loss": 0.0748, + "step": 4998 + }, + { + "epoch": 0.42, + "grad_norm": 0.3815338718524593, + "learning_rate": 6.496206592010114e-06, + "loss": 0.1042, + "step": 4999 + }, + { + "epoch": 0.42, + "grad_norm": 0.31586932692974995, + "learning_rate": 6.49490446136963e-06, + "loss": 0.0986, + "step": 5000 + }, + { + "epoch": 0.42, + "grad_norm": 0.26299610805225904, + "learning_rate": 6.493602219379953e-06, + "loss": 0.0696, + "step": 5001 + }, + { + "epoch": 0.42, + "grad_norm": 0.21825462729415152, + "learning_rate": 6.49229986613808e-06, + "loss": 0.0501, + "step": 5002 + }, + { + "epoch": 0.42, + "grad_norm": 0.4494672288061924, + "learning_rate": 6.49099740174102e-06, + "loss": 0.1074, + "step": 5003 + }, + { + "epoch": 0.42, + "grad_norm": 0.27087500929515734, + "learning_rate": 6.4896948262857885e-06, + "loss": 0.0669, + "step": 5004 + }, + { + "epoch": 0.42, + "grad_norm": 0.24618049818754711, + "learning_rate": 6.488392139869407e-06, + "loss": 0.0919, + "step": 5005 + }, + { + "epoch": 0.42, + "grad_norm": 0.28070133798760527, + "learning_rate": 6.4870893425889104e-06, + "loss": 0.0921, + "step": 5006 + }, + { + "epoch": 0.42, + "grad_norm": 0.3287066414001967, + "learning_rate": 6.485786434541335e-06, + "loss": 0.0805, + "step": 5007 + }, + { + "epoch": 0.42, + "grad_norm": 0.26893340573755653, + "learning_rate": 6.484483415823731e-06, + "loss": 0.067, + "step": 5008 + }, + { + "epoch": 0.42, + "grad_norm": 0.26221526526767114, + "learning_rate": 6.483180286533155e-06, + "loss": 0.0676, + "step": 5009 + }, + { + "epoch": 0.42, + "grad_norm": 0.4930738325877774, + "learning_rate": 6.4818770467666695e-06, + "loss": 0.0992, + "step": 5010 + }, + { + "epoch": 0.42, + "grad_norm": 0.31196998865454895, + "learning_rate": 6.480573696621351e-06, + "loss": 0.0682, + "step": 5011 + }, + { + "epoch": 0.42, + "grad_norm": 0.3381343640589562, + "learning_rate": 6.479270236194277e-06, + "loss": 0.1295, + "step": 5012 + }, + { + "epoch": 0.42, + "grad_norm": 0.28425764608852916, + "learning_rate": 6.47796666558254e-06, + "loss": 0.054, + "step": 5013 + }, + { + "epoch": 0.42, + "grad_norm": 0.3619156883705878, + "learning_rate": 6.476662984883234e-06, + "loss": 0.0941, + "step": 5014 + }, + { + "epoch": 0.42, + "grad_norm": 0.33919449563345655, + "learning_rate": 6.475359194193469e-06, + "loss": 0.097, + "step": 5015 + }, + { + "epoch": 0.42, + "grad_norm": 0.29035217697509946, + "learning_rate": 6.474055293610355e-06, + "loss": 0.0858, + "step": 5016 + }, + { + "epoch": 0.42, + "grad_norm": 0.3070955024663146, + "learning_rate": 6.472751283231016e-06, + "loss": 0.08, + "step": 5017 + }, + { + "epoch": 0.42, + "grad_norm": 0.2767586795416823, + "learning_rate": 6.4714471631525825e-06, + "loss": 0.0788, + "step": 5018 + }, + { + "epoch": 0.42, + "grad_norm": 0.37509409073377437, + "learning_rate": 6.470142933472191e-06, + "loss": 0.083, + "step": 5019 + }, + { + "epoch": 0.42, + "grad_norm": 0.3364692149702103, + "learning_rate": 6.4688385942869915e-06, + "loss": 0.0933, + "step": 5020 + }, + { + "epoch": 0.42, + "grad_norm": 0.2771744733565043, + "learning_rate": 6.467534145694135e-06, + "loss": 0.0749, + "step": 5021 + }, + { + "epoch": 0.42, + "grad_norm": 0.5487536079881584, + "learning_rate": 6.4662295877907865e-06, + "loss": 0.1075, + "step": 5022 + }, + { + "epoch": 0.42, + "grad_norm": 0.3466256618887153, + "learning_rate": 6.464924920674119e-06, + "loss": 0.0991, + "step": 5023 + }, + { + "epoch": 0.42, + "grad_norm": 0.25854092705346277, + "learning_rate": 6.463620144441308e-06, + "loss": 0.0864, + "step": 5024 + }, + { + "epoch": 0.42, + "grad_norm": 0.2954208713445148, + "learning_rate": 6.4623152591895434e-06, + "loss": 0.0731, + "step": 5025 + }, + { + "epoch": 0.42, + "grad_norm": 0.3121212028468487, + "learning_rate": 6.461010265016019e-06, + "loss": 0.0706, + "step": 5026 + }, + { + "epoch": 0.42, + "grad_norm": 0.3138915821341992, + "learning_rate": 6.4597051620179385e-06, + "loss": 0.0888, + "step": 5027 + }, + { + "epoch": 0.42, + "grad_norm": 0.2329737267059408, + "learning_rate": 6.458399950292515e-06, + "loss": 0.0479, + "step": 5028 + }, + { + "epoch": 0.42, + "grad_norm": 0.2588992022431299, + "learning_rate": 6.457094629936966e-06, + "loss": 0.0828, + "step": 5029 + }, + { + "epoch": 0.42, + "grad_norm": 0.52556991309081, + "learning_rate": 6.455789201048523e-06, + "loss": 0.1128, + "step": 5030 + }, + { + "epoch": 0.42, + "grad_norm": 0.2768393560829019, + "learning_rate": 6.454483663724418e-06, + "loss": 0.0515, + "step": 5031 + }, + { + "epoch": 0.42, + "grad_norm": 0.6221289764629585, + "learning_rate": 6.453178018061899e-06, + "loss": 0.1123, + "step": 5032 + }, + { + "epoch": 0.42, + "grad_norm": 0.2005402435005419, + "learning_rate": 6.451872264158213e-06, + "loss": 0.0913, + "step": 5033 + }, + { + "epoch": 0.42, + "grad_norm": 0.36300848899648425, + "learning_rate": 6.4505664021106255e-06, + "loss": 0.0944, + "step": 5034 + }, + { + "epoch": 0.42, + "grad_norm": 0.30923208105798594, + "learning_rate": 6.4492604320164e-06, + "loss": 0.0933, + "step": 5035 + }, + { + "epoch": 0.42, + "grad_norm": 0.6123579343408642, + "learning_rate": 6.447954353972816e-06, + "loss": 0.1626, + "step": 5036 + }, + { + "epoch": 0.42, + "grad_norm": 0.29662012201504695, + "learning_rate": 6.446648168077157e-06, + "loss": 0.0982, + "step": 5037 + }, + { + "epoch": 0.42, + "grad_norm": 0.34551469150360464, + "learning_rate": 6.445341874426714e-06, + "loss": 0.098, + "step": 5038 + }, + { + "epoch": 0.42, + "grad_norm": 0.3596943507309376, + "learning_rate": 6.444035473118788e-06, + "loss": 0.095, + "step": 5039 + }, + { + "epoch": 0.42, + "grad_norm": 0.3355914464076175, + "learning_rate": 6.442728964250691e-06, + "loss": 0.0948, + "step": 5040 + }, + { + "epoch": 0.42, + "grad_norm": 0.4124424022259644, + "learning_rate": 6.441422347919733e-06, + "loss": 0.1159, + "step": 5041 + }, + { + "epoch": 0.42, + "grad_norm": 0.27819871482231173, + "learning_rate": 6.4401156242232435e-06, + "loss": 0.069, + "step": 5042 + }, + { + "epoch": 0.42, + "grad_norm": 0.32939357764806776, + "learning_rate": 6.438808793258552e-06, + "loss": 0.0807, + "step": 5043 + }, + { + "epoch": 0.43, + "grad_norm": 0.42129641238343774, + "learning_rate": 6.437501855123e-06, + "loss": 0.0913, + "step": 5044 + }, + { + "epoch": 0.43, + "grad_norm": 0.2686422173490415, + "learning_rate": 6.436194809913936e-06, + "loss": 0.0819, + "step": 5045 + }, + { + "epoch": 0.43, + "grad_norm": 0.2581328510174416, + "learning_rate": 6.434887657728716e-06, + "loss": 0.0579, + "step": 5046 + }, + { + "epoch": 0.43, + "grad_norm": 0.4023706940830099, + "learning_rate": 6.433580398664705e-06, + "loss": 0.0999, + "step": 5047 + }, + { + "epoch": 0.43, + "grad_norm": 0.27784003810290475, + "learning_rate": 6.432273032819273e-06, + "loss": 0.0681, + "step": 5048 + }, + { + "epoch": 0.43, + "grad_norm": 0.21007825624320725, + "learning_rate": 6.430965560289803e-06, + "loss": 0.0691, + "step": 5049 + }, + { + "epoch": 0.43, + "grad_norm": 0.30586526560767807, + "learning_rate": 6.429657981173683e-06, + "loss": 0.1043, + "step": 5050 + }, + { + "epoch": 0.43, + "grad_norm": 0.307201982009912, + "learning_rate": 6.4283502955683065e-06, + "loss": 0.0707, + "step": 5051 + }, + { + "epoch": 0.43, + "grad_norm": 0.31925414456324913, + "learning_rate": 6.427042503571081e-06, + "loss": 0.081, + "step": 5052 + }, + { + "epoch": 0.43, + "grad_norm": 0.22836829621777202, + "learning_rate": 6.425734605279414e-06, + "loss": 0.0809, + "step": 5053 + }, + { + "epoch": 0.43, + "grad_norm": 0.346242521474331, + "learning_rate": 6.4244266007907304e-06, + "loss": 0.0901, + "step": 5054 + }, + { + "epoch": 0.43, + "grad_norm": 0.30662731802114246, + "learning_rate": 6.423118490202456e-06, + "loss": 0.0734, + "step": 5055 + }, + { + "epoch": 0.43, + "grad_norm": 0.28587163179015795, + "learning_rate": 6.421810273612023e-06, + "loss": 0.0806, + "step": 5056 + }, + { + "epoch": 0.43, + "grad_norm": 0.25576872866339184, + "learning_rate": 6.420501951116882e-06, + "loss": 0.0931, + "step": 5057 + }, + { + "epoch": 0.43, + "grad_norm": 0.23019490471316545, + "learning_rate": 6.419193522814478e-06, + "loss": 0.0698, + "step": 5058 + }, + { + "epoch": 0.43, + "grad_norm": 0.2987212789799524, + "learning_rate": 6.417884988802275e-06, + "loss": 0.0646, + "step": 5059 + }, + { + "epoch": 0.43, + "grad_norm": 0.1940393800406046, + "learning_rate": 6.4165763491777365e-06, + "loss": 0.0524, + "step": 5060 + }, + { + "epoch": 0.43, + "grad_norm": 0.4575970419058964, + "learning_rate": 6.415267604038342e-06, + "loss": 0.1352, + "step": 5061 + }, + { + "epoch": 0.43, + "grad_norm": 0.22992657309965417, + "learning_rate": 6.413958753481569e-06, + "loss": 0.0842, + "step": 5062 + }, + { + "epoch": 0.43, + "grad_norm": 0.49487432506669243, + "learning_rate": 6.4126497976049115e-06, + "loss": 0.1274, + "step": 5063 + }, + { + "epoch": 0.43, + "grad_norm": 0.22582812423344603, + "learning_rate": 6.411340736505869e-06, + "loss": 0.0581, + "step": 5064 + }, + { + "epoch": 0.43, + "grad_norm": 0.33076147771303804, + "learning_rate": 6.410031570281946e-06, + "loss": 0.0864, + "step": 5065 + }, + { + "epoch": 0.43, + "grad_norm": 0.3878302169174842, + "learning_rate": 6.408722299030658e-06, + "loss": 0.1248, + "step": 5066 + }, + { + "epoch": 0.43, + "grad_norm": 0.2896374240751767, + "learning_rate": 6.4074129228495265e-06, + "loss": 0.0693, + "step": 5067 + }, + { + "epoch": 0.43, + "grad_norm": 0.2799746839039772, + "learning_rate": 6.406103441836082e-06, + "loss": 0.0854, + "step": 5068 + }, + { + "epoch": 0.43, + "grad_norm": 0.24553549859245666, + "learning_rate": 6.404793856087863e-06, + "loss": 0.074, + "step": 5069 + }, + { + "epoch": 0.43, + "grad_norm": 0.2873029701818784, + "learning_rate": 6.403484165702411e-06, + "loss": 0.0845, + "step": 5070 + }, + { + "epoch": 0.43, + "grad_norm": 0.43435135945478764, + "learning_rate": 6.402174370777285e-06, + "loss": 0.1004, + "step": 5071 + }, + { + "epoch": 0.43, + "grad_norm": 0.39628190239385086, + "learning_rate": 6.400864471410043e-06, + "loss": 0.0876, + "step": 5072 + }, + { + "epoch": 0.43, + "grad_norm": 0.35314246297869617, + "learning_rate": 6.399554467698255e-06, + "loss": 0.064, + "step": 5073 + }, + { + "epoch": 0.43, + "grad_norm": 0.3910024156789632, + "learning_rate": 6.398244359739496e-06, + "loss": 0.098, + "step": 5074 + }, + { + "epoch": 0.43, + "grad_norm": 0.2535591210565408, + "learning_rate": 6.396934147631352e-06, + "loss": 0.0592, + "step": 5075 + }, + { + "epoch": 0.43, + "grad_norm": 0.3224967862438788, + "learning_rate": 6.395623831471416e-06, + "loss": 0.0664, + "step": 5076 + }, + { + "epoch": 0.43, + "grad_norm": 0.3907403649438304, + "learning_rate": 6.394313411357286e-06, + "loss": 0.0742, + "step": 5077 + }, + { + "epoch": 0.43, + "grad_norm": 0.37685386758012246, + "learning_rate": 6.393002887386571e-06, + "loss": 0.0754, + "step": 5078 + }, + { + "epoch": 0.43, + "grad_norm": 0.29114945029243683, + "learning_rate": 6.391692259656885e-06, + "loss": 0.0863, + "step": 5079 + }, + { + "epoch": 0.43, + "grad_norm": 0.4426190922804583, + "learning_rate": 6.390381528265853e-06, + "loss": 0.1026, + "step": 5080 + }, + { + "epoch": 0.43, + "grad_norm": 0.33878778832001055, + "learning_rate": 6.389070693311106e-06, + "loss": 0.1073, + "step": 5081 + }, + { + "epoch": 0.43, + "grad_norm": 0.4866856531092809, + "learning_rate": 6.387759754890281e-06, + "loss": 0.1023, + "step": 5082 + }, + { + "epoch": 0.43, + "grad_norm": 0.21902928550672768, + "learning_rate": 6.386448713101025e-06, + "loss": 0.0593, + "step": 5083 + }, + { + "epoch": 0.43, + "grad_norm": 0.35981973601339395, + "learning_rate": 6.385137568040993e-06, + "loss": 0.0885, + "step": 5084 + }, + { + "epoch": 0.43, + "grad_norm": 0.26959102786023414, + "learning_rate": 6.383826319807845e-06, + "loss": 0.078, + "step": 5085 + }, + { + "epoch": 0.43, + "grad_norm": 0.3996308244919742, + "learning_rate": 6.382514968499253e-06, + "loss": 0.1414, + "step": 5086 + }, + { + "epoch": 0.43, + "grad_norm": 0.3732410680366653, + "learning_rate": 6.3812035142128905e-06, + "loss": 0.0762, + "step": 5087 + }, + { + "epoch": 0.43, + "grad_norm": 0.32304479548192916, + "learning_rate": 6.379891957046445e-06, + "loss": 0.0774, + "step": 5088 + }, + { + "epoch": 0.43, + "grad_norm": 0.16943076262752108, + "learning_rate": 6.378580297097607e-06, + "loss": 0.0526, + "step": 5089 + }, + { + "epoch": 0.43, + "grad_norm": 0.2990947234421667, + "learning_rate": 6.3772685344640795e-06, + "loss": 0.0782, + "step": 5090 + }, + { + "epoch": 0.43, + "grad_norm": 0.3447900748068334, + "learning_rate": 6.375956669243568e-06, + "loss": 0.0893, + "step": 5091 + }, + { + "epoch": 0.43, + "grad_norm": 0.3455460735605349, + "learning_rate": 6.374644701533788e-06, + "loss": 0.0728, + "step": 5092 + }, + { + "epoch": 0.43, + "grad_norm": 0.30185887116342336, + "learning_rate": 6.373332631432462e-06, + "loss": 0.0859, + "step": 5093 + }, + { + "epoch": 0.43, + "grad_norm": 0.4259665868930272, + "learning_rate": 6.372020459037321e-06, + "loss": 0.0833, + "step": 5094 + }, + { + "epoch": 0.43, + "grad_norm": 0.2919816365827332, + "learning_rate": 6.370708184446105e-06, + "loss": 0.0606, + "step": 5095 + }, + { + "epoch": 0.43, + "grad_norm": 0.33598112225093646, + "learning_rate": 6.369395807756557e-06, + "loss": 0.0826, + "step": 5096 + }, + { + "epoch": 0.43, + "grad_norm": 0.22720882870985729, + "learning_rate": 6.368083329066432e-06, + "loss": 0.0661, + "step": 5097 + }, + { + "epoch": 0.43, + "grad_norm": 0.9077888446790929, + "learning_rate": 6.366770748473493e-06, + "loss": 0.1163, + "step": 5098 + }, + { + "epoch": 0.43, + "grad_norm": 0.48209921357093444, + "learning_rate": 6.365458066075505e-06, + "loss": 0.1113, + "step": 5099 + }, + { + "epoch": 0.43, + "grad_norm": 0.27089999608786985, + "learning_rate": 6.364145281970247e-06, + "loss": 0.073, + "step": 5100 + }, + { + "epoch": 0.43, + "grad_norm": 0.2325847909295708, + "learning_rate": 6.3628323962555e-06, + "loss": 0.0691, + "step": 5101 + }, + { + "epoch": 0.43, + "grad_norm": 0.4536122346623705, + "learning_rate": 6.361519409029059e-06, + "loss": 0.1187, + "step": 5102 + }, + { + "epoch": 0.43, + "grad_norm": 0.27630816563069316, + "learning_rate": 6.36020632038872e-06, + "loss": 0.0728, + "step": 5103 + }, + { + "epoch": 0.43, + "grad_norm": 0.253075229197033, + "learning_rate": 6.3588931304322905e-06, + "loss": 0.0596, + "step": 5104 + }, + { + "epoch": 0.43, + "grad_norm": 0.39107903850366826, + "learning_rate": 6.3575798392575846e-06, + "loss": 0.0904, + "step": 5105 + }, + { + "epoch": 0.43, + "grad_norm": 0.37701026116821723, + "learning_rate": 6.3562664469624235e-06, + "loss": 0.0991, + "step": 5106 + }, + { + "epoch": 0.43, + "grad_norm": 0.487250291271705, + "learning_rate": 6.354952953644636e-06, + "loss": 0.1233, + "step": 5107 + }, + { + "epoch": 0.43, + "grad_norm": 0.5460562821845559, + "learning_rate": 6.353639359402061e-06, + "loss": 0.1069, + "step": 5108 + }, + { + "epoch": 0.43, + "grad_norm": 0.39750831422381533, + "learning_rate": 6.352325664332539e-06, + "loss": 0.1039, + "step": 5109 + }, + { + "epoch": 0.43, + "grad_norm": 0.638983494765347, + "learning_rate": 6.351011868533925e-06, + "loss": 0.1118, + "step": 5110 + }, + { + "epoch": 0.43, + "grad_norm": 0.5905910254166707, + "learning_rate": 6.349697972104076e-06, + "loss": 0.0851, + "step": 5111 + }, + { + "epoch": 0.43, + "grad_norm": 0.24767387021109064, + "learning_rate": 6.348383975140862e-06, + "loss": 0.073, + "step": 5112 + }, + { + "epoch": 0.43, + "grad_norm": 0.294039794615177, + "learning_rate": 6.347069877742151e-06, + "loss": 0.0716, + "step": 5113 + }, + { + "epoch": 0.43, + "grad_norm": 0.2886262464525754, + "learning_rate": 6.34575568000583e-06, + "loss": 0.0808, + "step": 5114 + }, + { + "epoch": 0.43, + "grad_norm": 0.4128316445687048, + "learning_rate": 6.344441382029787e-06, + "loss": 0.1062, + "step": 5115 + }, + { + "epoch": 0.43, + "grad_norm": 0.38302340055662404, + "learning_rate": 6.343126983911916e-06, + "loss": 0.0892, + "step": 5116 + }, + { + "epoch": 0.43, + "grad_norm": 0.3411276395220711, + "learning_rate": 6.341812485750124e-06, + "loss": 0.0882, + "step": 5117 + }, + { + "epoch": 0.43, + "grad_norm": 0.26660827309731244, + "learning_rate": 6.3404978876423205e-06, + "loss": 0.0649, + "step": 5118 + }, + { + "epoch": 0.43, + "grad_norm": 0.4145834756708014, + "learning_rate": 6.339183189686426e-06, + "loss": 0.1093, + "step": 5119 + }, + { + "epoch": 0.43, + "grad_norm": 0.25612640293289035, + "learning_rate": 6.337868391980367e-06, + "loss": 0.0902, + "step": 5120 + }, + { + "epoch": 0.43, + "grad_norm": 0.4237678002107076, + "learning_rate": 6.336553494622077e-06, + "loss": 0.0982, + "step": 5121 + }, + { + "epoch": 0.43, + "grad_norm": 0.4361783140165895, + "learning_rate": 6.335238497709495e-06, + "loss": 0.1223, + "step": 5122 + }, + { + "epoch": 0.43, + "grad_norm": 0.27802344553059266, + "learning_rate": 6.333923401340572e-06, + "loss": 0.0786, + "step": 5123 + }, + { + "epoch": 0.43, + "grad_norm": 0.2993314997940703, + "learning_rate": 6.332608205613264e-06, + "loss": 0.1006, + "step": 5124 + }, + { + "epoch": 0.43, + "grad_norm": 0.35561042437654855, + "learning_rate": 6.331292910625533e-06, + "loss": 0.0536, + "step": 5125 + }, + { + "epoch": 0.43, + "grad_norm": 0.3949167641028964, + "learning_rate": 6.3299775164753495e-06, + "loss": 0.1104, + "step": 5126 + }, + { + "epoch": 0.43, + "grad_norm": 0.49954067696945664, + "learning_rate": 6.328662023260696e-06, + "loss": 0.0854, + "step": 5127 + }, + { + "epoch": 0.43, + "grad_norm": 0.2538857532554658, + "learning_rate": 6.327346431079553e-06, + "loss": 0.059, + "step": 5128 + }, + { + "epoch": 0.43, + "grad_norm": 0.23857050563486423, + "learning_rate": 6.326030740029916e-06, + "loss": 0.1012, + "step": 5129 + }, + { + "epoch": 0.43, + "grad_norm": 0.28855503017043477, + "learning_rate": 6.324714950209784e-06, + "loss": 0.0741, + "step": 5130 + }, + { + "epoch": 0.43, + "grad_norm": 0.3936977739714567, + "learning_rate": 6.323399061717165e-06, + "loss": 0.0918, + "step": 5131 + }, + { + "epoch": 0.43, + "grad_norm": 0.7527307133192241, + "learning_rate": 6.322083074650076e-06, + "loss": 0.137, + "step": 5132 + }, + { + "epoch": 0.43, + "grad_norm": 0.4874735487182669, + "learning_rate": 6.320766989106537e-06, + "loss": 0.1113, + "step": 5133 + }, + { + "epoch": 0.43, + "grad_norm": 0.2870276148346911, + "learning_rate": 6.319450805184579e-06, + "loss": 0.0674, + "step": 5134 + }, + { + "epoch": 0.43, + "grad_norm": 0.4325594125606785, + "learning_rate": 6.318134522982239e-06, + "loss": 0.1096, + "step": 5135 + }, + { + "epoch": 0.43, + "grad_norm": 0.18184279655658253, + "learning_rate": 6.316818142597562e-06, + "loss": 0.0693, + "step": 5136 + }, + { + "epoch": 0.43, + "grad_norm": 0.289487516660886, + "learning_rate": 6.315501664128596e-06, + "loss": 0.0764, + "step": 5137 + }, + { + "epoch": 0.43, + "grad_norm": 0.2621708938617668, + "learning_rate": 6.314185087673404e-06, + "loss": 0.0833, + "step": 5138 + }, + { + "epoch": 0.43, + "grad_norm": 0.33076434156815726, + "learning_rate": 6.312868413330052e-06, + "loss": 0.1065, + "step": 5139 + }, + { + "epoch": 0.43, + "grad_norm": 0.4055905024743049, + "learning_rate": 6.311551641196612e-06, + "loss": 0.1007, + "step": 5140 + }, + { + "epoch": 0.43, + "grad_norm": 0.5533421547127563, + "learning_rate": 6.310234771371165e-06, + "loss": 0.1889, + "step": 5141 + }, + { + "epoch": 0.43, + "grad_norm": 0.38411423123472727, + "learning_rate": 6.308917803951799e-06, + "loss": 0.1193, + "step": 5142 + }, + { + "epoch": 0.43, + "grad_norm": 0.3172037829610849, + "learning_rate": 6.30760073903661e-06, + "loss": 0.0848, + "step": 5143 + }, + { + "epoch": 0.43, + "grad_norm": 0.29420732700167507, + "learning_rate": 6.306283576723702e-06, + "loss": 0.0852, + "step": 5144 + }, + { + "epoch": 0.43, + "grad_norm": 0.33592250478861607, + "learning_rate": 6.304966317111183e-06, + "loss": 0.097, + "step": 5145 + }, + { + "epoch": 0.43, + "grad_norm": 0.7618676670534758, + "learning_rate": 6.30364896029717e-06, + "loss": 0.1775, + "step": 5146 + }, + { + "epoch": 0.43, + "grad_norm": 0.3046605235732828, + "learning_rate": 6.302331506379789e-06, + "loss": 0.0601, + "step": 5147 + }, + { + "epoch": 0.43, + "grad_norm": 0.3172669658320801, + "learning_rate": 6.301013955457169e-06, + "loss": 0.0835, + "step": 5148 + }, + { + "epoch": 0.43, + "grad_norm": 0.21588165888739544, + "learning_rate": 6.2996963076274535e-06, + "loss": 0.053, + "step": 5149 + }, + { + "epoch": 0.43, + "grad_norm": 0.3379512095788092, + "learning_rate": 6.298378562988783e-06, + "loss": 0.1037, + "step": 5150 + }, + { + "epoch": 0.43, + "grad_norm": 0.32430884701003304, + "learning_rate": 6.2970607216393154e-06, + "loss": 0.0842, + "step": 5151 + }, + { + "epoch": 0.43, + "grad_norm": 0.34240021569397755, + "learning_rate": 6.2957427836772075e-06, + "loss": 0.0836, + "step": 5152 + }, + { + "epoch": 0.43, + "grad_norm": 0.3437104889896471, + "learning_rate": 6.294424749200629e-06, + "loss": 0.127, + "step": 5153 + }, + { + "epoch": 0.43, + "grad_norm": 0.3417632146071153, + "learning_rate": 6.293106618307757e-06, + "loss": 0.0921, + "step": 5154 + }, + { + "epoch": 0.43, + "grad_norm": 0.18062797482194565, + "learning_rate": 6.291788391096769e-06, + "loss": 0.0638, + "step": 5155 + }, + { + "epoch": 0.43, + "grad_norm": 0.3082937560199461, + "learning_rate": 6.290470067665858e-06, + "loss": 0.0757, + "step": 5156 + }, + { + "epoch": 0.43, + "grad_norm": 0.5238024527913562, + "learning_rate": 6.289151648113217e-06, + "loss": 0.1118, + "step": 5157 + }, + { + "epoch": 0.43, + "grad_norm": 0.2651797135705574, + "learning_rate": 6.287833132537053e-06, + "loss": 0.0627, + "step": 5158 + }, + { + "epoch": 0.43, + "grad_norm": 0.5273501862063448, + "learning_rate": 6.286514521035574e-06, + "loss": 0.1159, + "step": 5159 + }, + { + "epoch": 0.43, + "grad_norm": 0.2637778275733897, + "learning_rate": 6.285195813706999e-06, + "loss": 0.0912, + "step": 5160 + }, + { + "epoch": 0.43, + "grad_norm": 0.22590963850321447, + "learning_rate": 6.2838770106495535e-06, + "loss": 0.0678, + "step": 5161 + }, + { + "epoch": 0.43, + "grad_norm": 0.29320572091763986, + "learning_rate": 6.282558111961469e-06, + "loss": 0.0925, + "step": 5162 + }, + { + "epoch": 0.44, + "grad_norm": 0.5279643431001139, + "learning_rate": 6.281239117740985e-06, + "loss": 0.1056, + "step": 5163 + }, + { + "epoch": 0.44, + "grad_norm": 0.6136431513238707, + "learning_rate": 6.279920028086347e-06, + "loss": 0.1705, + "step": 5164 + }, + { + "epoch": 0.44, + "grad_norm": 0.19863772121577403, + "learning_rate": 6.278600843095809e-06, + "loss": 0.0699, + "step": 5165 + }, + { + "epoch": 0.44, + "grad_norm": 0.25559180686480315, + "learning_rate": 6.2772815628676335e-06, + "loss": 0.0986, + "step": 5166 + }, + { + "epoch": 0.44, + "grad_norm": 0.36503701428093804, + "learning_rate": 6.275962187500084e-06, + "loss": 0.1019, + "step": 5167 + }, + { + "epoch": 0.44, + "grad_norm": 0.32045961123847094, + "learning_rate": 6.2746427170914405e-06, + "loss": 0.0555, + "step": 5168 + }, + { + "epoch": 0.44, + "grad_norm": 0.26837908560880663, + "learning_rate": 6.27332315173998e-06, + "loss": 0.1003, + "step": 5169 + }, + { + "epoch": 0.44, + "grad_norm": 0.26231090290125614, + "learning_rate": 6.272003491543995e-06, + "loss": 0.0681, + "step": 5170 + }, + { + "epoch": 0.44, + "grad_norm": 0.34422761593659307, + "learning_rate": 6.270683736601779e-06, + "loss": 0.1146, + "step": 5171 + }, + { + "epoch": 0.44, + "grad_norm": 0.37413142457805704, + "learning_rate": 6.269363887011637e-06, + "loss": 0.0924, + "step": 5172 + }, + { + "epoch": 0.44, + "grad_norm": 0.5052662802453262, + "learning_rate": 6.2680439428718784e-06, + "loss": 0.073, + "step": 5173 + }, + { + "epoch": 0.44, + "grad_norm": 0.48362656473537935, + "learning_rate": 6.2667239042808185e-06, + "loss": 0.0894, + "step": 5174 + }, + { + "epoch": 0.44, + "grad_norm": 0.22522435208419184, + "learning_rate": 6.265403771336784e-06, + "loss": 0.0557, + "step": 5175 + }, + { + "epoch": 0.44, + "grad_norm": 0.30394007244896476, + "learning_rate": 6.264083544138104e-06, + "loss": 0.0644, + "step": 5176 + }, + { + "epoch": 0.44, + "grad_norm": 0.2664992122379471, + "learning_rate": 6.2627632227831194e-06, + "loss": 0.0848, + "step": 5177 + }, + { + "epoch": 0.44, + "grad_norm": 0.19992770427447581, + "learning_rate": 6.261442807370173e-06, + "loss": 0.0518, + "step": 5178 + }, + { + "epoch": 0.44, + "grad_norm": 0.4142092988716273, + "learning_rate": 6.260122297997617e-06, + "loss": 0.0907, + "step": 5179 + }, + { + "epoch": 0.44, + "grad_norm": 0.41650049081107376, + "learning_rate": 6.258801694763813e-06, + "loss": 0.113, + "step": 5180 + }, + { + "epoch": 0.44, + "grad_norm": 0.41747702078138804, + "learning_rate": 6.257480997767125e-06, + "loss": 0.1082, + "step": 5181 + }, + { + "epoch": 0.44, + "grad_norm": 0.4009676149867442, + "learning_rate": 6.256160207105925e-06, + "loss": 0.0788, + "step": 5182 + }, + { + "epoch": 0.44, + "grad_norm": 0.31045376154767057, + "learning_rate": 6.254839322878598e-06, + "loss": 0.0988, + "step": 5183 + }, + { + "epoch": 0.44, + "grad_norm": 0.3322045467570745, + "learning_rate": 6.253518345183527e-06, + "loss": 0.0678, + "step": 5184 + }, + { + "epoch": 0.44, + "grad_norm": 0.7139718357199312, + "learning_rate": 6.252197274119108e-06, + "loss": 0.0785, + "step": 5185 + }, + { + "epoch": 0.44, + "grad_norm": 0.3674088151430725, + "learning_rate": 6.250876109783741e-06, + "loss": 0.0972, + "step": 5186 + }, + { + "epoch": 0.44, + "grad_norm": 0.1996350170747824, + "learning_rate": 6.249554852275835e-06, + "loss": 0.0587, + "step": 5187 + }, + { + "epoch": 0.44, + "grad_norm": 0.21907129191713076, + "learning_rate": 6.248233501693803e-06, + "loss": 0.0592, + "step": 5188 + }, + { + "epoch": 0.44, + "grad_norm": 0.34069128790062725, + "learning_rate": 6.24691205813607e-06, + "loss": 0.0913, + "step": 5189 + }, + { + "epoch": 0.44, + "grad_norm": 0.26382432394386207, + "learning_rate": 6.245590521701063e-06, + "loss": 0.076, + "step": 5190 + }, + { + "epoch": 0.44, + "grad_norm": 0.3866148334623173, + "learning_rate": 6.244268892487217e-06, + "loss": 0.1073, + "step": 5191 + }, + { + "epoch": 0.44, + "grad_norm": 0.6201201587266143, + "learning_rate": 6.242947170592975e-06, + "loss": 0.1169, + "step": 5192 + }, + { + "epoch": 0.44, + "grad_norm": 0.30369501699517293, + "learning_rate": 6.241625356116787e-06, + "loss": 0.0589, + "step": 5193 + }, + { + "epoch": 0.44, + "grad_norm": 0.44604671621660175, + "learning_rate": 6.2403034491571105e-06, + "loss": 0.0878, + "step": 5194 + }, + { + "epoch": 0.44, + "grad_norm": 0.2179603961583201, + "learning_rate": 6.238981449812408e-06, + "loss": 0.0653, + "step": 5195 + }, + { + "epoch": 0.44, + "grad_norm": 0.2624839034369519, + "learning_rate": 6.237659358181149e-06, + "loss": 0.0784, + "step": 5196 + }, + { + "epoch": 0.44, + "grad_norm": 0.34287357104560556, + "learning_rate": 6.23633717436181e-06, + "loss": 0.0779, + "step": 5197 + }, + { + "epoch": 0.44, + "grad_norm": 0.3008478494192351, + "learning_rate": 6.235014898452878e-06, + "loss": 0.077, + "step": 5198 + }, + { + "epoch": 0.44, + "grad_norm": 0.24984795210907965, + "learning_rate": 6.23369253055284e-06, + "loss": 0.0597, + "step": 5199 + }, + { + "epoch": 0.44, + "grad_norm": 0.35262018409534923, + "learning_rate": 6.2323700707601976e-06, + "loss": 0.1057, + "step": 5200 + }, + { + "epoch": 0.44, + "grad_norm": 0.33599549759870734, + "learning_rate": 6.231047519173451e-06, + "loss": 0.0979, + "step": 5201 + }, + { + "epoch": 0.44, + "grad_norm": 0.43934572836200547, + "learning_rate": 6.2297248758911155e-06, + "loss": 0.085, + "step": 5202 + }, + { + "epoch": 0.44, + "grad_norm": 0.2707930690238642, + "learning_rate": 6.228402141011706e-06, + "loss": 0.0363, + "step": 5203 + }, + { + "epoch": 0.44, + "grad_norm": 0.2857447055533513, + "learning_rate": 6.227079314633752e-06, + "loss": 0.0743, + "step": 5204 + }, + { + "epoch": 0.44, + "grad_norm": 0.35087000536764146, + "learning_rate": 6.22575639685578e-06, + "loss": 0.0652, + "step": 5205 + }, + { + "epoch": 0.44, + "grad_norm": 0.29241830136015806, + "learning_rate": 6.224433387776332e-06, + "loss": 0.0875, + "step": 5206 + }, + { + "epoch": 0.44, + "grad_norm": 0.4078855011623417, + "learning_rate": 6.223110287493952e-06, + "loss": 0.0724, + "step": 5207 + }, + { + "epoch": 0.44, + "grad_norm": 0.32130080820204526, + "learning_rate": 6.221787096107192e-06, + "loss": 0.0836, + "step": 5208 + }, + { + "epoch": 0.44, + "grad_norm": 0.32063810201623266, + "learning_rate": 6.220463813714614e-06, + "loss": 0.0632, + "step": 5209 + }, + { + "epoch": 0.44, + "grad_norm": 0.3177714705667108, + "learning_rate": 6.219140440414778e-06, + "loss": 0.0957, + "step": 5210 + }, + { + "epoch": 0.44, + "grad_norm": 0.24373807562540997, + "learning_rate": 6.2178169763062614e-06, + "loss": 0.0536, + "step": 5211 + }, + { + "epoch": 0.44, + "grad_norm": 0.4192546314142608, + "learning_rate": 6.216493421487643e-06, + "loss": 0.1231, + "step": 5212 + }, + { + "epoch": 0.44, + "grad_norm": 0.455772876853517, + "learning_rate": 6.215169776057507e-06, + "loss": 0.084, + "step": 5213 + }, + { + "epoch": 0.44, + "grad_norm": 0.33006179647951395, + "learning_rate": 6.2138460401144475e-06, + "loss": 0.0295, + "step": 5214 + }, + { + "epoch": 0.44, + "grad_norm": 0.8422524354794191, + "learning_rate": 6.212522213757064e-06, + "loss": 0.1792, + "step": 5215 + }, + { + "epoch": 0.44, + "grad_norm": 0.18670697645153825, + "learning_rate": 6.211198297083961e-06, + "loss": 0.038, + "step": 5216 + }, + { + "epoch": 0.44, + "grad_norm": 0.2764012360566822, + "learning_rate": 6.209874290193755e-06, + "loss": 0.093, + "step": 5217 + }, + { + "epoch": 0.44, + "grad_norm": 0.4644613658355343, + "learning_rate": 6.208550193185063e-06, + "loss": 0.1174, + "step": 5218 + }, + { + "epoch": 0.44, + "grad_norm": 0.16372993976411004, + "learning_rate": 6.207226006156512e-06, + "loss": 0.0455, + "step": 5219 + }, + { + "epoch": 0.44, + "grad_norm": 0.268957705241708, + "learning_rate": 6.205901729206736e-06, + "loss": 0.0723, + "step": 5220 + }, + { + "epoch": 0.44, + "grad_norm": 0.1879080371373582, + "learning_rate": 6.204577362434375e-06, + "loss": 0.0395, + "step": 5221 + }, + { + "epoch": 0.44, + "grad_norm": 0.18541568260756455, + "learning_rate": 6.203252905938074e-06, + "loss": 0.0589, + "step": 5222 + }, + { + "epoch": 0.44, + "grad_norm": 0.21184262704789444, + "learning_rate": 6.201928359816487e-06, + "loss": 0.0651, + "step": 5223 + }, + { + "epoch": 0.44, + "grad_norm": 0.24888988392936323, + "learning_rate": 6.200603724168275e-06, + "loss": 0.0859, + "step": 5224 + }, + { + "epoch": 0.44, + "grad_norm": 0.20855703975492812, + "learning_rate": 6.199278999092102e-06, + "loss": 0.053, + "step": 5225 + }, + { + "epoch": 0.44, + "grad_norm": 0.32001081232409684, + "learning_rate": 6.197954184686644e-06, + "loss": 0.0718, + "step": 5226 + }, + { + "epoch": 0.44, + "grad_norm": 0.40238514890779303, + "learning_rate": 6.196629281050579e-06, + "loss": 0.0674, + "step": 5227 + }, + { + "epoch": 0.44, + "grad_norm": 0.34154181015128565, + "learning_rate": 6.195304288282594e-06, + "loss": 0.0895, + "step": 5228 + }, + { + "epoch": 0.44, + "grad_norm": 0.23415557798416675, + "learning_rate": 6.1939792064813846e-06, + "loss": 0.0898, + "step": 5229 + }, + { + "epoch": 0.44, + "grad_norm": 0.374382807663893, + "learning_rate": 6.192654035745646e-06, + "loss": 0.1142, + "step": 5230 + }, + { + "epoch": 0.44, + "grad_norm": 0.28467842116791237, + "learning_rate": 6.191328776174088e-06, + "loss": 0.0915, + "step": 5231 + }, + { + "epoch": 0.44, + "grad_norm": 0.3292240351475248, + "learning_rate": 6.190003427865422e-06, + "loss": 0.1224, + "step": 5232 + }, + { + "epoch": 0.44, + "grad_norm": 0.25746160314004013, + "learning_rate": 6.188677990918369e-06, + "loss": 0.0737, + "step": 5233 + }, + { + "epoch": 0.44, + "grad_norm": 0.18337006351164734, + "learning_rate": 6.187352465431655e-06, + "loss": 0.0237, + "step": 5234 + }, + { + "epoch": 0.44, + "grad_norm": 0.16741479630572878, + "learning_rate": 6.186026851504012e-06, + "loss": 0.046, + "step": 5235 + }, + { + "epoch": 0.44, + "grad_norm": 0.31670212416517907, + "learning_rate": 6.184701149234181e-06, + "loss": 0.1054, + "step": 5236 + }, + { + "epoch": 0.44, + "grad_norm": 0.3256155635478663, + "learning_rate": 6.183375358720907e-06, + "loss": 0.0973, + "step": 5237 + }, + { + "epoch": 0.44, + "grad_norm": 0.3372282515677099, + "learning_rate": 6.1820494800629415e-06, + "loss": 0.0573, + "step": 5238 + }, + { + "epoch": 0.44, + "grad_norm": 0.14162739478644318, + "learning_rate": 6.180723513359045e-06, + "loss": 0.0258, + "step": 5239 + }, + { + "epoch": 0.44, + "grad_norm": 0.2614610261841953, + "learning_rate": 6.179397458707984e-06, + "loss": 0.0743, + "step": 5240 + }, + { + "epoch": 0.44, + "grad_norm": 0.45798692754897236, + "learning_rate": 6.178071316208529e-06, + "loss": 0.1214, + "step": 5241 + }, + { + "epoch": 0.44, + "grad_norm": 0.2606433617573775, + "learning_rate": 6.17674508595946e-06, + "loss": 0.0687, + "step": 5242 + }, + { + "epoch": 0.44, + "grad_norm": 0.3858788305461409, + "learning_rate": 6.175418768059562e-06, + "loss": 0.0864, + "step": 5243 + }, + { + "epoch": 0.44, + "grad_norm": 0.22879257836160302, + "learning_rate": 6.174092362607627e-06, + "loss": 0.0567, + "step": 5244 + }, + { + "epoch": 0.44, + "grad_norm": 0.31052984314295756, + "learning_rate": 6.1727658697024505e-06, + "loss": 0.0564, + "step": 5245 + }, + { + "epoch": 0.44, + "grad_norm": 0.3993084983310892, + "learning_rate": 6.171439289442843e-06, + "loss": 0.0819, + "step": 5246 + }, + { + "epoch": 0.44, + "grad_norm": 0.21731570647463253, + "learning_rate": 6.170112621927613e-06, + "loss": 0.0482, + "step": 5247 + }, + { + "epoch": 0.44, + "grad_norm": 0.4334208200535504, + "learning_rate": 6.168785867255578e-06, + "loss": 0.1036, + "step": 5248 + }, + { + "epoch": 0.44, + "grad_norm": 0.2757908625035082, + "learning_rate": 6.1674590255255625e-06, + "loss": 0.0694, + "step": 5249 + }, + { + "epoch": 0.44, + "grad_norm": 0.29254399750667776, + "learning_rate": 6.166132096836399e-06, + "loss": 0.0488, + "step": 5250 + }, + { + "epoch": 0.44, + "grad_norm": 0.36610907681238747, + "learning_rate": 6.164805081286922e-06, + "loss": 0.0788, + "step": 5251 + }, + { + "epoch": 0.44, + "grad_norm": 0.6990823988400623, + "learning_rate": 6.163477978975978e-06, + "loss": 0.0912, + "step": 5252 + }, + { + "epoch": 0.44, + "grad_norm": 0.3985405688150222, + "learning_rate": 6.162150790002417e-06, + "loss": 0.0715, + "step": 5253 + }, + { + "epoch": 0.44, + "grad_norm": 0.32863635148769466, + "learning_rate": 6.160823514465093e-06, + "loss": 0.0805, + "step": 5254 + }, + { + "epoch": 0.44, + "grad_norm": 0.1964920384892367, + "learning_rate": 6.1594961524628735e-06, + "loss": 0.0396, + "step": 5255 + }, + { + "epoch": 0.44, + "grad_norm": 0.3291273929905797, + "learning_rate": 6.158168704094625e-06, + "loss": 0.0524, + "step": 5256 + }, + { + "epoch": 0.44, + "grad_norm": 0.39900875170454553, + "learning_rate": 6.156841169459225e-06, + "loss": 0.1108, + "step": 5257 + }, + { + "epoch": 0.44, + "grad_norm": 0.34635101732129814, + "learning_rate": 6.155513548655556e-06, + "loss": 0.115, + "step": 5258 + }, + { + "epoch": 0.44, + "grad_norm": 0.5700847961066906, + "learning_rate": 6.154185841782505e-06, + "loss": 0.1235, + "step": 5259 + }, + { + "epoch": 0.44, + "grad_norm": 0.5093442194896075, + "learning_rate": 6.152858048938971e-06, + "loss": 0.0983, + "step": 5260 + }, + { + "epoch": 0.44, + "grad_norm": 0.40187369354248026, + "learning_rate": 6.151530170223852e-06, + "loss": 0.0716, + "step": 5261 + }, + { + "epoch": 0.44, + "grad_norm": 0.23064170997759925, + "learning_rate": 6.1502022057360575e-06, + "loss": 0.0573, + "step": 5262 + }, + { + "epoch": 0.44, + "grad_norm": 0.4203572895242865, + "learning_rate": 6.1488741555745036e-06, + "loss": 0.0931, + "step": 5263 + }, + { + "epoch": 0.44, + "grad_norm": 0.5294580157460433, + "learning_rate": 6.147546019838109e-06, + "loss": 0.1184, + "step": 5264 + }, + { + "epoch": 0.44, + "grad_norm": 0.307256022814954, + "learning_rate": 6.146217798625803e-06, + "loss": 0.1003, + "step": 5265 + }, + { + "epoch": 0.44, + "grad_norm": 0.26042322270279156, + "learning_rate": 6.144889492036516e-06, + "loss": 0.051, + "step": 5266 + }, + { + "epoch": 0.44, + "grad_norm": 0.29410465015178416, + "learning_rate": 6.143561100169193e-06, + "loss": 0.0669, + "step": 5267 + }, + { + "epoch": 0.44, + "grad_norm": 0.5136536547199391, + "learning_rate": 6.142232623122775e-06, + "loss": 0.1134, + "step": 5268 + }, + { + "epoch": 0.44, + "grad_norm": 0.20144584319530556, + "learning_rate": 6.140904060996218e-06, + "loss": 0.0462, + "step": 5269 + }, + { + "epoch": 0.44, + "grad_norm": 1.1016368380696153, + "learning_rate": 6.139575413888481e-06, + "loss": 0.136, + "step": 5270 + }, + { + "epoch": 0.44, + "grad_norm": 0.33286888319463154, + "learning_rate": 6.138246681898528e-06, + "loss": 0.0805, + "step": 5271 + }, + { + "epoch": 0.44, + "grad_norm": 0.30880960494374793, + "learning_rate": 6.1369178651253325e-06, + "loss": 0.0616, + "step": 5272 + }, + { + "epoch": 0.44, + "grad_norm": 0.25163701847294884, + "learning_rate": 6.13558896366787e-06, + "loss": 0.0708, + "step": 5273 + }, + { + "epoch": 0.44, + "grad_norm": 0.5165211940623574, + "learning_rate": 6.134259977625128e-06, + "loss": 0.1065, + "step": 5274 + }, + { + "epoch": 0.44, + "grad_norm": 0.3129297548190298, + "learning_rate": 6.132930907096094e-06, + "loss": 0.0555, + "step": 5275 + }, + { + "epoch": 0.44, + "grad_norm": 0.27842362431688267, + "learning_rate": 6.131601752179765e-06, + "loss": 0.0585, + "step": 5276 + }, + { + "epoch": 0.44, + "grad_norm": 0.2918712735621903, + "learning_rate": 6.130272512975147e-06, + "loss": 0.0741, + "step": 5277 + }, + { + "epoch": 0.44, + "grad_norm": 0.330251987805454, + "learning_rate": 6.128943189581248e-06, + "loss": 0.1309, + "step": 5278 + }, + { + "epoch": 0.44, + "grad_norm": 0.23201982937692855, + "learning_rate": 6.1276137820970825e-06, + "loss": 0.0666, + "step": 5279 + }, + { + "epoch": 0.44, + "grad_norm": 0.25492387900104096, + "learning_rate": 6.126284290621674e-06, + "loss": 0.0933, + "step": 5280 + }, + { + "epoch": 0.44, + "grad_norm": 0.2718308543836977, + "learning_rate": 6.124954715254051e-06, + "loss": 0.1145, + "step": 5281 + }, + { + "epoch": 0.45, + "grad_norm": 0.47840132991708734, + "learning_rate": 6.123625056093248e-06, + "loss": 0.0779, + "step": 5282 + }, + { + "epoch": 0.45, + "grad_norm": 0.2403083600987502, + "learning_rate": 6.122295313238304e-06, + "loss": 0.0731, + "step": 5283 + }, + { + "epoch": 0.45, + "grad_norm": 0.2322454846665232, + "learning_rate": 6.120965486788268e-06, + "loss": 0.0516, + "step": 5284 + }, + { + "epoch": 0.45, + "grad_norm": 0.20172763897515666, + "learning_rate": 6.119635576842192e-06, + "loss": 0.0615, + "step": 5285 + }, + { + "epoch": 0.45, + "grad_norm": 0.1665346362962438, + "learning_rate": 6.118305583499135e-06, + "loss": 0.0635, + "step": 5286 + }, + { + "epoch": 0.45, + "grad_norm": 0.2628097734887062, + "learning_rate": 6.116975506858166e-06, + "loss": 0.0616, + "step": 5287 + }, + { + "epoch": 0.45, + "grad_norm": 0.30032685559774974, + "learning_rate": 6.115645347018352e-06, + "loss": 0.0531, + "step": 5288 + }, + { + "epoch": 0.45, + "grad_norm": 0.4209492939459478, + "learning_rate": 6.1143151040787755e-06, + "loss": 0.1033, + "step": 5289 + }, + { + "epoch": 0.45, + "grad_norm": 0.3261313461188077, + "learning_rate": 6.112984778138516e-06, + "loss": 0.0854, + "step": 5290 + }, + { + "epoch": 0.45, + "grad_norm": 0.13389376858375338, + "learning_rate": 6.111654369296669e-06, + "loss": 0.0358, + "step": 5291 + }, + { + "epoch": 0.45, + "grad_norm": 0.3445051395951685, + "learning_rate": 6.110323877652328e-06, + "loss": 0.1095, + "step": 5292 + }, + { + "epoch": 0.45, + "grad_norm": 0.3742259378281943, + "learning_rate": 6.108993303304596e-06, + "loss": 0.1113, + "step": 5293 + }, + { + "epoch": 0.45, + "grad_norm": 0.3218806643812006, + "learning_rate": 6.107662646352584e-06, + "loss": 0.0993, + "step": 5294 + }, + { + "epoch": 0.45, + "grad_norm": 0.25177948362326225, + "learning_rate": 6.106331906895403e-06, + "loss": 0.0786, + "step": 5295 + }, + { + "epoch": 0.45, + "grad_norm": 0.316242319175783, + "learning_rate": 6.105001085032178e-06, + "loss": 0.083, + "step": 5296 + }, + { + "epoch": 0.45, + "grad_norm": 0.20377348322281413, + "learning_rate": 6.103670180862036e-06, + "loss": 0.0405, + "step": 5297 + }, + { + "epoch": 0.45, + "grad_norm": 0.2710851318933867, + "learning_rate": 6.102339194484107e-06, + "loss": 0.0645, + "step": 5298 + }, + { + "epoch": 0.45, + "grad_norm": 0.3639422896700517, + "learning_rate": 6.1010081259975366e-06, + "loss": 0.1263, + "step": 5299 + }, + { + "epoch": 0.45, + "grad_norm": 0.41266385451860965, + "learning_rate": 6.099676975501464e-06, + "loss": 0.1056, + "step": 5300 + }, + { + "epoch": 0.45, + "grad_norm": 0.5356794486774135, + "learning_rate": 6.0983457430950466e-06, + "loss": 0.1075, + "step": 5301 + }, + { + "epoch": 0.45, + "grad_norm": 0.34534845977928014, + "learning_rate": 6.097014428877438e-06, + "loss": 0.1018, + "step": 5302 + }, + { + "epoch": 0.45, + "grad_norm": 0.3434918712563387, + "learning_rate": 6.095683032947805e-06, + "loss": 0.0964, + "step": 5303 + }, + { + "epoch": 0.45, + "grad_norm": 0.45976494885796537, + "learning_rate": 6.0943515554053165e-06, + "loss": 0.1267, + "step": 5304 + }, + { + "epoch": 0.45, + "grad_norm": 0.33969733743416625, + "learning_rate": 6.0930199963491496e-06, + "loss": 0.0755, + "step": 5305 + }, + { + "epoch": 0.45, + "grad_norm": 0.2712116293292752, + "learning_rate": 6.0916883558784865e-06, + "loss": 0.0781, + "step": 5306 + }, + { + "epoch": 0.45, + "grad_norm": 0.3796384927373055, + "learning_rate": 6.090356634092513e-06, + "loss": 0.074, + "step": 5307 + }, + { + "epoch": 0.45, + "grad_norm": 0.3556636637485662, + "learning_rate": 6.089024831090429e-06, + "loss": 0.0584, + "step": 5308 + }, + { + "epoch": 0.45, + "grad_norm": 0.19135626383308632, + "learning_rate": 6.08769294697143e-06, + "loss": 0.0418, + "step": 5309 + }, + { + "epoch": 0.45, + "grad_norm": 0.442098643807956, + "learning_rate": 6.0863609818347235e-06, + "loss": 0.1043, + "step": 5310 + }, + { + "epoch": 0.45, + "grad_norm": 0.20451789830048572, + "learning_rate": 6.085028935779523e-06, + "loss": 0.0589, + "step": 5311 + }, + { + "epoch": 0.45, + "grad_norm": 0.4664615389241614, + "learning_rate": 6.083696808905046e-06, + "loss": 0.127, + "step": 5312 + }, + { + "epoch": 0.45, + "grad_norm": 0.36286836154522245, + "learning_rate": 6.082364601310518e-06, + "loss": 0.0945, + "step": 5313 + }, + { + "epoch": 0.45, + "grad_norm": 0.34145616084273633, + "learning_rate": 6.08103231309517e-06, + "loss": 0.0832, + "step": 5314 + }, + { + "epoch": 0.45, + "grad_norm": 0.24564504275421514, + "learning_rate": 6.079699944358236e-06, + "loss": 0.0679, + "step": 5315 + }, + { + "epoch": 0.45, + "grad_norm": 0.2836701054721182, + "learning_rate": 6.078367495198963e-06, + "loss": 0.0812, + "step": 5316 + }, + { + "epoch": 0.45, + "grad_norm": 0.3013302779259131, + "learning_rate": 6.077034965716594e-06, + "loss": 0.123, + "step": 5317 + }, + { + "epoch": 0.45, + "grad_norm": 0.2398990555826029, + "learning_rate": 6.075702356010389e-06, + "loss": 0.0607, + "step": 5318 + }, + { + "epoch": 0.45, + "grad_norm": 0.3291590887858968, + "learning_rate": 6.0743696661796045e-06, + "loss": 0.0703, + "step": 5319 + }, + { + "epoch": 0.45, + "grad_norm": 0.20390315647094814, + "learning_rate": 6.073036896323508e-06, + "loss": 0.0903, + "step": 5320 + }, + { + "epoch": 0.45, + "grad_norm": 0.30712055957935314, + "learning_rate": 6.0717040465413755e-06, + "loss": 0.1032, + "step": 5321 + }, + { + "epoch": 0.45, + "grad_norm": 0.3077468754094887, + "learning_rate": 6.070371116932481e-06, + "loss": 0.0854, + "step": 5322 + }, + { + "epoch": 0.45, + "grad_norm": 0.33880899280042637, + "learning_rate": 6.0690381075961106e-06, + "loss": 0.0722, + "step": 5323 + }, + { + "epoch": 0.45, + "grad_norm": 0.327678207447578, + "learning_rate": 6.067705018631553e-06, + "loss": 0.0594, + "step": 5324 + }, + { + "epoch": 0.45, + "grad_norm": 0.35960992797103836, + "learning_rate": 6.066371850138106e-06, + "loss": 0.105, + "step": 5325 + }, + { + "epoch": 0.45, + "grad_norm": 0.5613633055349678, + "learning_rate": 6.065038602215073e-06, + "loss": 0.128, + "step": 5326 + }, + { + "epoch": 0.45, + "grad_norm": 0.45614752188697993, + "learning_rate": 6.06370527496176e-06, + "loss": 0.0599, + "step": 5327 + }, + { + "epoch": 0.45, + "grad_norm": 0.344320021583728, + "learning_rate": 6.0623718684774816e-06, + "loss": 0.1022, + "step": 5328 + }, + { + "epoch": 0.45, + "grad_norm": 0.2617636852821932, + "learning_rate": 6.061038382861557e-06, + "loss": 0.0788, + "step": 5329 + }, + { + "epoch": 0.45, + "grad_norm": 0.531924953798233, + "learning_rate": 6.0597048182133135e-06, + "loss": 0.1478, + "step": 5330 + }, + { + "epoch": 0.45, + "grad_norm": 0.2865681723873686, + "learning_rate": 6.058371174632081e-06, + "loss": 0.0645, + "step": 5331 + }, + { + "epoch": 0.45, + "grad_norm": 0.5234901913648033, + "learning_rate": 6.057037452217198e-06, + "loss": 0.0883, + "step": 5332 + }, + { + "epoch": 0.45, + "grad_norm": 0.1575635283649197, + "learning_rate": 6.055703651068009e-06, + "loss": 0.0469, + "step": 5333 + }, + { + "epoch": 0.45, + "grad_norm": 0.3830883326555604, + "learning_rate": 6.054369771283861e-06, + "loss": 0.11, + "step": 5334 + }, + { + "epoch": 0.45, + "grad_norm": 0.2565821085185686, + "learning_rate": 6.053035812964111e-06, + "loss": 0.0677, + "step": 5335 + }, + { + "epoch": 0.45, + "grad_norm": 0.28226342991490844, + "learning_rate": 6.051701776208119e-06, + "loss": 0.0742, + "step": 5336 + }, + { + "epoch": 0.45, + "grad_norm": 0.284647734978893, + "learning_rate": 6.050367661115251e-06, + "loss": 0.0705, + "step": 5337 + }, + { + "epoch": 0.45, + "grad_norm": 0.35946088997305786, + "learning_rate": 6.049033467784883e-06, + "loss": 0.0903, + "step": 5338 + }, + { + "epoch": 0.45, + "grad_norm": 0.20383545277218626, + "learning_rate": 6.047699196316389e-06, + "loss": 0.0753, + "step": 5339 + }, + { + "epoch": 0.45, + "grad_norm": 0.4639088175511074, + "learning_rate": 6.046364846809157e-06, + "loss": 0.0562, + "step": 5340 + }, + { + "epoch": 0.45, + "grad_norm": 0.2276505730846899, + "learning_rate": 6.045030419362575e-06, + "loss": 0.0656, + "step": 5341 + }, + { + "epoch": 0.45, + "grad_norm": 0.44853535073360584, + "learning_rate": 6.043695914076041e-06, + "loss": 0.1008, + "step": 5342 + }, + { + "epoch": 0.45, + "grad_norm": 0.2719509404814123, + "learning_rate": 6.042361331048955e-06, + "loss": 0.0903, + "step": 5343 + }, + { + "epoch": 0.45, + "grad_norm": 0.4498368931815228, + "learning_rate": 6.041026670380723e-06, + "loss": 0.0878, + "step": 5344 + }, + { + "epoch": 0.45, + "grad_norm": 0.42161491126361156, + "learning_rate": 6.039691932170762e-06, + "loss": 0.0938, + "step": 5345 + }, + { + "epoch": 0.45, + "grad_norm": 0.45854043176362996, + "learning_rate": 6.038357116518489e-06, + "loss": 0.1193, + "step": 5346 + }, + { + "epoch": 0.45, + "grad_norm": 0.26355164424531924, + "learning_rate": 6.037022223523328e-06, + "loss": 0.0837, + "step": 5347 + }, + { + "epoch": 0.45, + "grad_norm": 0.28987998376162316, + "learning_rate": 6.035687253284711e-06, + "loss": 0.0949, + "step": 5348 + }, + { + "epoch": 0.45, + "grad_norm": 0.5031540499759544, + "learning_rate": 6.034352205902074e-06, + "loss": 0.113, + "step": 5349 + }, + { + "epoch": 0.45, + "grad_norm": 0.3088277986263113, + "learning_rate": 6.03301708147486e-06, + "loss": 0.0788, + "step": 5350 + }, + { + "epoch": 0.45, + "grad_norm": 0.5342678592105653, + "learning_rate": 6.031681880102515e-06, + "loss": 0.1196, + "step": 5351 + }, + { + "epoch": 0.45, + "grad_norm": 0.17505078876655264, + "learning_rate": 6.030346601884494e-06, + "loss": 0.0412, + "step": 5352 + }, + { + "epoch": 0.45, + "grad_norm": 0.23158006036776607, + "learning_rate": 6.029011246920257e-06, + "loss": 0.0669, + "step": 5353 + }, + { + "epoch": 0.45, + "grad_norm": 0.3866824063089273, + "learning_rate": 6.027675815309265e-06, + "loss": 0.068, + "step": 5354 + }, + { + "epoch": 0.45, + "grad_norm": 0.507954179326806, + "learning_rate": 6.0263403071509955e-06, + "loss": 0.1028, + "step": 5355 + }, + { + "epoch": 0.45, + "grad_norm": 0.201664224253523, + "learning_rate": 6.025004722544918e-06, + "loss": 0.0621, + "step": 5356 + }, + { + "epoch": 0.45, + "grad_norm": 0.2728142522919522, + "learning_rate": 6.023669061590519e-06, + "loss": 0.0799, + "step": 5357 + }, + { + "epoch": 0.45, + "grad_norm": 0.4242176150960372, + "learning_rate": 6.022333324387284e-06, + "loss": 0.094, + "step": 5358 + }, + { + "epoch": 0.45, + "grad_norm": 0.253479321393804, + "learning_rate": 6.020997511034709e-06, + "loss": 0.083, + "step": 5359 + }, + { + "epoch": 0.45, + "grad_norm": 0.34327452023089416, + "learning_rate": 6.01966162163229e-06, + "loss": 0.1044, + "step": 5360 + }, + { + "epoch": 0.45, + "grad_norm": 0.419842383178155, + "learning_rate": 6.018325656279535e-06, + "loss": 0.1473, + "step": 5361 + }, + { + "epoch": 0.45, + "grad_norm": 0.21744188842082793, + "learning_rate": 6.016989615075952e-06, + "loss": 0.0761, + "step": 5362 + }, + { + "epoch": 0.45, + "grad_norm": 0.38278722044476043, + "learning_rate": 6.015653498121058e-06, + "loss": 0.1036, + "step": 5363 + }, + { + "epoch": 0.45, + "grad_norm": 0.2568964799739123, + "learning_rate": 6.014317305514376e-06, + "loss": 0.0578, + "step": 5364 + }, + { + "epoch": 0.45, + "grad_norm": 0.3228270671877277, + "learning_rate": 6.01298103735543e-06, + "loss": 0.0428, + "step": 5365 + }, + { + "epoch": 0.45, + "grad_norm": 0.4603803037031919, + "learning_rate": 6.011644693743756e-06, + "loss": 0.1034, + "step": 5366 + }, + { + "epoch": 0.45, + "grad_norm": 0.29767446653696644, + "learning_rate": 6.010308274778893e-06, + "loss": 0.0595, + "step": 5367 + }, + { + "epoch": 0.45, + "grad_norm": 0.24804175699966854, + "learning_rate": 6.008971780560384e-06, + "loss": 0.0615, + "step": 5368 + }, + { + "epoch": 0.45, + "grad_norm": 0.9415976368171958, + "learning_rate": 6.0076352111877785e-06, + "loss": 0.1487, + "step": 5369 + }, + { + "epoch": 0.45, + "grad_norm": 0.4642217571132315, + "learning_rate": 6.0062985667606336e-06, + "loss": 0.1361, + "step": 5370 + }, + { + "epoch": 0.45, + "grad_norm": 0.24236977539073618, + "learning_rate": 6.004961847378508e-06, + "loss": 0.0696, + "step": 5371 + }, + { + "epoch": 0.45, + "grad_norm": 0.31704934013023744, + "learning_rate": 6.003625053140971e-06, + "loss": 0.0877, + "step": 5372 + }, + { + "epoch": 0.45, + "grad_norm": 0.20176125855510404, + "learning_rate": 6.0022881841475926e-06, + "loss": 0.0282, + "step": 5373 + }, + { + "epoch": 0.45, + "grad_norm": 0.4372488176025192, + "learning_rate": 6.000951240497952e-06, + "loss": 0.1032, + "step": 5374 + }, + { + "epoch": 0.45, + "grad_norm": 0.38813171225862125, + "learning_rate": 5.999614222291631e-06, + "loss": 0.0944, + "step": 5375 + }, + { + "epoch": 0.45, + "grad_norm": 0.20913893591875704, + "learning_rate": 5.998277129628221e-06, + "loss": 0.0531, + "step": 5376 + }, + { + "epoch": 0.45, + "grad_norm": 0.36742366619963834, + "learning_rate": 5.996939962607314e-06, + "loss": 0.0854, + "step": 5377 + }, + { + "epoch": 0.45, + "grad_norm": 0.3281824636655971, + "learning_rate": 5.995602721328511e-06, + "loss": 0.1037, + "step": 5378 + }, + { + "epoch": 0.45, + "grad_norm": 0.4207613209814052, + "learning_rate": 5.9942654058914184e-06, + "loss": 0.0579, + "step": 5379 + }, + { + "epoch": 0.45, + "grad_norm": 0.3772755919900469, + "learning_rate": 5.992928016395646e-06, + "loss": 0.0953, + "step": 5380 + }, + { + "epoch": 0.45, + "grad_norm": 0.3206313646534068, + "learning_rate": 5.9915905529408105e-06, + "loss": 0.1031, + "step": 5381 + }, + { + "epoch": 0.45, + "grad_norm": 0.1735927131276033, + "learning_rate": 5.990253015626534e-06, + "loss": 0.0428, + "step": 5382 + }, + { + "epoch": 0.45, + "grad_norm": 0.35059837178469994, + "learning_rate": 5.988915404552443e-06, + "loss": 0.1164, + "step": 5383 + }, + { + "epoch": 0.45, + "grad_norm": 0.22522239498583646, + "learning_rate": 5.987577719818174e-06, + "loss": 0.0467, + "step": 5384 + }, + { + "epoch": 0.45, + "grad_norm": 0.30030009583985073, + "learning_rate": 5.986239961523362e-06, + "loss": 0.0923, + "step": 5385 + }, + { + "epoch": 0.45, + "grad_norm": 0.2846834646501469, + "learning_rate": 5.984902129767654e-06, + "loss": 0.0869, + "step": 5386 + }, + { + "epoch": 0.45, + "grad_norm": 0.23855077691604534, + "learning_rate": 5.983564224650696e-06, + "loss": 0.0715, + "step": 5387 + }, + { + "epoch": 0.45, + "grad_norm": 0.2828577573257001, + "learning_rate": 5.982226246272145e-06, + "loss": 0.0609, + "step": 5388 + }, + { + "epoch": 0.45, + "grad_norm": 0.32854683418853087, + "learning_rate": 5.980888194731664e-06, + "loss": 0.0906, + "step": 5389 + }, + { + "epoch": 0.45, + "grad_norm": 0.24760203766003816, + "learning_rate": 5.979550070128912e-06, + "loss": 0.0753, + "step": 5390 + }, + { + "epoch": 0.45, + "grad_norm": 0.18559918192886488, + "learning_rate": 5.978211872563567e-06, + "loss": 0.0629, + "step": 5391 + }, + { + "epoch": 0.45, + "grad_norm": 0.4420638735545099, + "learning_rate": 5.9768736021353026e-06, + "loss": 0.1124, + "step": 5392 + }, + { + "epoch": 0.45, + "grad_norm": 0.40955422044635365, + "learning_rate": 5.9755352589438035e-06, + "loss": 0.1171, + "step": 5393 + }, + { + "epoch": 0.45, + "grad_norm": 0.36434886894430873, + "learning_rate": 5.974196843088752e-06, + "loss": 0.0861, + "step": 5394 + }, + { + "epoch": 0.45, + "grad_norm": 0.35350580419269484, + "learning_rate": 5.972858354669846e-06, + "loss": 0.0844, + "step": 5395 + }, + { + "epoch": 0.45, + "grad_norm": 0.314795612280872, + "learning_rate": 5.971519793786783e-06, + "loss": 0.1084, + "step": 5396 + }, + { + "epoch": 0.45, + "grad_norm": 0.28266219250668395, + "learning_rate": 5.9701811605392655e-06, + "loss": 0.0904, + "step": 5397 + }, + { + "epoch": 0.45, + "grad_norm": 0.37303693149487516, + "learning_rate": 5.968842455027004e-06, + "loss": 0.1007, + "step": 5398 + }, + { + "epoch": 0.45, + "grad_norm": 0.292483999253837, + "learning_rate": 5.967503677349713e-06, + "loss": 0.0819, + "step": 5399 + }, + { + "epoch": 0.46, + "grad_norm": 0.384219057211456, + "learning_rate": 5.96616482760711e-06, + "loss": 0.0672, + "step": 5400 + }, + { + "epoch": 0.46, + "grad_norm": 0.271373697367901, + "learning_rate": 5.964825905898924e-06, + "loss": 0.0663, + "step": 5401 + }, + { + "epoch": 0.46, + "grad_norm": 0.28328150845718114, + "learning_rate": 5.963486912324884e-06, + "loss": 0.0832, + "step": 5402 + }, + { + "epoch": 0.46, + "grad_norm": 0.19124636918266488, + "learning_rate": 5.962147846984726e-06, + "loss": 0.0711, + "step": 5403 + }, + { + "epoch": 0.46, + "grad_norm": 0.2866461625574067, + "learning_rate": 5.960808709978191e-06, + "loss": 0.0808, + "step": 5404 + }, + { + "epoch": 0.46, + "grad_norm": 0.24223384690506577, + "learning_rate": 5.959469501405025e-06, + "loss": 0.0645, + "step": 5405 + }, + { + "epoch": 0.46, + "grad_norm": 0.17515017669779412, + "learning_rate": 5.958130221364984e-06, + "loss": 0.0454, + "step": 5406 + }, + { + "epoch": 0.46, + "grad_norm": 0.44491585820785906, + "learning_rate": 5.956790869957821e-06, + "loss": 0.0672, + "step": 5407 + }, + { + "epoch": 0.46, + "grad_norm": 0.3001242145935283, + "learning_rate": 5.9554514472833e-06, + "loss": 0.0695, + "step": 5408 + }, + { + "epoch": 0.46, + "grad_norm": 0.522825204285594, + "learning_rate": 5.95411195344119e-06, + "loss": 0.0952, + "step": 5409 + }, + { + "epoch": 0.46, + "grad_norm": 0.29118676662374365, + "learning_rate": 5.952772388531263e-06, + "loss": 0.1118, + "step": 5410 + }, + { + "epoch": 0.46, + "grad_norm": 0.34546595544419983, + "learning_rate": 5.951432752653299e-06, + "loss": 0.1014, + "step": 5411 + }, + { + "epoch": 0.46, + "grad_norm": 0.2479246892137602, + "learning_rate": 5.950093045907081e-06, + "loss": 0.1029, + "step": 5412 + }, + { + "epoch": 0.46, + "grad_norm": 0.2801591137502635, + "learning_rate": 5.948753268392397e-06, + "loss": 0.0699, + "step": 5413 + }, + { + "epoch": 0.46, + "grad_norm": 0.25349991222741874, + "learning_rate": 5.9474134202090435e-06, + "loss": 0.0557, + "step": 5414 + }, + { + "epoch": 0.46, + "grad_norm": 0.4200883843164058, + "learning_rate": 5.946073501456819e-06, + "loss": 0.1118, + "step": 5415 + }, + { + "epoch": 0.46, + "grad_norm": 0.32163358365090916, + "learning_rate": 5.94473351223553e-06, + "loss": 0.086, + "step": 5416 + }, + { + "epoch": 0.46, + "grad_norm": 0.38542617248132943, + "learning_rate": 5.9433934526449835e-06, + "loss": 0.1317, + "step": 5417 + }, + { + "epoch": 0.46, + "grad_norm": 0.15562172662486587, + "learning_rate": 5.942053322784999e-06, + "loss": 0.0415, + "step": 5418 + }, + { + "epoch": 0.46, + "grad_norm": 0.34882786063223997, + "learning_rate": 5.940713122755394e-06, + "loss": 0.0768, + "step": 5419 + }, + { + "epoch": 0.46, + "grad_norm": 0.6119491340846164, + "learning_rate": 5.939372852655996e-06, + "loss": 0.1381, + "step": 5420 + }, + { + "epoch": 0.46, + "grad_norm": 0.5051897299308056, + "learning_rate": 5.9380325125866355e-06, + "loss": 0.1367, + "step": 5421 + }, + { + "epoch": 0.46, + "grad_norm": 0.33105786807140036, + "learning_rate": 5.936692102647149e-06, + "loss": 0.0863, + "step": 5422 + }, + { + "epoch": 0.46, + "grad_norm": 0.24647067650994745, + "learning_rate": 5.9353516229373775e-06, + "loss": 0.0679, + "step": 5423 + }, + { + "epoch": 0.46, + "grad_norm": 0.26741282140702005, + "learning_rate": 5.934011073557169e-06, + "loss": 0.0577, + "step": 5424 + }, + { + "epoch": 0.46, + "grad_norm": 0.2973598602973295, + "learning_rate": 5.932670454606375e-06, + "loss": 0.0761, + "step": 5425 + }, + { + "epoch": 0.46, + "grad_norm": 0.25193519769041045, + "learning_rate": 5.9313297661848514e-06, + "loss": 0.0483, + "step": 5426 + }, + { + "epoch": 0.46, + "grad_norm": 0.20721343235490933, + "learning_rate": 5.9299890083924625e-06, + "loss": 0.0633, + "step": 5427 + }, + { + "epoch": 0.46, + "grad_norm": 0.17814131641387448, + "learning_rate": 5.9286481813290745e-06, + "loss": 0.0608, + "step": 5428 + }, + { + "epoch": 0.46, + "grad_norm": 0.2612157632117362, + "learning_rate": 5.927307285094559e-06, + "loss": 0.0432, + "step": 5429 + }, + { + "epoch": 0.46, + "grad_norm": 0.43869738915439743, + "learning_rate": 5.925966319788796e-06, + "loss": 0.1045, + "step": 5430 + }, + { + "epoch": 0.46, + "grad_norm": 0.33942885639881754, + "learning_rate": 5.924625285511667e-06, + "loss": 0.092, + "step": 5431 + }, + { + "epoch": 0.46, + "grad_norm": 0.2767786740655042, + "learning_rate": 5.9232841823630605e-06, + "loss": 0.0879, + "step": 5432 + }, + { + "epoch": 0.46, + "grad_norm": 0.2264272886620654, + "learning_rate": 5.92194301044287e-06, + "loss": 0.052, + "step": 5433 + }, + { + "epoch": 0.46, + "grad_norm": 0.24241049628349756, + "learning_rate": 5.9206017698509925e-06, + "loss": 0.0737, + "step": 5434 + }, + { + "epoch": 0.46, + "grad_norm": 0.2693447988420022, + "learning_rate": 5.919260460687333e-06, + "loss": 0.072, + "step": 5435 + }, + { + "epoch": 0.46, + "grad_norm": 0.3303712941790021, + "learning_rate": 5.917919083051798e-06, + "loss": 0.0873, + "step": 5436 + }, + { + "epoch": 0.46, + "grad_norm": 0.34635469598794016, + "learning_rate": 5.916577637044304e-06, + "loss": 0.0756, + "step": 5437 + }, + { + "epoch": 0.46, + "grad_norm": 0.3020661412990145, + "learning_rate": 5.915236122764767e-06, + "loss": 0.1049, + "step": 5438 + }, + { + "epoch": 0.46, + "grad_norm": 0.3609340741305951, + "learning_rate": 5.913894540313113e-06, + "loss": 0.0979, + "step": 5439 + }, + { + "epoch": 0.46, + "grad_norm": 0.3188562655569674, + "learning_rate": 5.912552889789269e-06, + "loss": 0.0553, + "step": 5440 + }, + { + "epoch": 0.46, + "grad_norm": 0.2735123421782616, + "learning_rate": 5.911211171293169e-06, + "loss": 0.0649, + "step": 5441 + }, + { + "epoch": 0.46, + "grad_norm": 0.23665961725644732, + "learning_rate": 5.909869384924756e-06, + "loss": 0.0521, + "step": 5442 + }, + { + "epoch": 0.46, + "grad_norm": 0.45275528368388723, + "learning_rate": 5.908527530783967e-06, + "loss": 0.0726, + "step": 5443 + }, + { + "epoch": 0.46, + "grad_norm": 0.39781500716977575, + "learning_rate": 5.907185608970758e-06, + "loss": 0.0866, + "step": 5444 + }, + { + "epoch": 0.46, + "grad_norm": 0.9431804063748954, + "learning_rate": 5.905843619585079e-06, + "loss": 0.1445, + "step": 5445 + }, + { + "epoch": 0.46, + "grad_norm": 0.23786262645916229, + "learning_rate": 5.90450156272689e-06, + "loss": 0.0646, + "step": 5446 + }, + { + "epoch": 0.46, + "grad_norm": 0.29672031518352865, + "learning_rate": 5.903159438496156e-06, + "loss": 0.08, + "step": 5447 + }, + { + "epoch": 0.46, + "grad_norm": 0.27118386912964354, + "learning_rate": 5.901817246992845e-06, + "loss": 0.0862, + "step": 5448 + }, + { + "epoch": 0.46, + "grad_norm": 0.5252336068494479, + "learning_rate": 5.900474988316932e-06, + "loss": 0.1175, + "step": 5449 + }, + { + "epoch": 0.46, + "grad_norm": 0.30824635135407047, + "learning_rate": 5.8991326625683965e-06, + "loss": 0.0831, + "step": 5450 + }, + { + "epoch": 0.46, + "grad_norm": 0.2939412791118779, + "learning_rate": 5.897790269847221e-06, + "loss": 0.1046, + "step": 5451 + }, + { + "epoch": 0.46, + "grad_norm": 0.4516006533442279, + "learning_rate": 5.896447810253399e-06, + "loss": 0.0836, + "step": 5452 + }, + { + "epoch": 0.46, + "grad_norm": 0.2632207720016078, + "learning_rate": 5.895105283886919e-06, + "loss": 0.0886, + "step": 5453 + }, + { + "epoch": 0.46, + "grad_norm": 0.3941103841087465, + "learning_rate": 5.893762690847786e-06, + "loss": 0.0782, + "step": 5454 + }, + { + "epoch": 0.46, + "grad_norm": 0.3606122750644074, + "learning_rate": 5.8924200312359995e-06, + "loss": 0.0976, + "step": 5455 + }, + { + "epoch": 0.46, + "grad_norm": 0.18572721386996988, + "learning_rate": 5.891077305151571e-06, + "loss": 0.0506, + "step": 5456 + }, + { + "epoch": 0.46, + "grad_norm": 0.27288165293169897, + "learning_rate": 5.889734512694512e-06, + "loss": 0.063, + "step": 5457 + }, + { + "epoch": 0.46, + "grad_norm": 0.46128066092372755, + "learning_rate": 5.888391653964845e-06, + "loss": 0.0736, + "step": 5458 + }, + { + "epoch": 0.46, + "grad_norm": 0.4810538279899771, + "learning_rate": 5.887048729062593e-06, + "loss": 0.1153, + "step": 5459 + }, + { + "epoch": 0.46, + "grad_norm": 0.2792174271140684, + "learning_rate": 5.885705738087784e-06, + "loss": 0.1043, + "step": 5460 + }, + { + "epoch": 0.46, + "grad_norm": 0.38596880016960144, + "learning_rate": 5.884362681140451e-06, + "loss": 0.0692, + "step": 5461 + }, + { + "epoch": 0.46, + "grad_norm": 0.36190712670968567, + "learning_rate": 5.8830195583206336e-06, + "loss": 0.0876, + "step": 5462 + }, + { + "epoch": 0.46, + "grad_norm": 0.3695955082362342, + "learning_rate": 5.8816763697283764e-06, + "loss": 0.075, + "step": 5463 + }, + { + "epoch": 0.46, + "grad_norm": 0.2345099539619166, + "learning_rate": 5.880333115463727e-06, + "loss": 0.0604, + "step": 5464 + }, + { + "epoch": 0.46, + "grad_norm": 0.3954974425213387, + "learning_rate": 5.878989795626739e-06, + "loss": 0.0826, + "step": 5465 + }, + { + "epoch": 0.46, + "grad_norm": 0.42798817688051183, + "learning_rate": 5.87764641031747e-06, + "loss": 0.0752, + "step": 5466 + }, + { + "epoch": 0.46, + "grad_norm": 0.43157879208887184, + "learning_rate": 5.876302959635983e-06, + "loss": 0.0783, + "step": 5467 + }, + { + "epoch": 0.46, + "grad_norm": 0.23685597293540014, + "learning_rate": 5.874959443682347e-06, + "loss": 0.0688, + "step": 5468 + }, + { + "epoch": 0.46, + "grad_norm": 0.27552423101129225, + "learning_rate": 5.873615862556636e-06, + "loss": 0.0674, + "step": 5469 + }, + { + "epoch": 0.46, + "grad_norm": 0.3357522739684893, + "learning_rate": 5.872272216358925e-06, + "loss": 0.0631, + "step": 5470 + }, + { + "epoch": 0.46, + "grad_norm": 0.27823073306115076, + "learning_rate": 5.8709285051892995e-06, + "loss": 0.0589, + "step": 5471 + }, + { + "epoch": 0.46, + "grad_norm": 0.29433978868119975, + "learning_rate": 5.869584729147843e-06, + "loss": 0.0481, + "step": 5472 + }, + { + "epoch": 0.46, + "grad_norm": 0.3932134499635916, + "learning_rate": 5.8682408883346535e-06, + "loss": 0.097, + "step": 5473 + }, + { + "epoch": 0.46, + "grad_norm": 0.32592081166041487, + "learning_rate": 5.866896982849821e-06, + "loss": 0.0976, + "step": 5474 + }, + { + "epoch": 0.46, + "grad_norm": 0.4376139618061136, + "learning_rate": 5.865553012793453e-06, + "loss": 0.0981, + "step": 5475 + }, + { + "epoch": 0.46, + "grad_norm": 0.2688216931576172, + "learning_rate": 5.864208978265656e-06, + "loss": 0.0774, + "step": 5476 + }, + { + "epoch": 0.46, + "grad_norm": 0.6363658614798618, + "learning_rate": 5.862864879366537e-06, + "loss": 0.1516, + "step": 5477 + }, + { + "epoch": 0.46, + "grad_norm": 0.2935507933750343, + "learning_rate": 5.861520716196218e-06, + "loss": 0.0774, + "step": 5478 + }, + { + "epoch": 0.46, + "grad_norm": 0.27191425804936137, + "learning_rate": 5.860176488854815e-06, + "loss": 0.063, + "step": 5479 + }, + { + "epoch": 0.46, + "grad_norm": 0.3581244371332679, + "learning_rate": 5.8588321974424565e-06, + "loss": 0.0825, + "step": 5480 + }, + { + "epoch": 0.46, + "grad_norm": 0.456551247047345, + "learning_rate": 5.857487842059274e-06, + "loss": 0.0488, + "step": 5481 + }, + { + "epoch": 0.46, + "grad_norm": 0.4851685836683691, + "learning_rate": 5.856143422805401e-06, + "loss": 0.1165, + "step": 5482 + }, + { + "epoch": 0.46, + "grad_norm": 0.4050814229820206, + "learning_rate": 5.854798939780979e-06, + "loss": 0.1029, + "step": 5483 + }, + { + "epoch": 0.46, + "grad_norm": 0.30698153578493365, + "learning_rate": 5.853454393086152e-06, + "loss": 0.0889, + "step": 5484 + }, + { + "epoch": 0.46, + "grad_norm": 0.25032996767820265, + "learning_rate": 5.85210978282107e-06, + "loss": 0.0831, + "step": 5485 + }, + { + "epoch": 0.46, + "grad_norm": 0.42936133407711347, + "learning_rate": 5.850765109085888e-06, + "loss": 0.1057, + "step": 5486 + }, + { + "epoch": 0.46, + "grad_norm": 0.3808180925542593, + "learning_rate": 5.849420371980764e-06, + "loss": 0.1183, + "step": 5487 + }, + { + "epoch": 0.46, + "grad_norm": 0.3070775081588745, + "learning_rate": 5.848075571605863e-06, + "loss": 0.0858, + "step": 5488 + }, + { + "epoch": 0.46, + "grad_norm": 0.31755065766205454, + "learning_rate": 5.846730708061353e-06, + "loss": 0.0594, + "step": 5489 + }, + { + "epoch": 0.46, + "grad_norm": 0.2731228189169844, + "learning_rate": 5.845385781447407e-06, + "loss": 0.0636, + "step": 5490 + }, + { + "epoch": 0.46, + "grad_norm": 0.5978250146115706, + "learning_rate": 5.844040791864203e-06, + "loss": 0.1253, + "step": 5491 + }, + { + "epoch": 0.46, + "grad_norm": 0.2935693534547744, + "learning_rate": 5.842695739411923e-06, + "loss": 0.0783, + "step": 5492 + }, + { + "epoch": 0.46, + "grad_norm": 0.25102607364980223, + "learning_rate": 5.841350624190757e-06, + "loss": 0.054, + "step": 5493 + }, + { + "epoch": 0.46, + "grad_norm": 0.21224210574241656, + "learning_rate": 5.8400054463008945e-06, + "loss": 0.0554, + "step": 5494 + }, + { + "epoch": 0.46, + "grad_norm": 0.22159124127410115, + "learning_rate": 5.8386602058425336e-06, + "loss": 0.0735, + "step": 5495 + }, + { + "epoch": 0.46, + "grad_norm": 0.2708569485746134, + "learning_rate": 5.837314902915874e-06, + "loss": 0.0427, + "step": 5496 + }, + { + "epoch": 0.46, + "grad_norm": 0.3022036308764928, + "learning_rate": 5.835969537621124e-06, + "loss": 0.1, + "step": 5497 + }, + { + "epoch": 0.46, + "grad_norm": 0.34493755313254176, + "learning_rate": 5.834624110058492e-06, + "loss": 0.097, + "step": 5498 + }, + { + "epoch": 0.46, + "grad_norm": 0.3102325199305184, + "learning_rate": 5.8332786203281936e-06, + "loss": 0.0965, + "step": 5499 + }, + { + "epoch": 0.46, + "grad_norm": 0.3379664814601461, + "learning_rate": 5.831933068530451e-06, + "loss": 0.0751, + "step": 5500 + }, + { + "epoch": 0.46, + "grad_norm": 0.33880655909892915, + "learning_rate": 5.830587454765486e-06, + "loss": 0.0912, + "step": 5501 + }, + { + "epoch": 0.46, + "grad_norm": 0.289699079667515, + "learning_rate": 5.82924177913353e-06, + "loss": 0.0713, + "step": 5502 + }, + { + "epoch": 0.46, + "grad_norm": 0.28724861029267074, + "learning_rate": 5.827896041734814e-06, + "loss": 0.0871, + "step": 5503 + }, + { + "epoch": 0.46, + "grad_norm": 0.4289693560624725, + "learning_rate": 5.826550242669579e-06, + "loss": 0.0723, + "step": 5504 + }, + { + "epoch": 0.46, + "grad_norm": 0.48773536208127166, + "learning_rate": 5.825204382038066e-06, + "loss": 0.1196, + "step": 5505 + }, + { + "epoch": 0.46, + "grad_norm": 0.2262072501009228, + "learning_rate": 5.823858459940524e-06, + "loss": 0.046, + "step": 5506 + }, + { + "epoch": 0.46, + "grad_norm": 0.5949291079928839, + "learning_rate": 5.822512476477205e-06, + "loss": 0.1239, + "step": 5507 + }, + { + "epoch": 0.46, + "grad_norm": 0.36735769816055003, + "learning_rate": 5.821166431748365e-06, + "loss": 0.0783, + "step": 5508 + }, + { + "epoch": 0.46, + "grad_norm": 0.304974037590092, + "learning_rate": 5.8198203258542654e-06, + "loss": 0.1056, + "step": 5509 + }, + { + "epoch": 0.46, + "grad_norm": 0.2879068494404196, + "learning_rate": 5.818474158895172e-06, + "loss": 0.0689, + "step": 5510 + }, + { + "epoch": 0.46, + "grad_norm": 0.3358417956913452, + "learning_rate": 5.8171279309713546e-06, + "loss": 0.1044, + "step": 5511 + }, + { + "epoch": 0.46, + "grad_norm": 0.316950721634035, + "learning_rate": 5.81578164218309e-06, + "loss": 0.0913, + "step": 5512 + }, + { + "epoch": 0.46, + "grad_norm": 0.30487195742167206, + "learning_rate": 5.814435292630658e-06, + "loss": 0.0874, + "step": 5513 + }, + { + "epoch": 0.46, + "grad_norm": 0.301986113268227, + "learning_rate": 5.8130888824143384e-06, + "loss": 0.0808, + "step": 5514 + }, + { + "epoch": 0.46, + "grad_norm": 0.25533843297048364, + "learning_rate": 5.811742411634424e-06, + "loss": 0.0735, + "step": 5515 + }, + { + "epoch": 0.46, + "grad_norm": 0.33011822574590644, + "learning_rate": 5.810395880391203e-06, + "loss": 0.0663, + "step": 5516 + }, + { + "epoch": 0.46, + "grad_norm": 0.4078296517395853, + "learning_rate": 5.809049288784979e-06, + "loss": 0.1183, + "step": 5517 + }, + { + "epoch": 0.46, + "grad_norm": 0.3808957534783275, + "learning_rate": 5.807702636916049e-06, + "loss": 0.0823, + "step": 5518 + }, + { + "epoch": 0.47, + "grad_norm": 0.2430180721782435, + "learning_rate": 5.806355924884722e-06, + "loss": 0.0499, + "step": 5519 + }, + { + "epoch": 0.47, + "grad_norm": 0.262472133568686, + "learning_rate": 5.805009152791309e-06, + "loss": 0.0864, + "step": 5520 + }, + { + "epoch": 0.47, + "grad_norm": 0.2514964345410544, + "learning_rate": 5.803662320736123e-06, + "loss": 0.0415, + "step": 5521 + }, + { + "epoch": 0.47, + "grad_norm": 0.40876828981550006, + "learning_rate": 5.802315428819488e-06, + "loss": 0.1013, + "step": 5522 + }, + { + "epoch": 0.47, + "grad_norm": 0.39789771047624384, + "learning_rate": 5.800968477141724e-06, + "loss": 0.0935, + "step": 5523 + }, + { + "epoch": 0.47, + "grad_norm": 0.33977255097863057, + "learning_rate": 5.799621465803163e-06, + "loss": 0.1011, + "step": 5524 + }, + { + "epoch": 0.47, + "grad_norm": 0.2873917561215367, + "learning_rate": 5.7982743949041375e-06, + "loss": 0.0613, + "step": 5525 + }, + { + "epoch": 0.47, + "grad_norm": 0.23837984934378806, + "learning_rate": 5.796927264544984e-06, + "loss": 0.062, + "step": 5526 + }, + { + "epoch": 0.47, + "grad_norm": 0.19161710253473258, + "learning_rate": 5.795580074826045e-06, + "loss": 0.0238, + "step": 5527 + }, + { + "epoch": 0.47, + "grad_norm": 0.21426544929669433, + "learning_rate": 5.794232825847668e-06, + "loss": 0.075, + "step": 5528 + }, + { + "epoch": 0.47, + "grad_norm": 0.2555229898399971, + "learning_rate": 5.792885517710204e-06, + "loss": 0.0554, + "step": 5529 + }, + { + "epoch": 0.47, + "grad_norm": 0.3436322031031592, + "learning_rate": 5.791538150514007e-06, + "loss": 0.0688, + "step": 5530 + }, + { + "epoch": 0.47, + "grad_norm": 0.21500653929996094, + "learning_rate": 5.790190724359437e-06, + "loss": 0.0573, + "step": 5531 + }, + { + "epoch": 0.47, + "grad_norm": 0.3618648236786172, + "learning_rate": 5.788843239346859e-06, + "loss": 0.0645, + "step": 5532 + }, + { + "epoch": 0.47, + "grad_norm": 0.23069586447728047, + "learning_rate": 5.787495695576642e-06, + "loss": 0.0435, + "step": 5533 + }, + { + "epoch": 0.47, + "grad_norm": 0.2910665008102202, + "learning_rate": 5.786148093149158e-06, + "loss": 0.0678, + "step": 5534 + }, + { + "epoch": 0.47, + "grad_norm": 0.39998732650592017, + "learning_rate": 5.7848004321647834e-06, + "loss": 0.0874, + "step": 5535 + }, + { + "epoch": 0.47, + "grad_norm": 0.4703277204959219, + "learning_rate": 5.783452712723902e-06, + "loss": 0.0675, + "step": 5536 + }, + { + "epoch": 0.47, + "grad_norm": 0.5372705539226282, + "learning_rate": 5.782104934926897e-06, + "loss": 0.1022, + "step": 5537 + }, + { + "epoch": 0.47, + "grad_norm": 0.47481471149161253, + "learning_rate": 5.78075709887416e-06, + "loss": 0.1082, + "step": 5538 + }, + { + "epoch": 0.47, + "grad_norm": 0.3450454044999949, + "learning_rate": 5.779409204666089e-06, + "loss": 0.077, + "step": 5539 + }, + { + "epoch": 0.47, + "grad_norm": 0.35855623659021096, + "learning_rate": 5.778061252403077e-06, + "loss": 0.0936, + "step": 5540 + }, + { + "epoch": 0.47, + "grad_norm": 0.2721416815134429, + "learning_rate": 5.776713242185532e-06, + "loss": 0.0699, + "step": 5541 + }, + { + "epoch": 0.47, + "grad_norm": 0.2392716736204212, + "learning_rate": 5.77536517411386e-06, + "loss": 0.0607, + "step": 5542 + }, + { + "epoch": 0.47, + "grad_norm": 0.5508789040434487, + "learning_rate": 5.774017048288472e-06, + "loss": 0.1035, + "step": 5543 + }, + { + "epoch": 0.47, + "grad_norm": 0.2899073107527558, + "learning_rate": 5.772668864809786e-06, + "loss": 0.0909, + "step": 5544 + }, + { + "epoch": 0.47, + "grad_norm": 0.3312688244821426, + "learning_rate": 5.771320623778222e-06, + "loss": 0.1091, + "step": 5545 + }, + { + "epoch": 0.47, + "grad_norm": 0.44022967666575574, + "learning_rate": 5.769972325294204e-06, + "loss": 0.0891, + "step": 5546 + }, + { + "epoch": 0.47, + "grad_norm": 0.17073865233776703, + "learning_rate": 5.768623969458164e-06, + "loss": 0.0562, + "step": 5547 + }, + { + "epoch": 0.47, + "grad_norm": 0.25083429444298777, + "learning_rate": 5.767275556370531e-06, + "loss": 0.1015, + "step": 5548 + }, + { + "epoch": 0.47, + "grad_norm": 0.18966072650472876, + "learning_rate": 5.765927086131745e-06, + "loss": 0.0633, + "step": 5549 + }, + { + "epoch": 0.47, + "grad_norm": 0.29976787787873366, + "learning_rate": 5.764578558842247e-06, + "loss": 0.0702, + "step": 5550 + }, + { + "epoch": 0.47, + "grad_norm": 0.6077257866343774, + "learning_rate": 5.763229974602487e-06, + "loss": 0.1474, + "step": 5551 + }, + { + "epoch": 0.47, + "grad_norm": 0.47220680770563483, + "learning_rate": 5.7618813335129096e-06, + "loss": 0.1059, + "step": 5552 + }, + { + "epoch": 0.47, + "grad_norm": 0.2428644419018441, + "learning_rate": 5.760532635673973e-06, + "loss": 0.0685, + "step": 5553 + }, + { + "epoch": 0.47, + "grad_norm": 0.3041494908645644, + "learning_rate": 5.759183881186136e-06, + "loss": 0.0855, + "step": 5554 + }, + { + "epoch": 0.47, + "grad_norm": 0.30260185328118305, + "learning_rate": 5.7578350701498585e-06, + "loss": 0.0972, + "step": 5555 + }, + { + "epoch": 0.47, + "grad_norm": 0.3272026213165436, + "learning_rate": 5.756486202665613e-06, + "loss": 0.0839, + "step": 5556 + }, + { + "epoch": 0.47, + "grad_norm": 0.23151635629596667, + "learning_rate": 5.755137278833867e-06, + "loss": 0.0641, + "step": 5557 + }, + { + "epoch": 0.47, + "grad_norm": 0.281132565847536, + "learning_rate": 5.753788298755097e-06, + "loss": 0.0776, + "step": 5558 + }, + { + "epoch": 0.47, + "grad_norm": 0.329153405046996, + "learning_rate": 5.752439262529784e-06, + "loss": 0.0878, + "step": 5559 + }, + { + "epoch": 0.47, + "grad_norm": 0.21577696935457444, + "learning_rate": 5.751090170258411e-06, + "loss": 0.0473, + "step": 5560 + }, + { + "epoch": 0.47, + "grad_norm": 0.30112588880134533, + "learning_rate": 5.749741022041468e-06, + "loss": 0.0555, + "step": 5561 + }, + { + "epoch": 0.47, + "grad_norm": 0.23695833935562333, + "learning_rate": 5.7483918179794455e-06, + "loss": 0.0535, + "step": 5562 + }, + { + "epoch": 0.47, + "grad_norm": 0.3717634800904838, + "learning_rate": 5.74704255817284e-06, + "loss": 0.0817, + "step": 5563 + }, + { + "epoch": 0.47, + "grad_norm": 0.37744465779504965, + "learning_rate": 5.745693242722154e-06, + "loss": 0.0724, + "step": 5564 + }, + { + "epoch": 0.47, + "grad_norm": 0.22368476125756517, + "learning_rate": 5.744343871727891e-06, + "loss": 0.0633, + "step": 5565 + }, + { + "epoch": 0.47, + "grad_norm": 0.5010375452613077, + "learning_rate": 5.742994445290559e-06, + "loss": 0.1203, + "step": 5566 + }, + { + "epoch": 0.47, + "grad_norm": 0.2204684839660284, + "learning_rate": 5.741644963510673e-06, + "loss": 0.0738, + "step": 5567 + }, + { + "epoch": 0.47, + "grad_norm": 0.3736840192163062, + "learning_rate": 5.740295426488751e-06, + "loss": 0.103, + "step": 5568 + }, + { + "epoch": 0.47, + "grad_norm": 0.2882556582912704, + "learning_rate": 5.738945834325311e-06, + "loss": 0.0826, + "step": 5569 + }, + { + "epoch": 0.47, + "grad_norm": 0.34886051486822206, + "learning_rate": 5.737596187120883e-06, + "loss": 0.0871, + "step": 5570 + }, + { + "epoch": 0.47, + "grad_norm": 0.3522572809195416, + "learning_rate": 5.736246484975993e-06, + "loss": 0.0833, + "step": 5571 + }, + { + "epoch": 0.47, + "grad_norm": 0.3353737834337096, + "learning_rate": 5.734896727991175e-06, + "loss": 0.0953, + "step": 5572 + }, + { + "epoch": 0.47, + "grad_norm": 0.5509551293991473, + "learning_rate": 5.7335469162669684e-06, + "loss": 0.0995, + "step": 5573 + }, + { + "epoch": 0.47, + "grad_norm": 0.2464567524491927, + "learning_rate": 5.732197049903915e-06, + "loss": 0.0645, + "step": 5574 + }, + { + "epoch": 0.47, + "grad_norm": 0.2483681554925709, + "learning_rate": 5.730847129002559e-06, + "loss": 0.0619, + "step": 5575 + }, + { + "epoch": 0.47, + "grad_norm": 0.2610507995413378, + "learning_rate": 5.729497153663451e-06, + "loss": 0.0642, + "step": 5576 + }, + { + "epoch": 0.47, + "grad_norm": 0.3006625763711666, + "learning_rate": 5.728147123987147e-06, + "loss": 0.0703, + "step": 5577 + }, + { + "epoch": 0.47, + "grad_norm": 0.3182189379090218, + "learning_rate": 5.726797040074202e-06, + "loss": 0.0992, + "step": 5578 + }, + { + "epoch": 0.47, + "grad_norm": 0.3266703415070439, + "learning_rate": 5.72544690202518e-06, + "loss": 0.0549, + "step": 5579 + }, + { + "epoch": 0.47, + "grad_norm": 0.2233672082402826, + "learning_rate": 5.724096709940646e-06, + "loss": 0.0681, + "step": 5580 + }, + { + "epoch": 0.47, + "grad_norm": 0.34636489326505143, + "learning_rate": 5.722746463921173e-06, + "loss": 0.0523, + "step": 5581 + }, + { + "epoch": 0.47, + "grad_norm": 0.4627520039721311, + "learning_rate": 5.721396164067331e-06, + "loss": 0.1276, + "step": 5582 + }, + { + "epoch": 0.47, + "grad_norm": 0.2597117947591045, + "learning_rate": 5.720045810479702e-06, + "loss": 0.0683, + "step": 5583 + }, + { + "epoch": 0.47, + "grad_norm": 0.28018862022772884, + "learning_rate": 5.718695403258865e-06, + "loss": 0.0694, + "step": 5584 + }, + { + "epoch": 0.47, + "grad_norm": 0.1873632986795777, + "learning_rate": 5.717344942505408e-06, + "loss": 0.0582, + "step": 5585 + }, + { + "epoch": 0.47, + "grad_norm": 0.47741410793170386, + "learning_rate": 5.71599442831992e-06, + "loss": 0.1212, + "step": 5586 + }, + { + "epoch": 0.47, + "grad_norm": 0.18485283410659728, + "learning_rate": 5.714643860802997e-06, + "loss": 0.0579, + "step": 5587 + }, + { + "epoch": 0.47, + "grad_norm": 0.31753321833564613, + "learning_rate": 5.713293240055235e-06, + "loss": 0.1015, + "step": 5588 + }, + { + "epoch": 0.47, + "grad_norm": 0.4289710166430089, + "learning_rate": 5.711942566177239e-06, + "loss": 0.1073, + "step": 5589 + }, + { + "epoch": 0.47, + "grad_norm": 0.20871209154545425, + "learning_rate": 5.710591839269613e-06, + "loss": 0.0683, + "step": 5590 + }, + { + "epoch": 0.47, + "grad_norm": 0.32200975472776333, + "learning_rate": 5.709241059432966e-06, + "loss": 0.1044, + "step": 5591 + }, + { + "epoch": 0.47, + "grad_norm": 0.3069164887541522, + "learning_rate": 5.707890226767915e-06, + "loss": 0.1243, + "step": 5592 + }, + { + "epoch": 0.47, + "grad_norm": 0.24478970836371816, + "learning_rate": 5.7065393413750745e-06, + "loss": 0.0785, + "step": 5593 + }, + { + "epoch": 0.47, + "grad_norm": 0.4204799714300244, + "learning_rate": 5.705188403355069e-06, + "loss": 0.0936, + "step": 5594 + }, + { + "epoch": 0.47, + "grad_norm": 0.39321693575458266, + "learning_rate": 5.703837412808523e-06, + "loss": 0.0585, + "step": 5595 + }, + { + "epoch": 0.47, + "grad_norm": 0.3816088083027971, + "learning_rate": 5.702486369836066e-06, + "loss": 0.1096, + "step": 5596 + }, + { + "epoch": 0.47, + "grad_norm": 0.3420344999923336, + "learning_rate": 5.701135274538332e-06, + "loss": 0.078, + "step": 5597 + }, + { + "epoch": 0.47, + "grad_norm": 0.3588946040476363, + "learning_rate": 5.699784127015958e-06, + "loss": 0.0934, + "step": 5598 + }, + { + "epoch": 0.47, + "grad_norm": 0.390002747615039, + "learning_rate": 5.698432927369586e-06, + "loss": 0.0968, + "step": 5599 + }, + { + "epoch": 0.47, + "grad_norm": 0.2861890574058236, + "learning_rate": 5.697081675699861e-06, + "loss": 0.0673, + "step": 5600 + }, + { + "epoch": 0.47, + "grad_norm": 0.2606084938917433, + "learning_rate": 5.695730372107431e-06, + "loss": 0.0636, + "step": 5601 + }, + { + "epoch": 0.47, + "grad_norm": 0.2116570568524385, + "learning_rate": 5.694379016692953e-06, + "loss": 0.0641, + "step": 5602 + }, + { + "epoch": 0.47, + "grad_norm": 0.32236574141529023, + "learning_rate": 5.693027609557077e-06, + "loss": 0.0676, + "step": 5603 + }, + { + "epoch": 0.47, + "grad_norm": 0.23067120768289576, + "learning_rate": 5.69167615080047e-06, + "loss": 0.0574, + "step": 5604 + }, + { + "epoch": 0.47, + "grad_norm": 0.3479213003383328, + "learning_rate": 5.6903246405237925e-06, + "loss": 0.0477, + "step": 5605 + }, + { + "epoch": 0.47, + "grad_norm": 0.2760653672522455, + "learning_rate": 5.688973078827714e-06, + "loss": 0.0679, + "step": 5606 + }, + { + "epoch": 0.47, + "grad_norm": 0.4573203780161493, + "learning_rate": 5.687621465812909e-06, + "loss": 0.1129, + "step": 5607 + }, + { + "epoch": 0.47, + "grad_norm": 0.5464482398141809, + "learning_rate": 5.68626980158005e-06, + "loss": 0.1591, + "step": 5608 + }, + { + "epoch": 0.47, + "grad_norm": 0.27751221135060417, + "learning_rate": 5.6849180862298205e-06, + "loss": 0.0678, + "step": 5609 + }, + { + "epoch": 0.47, + "grad_norm": 0.6131120547521957, + "learning_rate": 5.683566319862899e-06, + "loss": 0.1489, + "step": 5610 + }, + { + "epoch": 0.47, + "grad_norm": 0.3840083354820757, + "learning_rate": 5.682214502579978e-06, + "loss": 0.0922, + "step": 5611 + }, + { + "epoch": 0.47, + "grad_norm": 0.504500977166992, + "learning_rate": 5.680862634481746e-06, + "loss": 0.1234, + "step": 5612 + }, + { + "epoch": 0.47, + "grad_norm": 0.4544333815305592, + "learning_rate": 5.679510715668898e-06, + "loss": 0.0949, + "step": 5613 + }, + { + "epoch": 0.47, + "grad_norm": 0.5624046200684606, + "learning_rate": 5.678158746242134e-06, + "loss": 0.0745, + "step": 5614 + }, + { + "epoch": 0.47, + "grad_norm": 0.3124316296871797, + "learning_rate": 5.6768067263021575e-06, + "loss": 0.0962, + "step": 5615 + }, + { + "epoch": 0.47, + "grad_norm": 0.4099163638457547, + "learning_rate": 5.675454655949672e-06, + "loss": 0.074, + "step": 5616 + }, + { + "epoch": 0.47, + "grad_norm": 0.34444542518798293, + "learning_rate": 5.674102535285388e-06, + "loss": 0.0832, + "step": 5617 + }, + { + "epoch": 0.47, + "grad_norm": 0.3876046215038411, + "learning_rate": 5.672750364410022e-06, + "loss": 0.1015, + "step": 5618 + }, + { + "epoch": 0.47, + "grad_norm": 0.29850263005369165, + "learning_rate": 5.671398143424288e-06, + "loss": 0.1032, + "step": 5619 + }, + { + "epoch": 0.47, + "grad_norm": 0.37656503515646045, + "learning_rate": 5.67004587242891e-06, + "loss": 0.1134, + "step": 5620 + }, + { + "epoch": 0.47, + "grad_norm": 0.7099567169699808, + "learning_rate": 5.668693551524614e-06, + "loss": 0.1365, + "step": 5621 + }, + { + "epoch": 0.47, + "grad_norm": 0.3362720418653712, + "learning_rate": 5.667341180812122e-06, + "loss": 0.0518, + "step": 5622 + }, + { + "epoch": 0.47, + "grad_norm": 0.7227858759375333, + "learning_rate": 5.665988760392175e-06, + "loss": 0.1635, + "step": 5623 + }, + { + "epoch": 0.47, + "grad_norm": 0.34420597132915093, + "learning_rate": 5.664636290365505e-06, + "loss": 0.1058, + "step": 5624 + }, + { + "epoch": 0.47, + "grad_norm": 0.26579031508845113, + "learning_rate": 5.663283770832852e-06, + "loss": 0.0574, + "step": 5625 + }, + { + "epoch": 0.47, + "grad_norm": 0.2803913870350494, + "learning_rate": 5.6619312018949606e-06, + "loss": 0.0628, + "step": 5626 + }, + { + "epoch": 0.47, + "grad_norm": 0.21091289948698302, + "learning_rate": 5.660578583652576e-06, + "loss": 0.042, + "step": 5627 + }, + { + "epoch": 0.47, + "grad_norm": 0.3534792353523456, + "learning_rate": 5.659225916206451e-06, + "loss": 0.1189, + "step": 5628 + }, + { + "epoch": 0.47, + "grad_norm": 0.4752144203220441, + "learning_rate": 5.657873199657339e-06, + "loss": 0.0669, + "step": 5629 + }, + { + "epoch": 0.47, + "grad_norm": 0.2414156303105738, + "learning_rate": 5.656520434105999e-06, + "loss": 0.062, + "step": 5630 + }, + { + "epoch": 0.47, + "grad_norm": 0.44630109384612976, + "learning_rate": 5.655167619653192e-06, + "loss": 0.108, + "step": 5631 + }, + { + "epoch": 0.47, + "grad_norm": 0.3440341420918822, + "learning_rate": 5.653814756399685e-06, + "loss": 0.0891, + "step": 5632 + }, + { + "epoch": 0.47, + "grad_norm": 0.30230005052701264, + "learning_rate": 5.652461844446246e-06, + "loss": 0.1032, + "step": 5633 + }, + { + "epoch": 0.47, + "grad_norm": 0.26432678382436975, + "learning_rate": 5.651108883893648e-06, + "loss": 0.0679, + "step": 5634 + }, + { + "epoch": 0.47, + "grad_norm": 0.3292432517057025, + "learning_rate": 5.649755874842666e-06, + "loss": 0.0681, + "step": 5635 + }, + { + "epoch": 0.47, + "grad_norm": 0.3226872765953513, + "learning_rate": 5.648402817394083e-06, + "loss": 0.0992, + "step": 5636 + }, + { + "epoch": 0.47, + "grad_norm": 0.6761321770399754, + "learning_rate": 5.64704971164868e-06, + "loss": 0.1163, + "step": 5637 + }, + { + "epoch": 0.48, + "grad_norm": 0.36251828083933924, + "learning_rate": 5.6456965577072455e-06, + "loss": 0.1079, + "step": 5638 + }, + { + "epoch": 0.48, + "grad_norm": 0.36111582223424094, + "learning_rate": 5.64434335567057e-06, + "loss": 0.0806, + "step": 5639 + }, + { + "epoch": 0.48, + "grad_norm": 0.3461952598619096, + "learning_rate": 5.642990105639447e-06, + "loss": 0.073, + "step": 5640 + }, + { + "epoch": 0.48, + "grad_norm": 0.2749002554082094, + "learning_rate": 5.641636807714677e-06, + "loss": 0.0819, + "step": 5641 + }, + { + "epoch": 0.48, + "grad_norm": 0.29281198780191914, + "learning_rate": 5.640283461997058e-06, + "loss": 0.0883, + "step": 5642 + }, + { + "epoch": 0.48, + "grad_norm": 0.3111774504830054, + "learning_rate": 5.6389300685873984e-06, + "loss": 0.1034, + "step": 5643 + }, + { + "epoch": 0.48, + "grad_norm": 0.2151425102716583, + "learning_rate": 5.637576627586503e-06, + "loss": 0.067, + "step": 5644 + }, + { + "epoch": 0.48, + "grad_norm": 0.2545000311954673, + "learning_rate": 5.636223139095188e-06, + "loss": 0.0803, + "step": 5645 + }, + { + "epoch": 0.48, + "grad_norm": 0.4608188534182907, + "learning_rate": 5.634869603214267e-06, + "loss": 0.0718, + "step": 5646 + }, + { + "epoch": 0.48, + "grad_norm": 0.37046394131674043, + "learning_rate": 5.6335160200445595e-06, + "loss": 0.1085, + "step": 5647 + }, + { + "epoch": 0.48, + "grad_norm": 0.3740440366772873, + "learning_rate": 5.6321623896868885e-06, + "loss": 0.0583, + "step": 5648 + }, + { + "epoch": 0.48, + "grad_norm": 0.27509748450576443, + "learning_rate": 5.630808712242081e-06, + "loss": 0.0765, + "step": 5649 + }, + { + "epoch": 0.48, + "grad_norm": 0.482930793489925, + "learning_rate": 5.629454987810964e-06, + "loss": 0.1509, + "step": 5650 + }, + { + "epoch": 0.48, + "grad_norm": 0.26073556383640606, + "learning_rate": 5.628101216494374e-06, + "loss": 0.0925, + "step": 5651 + }, + { + "epoch": 0.48, + "grad_norm": 0.22663094642167075, + "learning_rate": 5.626747398393145e-06, + "loss": 0.065, + "step": 5652 + }, + { + "epoch": 0.48, + "grad_norm": 0.5500517239429134, + "learning_rate": 5.62539353360812e-06, + "loss": 0.1789, + "step": 5653 + }, + { + "epoch": 0.48, + "grad_norm": 0.21734146046063216, + "learning_rate": 5.62403962224014e-06, + "loss": 0.0731, + "step": 5654 + }, + { + "epoch": 0.48, + "grad_norm": 0.4658442455755646, + "learning_rate": 5.622685664390056e-06, + "loss": 0.1222, + "step": 5655 + }, + { + "epoch": 0.48, + "grad_norm": 0.4624750731992381, + "learning_rate": 5.6213316601587144e-06, + "loss": 0.0882, + "step": 5656 + }, + { + "epoch": 0.48, + "grad_norm": 0.2561310086827649, + "learning_rate": 5.619977609646971e-06, + "loss": 0.0722, + "step": 5657 + }, + { + "epoch": 0.48, + "grad_norm": 0.44924519934666585, + "learning_rate": 5.618623512955686e-06, + "loss": 0.1068, + "step": 5658 + }, + { + "epoch": 0.48, + "grad_norm": 0.23712888561973036, + "learning_rate": 5.617269370185715e-06, + "loss": 0.066, + "step": 5659 + }, + { + "epoch": 0.48, + "grad_norm": 0.47920327494316406, + "learning_rate": 5.615915181437928e-06, + "loss": 0.1066, + "step": 5660 + }, + { + "epoch": 0.48, + "grad_norm": 0.33331104594023203, + "learning_rate": 5.614560946813188e-06, + "loss": 0.0517, + "step": 5661 + }, + { + "epoch": 0.48, + "grad_norm": 0.44260127535128885, + "learning_rate": 5.6132066664123705e-06, + "loss": 0.1171, + "step": 5662 + }, + { + "epoch": 0.48, + "grad_norm": 0.34322407566172985, + "learning_rate": 5.611852340336348e-06, + "loss": 0.0986, + "step": 5663 + }, + { + "epoch": 0.48, + "grad_norm": 0.36644487013452426, + "learning_rate": 5.610497968685997e-06, + "loss": 0.0796, + "step": 5664 + }, + { + "epoch": 0.48, + "grad_norm": 0.25972183438657026, + "learning_rate": 5.609143551562203e-06, + "loss": 0.0649, + "step": 5665 + }, + { + "epoch": 0.48, + "grad_norm": 0.39474931569518107, + "learning_rate": 5.607789089065847e-06, + "loss": 0.1068, + "step": 5666 + }, + { + "epoch": 0.48, + "grad_norm": 0.31139319987199193, + "learning_rate": 5.606434581297821e-06, + "loss": 0.0961, + "step": 5667 + }, + { + "epoch": 0.48, + "grad_norm": 0.37227392553888705, + "learning_rate": 5.605080028359013e-06, + "loss": 0.0799, + "step": 5668 + }, + { + "epoch": 0.48, + "grad_norm": 0.37354813746092325, + "learning_rate": 5.6037254303503205e-06, + "loss": 0.094, + "step": 5669 + }, + { + "epoch": 0.48, + "grad_norm": 0.4497863241086802, + "learning_rate": 5.602370787372642e-06, + "loss": 0.1098, + "step": 5670 + }, + { + "epoch": 0.48, + "grad_norm": 0.413406392049191, + "learning_rate": 5.601016099526876e-06, + "loss": 0.1301, + "step": 5671 + }, + { + "epoch": 0.48, + "grad_norm": 0.22696516091607688, + "learning_rate": 5.599661366913931e-06, + "loss": 0.0702, + "step": 5672 + }, + { + "epoch": 0.48, + "grad_norm": 0.3063495252430633, + "learning_rate": 5.5983065896347135e-06, + "loss": 0.084, + "step": 5673 + }, + { + "epoch": 0.48, + "grad_norm": 0.39186784208687114, + "learning_rate": 5.596951767790136e-06, + "loss": 0.0636, + "step": 5674 + }, + { + "epoch": 0.48, + "grad_norm": 0.22955859999304504, + "learning_rate": 5.595596901481114e-06, + "loss": 0.0969, + "step": 5675 + }, + { + "epoch": 0.48, + "grad_norm": 0.3600930975295021, + "learning_rate": 5.594241990808565e-06, + "loss": 0.0837, + "step": 5676 + }, + { + "epoch": 0.48, + "grad_norm": 0.23089071127195185, + "learning_rate": 5.59288703587341e-06, + "loss": 0.0568, + "step": 5677 + }, + { + "epoch": 0.48, + "grad_norm": 0.423516362733909, + "learning_rate": 5.5915320367765745e-06, + "loss": 0.123, + "step": 5678 + }, + { + "epoch": 0.48, + "grad_norm": 0.3340064185986187, + "learning_rate": 5.590176993618989e-06, + "loss": 0.1019, + "step": 5679 + }, + { + "epoch": 0.48, + "grad_norm": 0.1413243354166399, + "learning_rate": 5.58882190650158e-06, + "loss": 0.0393, + "step": 5680 + }, + { + "epoch": 0.48, + "grad_norm": 0.2028496639971009, + "learning_rate": 5.587466775525287e-06, + "loss": 0.0654, + "step": 5681 + }, + { + "epoch": 0.48, + "grad_norm": 0.18628100996373606, + "learning_rate": 5.586111600791047e-06, + "loss": 0.0629, + "step": 5682 + }, + { + "epoch": 0.48, + "grad_norm": 0.42744836615686443, + "learning_rate": 5.584756382399801e-06, + "loss": 0.0956, + "step": 5683 + }, + { + "epoch": 0.48, + "grad_norm": 0.4940653348606991, + "learning_rate": 5.5834011204524905e-06, + "loss": 0.0959, + "step": 5684 + }, + { + "epoch": 0.48, + "grad_norm": 0.30569883879524296, + "learning_rate": 5.582045815050069e-06, + "loss": 0.0884, + "step": 5685 + }, + { + "epoch": 0.48, + "grad_norm": 0.40931278459331844, + "learning_rate": 5.580690466293482e-06, + "loss": 0.0999, + "step": 5686 + }, + { + "epoch": 0.48, + "grad_norm": 0.6138589350260053, + "learning_rate": 5.579335074283688e-06, + "loss": 0.1126, + "step": 5687 + }, + { + "epoch": 0.48, + "grad_norm": 0.7521137790545585, + "learning_rate": 5.5779796391216426e-06, + "loss": 0.1017, + "step": 5688 + }, + { + "epoch": 0.48, + "grad_norm": 0.47098228306483425, + "learning_rate": 5.576624160908306e-06, + "loss": 0.107, + "step": 5689 + }, + { + "epoch": 0.48, + "grad_norm": 0.427566993867442, + "learning_rate": 5.575268639744643e-06, + "loss": 0.0871, + "step": 5690 + }, + { + "epoch": 0.48, + "grad_norm": 0.29042767680987536, + "learning_rate": 5.573913075731621e-06, + "loss": 0.0627, + "step": 5691 + }, + { + "epoch": 0.48, + "grad_norm": 0.4916901087083386, + "learning_rate": 5.572557468970209e-06, + "loss": 0.1025, + "step": 5692 + }, + { + "epoch": 0.48, + "grad_norm": 0.32846475135583586, + "learning_rate": 5.571201819561381e-06, + "loss": 0.0726, + "step": 5693 + }, + { + "epoch": 0.48, + "grad_norm": 0.3223533666151447, + "learning_rate": 5.569846127606115e-06, + "loss": 0.0764, + "step": 5694 + }, + { + "epoch": 0.48, + "grad_norm": 0.3162263028394242, + "learning_rate": 5.568490393205389e-06, + "loss": 0.0625, + "step": 5695 + }, + { + "epoch": 0.48, + "grad_norm": 0.3597534976214187, + "learning_rate": 5.567134616460188e-06, + "loss": 0.0938, + "step": 5696 + }, + { + "epoch": 0.48, + "grad_norm": 0.3491695240286068, + "learning_rate": 5.565778797471496e-06, + "loss": 0.0806, + "step": 5697 + }, + { + "epoch": 0.48, + "grad_norm": 0.5666901458981042, + "learning_rate": 5.564422936340304e-06, + "loss": 0.0943, + "step": 5698 + }, + { + "epoch": 0.48, + "grad_norm": 0.30673995229933726, + "learning_rate": 5.563067033167603e-06, + "loss": 0.064, + "step": 5699 + }, + { + "epoch": 0.48, + "grad_norm": 0.32408433337671116, + "learning_rate": 5.561711088054389e-06, + "loss": 0.0666, + "step": 5700 + }, + { + "epoch": 0.48, + "grad_norm": 0.2766375171785388, + "learning_rate": 5.560355101101662e-06, + "loss": 0.1018, + "step": 5701 + }, + { + "epoch": 0.48, + "grad_norm": 0.551243941580642, + "learning_rate": 5.558999072410423e-06, + "loss": 0.0745, + "step": 5702 + }, + { + "epoch": 0.48, + "grad_norm": 0.3241903474123637, + "learning_rate": 5.557643002081674e-06, + "loss": 0.0637, + "step": 5703 + }, + { + "epoch": 0.48, + "grad_norm": 0.3284709302287849, + "learning_rate": 5.556286890216429e-06, + "loss": 0.0698, + "step": 5704 + }, + { + "epoch": 0.48, + "grad_norm": 0.3348684344022521, + "learning_rate": 5.554930736915694e-06, + "loss": 0.0985, + "step": 5705 + }, + { + "epoch": 0.48, + "grad_norm": 0.2583363734599848, + "learning_rate": 5.5535745422804855e-06, + "loss": 0.0838, + "step": 5706 + }, + { + "epoch": 0.48, + "grad_norm": 0.5085799631078692, + "learning_rate": 5.55221830641182e-06, + "loss": 0.1432, + "step": 5707 + }, + { + "epoch": 0.48, + "grad_norm": 0.42168498473145294, + "learning_rate": 5.5508620294107185e-06, + "loss": 0.1154, + "step": 5708 + }, + { + "epoch": 0.48, + "grad_norm": 0.4416564343069031, + "learning_rate": 5.549505711378204e-06, + "loss": 0.11, + "step": 5709 + }, + { + "epoch": 0.48, + "grad_norm": 0.32444438881896137, + "learning_rate": 5.548149352415302e-06, + "loss": 0.059, + "step": 5710 + }, + { + "epoch": 0.48, + "grad_norm": 0.22523026473208216, + "learning_rate": 5.546792952623044e-06, + "loss": 0.0638, + "step": 5711 + }, + { + "epoch": 0.48, + "grad_norm": 0.3886911800859997, + "learning_rate": 5.545436512102461e-06, + "loss": 0.0818, + "step": 5712 + }, + { + "epoch": 0.48, + "grad_norm": 0.31775115692365624, + "learning_rate": 5.544080030954591e-06, + "loss": 0.0964, + "step": 5713 + }, + { + "epoch": 0.48, + "grad_norm": 0.21575205617215942, + "learning_rate": 5.542723509280469e-06, + "loss": 0.0502, + "step": 5714 + }, + { + "epoch": 0.48, + "grad_norm": 0.27910642023359394, + "learning_rate": 5.5413669471811385e-06, + "loss": 0.0569, + "step": 5715 + }, + { + "epoch": 0.48, + "grad_norm": 0.45358836130889546, + "learning_rate": 5.5400103447576444e-06, + "loss": 0.1172, + "step": 5716 + }, + { + "epoch": 0.48, + "grad_norm": 0.23814668839737924, + "learning_rate": 5.538653702111035e-06, + "loss": 0.0852, + "step": 5717 + }, + { + "epoch": 0.48, + "grad_norm": 0.22656444488042088, + "learning_rate": 5.537297019342358e-06, + "loss": 0.0537, + "step": 5718 + }, + { + "epoch": 0.48, + "grad_norm": 0.296385686962184, + "learning_rate": 5.535940296552671e-06, + "loss": 0.0927, + "step": 5719 + }, + { + "epoch": 0.48, + "grad_norm": 0.2700843957690607, + "learning_rate": 5.534583533843028e-06, + "loss": 0.0609, + "step": 5720 + }, + { + "epoch": 0.48, + "grad_norm": 0.24369352964737995, + "learning_rate": 5.533226731314489e-06, + "loss": 0.0722, + "step": 5721 + }, + { + "epoch": 0.48, + "grad_norm": 0.7172085429352948, + "learning_rate": 5.5318698890681166e-06, + "loss": 0.1509, + "step": 5722 + }, + { + "epoch": 0.48, + "grad_norm": 0.37429696243513727, + "learning_rate": 5.530513007204977e-06, + "loss": 0.1158, + "step": 5723 + }, + { + "epoch": 0.48, + "grad_norm": 0.20434959965219343, + "learning_rate": 5.529156085826138e-06, + "loss": 0.0518, + "step": 5724 + }, + { + "epoch": 0.48, + "grad_norm": 0.23170871561108292, + "learning_rate": 5.527799125032671e-06, + "loss": 0.0442, + "step": 5725 + }, + { + "epoch": 0.48, + "grad_norm": 0.4605852887510632, + "learning_rate": 5.5264421249256495e-06, + "loss": 0.1091, + "step": 5726 + }, + { + "epoch": 0.48, + "grad_norm": 0.3543527114197729, + "learning_rate": 5.525085085606152e-06, + "loss": 0.0841, + "step": 5727 + }, + { + "epoch": 0.48, + "grad_norm": 0.4411884704545098, + "learning_rate": 5.523728007175259e-06, + "loss": 0.1037, + "step": 5728 + }, + { + "epoch": 0.48, + "grad_norm": 0.27820584968170453, + "learning_rate": 5.5223708897340544e-06, + "loss": 0.0615, + "step": 5729 + }, + { + "epoch": 0.48, + "grad_norm": 0.45218606064918976, + "learning_rate": 5.521013733383622e-06, + "loss": 0.1336, + "step": 5730 + }, + { + "epoch": 0.48, + "grad_norm": 0.30549844456248904, + "learning_rate": 5.519656538225051e-06, + "loss": 0.0695, + "step": 5731 + }, + { + "epoch": 0.48, + "grad_norm": 0.4291096704631432, + "learning_rate": 5.5182993043594335e-06, + "loss": 0.0997, + "step": 5732 + }, + { + "epoch": 0.48, + "grad_norm": 0.2571889427529331, + "learning_rate": 5.516942031887866e-06, + "loss": 0.0354, + "step": 5733 + }, + { + "epoch": 0.48, + "grad_norm": 0.2757948996462117, + "learning_rate": 5.515584720911443e-06, + "loss": 0.0673, + "step": 5734 + }, + { + "epoch": 0.48, + "grad_norm": 0.5385883576334488, + "learning_rate": 5.5142273715312675e-06, + "loss": 0.1248, + "step": 5735 + }, + { + "epoch": 0.48, + "grad_norm": 0.3577852273370118, + "learning_rate": 5.512869983848441e-06, + "loss": 0.0629, + "step": 5736 + }, + { + "epoch": 0.48, + "grad_norm": 0.3682384777193351, + "learning_rate": 5.511512557964072e-06, + "loss": 0.0903, + "step": 5737 + }, + { + "epoch": 0.48, + "grad_norm": 0.27309780141062473, + "learning_rate": 5.510155093979266e-06, + "loss": 0.0631, + "step": 5738 + }, + { + "epoch": 0.48, + "grad_norm": 0.19783353713567856, + "learning_rate": 5.5087975919951374e-06, + "loss": 0.0391, + "step": 5739 + }, + { + "epoch": 0.48, + "grad_norm": 0.3012027119446076, + "learning_rate": 5.507440052112801e-06, + "loss": 0.0789, + "step": 5740 + }, + { + "epoch": 0.48, + "grad_norm": 0.19164559440211185, + "learning_rate": 5.506082474433372e-06, + "loss": 0.0532, + "step": 5741 + }, + { + "epoch": 0.48, + "grad_norm": 0.5413036723531496, + "learning_rate": 5.504724859057974e-06, + "loss": 0.1287, + "step": 5742 + }, + { + "epoch": 0.48, + "grad_norm": 0.23589077038115117, + "learning_rate": 5.503367206087727e-06, + "loss": 0.0592, + "step": 5743 + }, + { + "epoch": 0.48, + "grad_norm": 0.3436411318282866, + "learning_rate": 5.5020095156237575e-06, + "loss": 0.084, + "step": 5744 + }, + { + "epoch": 0.48, + "grad_norm": 0.24285203077891754, + "learning_rate": 5.500651787767196e-06, + "loss": 0.0535, + "step": 5745 + }, + { + "epoch": 0.48, + "grad_norm": 0.2931560683055988, + "learning_rate": 5.499294022619172e-06, + "loss": 0.0683, + "step": 5746 + }, + { + "epoch": 0.48, + "grad_norm": 0.2899218719864031, + "learning_rate": 5.497936220280822e-06, + "loss": 0.0673, + "step": 5747 + }, + { + "epoch": 0.48, + "grad_norm": 0.5093583918373007, + "learning_rate": 5.496578380853281e-06, + "loss": 0.1355, + "step": 5748 + }, + { + "epoch": 0.48, + "grad_norm": 0.6146155527407061, + "learning_rate": 5.495220504437688e-06, + "loss": 0.1031, + "step": 5749 + }, + { + "epoch": 0.48, + "grad_norm": 0.3455043133951765, + "learning_rate": 5.4938625911351885e-06, + "loss": 0.0701, + "step": 5750 + }, + { + "epoch": 0.48, + "grad_norm": 0.31276324853517523, + "learning_rate": 5.492504641046926e-06, + "loss": 0.1002, + "step": 5751 + }, + { + "epoch": 0.48, + "grad_norm": 0.5500843100576319, + "learning_rate": 5.491146654274049e-06, + "loss": 0.1221, + "step": 5752 + }, + { + "epoch": 0.48, + "grad_norm": 0.3300547141537957, + "learning_rate": 5.4897886309177085e-06, + "loss": 0.0668, + "step": 5753 + }, + { + "epoch": 0.48, + "grad_norm": 0.3974818080692812, + "learning_rate": 5.488430571079055e-06, + "loss": 0.0915, + "step": 5754 + }, + { + "epoch": 0.48, + "grad_norm": 0.1615940111343022, + "learning_rate": 5.487072474859251e-06, + "loss": 0.0387, + "step": 5755 + }, + { + "epoch": 0.49, + "grad_norm": 0.41050205838945275, + "learning_rate": 5.485714342359448e-06, + "loss": 0.0863, + "step": 5756 + }, + { + "epoch": 0.49, + "grad_norm": 0.23596620228530818, + "learning_rate": 5.4843561736808134e-06, + "loss": 0.0896, + "step": 5757 + }, + { + "epoch": 0.49, + "grad_norm": 0.3934464826452503, + "learning_rate": 5.4829979689245095e-06, + "loss": 0.1321, + "step": 5758 + }, + { + "epoch": 0.49, + "grad_norm": 0.38238645983439473, + "learning_rate": 5.481639728191703e-06, + "loss": 0.0837, + "step": 5759 + }, + { + "epoch": 0.49, + "grad_norm": 0.39471242910149673, + "learning_rate": 5.480281451583564e-06, + "loss": 0.0862, + "step": 5760 + }, + { + "epoch": 0.49, + "grad_norm": 0.2176033924452278, + "learning_rate": 5.478923139201262e-06, + "loss": 0.05, + "step": 5761 + }, + { + "epoch": 0.49, + "grad_norm": 0.3581866885092856, + "learning_rate": 5.477564791145978e-06, + "loss": 0.0608, + "step": 5762 + }, + { + "epoch": 0.49, + "grad_norm": 0.35832318353186404, + "learning_rate": 5.476206407518885e-06, + "loss": 0.0634, + "step": 5763 + }, + { + "epoch": 0.49, + "grad_norm": 0.3915307010977396, + "learning_rate": 5.474847988421165e-06, + "loss": 0.0916, + "step": 5764 + }, + { + "epoch": 0.49, + "grad_norm": 0.3376870603746107, + "learning_rate": 5.473489533954e-06, + "loss": 0.0711, + "step": 5765 + }, + { + "epoch": 0.49, + "grad_norm": 0.3554314616111118, + "learning_rate": 5.472131044218576e-06, + "loss": 0.0968, + "step": 5766 + }, + { + "epoch": 0.49, + "grad_norm": 0.4873010073319551, + "learning_rate": 5.470772519316082e-06, + "loss": 0.1127, + "step": 5767 + }, + { + "epoch": 0.49, + "grad_norm": 0.4002819925752563, + "learning_rate": 5.469413959347708e-06, + "loss": 0.105, + "step": 5768 + }, + { + "epoch": 0.49, + "grad_norm": 0.2927058346348805, + "learning_rate": 5.468055364414649e-06, + "loss": 0.0714, + "step": 5769 + }, + { + "epoch": 0.49, + "grad_norm": 0.42967571320779985, + "learning_rate": 5.4666967346180985e-06, + "loss": 0.0947, + "step": 5770 + }, + { + "epoch": 0.49, + "grad_norm": 0.35678173520469486, + "learning_rate": 5.465338070059257e-06, + "loss": 0.0978, + "step": 5771 + }, + { + "epoch": 0.49, + "grad_norm": 0.386854159437913, + "learning_rate": 5.463979370839325e-06, + "loss": 0.0788, + "step": 5772 + }, + { + "epoch": 0.49, + "grad_norm": 0.3165962410990284, + "learning_rate": 5.462620637059507e-06, + "loss": 0.0668, + "step": 5773 + }, + { + "epoch": 0.49, + "grad_norm": 0.3409480840710618, + "learning_rate": 5.461261868821009e-06, + "loss": 0.0456, + "step": 5774 + }, + { + "epoch": 0.49, + "grad_norm": 0.36521692370020237, + "learning_rate": 5.45990306622504e-06, + "loss": 0.0999, + "step": 5775 + }, + { + "epoch": 0.49, + "grad_norm": 0.302673743631347, + "learning_rate": 5.458544229372811e-06, + "loss": 0.0755, + "step": 5776 + }, + { + "epoch": 0.49, + "grad_norm": 0.34620960731028855, + "learning_rate": 5.457185358365537e-06, + "loss": 0.0961, + "step": 5777 + }, + { + "epoch": 0.49, + "grad_norm": 0.22435024972206655, + "learning_rate": 5.455826453304433e-06, + "loss": 0.0497, + "step": 5778 + }, + { + "epoch": 0.49, + "grad_norm": 0.8149195174956303, + "learning_rate": 5.454467514290721e-06, + "loss": 0.1441, + "step": 5779 + }, + { + "epoch": 0.49, + "grad_norm": 0.25484534732694647, + "learning_rate": 5.45310854142562e-06, + "loss": 0.0774, + "step": 5780 + }, + { + "epoch": 0.49, + "grad_norm": 0.7616130610160319, + "learning_rate": 5.4517495348103566e-06, + "loss": 0.1015, + "step": 5781 + }, + { + "epoch": 0.49, + "grad_norm": 0.20326276038898253, + "learning_rate": 5.450390494546155e-06, + "loss": 0.0676, + "step": 5782 + }, + { + "epoch": 0.49, + "grad_norm": 0.26004035244569695, + "learning_rate": 5.449031420734246e-06, + "loss": 0.0596, + "step": 5783 + }, + { + "epoch": 0.49, + "grad_norm": 0.3825553413830431, + "learning_rate": 5.44767231347586e-06, + "loss": 0.0639, + "step": 5784 + }, + { + "epoch": 0.49, + "grad_norm": 0.2925192137676939, + "learning_rate": 5.446313172872234e-06, + "loss": 0.0474, + "step": 5785 + }, + { + "epoch": 0.49, + "grad_norm": 0.5562649537342925, + "learning_rate": 5.444953999024602e-06, + "loss": 0.0984, + "step": 5786 + }, + { + "epoch": 0.49, + "grad_norm": 0.5448080211413895, + "learning_rate": 5.4435947920342045e-06, + "loss": 0.1255, + "step": 5787 + }, + { + "epoch": 0.49, + "grad_norm": 0.40403460525905366, + "learning_rate": 5.4422355520022814e-06, + "loss": 0.1249, + "step": 5788 + }, + { + "epoch": 0.49, + "grad_norm": 0.3862065898774094, + "learning_rate": 5.440876279030081e-06, + "loss": 0.1225, + "step": 5789 + }, + { + "epoch": 0.49, + "grad_norm": 0.3385398640118941, + "learning_rate": 5.439516973218844e-06, + "loss": 0.0981, + "step": 5790 + }, + { + "epoch": 0.49, + "grad_norm": 0.29746574279341037, + "learning_rate": 5.438157634669825e-06, + "loss": 0.066, + "step": 5791 + }, + { + "epoch": 0.49, + "grad_norm": 0.26264185043030625, + "learning_rate": 5.436798263484271e-06, + "loss": 0.0776, + "step": 5792 + }, + { + "epoch": 0.49, + "grad_norm": 0.4388424294379245, + "learning_rate": 5.4354388597634385e-06, + "loss": 0.1062, + "step": 5793 + }, + { + "epoch": 0.49, + "grad_norm": 0.35413387057772644, + "learning_rate": 5.434079423608584e-06, + "loss": 0.0922, + "step": 5794 + }, + { + "epoch": 0.49, + "grad_norm": 0.482696367897711, + "learning_rate": 5.432719955120963e-06, + "loss": 0.1144, + "step": 5795 + }, + { + "epoch": 0.49, + "grad_norm": 0.4412554583422484, + "learning_rate": 5.431360454401842e-06, + "loss": 0.1303, + "step": 5796 + }, + { + "epoch": 0.49, + "grad_norm": 0.5570839878973987, + "learning_rate": 5.430000921552479e-06, + "loss": 0.1261, + "step": 5797 + }, + { + "epoch": 0.49, + "grad_norm": 0.3142435168390734, + "learning_rate": 5.428641356674144e-06, + "loss": 0.0502, + "step": 5798 + }, + { + "epoch": 0.49, + "grad_norm": 0.23914828727147622, + "learning_rate": 5.427281759868101e-06, + "loss": 0.0817, + "step": 5799 + }, + { + "epoch": 0.49, + "grad_norm": 0.3124252898243474, + "learning_rate": 5.425922131235627e-06, + "loss": 0.1185, + "step": 5800 + }, + { + "epoch": 0.49, + "grad_norm": 0.39326557894380637, + "learning_rate": 5.42456247087799e-06, + "loss": 0.0983, + "step": 5801 + }, + { + "epoch": 0.49, + "grad_norm": 0.45163575489107505, + "learning_rate": 5.4232027788964655e-06, + "loss": 0.0904, + "step": 5802 + }, + { + "epoch": 0.49, + "grad_norm": 0.4718931757609372, + "learning_rate": 5.421843055392335e-06, + "loss": 0.1321, + "step": 5803 + }, + { + "epoch": 0.49, + "grad_norm": 0.48401073611050005, + "learning_rate": 5.420483300466874e-06, + "loss": 0.1029, + "step": 5804 + }, + { + "epoch": 0.49, + "grad_norm": 0.43973430989405965, + "learning_rate": 5.419123514221369e-06, + "loss": 0.0903, + "step": 5805 + }, + { + "epoch": 0.49, + "grad_norm": 0.2378611941770208, + "learning_rate": 5.417763696757103e-06, + "loss": 0.0659, + "step": 5806 + }, + { + "epoch": 0.49, + "grad_norm": 0.2681876612943705, + "learning_rate": 5.416403848175362e-06, + "loss": 0.0744, + "step": 5807 + }, + { + "epoch": 0.49, + "grad_norm": 0.3069016596884403, + "learning_rate": 5.415043968577439e-06, + "loss": 0.067, + "step": 5808 + }, + { + "epoch": 0.49, + "grad_norm": 0.2513450866195137, + "learning_rate": 5.413684058064621e-06, + "loss": 0.0473, + "step": 5809 + }, + { + "epoch": 0.49, + "grad_norm": 0.3838998526448848, + "learning_rate": 5.412324116738207e-06, + "loss": 0.0854, + "step": 5810 + }, + { + "epoch": 0.49, + "grad_norm": 0.2910001845774251, + "learning_rate": 5.410964144699487e-06, + "loss": 0.1049, + "step": 5811 + }, + { + "epoch": 0.49, + "grad_norm": 0.2809230421844356, + "learning_rate": 5.4096041420497656e-06, + "loss": 0.0934, + "step": 5812 + }, + { + "epoch": 0.49, + "grad_norm": 0.32864647845632317, + "learning_rate": 5.408244108890342e-06, + "loss": 0.0869, + "step": 5813 + }, + { + "epoch": 0.49, + "grad_norm": 0.4249609302752765, + "learning_rate": 5.406884045322519e-06, + "loss": 0.1318, + "step": 5814 + }, + { + "epoch": 0.49, + "grad_norm": 0.22202876283011805, + "learning_rate": 5.405523951447602e-06, + "loss": 0.0677, + "step": 5815 + }, + { + "epoch": 0.49, + "grad_norm": 0.3102489618890112, + "learning_rate": 5.404163827366898e-06, + "loss": 0.0925, + "step": 5816 + }, + { + "epoch": 0.49, + "grad_norm": 0.4326009188272736, + "learning_rate": 5.402803673181719e-06, + "loss": 0.1203, + "step": 5817 + }, + { + "epoch": 0.49, + "grad_norm": 0.27032184169262746, + "learning_rate": 5.4014434889933755e-06, + "loss": 0.0852, + "step": 5818 + }, + { + "epoch": 0.49, + "grad_norm": 0.2881934693378529, + "learning_rate": 5.400083274903181e-06, + "loss": 0.0737, + "step": 5819 + }, + { + "epoch": 0.49, + "grad_norm": 0.24999747585160442, + "learning_rate": 5.398723031012455e-06, + "loss": 0.1029, + "step": 5820 + }, + { + "epoch": 0.49, + "grad_norm": 0.3233752137246742, + "learning_rate": 5.397362757422515e-06, + "loss": 0.1, + "step": 5821 + }, + { + "epoch": 0.49, + "grad_norm": 0.2929702287674229, + "learning_rate": 5.396002454234681e-06, + "loss": 0.0827, + "step": 5822 + }, + { + "epoch": 0.49, + "grad_norm": 0.28087136090119724, + "learning_rate": 5.394642121550279e-06, + "loss": 0.0962, + "step": 5823 + }, + { + "epoch": 0.49, + "grad_norm": 0.4008858294120123, + "learning_rate": 5.39328175947063e-06, + "loss": 0.094, + "step": 5824 + }, + { + "epoch": 0.49, + "grad_norm": 0.41433994844952604, + "learning_rate": 5.391921368097067e-06, + "loss": 0.088, + "step": 5825 + }, + { + "epoch": 0.49, + "grad_norm": 0.33171759257417444, + "learning_rate": 5.3905609475309164e-06, + "loss": 0.0916, + "step": 5826 + }, + { + "epoch": 0.49, + "grad_norm": 0.2630866211488383, + "learning_rate": 5.389200497873512e-06, + "loss": 0.0671, + "step": 5827 + }, + { + "epoch": 0.49, + "grad_norm": 0.2786433332379624, + "learning_rate": 5.387840019226186e-06, + "loss": 0.0721, + "step": 5828 + }, + { + "epoch": 0.49, + "grad_norm": 0.24235499895738113, + "learning_rate": 5.386479511690276e-06, + "loss": 0.0567, + "step": 5829 + }, + { + "epoch": 0.49, + "grad_norm": 0.337008430306418, + "learning_rate": 5.385118975367122e-06, + "loss": 0.1405, + "step": 5830 + }, + { + "epoch": 0.49, + "grad_norm": 0.3284653822466512, + "learning_rate": 5.383758410358061e-06, + "loss": 0.0583, + "step": 5831 + }, + { + "epoch": 0.49, + "grad_norm": 0.338068549738393, + "learning_rate": 5.38239781676444e-06, + "loss": 0.0719, + "step": 5832 + }, + { + "epoch": 0.49, + "grad_norm": 0.2877297217286004, + "learning_rate": 5.381037194687602e-06, + "loss": 0.0725, + "step": 5833 + }, + { + "epoch": 0.49, + "grad_norm": 0.4754223947167591, + "learning_rate": 5.379676544228894e-06, + "loss": 0.1158, + "step": 5834 + }, + { + "epoch": 0.49, + "grad_norm": 0.21380103263903913, + "learning_rate": 5.378315865489664e-06, + "loss": 0.0536, + "step": 5835 + }, + { + "epoch": 0.49, + "grad_norm": 0.3600735838001535, + "learning_rate": 5.376955158571264e-06, + "loss": 0.0719, + "step": 5836 + }, + { + "epoch": 0.49, + "grad_norm": 0.3495979556173892, + "learning_rate": 5.37559442357505e-06, + "loss": 0.0818, + "step": 5837 + }, + { + "epoch": 0.49, + "grad_norm": 0.4074629636760797, + "learning_rate": 5.374233660602374e-06, + "loss": 0.1171, + "step": 5838 + }, + { + "epoch": 0.49, + "grad_norm": 0.3170574969120073, + "learning_rate": 5.372872869754595e-06, + "loss": 0.0518, + "step": 5839 + }, + { + "epoch": 0.49, + "grad_norm": 0.2757957544302347, + "learning_rate": 5.371512051133072e-06, + "loss": 0.0399, + "step": 5840 + }, + { + "epoch": 0.49, + "grad_norm": 0.26860795578830654, + "learning_rate": 5.370151204839168e-06, + "loss": 0.0818, + "step": 5841 + }, + { + "epoch": 0.49, + "grad_norm": 0.3532850669058584, + "learning_rate": 5.368790330974246e-06, + "loss": 0.0755, + "step": 5842 + }, + { + "epoch": 0.49, + "grad_norm": 0.3123370490525611, + "learning_rate": 5.367429429639672e-06, + "loss": 0.0746, + "step": 5843 + }, + { + "epoch": 0.49, + "grad_norm": 0.25344512682869874, + "learning_rate": 5.366068500936814e-06, + "loss": 0.0657, + "step": 5844 + }, + { + "epoch": 0.49, + "grad_norm": 0.2138844382270035, + "learning_rate": 5.3647075449670405e-06, + "loss": 0.083, + "step": 5845 + }, + { + "epoch": 0.49, + "grad_norm": 0.47741541450968, + "learning_rate": 5.363346561831725e-06, + "loss": 0.1028, + "step": 5846 + }, + { + "epoch": 0.49, + "grad_norm": 0.31529084173161936, + "learning_rate": 5.361985551632241e-06, + "loss": 0.0825, + "step": 5847 + }, + { + "epoch": 0.49, + "grad_norm": 0.35532618180966635, + "learning_rate": 5.360624514469963e-06, + "loss": 0.0937, + "step": 5848 + }, + { + "epoch": 0.49, + "grad_norm": 0.18780081210512914, + "learning_rate": 5.359263450446273e-06, + "loss": 0.05, + "step": 5849 + }, + { + "epoch": 0.49, + "grad_norm": 0.3027682616308561, + "learning_rate": 5.3579023596625455e-06, + "loss": 0.0728, + "step": 5850 + }, + { + "epoch": 0.49, + "grad_norm": 0.4961364988587402, + "learning_rate": 5.356541242220168e-06, + "loss": 0.1199, + "step": 5851 + }, + { + "epoch": 0.49, + "grad_norm": 0.2533616266521466, + "learning_rate": 5.3551800982205195e-06, + "loss": 0.06, + "step": 5852 + }, + { + "epoch": 0.49, + "grad_norm": 0.632176783013107, + "learning_rate": 5.35381892776499e-06, + "loss": 0.0866, + "step": 5853 + }, + { + "epoch": 0.49, + "grad_norm": 0.3864514137484007, + "learning_rate": 5.352457730954964e-06, + "loss": 0.0846, + "step": 5854 + }, + { + "epoch": 0.49, + "grad_norm": 0.37369094144340437, + "learning_rate": 5.351096507891834e-06, + "loss": 0.0923, + "step": 5855 + }, + { + "epoch": 0.49, + "grad_norm": 0.9331689389350514, + "learning_rate": 5.34973525867699e-06, + "loss": 0.1184, + "step": 5856 + }, + { + "epoch": 0.49, + "grad_norm": 0.3196874489671762, + "learning_rate": 5.348373983411829e-06, + "loss": 0.1118, + "step": 5857 + }, + { + "epoch": 0.49, + "grad_norm": 0.2525625080176364, + "learning_rate": 5.3470126821977405e-06, + "loss": 0.0862, + "step": 5858 + }, + { + "epoch": 0.49, + "grad_norm": 0.28349128317882527, + "learning_rate": 5.3456513551361285e-06, + "loss": 0.0459, + "step": 5859 + }, + { + "epoch": 0.49, + "grad_norm": 0.17846759299212983, + "learning_rate": 5.344290002328389e-06, + "loss": 0.0485, + "step": 5860 + }, + { + "epoch": 0.49, + "grad_norm": 0.27301391258555086, + "learning_rate": 5.3429286238759255e-06, + "loss": 0.0995, + "step": 5861 + }, + { + "epoch": 0.49, + "grad_norm": 0.22453682704146127, + "learning_rate": 5.34156721988014e-06, + "loss": 0.0796, + "step": 5862 + }, + { + "epoch": 0.49, + "grad_norm": 0.27045335024644945, + "learning_rate": 5.3402057904424385e-06, + "loss": 0.0814, + "step": 5863 + }, + { + "epoch": 0.49, + "grad_norm": 0.2613688011478815, + "learning_rate": 5.338844335664227e-06, + "loss": 0.0796, + "step": 5864 + }, + { + "epoch": 0.49, + "grad_norm": 0.2976030247791636, + "learning_rate": 5.337482855646915e-06, + "loss": 0.048, + "step": 5865 + }, + { + "epoch": 0.49, + "grad_norm": 0.298052194853526, + "learning_rate": 5.336121350491915e-06, + "loss": 0.0926, + "step": 5866 + }, + { + "epoch": 0.49, + "grad_norm": 0.3143718227900045, + "learning_rate": 5.334759820300639e-06, + "loss": 0.0805, + "step": 5867 + }, + { + "epoch": 0.49, + "grad_norm": 0.46996072528057004, + "learning_rate": 5.333398265174501e-06, + "loss": 0.1226, + "step": 5868 + }, + { + "epoch": 0.49, + "grad_norm": 0.2358291568869031, + "learning_rate": 5.332036685214918e-06, + "loss": 0.0614, + "step": 5869 + }, + { + "epoch": 0.49, + "grad_norm": 0.4607818536874872, + "learning_rate": 5.330675080523308e-06, + "loss": 0.0703, + "step": 5870 + }, + { + "epoch": 0.49, + "grad_norm": 0.5683169559396875, + "learning_rate": 5.329313451201092e-06, + "loss": 0.1208, + "step": 5871 + }, + { + "epoch": 0.49, + "grad_norm": 0.30224134679462283, + "learning_rate": 5.327951797349692e-06, + "loss": 0.0547, + "step": 5872 + }, + { + "epoch": 0.49, + "grad_norm": 0.4742975099063904, + "learning_rate": 5.326590119070532e-06, + "loss": 0.1124, + "step": 5873 + }, + { + "epoch": 0.49, + "grad_norm": 0.20598101488227194, + "learning_rate": 5.3252284164650355e-06, + "loss": 0.0586, + "step": 5874 + }, + { + "epoch": 0.5, + "grad_norm": 0.25352174154897184, + "learning_rate": 5.323866689634633e-06, + "loss": 0.0621, + "step": 5875 + }, + { + "epoch": 0.5, + "grad_norm": 0.3153490273424826, + "learning_rate": 5.322504938680754e-06, + "loss": 0.0884, + "step": 5876 + }, + { + "epoch": 0.5, + "grad_norm": 0.2667981001589733, + "learning_rate": 5.321143163704827e-06, + "loss": 0.0694, + "step": 5877 + }, + { + "epoch": 0.5, + "grad_norm": 0.27128628278836237, + "learning_rate": 5.319781364808287e-06, + "loss": 0.0764, + "step": 5878 + }, + { + "epoch": 0.5, + "grad_norm": 0.4193948040620459, + "learning_rate": 5.3184195420925686e-06, + "loss": 0.0983, + "step": 5879 + }, + { + "epoch": 0.5, + "grad_norm": 0.2864954945659026, + "learning_rate": 5.317057695659108e-06, + "loss": 0.0632, + "step": 5880 + }, + { + "epoch": 0.5, + "grad_norm": 0.21143711338179555, + "learning_rate": 5.315695825609341e-06, + "loss": 0.0517, + "step": 5881 + }, + { + "epoch": 0.5, + "grad_norm": 0.34212078559431086, + "learning_rate": 5.314333932044711e-06, + "loss": 0.0922, + "step": 5882 + }, + { + "epoch": 0.5, + "grad_norm": 0.5890274574199412, + "learning_rate": 5.312972015066659e-06, + "loss": 0.1088, + "step": 5883 + }, + { + "epoch": 0.5, + "grad_norm": 0.22022221024465025, + "learning_rate": 5.3116100747766284e-06, + "loss": 0.0759, + "step": 5884 + }, + { + "epoch": 0.5, + "grad_norm": 0.33730724392780426, + "learning_rate": 5.310248111276064e-06, + "loss": 0.0949, + "step": 5885 + }, + { + "epoch": 0.5, + "grad_norm": 0.38127342083392296, + "learning_rate": 5.308886124666413e-06, + "loss": 0.124, + "step": 5886 + }, + { + "epoch": 0.5, + "grad_norm": 0.4351611167669001, + "learning_rate": 5.3075241150491244e-06, + "loss": 0.0863, + "step": 5887 + }, + { + "epoch": 0.5, + "grad_norm": 0.37539160964482904, + "learning_rate": 5.306162082525646e-06, + "loss": 0.0868, + "step": 5888 + }, + { + "epoch": 0.5, + "grad_norm": 0.19796214184097738, + "learning_rate": 5.3048000271974356e-06, + "loss": 0.0634, + "step": 5889 + }, + { + "epoch": 0.5, + "grad_norm": 0.35484654187889153, + "learning_rate": 5.303437949165941e-06, + "loss": 0.099, + "step": 5890 + }, + { + "epoch": 0.5, + "grad_norm": 1.0184687645592114, + "learning_rate": 5.302075848532622e-06, + "loss": 0.1141, + "step": 5891 + }, + { + "epoch": 0.5, + "grad_norm": 0.2830622445863827, + "learning_rate": 5.300713725398933e-06, + "loss": 0.0465, + "step": 5892 + }, + { + "epoch": 0.5, + "grad_norm": 0.2509815816529308, + "learning_rate": 5.2993515798663345e-06, + "loss": 0.079, + "step": 5893 + }, + { + "epoch": 0.5, + "grad_norm": 0.3436773409881198, + "learning_rate": 5.297989412036285e-06, + "loss": 0.1019, + "step": 5894 + }, + { + "epoch": 0.5, + "grad_norm": 0.5377545672526517, + "learning_rate": 5.29662722201025e-06, + "loss": 0.0824, + "step": 5895 + }, + { + "epoch": 0.5, + "grad_norm": 0.3873789928457281, + "learning_rate": 5.295265009889691e-06, + "loss": 0.096, + "step": 5896 + }, + { + "epoch": 0.5, + "grad_norm": 0.4018760807378341, + "learning_rate": 5.293902775776074e-06, + "loss": 0.0691, + "step": 5897 + }, + { + "epoch": 0.5, + "grad_norm": 0.32093257797482977, + "learning_rate": 5.292540519770864e-06, + "loss": 0.1028, + "step": 5898 + }, + { + "epoch": 0.5, + "grad_norm": 0.281243435284015, + "learning_rate": 5.291178241975534e-06, + "loss": 0.0728, + "step": 5899 + }, + { + "epoch": 0.5, + "grad_norm": 0.5005730057655735, + "learning_rate": 5.289815942491553e-06, + "loss": 0.102, + "step": 5900 + }, + { + "epoch": 0.5, + "grad_norm": 0.36259547387027613, + "learning_rate": 5.2884536214203906e-06, + "loss": 0.0862, + "step": 5901 + }, + { + "epoch": 0.5, + "grad_norm": 0.40905649306165726, + "learning_rate": 5.287091278863524e-06, + "loss": 0.1181, + "step": 5902 + }, + { + "epoch": 0.5, + "grad_norm": 0.2893865074567721, + "learning_rate": 5.285728914922425e-06, + "loss": 0.0539, + "step": 5903 + }, + { + "epoch": 0.5, + "grad_norm": 0.3114824576784453, + "learning_rate": 5.284366529698572e-06, + "loss": 0.0905, + "step": 5904 + }, + { + "epoch": 0.5, + "grad_norm": 0.2703730092553051, + "learning_rate": 5.283004123293445e-06, + "loss": 0.0489, + "step": 5905 + }, + { + "epoch": 0.5, + "grad_norm": 0.3075813974631438, + "learning_rate": 5.281641695808519e-06, + "loss": 0.1066, + "step": 5906 + }, + { + "epoch": 0.5, + "grad_norm": 0.258751587127309, + "learning_rate": 5.280279247345283e-06, + "loss": 0.0505, + "step": 5907 + }, + { + "epoch": 0.5, + "grad_norm": 0.2828849318854819, + "learning_rate": 5.278916778005212e-06, + "loss": 0.0622, + "step": 5908 + }, + { + "epoch": 0.5, + "grad_norm": 0.38363668036834864, + "learning_rate": 5.277554287889798e-06, + "loss": 0.0935, + "step": 5909 + }, + { + "epoch": 0.5, + "grad_norm": 0.24332167433953486, + "learning_rate": 5.276191777100521e-06, + "loss": 0.0523, + "step": 5910 + }, + { + "epoch": 0.5, + "grad_norm": 0.22784704747186477, + "learning_rate": 5.274829245738872e-06, + "loss": 0.0576, + "step": 5911 + }, + { + "epoch": 0.5, + "grad_norm": 0.2959052956269438, + "learning_rate": 5.27346669390634e-06, + "loss": 0.0648, + "step": 5912 + }, + { + "epoch": 0.5, + "grad_norm": 0.45115793924552994, + "learning_rate": 5.272104121704417e-06, + "loss": 0.0996, + "step": 5913 + }, + { + "epoch": 0.5, + "grad_norm": 0.3299800514050582, + "learning_rate": 5.270741529234593e-06, + "loss": 0.1047, + "step": 5914 + }, + { + "epoch": 0.5, + "grad_norm": 0.4349666765386404, + "learning_rate": 5.269378916598362e-06, + "loss": 0.0673, + "step": 5915 + }, + { + "epoch": 0.5, + "grad_norm": 0.39744157951274733, + "learning_rate": 5.268016283897219e-06, + "loss": 0.0557, + "step": 5916 + }, + { + "epoch": 0.5, + "grad_norm": 0.5614272704360066, + "learning_rate": 5.266653631232665e-06, + "loss": 0.1439, + "step": 5917 + }, + { + "epoch": 0.5, + "grad_norm": 0.5728925466558913, + "learning_rate": 5.265290958706193e-06, + "loss": 0.0947, + "step": 5918 + }, + { + "epoch": 0.5, + "grad_norm": 0.2966067025469743, + "learning_rate": 5.263928266419306e-06, + "loss": 0.0672, + "step": 5919 + }, + { + "epoch": 0.5, + "grad_norm": 0.3315483235269579, + "learning_rate": 5.262565554473503e-06, + "loss": 0.0785, + "step": 5920 + }, + { + "epoch": 0.5, + "grad_norm": 0.28046838501260424, + "learning_rate": 5.26120282297029e-06, + "loss": 0.0964, + "step": 5921 + }, + { + "epoch": 0.5, + "grad_norm": 0.3593400502979721, + "learning_rate": 5.259840072011168e-06, + "loss": 0.1196, + "step": 5922 + }, + { + "epoch": 0.5, + "grad_norm": 0.4858668951485186, + "learning_rate": 5.258477301697643e-06, + "loss": 0.0774, + "step": 5923 + }, + { + "epoch": 0.5, + "grad_norm": 0.2852576439764612, + "learning_rate": 5.2571145121312225e-06, + "loss": 0.1049, + "step": 5924 + }, + { + "epoch": 0.5, + "grad_norm": 0.2921896146119645, + "learning_rate": 5.255751703413418e-06, + "loss": 0.0572, + "step": 5925 + }, + { + "epoch": 0.5, + "grad_norm": 0.21550888446582636, + "learning_rate": 5.254388875645734e-06, + "loss": 0.0567, + "step": 5926 + }, + { + "epoch": 0.5, + "grad_norm": 0.7359562388505169, + "learning_rate": 5.253026028929687e-06, + "loss": 0.1002, + "step": 5927 + }, + { + "epoch": 0.5, + "grad_norm": 0.30707085315385535, + "learning_rate": 5.251663163366785e-06, + "loss": 0.0568, + "step": 5928 + }, + { + "epoch": 0.5, + "grad_norm": 0.3113336742342824, + "learning_rate": 5.250300279058546e-06, + "loss": 0.0841, + "step": 5929 + }, + { + "epoch": 0.5, + "grad_norm": 0.5603573509038955, + "learning_rate": 5.248937376106483e-06, + "loss": 0.0965, + "step": 5930 + }, + { + "epoch": 0.5, + "grad_norm": 0.22265730954603144, + "learning_rate": 5.247574454612115e-06, + "loss": 0.0607, + "step": 5931 + }, + { + "epoch": 0.5, + "grad_norm": 0.366032201458281, + "learning_rate": 5.24621151467696e-06, + "loss": 0.1, + "step": 5932 + }, + { + "epoch": 0.5, + "grad_norm": 0.34885586152348336, + "learning_rate": 5.244848556402535e-06, + "loss": 0.0996, + "step": 5933 + }, + { + "epoch": 0.5, + "grad_norm": 0.2458269058914401, + "learning_rate": 5.243485579890365e-06, + "loss": 0.0786, + "step": 5934 + }, + { + "epoch": 0.5, + "grad_norm": 0.43203385607718814, + "learning_rate": 5.24212258524197e-06, + "loss": 0.1243, + "step": 5935 + }, + { + "epoch": 0.5, + "grad_norm": 0.5942081425326236, + "learning_rate": 5.240759572558874e-06, + "loss": 0.1134, + "step": 5936 + }, + { + "epoch": 0.5, + "grad_norm": 0.29506986376868666, + "learning_rate": 5.239396541942603e-06, + "loss": 0.1055, + "step": 5937 + }, + { + "epoch": 0.5, + "grad_norm": 0.3658487014076287, + "learning_rate": 5.2380334934946816e-06, + "loss": 0.0859, + "step": 5938 + }, + { + "epoch": 0.5, + "grad_norm": 0.3940361911956606, + "learning_rate": 5.236670427316641e-06, + "loss": 0.107, + "step": 5939 + }, + { + "epoch": 0.5, + "grad_norm": 0.17776821079862817, + "learning_rate": 5.235307343510008e-06, + "loss": 0.0365, + "step": 5940 + }, + { + "epoch": 0.5, + "grad_norm": 0.49507112224321226, + "learning_rate": 5.2339442421763135e-06, + "loss": 0.0937, + "step": 5941 + }, + { + "epoch": 0.5, + "grad_norm": 0.38761918451307387, + "learning_rate": 5.232581123417087e-06, + "loss": 0.0886, + "step": 5942 + }, + { + "epoch": 0.5, + "grad_norm": 0.3604602083178836, + "learning_rate": 5.231217987333865e-06, + "loss": 0.0808, + "step": 5943 + }, + { + "epoch": 0.5, + "grad_norm": 0.41391591755741614, + "learning_rate": 5.229854834028181e-06, + "loss": 0.0761, + "step": 5944 + }, + { + "epoch": 0.5, + "grad_norm": 0.34931570151947455, + "learning_rate": 5.228491663601568e-06, + "loss": 0.0905, + "step": 5945 + }, + { + "epoch": 0.5, + "grad_norm": 0.32140015880840156, + "learning_rate": 5.227128476155566e-06, + "loss": 0.0574, + "step": 5946 + }, + { + "epoch": 0.5, + "grad_norm": 0.35284860343490987, + "learning_rate": 5.2257652717917115e-06, + "loss": 0.0396, + "step": 5947 + }, + { + "epoch": 0.5, + "grad_norm": 0.3187629835346542, + "learning_rate": 5.224402050611547e-06, + "loss": 0.0537, + "step": 5948 + }, + { + "epoch": 0.5, + "grad_norm": 0.5469521529973783, + "learning_rate": 5.223038812716607e-06, + "loss": 0.1037, + "step": 5949 + }, + { + "epoch": 0.5, + "grad_norm": 0.21118980617296457, + "learning_rate": 5.221675558208438e-06, + "loss": 0.039, + "step": 5950 + }, + { + "epoch": 0.5, + "grad_norm": 0.47383463843984874, + "learning_rate": 5.220312287188583e-06, + "loss": 0.1454, + "step": 5951 + }, + { + "epoch": 0.5, + "grad_norm": 0.2717259588284945, + "learning_rate": 5.218948999758584e-06, + "loss": 0.0571, + "step": 5952 + }, + { + "epoch": 0.5, + "grad_norm": 0.29768858507025275, + "learning_rate": 5.2175856960199896e-06, + "loss": 0.0891, + "step": 5953 + }, + { + "epoch": 0.5, + "grad_norm": 0.3108257345894743, + "learning_rate": 5.216222376074343e-06, + "loss": 0.0727, + "step": 5954 + }, + { + "epoch": 0.5, + "grad_norm": 0.26790559583117884, + "learning_rate": 5.214859040023197e-06, + "loss": 0.1058, + "step": 5955 + }, + { + "epoch": 0.5, + "grad_norm": 0.1729405371315702, + "learning_rate": 5.213495687968096e-06, + "loss": 0.0454, + "step": 5956 + }, + { + "epoch": 0.5, + "grad_norm": 0.2882213341580032, + "learning_rate": 5.212132320010593e-06, + "loss": 0.0742, + "step": 5957 + }, + { + "epoch": 0.5, + "grad_norm": 0.35019410501393217, + "learning_rate": 5.21076893625224e-06, + "loss": 0.1134, + "step": 5958 + }, + { + "epoch": 0.5, + "grad_norm": 0.2590088417304105, + "learning_rate": 5.2094055367945874e-06, + "loss": 0.0512, + "step": 5959 + }, + { + "epoch": 0.5, + "grad_norm": 0.3632555304199441, + "learning_rate": 5.208042121739191e-06, + "loss": 0.135, + "step": 5960 + }, + { + "epoch": 0.5, + "grad_norm": 0.19896321479280807, + "learning_rate": 5.2066786911876055e-06, + "loss": 0.0427, + "step": 5961 + }, + { + "epoch": 0.5, + "grad_norm": 0.3718919503776485, + "learning_rate": 5.205315245241387e-06, + "loss": 0.113, + "step": 5962 + }, + { + "epoch": 0.5, + "grad_norm": 0.3040236896069676, + "learning_rate": 5.203951784002094e-06, + "loss": 0.1111, + "step": 5963 + }, + { + "epoch": 0.5, + "grad_norm": 0.26393562291799183, + "learning_rate": 5.202588307571282e-06, + "loss": 0.0666, + "step": 5964 + }, + { + "epoch": 0.5, + "grad_norm": 0.24472455738753562, + "learning_rate": 5.201224816050514e-06, + "loss": 0.0498, + "step": 5965 + }, + { + "epoch": 0.5, + "grad_norm": 0.25578600245277316, + "learning_rate": 5.199861309541348e-06, + "loss": 0.0499, + "step": 5966 + }, + { + "epoch": 0.5, + "grad_norm": 0.259787230681287, + "learning_rate": 5.1984977881453495e-06, + "loss": 0.0984, + "step": 5967 + }, + { + "epoch": 0.5, + "grad_norm": 0.27769885696048274, + "learning_rate": 5.197134251964079e-06, + "loss": 0.0841, + "step": 5968 + }, + { + "epoch": 0.5, + "grad_norm": 0.4147406018659988, + "learning_rate": 5.1957707010991e-06, + "loss": 0.115, + "step": 5969 + }, + { + "epoch": 0.5, + "grad_norm": 0.3659666632133375, + "learning_rate": 5.19440713565198e-06, + "loss": 0.0926, + "step": 5970 + }, + { + "epoch": 0.5, + "grad_norm": 0.3471676841698387, + "learning_rate": 5.1930435557242816e-06, + "loss": 0.1109, + "step": 5971 + }, + { + "epoch": 0.5, + "grad_norm": 0.33187993988037656, + "learning_rate": 5.191679961417577e-06, + "loss": 0.0767, + "step": 5972 + }, + { + "epoch": 0.5, + "grad_norm": 0.2284556792821559, + "learning_rate": 5.190316352833432e-06, + "loss": 0.0497, + "step": 5973 + }, + { + "epoch": 0.5, + "grad_norm": 0.16747608400686667, + "learning_rate": 5.188952730073416e-06, + "loss": 0.0446, + "step": 5974 + }, + { + "epoch": 0.5, + "grad_norm": 0.39378973268290185, + "learning_rate": 5.187589093239099e-06, + "loss": 0.0803, + "step": 5975 + }, + { + "epoch": 0.5, + "grad_norm": 0.3611311725092675, + "learning_rate": 5.186225442432055e-06, + "loss": 0.102, + "step": 5976 + }, + { + "epoch": 0.5, + "grad_norm": 0.5105128066014489, + "learning_rate": 5.184861777753855e-06, + "loss": 0.0953, + "step": 5977 + }, + { + "epoch": 0.5, + "grad_norm": 0.32264093418209583, + "learning_rate": 5.183498099306072e-06, + "loss": 0.0931, + "step": 5978 + }, + { + "epoch": 0.5, + "grad_norm": 0.19557256959602395, + "learning_rate": 5.182134407190282e-06, + "loss": 0.0822, + "step": 5979 + }, + { + "epoch": 0.5, + "grad_norm": 0.33454980657832073, + "learning_rate": 5.180770701508061e-06, + "loss": 0.0817, + "step": 5980 + }, + { + "epoch": 0.5, + "grad_norm": 0.6800409858291414, + "learning_rate": 5.1794069823609835e-06, + "loss": 0.121, + "step": 5981 + }, + { + "epoch": 0.5, + "grad_norm": 0.2530820712205607, + "learning_rate": 5.17804324985063e-06, + "loss": 0.0717, + "step": 5982 + }, + { + "epoch": 0.5, + "grad_norm": 0.31550944695375893, + "learning_rate": 5.176679504078578e-06, + "loss": 0.0601, + "step": 5983 + }, + { + "epoch": 0.5, + "grad_norm": 0.7737975011438912, + "learning_rate": 5.175315745146406e-06, + "loss": 0.1115, + "step": 5984 + }, + { + "epoch": 0.5, + "grad_norm": 0.3591080191243384, + "learning_rate": 5.173951973155698e-06, + "loss": 0.0851, + "step": 5985 + }, + { + "epoch": 0.5, + "grad_norm": 0.1899691479200827, + "learning_rate": 5.172588188208033e-06, + "loss": 0.0655, + "step": 5986 + }, + { + "epoch": 0.5, + "grad_norm": 0.27839218167742374, + "learning_rate": 5.1712243904049956e-06, + "loss": 0.1004, + "step": 5987 + }, + { + "epoch": 0.5, + "grad_norm": 0.315812940174292, + "learning_rate": 5.169860579848166e-06, + "loss": 0.0483, + "step": 5988 + }, + { + "epoch": 0.5, + "grad_norm": 0.31524438755974604, + "learning_rate": 5.168496756639132e-06, + "loss": 0.0958, + "step": 5989 + }, + { + "epoch": 0.5, + "grad_norm": 0.39547916985896014, + "learning_rate": 5.167132920879478e-06, + "loss": 0.076, + "step": 5990 + }, + { + "epoch": 0.5, + "grad_norm": 0.5516666827136503, + "learning_rate": 5.16576907267079e-06, + "loss": 0.0912, + "step": 5991 + }, + { + "epoch": 0.5, + "grad_norm": 0.858550522906612, + "learning_rate": 5.164405212114656e-06, + "loss": 0.1894, + "step": 5992 + }, + { + "epoch": 0.5, + "grad_norm": 0.2727058603363414, + "learning_rate": 5.163041339312664e-06, + "loss": 0.0551, + "step": 5993 + }, + { + "epoch": 0.51, + "grad_norm": 0.31847098732429524, + "learning_rate": 5.1616774543664025e-06, + "loss": 0.1041, + "step": 5994 + }, + { + "epoch": 0.51, + "grad_norm": 0.28331826469159443, + "learning_rate": 5.1603135573774645e-06, + "loss": 0.1042, + "step": 5995 + }, + { + "epoch": 0.51, + "grad_norm": 0.2662291660836843, + "learning_rate": 5.158949648447436e-06, + "loss": 0.0677, + "step": 5996 + }, + { + "epoch": 0.51, + "grad_norm": 0.22852635336766552, + "learning_rate": 5.157585727677914e-06, + "loss": 0.0462, + "step": 5997 + }, + { + "epoch": 0.51, + "grad_norm": 0.3893035322829395, + "learning_rate": 5.1562217951704885e-06, + "loss": 0.0867, + "step": 5998 + }, + { + "epoch": 0.51, + "grad_norm": 0.28045538888243887, + "learning_rate": 5.154857851026754e-06, + "loss": 0.0699, + "step": 5999 + }, + { + "epoch": 0.51, + "grad_norm": 0.4662070547928093, + "learning_rate": 5.153493895348303e-06, + "loss": 0.1064, + "step": 6000 + }, + { + "epoch": 0.51, + "grad_norm": 0.2500785431668303, + "learning_rate": 5.152129928236733e-06, + "loss": 0.0584, + "step": 6001 + }, + { + "epoch": 0.51, + "grad_norm": 0.5059242494636329, + "learning_rate": 5.150765949793641e-06, + "loss": 0.1115, + "step": 6002 + }, + { + "epoch": 0.51, + "grad_norm": 0.381436787512659, + "learning_rate": 5.149401960120621e-06, + "loss": 0.0792, + "step": 6003 + }, + { + "epoch": 0.51, + "grad_norm": 0.26742655601512416, + "learning_rate": 5.148037959319275e-06, + "loss": 0.1012, + "step": 6004 + }, + { + "epoch": 0.51, + "grad_norm": 0.439925332102308, + "learning_rate": 5.146673947491198e-06, + "loss": 0.1111, + "step": 6005 + }, + { + "epoch": 0.51, + "grad_norm": 0.29128553636157606, + "learning_rate": 5.1453099247379915e-06, + "loss": 0.0631, + "step": 6006 + }, + { + "epoch": 0.51, + "grad_norm": 0.33777613782662774, + "learning_rate": 5.143945891161254e-06, + "loss": 0.0902, + "step": 6007 + }, + { + "epoch": 0.51, + "grad_norm": 0.22804846787291416, + "learning_rate": 5.142581846862589e-06, + "loss": 0.0589, + "step": 6008 + }, + { + "epoch": 0.51, + "grad_norm": 0.2072539903007768, + "learning_rate": 5.141217791943597e-06, + "loss": 0.041, + "step": 6009 + }, + { + "epoch": 0.51, + "grad_norm": 0.2625603392078211, + "learning_rate": 5.139853726505881e-06, + "loss": 0.0679, + "step": 6010 + }, + { + "epoch": 0.51, + "grad_norm": 0.3142276861074264, + "learning_rate": 5.138489650651046e-06, + "loss": 0.062, + "step": 6011 + }, + { + "epoch": 0.51, + "grad_norm": 0.5182941206071672, + "learning_rate": 5.137125564480694e-06, + "loss": 0.1312, + "step": 6012 + }, + { + "epoch": 0.51, + "grad_norm": 0.3000767163669192, + "learning_rate": 5.135761468096431e-06, + "loss": 0.0677, + "step": 6013 + }, + { + "epoch": 0.51, + "grad_norm": 0.20578289587005374, + "learning_rate": 5.134397361599864e-06, + "loss": 0.0628, + "step": 6014 + }, + { + "epoch": 0.51, + "grad_norm": 0.5389329009272015, + "learning_rate": 5.133033245092597e-06, + "loss": 0.1489, + "step": 6015 + }, + { + "epoch": 0.51, + "grad_norm": 0.44336897162634975, + "learning_rate": 5.131669118676241e-06, + "loss": 0.1006, + "step": 6016 + }, + { + "epoch": 0.51, + "grad_norm": 0.28585383584612256, + "learning_rate": 5.1303049824524e-06, + "loss": 0.0516, + "step": 6017 + }, + { + "epoch": 0.51, + "grad_norm": 0.19821235155327224, + "learning_rate": 5.128940836522686e-06, + "loss": 0.034, + "step": 6018 + }, + { + "epoch": 0.51, + "grad_norm": 0.2646533646251551, + "learning_rate": 5.127576680988708e-06, + "loss": 0.0814, + "step": 6019 + }, + { + "epoch": 0.51, + "grad_norm": 0.25190132046275804, + "learning_rate": 5.126212515952073e-06, + "loss": 0.077, + "step": 6020 + }, + { + "epoch": 0.51, + "grad_norm": 0.2830587747496971, + "learning_rate": 5.1248483415143976e-06, + "loss": 0.0696, + "step": 6021 + }, + { + "epoch": 0.51, + "grad_norm": 0.4068967408667984, + "learning_rate": 5.1234841577772884e-06, + "loss": 0.0707, + "step": 6022 + }, + { + "epoch": 0.51, + "grad_norm": 0.3742501212046979, + "learning_rate": 5.122119964842361e-06, + "loss": 0.0882, + "step": 6023 + }, + { + "epoch": 0.51, + "grad_norm": 0.2364339289600392, + "learning_rate": 5.120755762811227e-06, + "loss": 0.073, + "step": 6024 + }, + { + "epoch": 0.51, + "grad_norm": 0.30354415010300856, + "learning_rate": 5.119391551785499e-06, + "loss": 0.0725, + "step": 6025 + }, + { + "epoch": 0.51, + "grad_norm": 0.6922659091844138, + "learning_rate": 5.118027331866795e-06, + "loss": 0.1682, + "step": 6026 + }, + { + "epoch": 0.51, + "grad_norm": 0.5031979493723242, + "learning_rate": 5.1166631031567275e-06, + "loss": 0.0987, + "step": 6027 + }, + { + "epoch": 0.51, + "grad_norm": 0.24862483290448176, + "learning_rate": 5.11529886575691e-06, + "loss": 0.0381, + "step": 6028 + }, + { + "epoch": 0.51, + "grad_norm": 0.4987042803812137, + "learning_rate": 5.113934619768963e-06, + "loss": 0.0931, + "step": 6029 + }, + { + "epoch": 0.51, + "grad_norm": 0.42348936868222986, + "learning_rate": 5.1125703652945014e-06, + "loss": 0.1061, + "step": 6030 + }, + { + "epoch": 0.51, + "grad_norm": 0.40289906708737316, + "learning_rate": 5.111206102435144e-06, + "loss": 0.0932, + "step": 6031 + }, + { + "epoch": 0.51, + "grad_norm": 0.341997785024806, + "learning_rate": 5.109841831292508e-06, + "loss": 0.1009, + "step": 6032 + }, + { + "epoch": 0.51, + "grad_norm": 0.2748591004662362, + "learning_rate": 5.108477551968213e-06, + "loss": 0.0775, + "step": 6033 + }, + { + "epoch": 0.51, + "grad_norm": 0.2860964350554465, + "learning_rate": 5.107113264563876e-06, + "loss": 0.0628, + "step": 6034 + }, + { + "epoch": 0.51, + "grad_norm": 0.6405188951230092, + "learning_rate": 5.105748969181121e-06, + "loss": 0.0952, + "step": 6035 + }, + { + "epoch": 0.51, + "grad_norm": 0.2633049428036427, + "learning_rate": 5.1043846659215654e-06, + "loss": 0.092, + "step": 6036 + }, + { + "epoch": 0.51, + "grad_norm": 0.42460474650644475, + "learning_rate": 5.103020354886832e-06, + "loss": 0.1037, + "step": 6037 + }, + { + "epoch": 0.51, + "grad_norm": 0.41450603349474446, + "learning_rate": 5.1016560361785435e-06, + "loss": 0.1129, + "step": 6038 + }, + { + "epoch": 0.51, + "grad_norm": 0.31181237576527454, + "learning_rate": 5.10029170989832e-06, + "loss": 0.0958, + "step": 6039 + }, + { + "epoch": 0.51, + "grad_norm": 0.19679489420377286, + "learning_rate": 5.0989273761477876e-06, + "loss": 0.0699, + "step": 6040 + }, + { + "epoch": 0.51, + "grad_norm": 0.35651059392153267, + "learning_rate": 5.097563035028565e-06, + "loss": 0.0824, + "step": 6041 + }, + { + "epoch": 0.51, + "grad_norm": 0.49997600730956254, + "learning_rate": 5.096198686642281e-06, + "loss": 0.1499, + "step": 6042 + }, + { + "epoch": 0.51, + "grad_norm": 0.4810115303788511, + "learning_rate": 5.094834331090559e-06, + "loss": 0.1039, + "step": 6043 + }, + { + "epoch": 0.51, + "grad_norm": 0.3926215205293296, + "learning_rate": 5.093469968475022e-06, + "loss": 0.1109, + "step": 6044 + }, + { + "epoch": 0.51, + "grad_norm": 0.20481746960437508, + "learning_rate": 5.0921055988972985e-06, + "loss": 0.0502, + "step": 6045 + }, + { + "epoch": 0.51, + "grad_norm": 0.8602901296057163, + "learning_rate": 5.090741222459014e-06, + "loss": 0.1349, + "step": 6046 + }, + { + "epoch": 0.51, + "grad_norm": 0.44396170496319914, + "learning_rate": 5.089376839261793e-06, + "loss": 0.0745, + "step": 6047 + }, + { + "epoch": 0.51, + "grad_norm": 0.5643161108627387, + "learning_rate": 5.0880124494072644e-06, + "loss": 0.1151, + "step": 6048 + }, + { + "epoch": 0.51, + "grad_norm": 0.43854214195706365, + "learning_rate": 5.086648052997056e-06, + "loss": 0.1003, + "step": 6049 + }, + { + "epoch": 0.51, + "grad_norm": 0.42559501223458013, + "learning_rate": 5.085283650132797e-06, + "loss": 0.1038, + "step": 6050 + }, + { + "epoch": 0.51, + "grad_norm": 0.4892743225990912, + "learning_rate": 5.0839192409161145e-06, + "loss": 0.1401, + "step": 6051 + }, + { + "epoch": 0.51, + "grad_norm": 0.2545673550847033, + "learning_rate": 5.082554825448638e-06, + "loss": 0.0821, + "step": 6052 + }, + { + "epoch": 0.51, + "grad_norm": 0.42593398863016335, + "learning_rate": 5.081190403831997e-06, + "loss": 0.1137, + "step": 6053 + }, + { + "epoch": 0.51, + "grad_norm": 0.2274941087951388, + "learning_rate": 5.079825976167821e-06, + "loss": 0.0625, + "step": 6054 + }, + { + "epoch": 0.51, + "grad_norm": 0.1998892508266908, + "learning_rate": 5.0784615425577435e-06, + "loss": 0.0687, + "step": 6055 + }, + { + "epoch": 0.51, + "grad_norm": 0.32022767270678354, + "learning_rate": 5.077097103103391e-06, + "loss": 0.1004, + "step": 6056 + }, + { + "epoch": 0.51, + "grad_norm": 0.2857402003596764, + "learning_rate": 5.075732657906398e-06, + "loss": 0.0595, + "step": 6057 + }, + { + "epoch": 0.51, + "grad_norm": 0.28914356400075425, + "learning_rate": 5.074368207068395e-06, + "loss": 0.0617, + "step": 6058 + }, + { + "epoch": 0.51, + "grad_norm": 0.24259193360553943, + "learning_rate": 5.073003750691016e-06, + "loss": 0.0655, + "step": 6059 + }, + { + "epoch": 0.51, + "grad_norm": 0.34322116862894014, + "learning_rate": 5.071639288875891e-06, + "loss": 0.0704, + "step": 6060 + }, + { + "epoch": 0.51, + "grad_norm": 0.4182289942633452, + "learning_rate": 5.070274821724656e-06, + "loss": 0.1355, + "step": 6061 + }, + { + "epoch": 0.51, + "grad_norm": 0.48638309639879845, + "learning_rate": 5.06891034933894e-06, + "loss": 0.0988, + "step": 6062 + }, + { + "epoch": 0.51, + "grad_norm": 0.3461678141176499, + "learning_rate": 5.067545871820383e-06, + "loss": 0.0774, + "step": 6063 + }, + { + "epoch": 0.51, + "grad_norm": 0.44029205232972474, + "learning_rate": 5.0661813892706135e-06, + "loss": 0.0978, + "step": 6064 + }, + { + "epoch": 0.51, + "grad_norm": 0.26941632569487267, + "learning_rate": 5.064816901791271e-06, + "loss": 0.0739, + "step": 6065 + }, + { + "epoch": 0.51, + "grad_norm": 0.3282484488175969, + "learning_rate": 5.063452409483986e-06, + "loss": 0.0868, + "step": 6066 + }, + { + "epoch": 0.51, + "grad_norm": 0.24144385176604088, + "learning_rate": 5.062087912450397e-06, + "loss": 0.0775, + "step": 6067 + }, + { + "epoch": 0.51, + "grad_norm": 0.3277952699966483, + "learning_rate": 5.060723410792137e-06, + "loss": 0.0792, + "step": 6068 + }, + { + "epoch": 0.51, + "grad_norm": 0.18504452230728521, + "learning_rate": 5.059358904610846e-06, + "loss": 0.028, + "step": 6069 + }, + { + "epoch": 0.51, + "grad_norm": 0.369359005896634, + "learning_rate": 5.057994394008156e-06, + "loss": 0.1059, + "step": 6070 + }, + { + "epoch": 0.51, + "grad_norm": 0.5076208991672917, + "learning_rate": 5.056629879085706e-06, + "loss": 0.0758, + "step": 6071 + }, + { + "epoch": 0.51, + "grad_norm": 0.28294199569622525, + "learning_rate": 5.055265359945134e-06, + "loss": 0.0514, + "step": 6072 + }, + { + "epoch": 0.51, + "grad_norm": 0.3744689800713614, + "learning_rate": 5.053900836688075e-06, + "loss": 0.0647, + "step": 6073 + }, + { + "epoch": 0.51, + "grad_norm": 0.3728414059460615, + "learning_rate": 5.052536309416168e-06, + "loss": 0.0957, + "step": 6074 + }, + { + "epoch": 0.51, + "grad_norm": 0.34654683058409563, + "learning_rate": 5.051171778231051e-06, + "loss": 0.1005, + "step": 6075 + }, + { + "epoch": 0.51, + "grad_norm": 0.24576838475027235, + "learning_rate": 5.04980724323436e-06, + "loss": 0.0578, + "step": 6076 + }, + { + "epoch": 0.51, + "grad_norm": 0.4943386021879641, + "learning_rate": 5.048442704527738e-06, + "loss": 0.1092, + "step": 6077 + }, + { + "epoch": 0.51, + "grad_norm": 0.5020615791784437, + "learning_rate": 5.047078162212819e-06, + "loss": 0.1106, + "step": 6078 + }, + { + "epoch": 0.51, + "grad_norm": 0.5014360441255473, + "learning_rate": 5.0457136163912454e-06, + "loss": 0.1264, + "step": 6079 + }, + { + "epoch": 0.51, + "grad_norm": 0.47226602959703157, + "learning_rate": 5.044349067164656e-06, + "loss": 0.1275, + "step": 6080 + }, + { + "epoch": 0.51, + "grad_norm": 0.291855423467308, + "learning_rate": 5.042984514634688e-06, + "loss": 0.0905, + "step": 6081 + }, + { + "epoch": 0.51, + "grad_norm": 0.26835728922279606, + "learning_rate": 5.0416199589029845e-06, + "loss": 0.0802, + "step": 6082 + }, + { + "epoch": 0.51, + "grad_norm": 0.2662254900305024, + "learning_rate": 5.040255400071182e-06, + "loss": 0.0915, + "step": 6083 + }, + { + "epoch": 0.51, + "grad_norm": 0.21033866312393795, + "learning_rate": 5.038890838240925e-06, + "loss": 0.0735, + "step": 6084 + }, + { + "epoch": 0.51, + "grad_norm": 0.4582072771153348, + "learning_rate": 5.037526273513851e-06, + "loss": 0.1122, + "step": 6085 + }, + { + "epoch": 0.51, + "grad_norm": 0.28944041072694016, + "learning_rate": 5.0361617059916025e-06, + "loss": 0.0792, + "step": 6086 + }, + { + "epoch": 0.51, + "grad_norm": 0.3898622098781108, + "learning_rate": 5.034797135775818e-06, + "loss": 0.104, + "step": 6087 + }, + { + "epoch": 0.51, + "grad_norm": 0.4119372114693691, + "learning_rate": 5.033432562968139e-06, + "loss": 0.099, + "step": 6088 + }, + { + "epoch": 0.51, + "grad_norm": 0.28604862202724446, + "learning_rate": 5.03206798767021e-06, + "loss": 0.0937, + "step": 6089 + }, + { + "epoch": 0.51, + "grad_norm": 0.4169722220453741, + "learning_rate": 5.03070340998367e-06, + "loss": 0.1129, + "step": 6090 + }, + { + "epoch": 0.51, + "grad_norm": 0.2675338684397392, + "learning_rate": 5.029338830010162e-06, + "loss": 0.0661, + "step": 6091 + }, + { + "epoch": 0.51, + "grad_norm": 0.32700418677726056, + "learning_rate": 5.027974247851326e-06, + "loss": 0.0642, + "step": 6092 + }, + { + "epoch": 0.51, + "grad_norm": 0.329996492913501, + "learning_rate": 5.026609663608806e-06, + "loss": 0.091, + "step": 6093 + }, + { + "epoch": 0.51, + "grad_norm": 0.5688733007835383, + "learning_rate": 5.025245077384241e-06, + "loss": 0.1369, + "step": 6094 + }, + { + "epoch": 0.51, + "grad_norm": 0.3344755744175377, + "learning_rate": 5.023880489279278e-06, + "loss": 0.092, + "step": 6095 + }, + { + "epoch": 0.51, + "grad_norm": 0.41448752927253607, + "learning_rate": 5.022515899395556e-06, + "loss": 0.094, + "step": 6096 + }, + { + "epoch": 0.51, + "grad_norm": 0.44026909409439197, + "learning_rate": 5.021151307834719e-06, + "loss": 0.143, + "step": 6097 + }, + { + "epoch": 0.51, + "grad_norm": 0.23724197925826088, + "learning_rate": 5.019786714698409e-06, + "loss": 0.058, + "step": 6098 + }, + { + "epoch": 0.51, + "grad_norm": 0.44323632492808446, + "learning_rate": 5.01842212008827e-06, + "loss": 0.1428, + "step": 6099 + }, + { + "epoch": 0.51, + "grad_norm": 0.29654520443007787, + "learning_rate": 5.017057524105943e-06, + "loss": 0.1426, + "step": 6100 + }, + { + "epoch": 0.51, + "grad_norm": 0.23975135627080457, + "learning_rate": 5.015692926853073e-06, + "loss": 0.0751, + "step": 6101 + }, + { + "epoch": 0.51, + "grad_norm": 0.5190032153670512, + "learning_rate": 5.014328328431301e-06, + "loss": 0.1205, + "step": 6102 + }, + { + "epoch": 0.51, + "grad_norm": 0.2606633688499103, + "learning_rate": 5.012963728942274e-06, + "loss": 0.077, + "step": 6103 + }, + { + "epoch": 0.51, + "grad_norm": 0.2896102191932014, + "learning_rate": 5.011599128487631e-06, + "loss": 0.0758, + "step": 6104 + }, + { + "epoch": 0.51, + "grad_norm": 0.42154835801245427, + "learning_rate": 5.010234527169018e-06, + "loss": 0.0911, + "step": 6105 + }, + { + "epoch": 0.51, + "grad_norm": 0.5778534701003817, + "learning_rate": 5.0088699250880775e-06, + "loss": 0.1206, + "step": 6106 + }, + { + "epoch": 0.51, + "grad_norm": 0.2923775936553259, + "learning_rate": 5.007505322346455e-06, + "loss": 0.0789, + "step": 6107 + }, + { + "epoch": 0.51, + "grad_norm": 0.6714251280893911, + "learning_rate": 5.006140719045791e-06, + "loss": 0.1082, + "step": 6108 + }, + { + "epoch": 0.51, + "grad_norm": 0.3114395221429919, + "learning_rate": 5.004776115287729e-06, + "loss": 0.0844, + "step": 6109 + }, + { + "epoch": 0.51, + "grad_norm": 0.3623220140843325, + "learning_rate": 5.0034115111739155e-06, + "loss": 0.0874, + "step": 6110 + }, + { + "epoch": 0.51, + "grad_norm": 0.3462248039122916, + "learning_rate": 5.0020469068059944e-06, + "loss": 0.0984, + "step": 6111 + }, + { + "epoch": 0.51, + "grad_norm": 0.32836708934653575, + "learning_rate": 5.000682302285606e-06, + "loss": 0.1043, + "step": 6112 + }, + { + "epoch": 0.52, + "grad_norm": 0.4518292686496185, + "learning_rate": 4.999317697714396e-06, + "loss": 0.1423, + "step": 6113 + }, + { + "epoch": 0.52, + "grad_norm": 0.18977448273876807, + "learning_rate": 4.997953093194008e-06, + "loss": 0.0325, + "step": 6114 + }, + { + "epoch": 0.52, + "grad_norm": 0.19822460095554992, + "learning_rate": 4.9965884888260845e-06, + "loss": 0.0372, + "step": 6115 + }, + { + "epoch": 0.52, + "grad_norm": 0.525613808028575, + "learning_rate": 4.9952238847122716e-06, + "loss": 0.1213, + "step": 6116 + }, + { + "epoch": 0.52, + "grad_norm": 0.224031031105772, + "learning_rate": 4.993859280954212e-06, + "loss": 0.0542, + "step": 6117 + }, + { + "epoch": 0.52, + "grad_norm": 0.2888428991375344, + "learning_rate": 4.992494677653546e-06, + "loss": 0.0748, + "step": 6118 + }, + { + "epoch": 0.52, + "grad_norm": 0.33916694725982105, + "learning_rate": 4.991130074911923e-06, + "loss": 0.0893, + "step": 6119 + }, + { + "epoch": 0.52, + "grad_norm": 0.36721548092249173, + "learning_rate": 4.9897654728309835e-06, + "loss": 0.0826, + "step": 6120 + }, + { + "epoch": 0.52, + "grad_norm": 0.25161914575843103, + "learning_rate": 4.988400871512371e-06, + "loss": 0.068, + "step": 6121 + }, + { + "epoch": 0.52, + "grad_norm": 0.21672267062809975, + "learning_rate": 4.987036271057728e-06, + "loss": 0.0863, + "step": 6122 + }, + { + "epoch": 0.52, + "grad_norm": 0.2740922523337386, + "learning_rate": 4.985671671568699e-06, + "loss": 0.0881, + "step": 6123 + }, + { + "epoch": 0.52, + "grad_norm": 0.24116254830966544, + "learning_rate": 4.984307073146929e-06, + "loss": 0.0657, + "step": 6124 + }, + { + "epoch": 0.52, + "grad_norm": 0.4553925657289683, + "learning_rate": 4.982942475894059e-06, + "loss": 0.0912, + "step": 6125 + }, + { + "epoch": 0.52, + "grad_norm": 0.19328634084111265, + "learning_rate": 4.981577879911732e-06, + "loss": 0.0524, + "step": 6126 + }, + { + "epoch": 0.52, + "grad_norm": 0.5352265551660182, + "learning_rate": 4.980213285301593e-06, + "loss": 0.0963, + "step": 6127 + }, + { + "epoch": 0.52, + "grad_norm": 0.4117215521420952, + "learning_rate": 4.978848692165282e-06, + "loss": 0.064, + "step": 6128 + }, + { + "epoch": 0.52, + "grad_norm": 0.27829909400673064, + "learning_rate": 4.977484100604446e-06, + "loss": 0.0763, + "step": 6129 + }, + { + "epoch": 0.52, + "grad_norm": 0.613680964042532, + "learning_rate": 4.976119510720723e-06, + "loss": 0.1162, + "step": 6130 + }, + { + "epoch": 0.52, + "grad_norm": 0.21436500883046947, + "learning_rate": 4.97475492261576e-06, + "loss": 0.0428, + "step": 6131 + }, + { + "epoch": 0.52, + "grad_norm": 0.2534080720507505, + "learning_rate": 4.9733903363911964e-06, + "loss": 0.0665, + "step": 6132 + }, + { + "epoch": 0.52, + "grad_norm": 0.24252724256587946, + "learning_rate": 4.972025752148676e-06, + "loss": 0.041, + "step": 6133 + }, + { + "epoch": 0.52, + "grad_norm": 0.3769287194881874, + "learning_rate": 4.970661169989839e-06, + "loss": 0.089, + "step": 6134 + }, + { + "epoch": 0.52, + "grad_norm": 0.2688266686908356, + "learning_rate": 4.9692965900163305e-06, + "loss": 0.0469, + "step": 6135 + }, + { + "epoch": 0.52, + "grad_norm": 0.2768978589757916, + "learning_rate": 4.9679320123297906e-06, + "loss": 0.0811, + "step": 6136 + }, + { + "epoch": 0.52, + "grad_norm": 0.3728127703149894, + "learning_rate": 4.966567437031862e-06, + "loss": 0.1083, + "step": 6137 + }, + { + "epoch": 0.52, + "grad_norm": 0.33874900795796165, + "learning_rate": 4.965202864224185e-06, + "loss": 0.0951, + "step": 6138 + }, + { + "epoch": 0.52, + "grad_norm": 0.36637561647606703, + "learning_rate": 4.963838294008399e-06, + "loss": 0.1024, + "step": 6139 + }, + { + "epoch": 0.52, + "grad_norm": 0.3797706313518771, + "learning_rate": 4.9624737264861504e-06, + "loss": 0.0917, + "step": 6140 + }, + { + "epoch": 0.52, + "grad_norm": 0.26124896950166415, + "learning_rate": 4.961109161759077e-06, + "loss": 0.0719, + "step": 6141 + }, + { + "epoch": 0.52, + "grad_norm": 0.36908080892762407, + "learning_rate": 4.959744599928818e-06, + "loss": 0.0612, + "step": 6142 + }, + { + "epoch": 0.52, + "grad_norm": 0.3029762992871014, + "learning_rate": 4.958380041097017e-06, + "loss": 0.0946, + "step": 6143 + }, + { + "epoch": 0.52, + "grad_norm": 0.4126229304999895, + "learning_rate": 4.957015485365314e-06, + "loss": 0.1055, + "step": 6144 + }, + { + "epoch": 0.52, + "grad_norm": 0.29787301507691455, + "learning_rate": 4.955650932835345e-06, + "loss": 0.0657, + "step": 6145 + }, + { + "epoch": 0.52, + "grad_norm": 0.3451998528500356, + "learning_rate": 4.954286383608755e-06, + "loss": 0.0824, + "step": 6146 + }, + { + "epoch": 0.52, + "grad_norm": 0.37345329058272825, + "learning_rate": 4.952921837787182e-06, + "loss": 0.0699, + "step": 6147 + }, + { + "epoch": 0.52, + "grad_norm": 0.28721887579417194, + "learning_rate": 4.951557295472265e-06, + "loss": 0.0819, + "step": 6148 + }, + { + "epoch": 0.52, + "grad_norm": 0.29254083426368205, + "learning_rate": 4.95019275676564e-06, + "loss": 0.0917, + "step": 6149 + }, + { + "epoch": 0.52, + "grad_norm": 0.23525709287225097, + "learning_rate": 4.948828221768951e-06, + "loss": 0.0835, + "step": 6150 + }, + { + "epoch": 0.52, + "grad_norm": 0.27557004524727574, + "learning_rate": 4.947463690583834e-06, + "loss": 0.0859, + "step": 6151 + }, + { + "epoch": 0.52, + "grad_norm": 0.3586414347561702, + "learning_rate": 4.946099163311925e-06, + "loss": 0.1031, + "step": 6152 + }, + { + "epoch": 0.52, + "grad_norm": 0.2614876758449999, + "learning_rate": 4.944734640054867e-06, + "loss": 0.0827, + "step": 6153 + }, + { + "epoch": 0.52, + "grad_norm": 0.31298286827836347, + "learning_rate": 4.943370120914295e-06, + "loss": 0.0757, + "step": 6154 + }, + { + "epoch": 0.52, + "grad_norm": 0.3697490641944804, + "learning_rate": 4.942005605991846e-06, + "loss": 0.0667, + "step": 6155 + }, + { + "epoch": 0.52, + "grad_norm": 0.7486924062975983, + "learning_rate": 4.940641095389155e-06, + "loss": 0.1032, + "step": 6156 + }, + { + "epoch": 0.52, + "grad_norm": 0.3381615215701721, + "learning_rate": 4.9392765892078635e-06, + "loss": 0.0507, + "step": 6157 + }, + { + "epoch": 0.52, + "grad_norm": 0.3717706200808369, + "learning_rate": 4.937912087549606e-06, + "loss": 0.0794, + "step": 6158 + }, + { + "epoch": 0.52, + "grad_norm": 0.2468357848572955, + "learning_rate": 4.936547590516016e-06, + "loss": 0.0701, + "step": 6159 + }, + { + "epoch": 0.52, + "grad_norm": 0.3120429182848458, + "learning_rate": 4.935183098208731e-06, + "loss": 0.0693, + "step": 6160 + }, + { + "epoch": 0.52, + "grad_norm": 0.320773136050333, + "learning_rate": 4.933818610729388e-06, + "loss": 0.08, + "step": 6161 + }, + { + "epoch": 0.52, + "grad_norm": 0.1693172104819401, + "learning_rate": 4.932454128179619e-06, + "loss": 0.0488, + "step": 6162 + }, + { + "epoch": 0.52, + "grad_norm": 0.34422545861133563, + "learning_rate": 4.931089650661061e-06, + "loss": 0.0597, + "step": 6163 + }, + { + "epoch": 0.52, + "grad_norm": 0.1610373497546505, + "learning_rate": 4.929725178275347e-06, + "loss": 0.0206, + "step": 6164 + }, + { + "epoch": 0.52, + "grad_norm": 0.4164289325134955, + "learning_rate": 4.92836071112411e-06, + "loss": 0.1167, + "step": 6165 + }, + { + "epoch": 0.52, + "grad_norm": 0.33136098218215754, + "learning_rate": 4.926996249308986e-06, + "loss": 0.0478, + "step": 6166 + }, + { + "epoch": 0.52, + "grad_norm": 0.7019121529586255, + "learning_rate": 4.925631792931607e-06, + "loss": 0.1094, + "step": 6167 + }, + { + "epoch": 0.52, + "grad_norm": 0.448148126412502, + "learning_rate": 4.924267342093603e-06, + "loss": 0.0971, + "step": 6168 + }, + { + "epoch": 0.52, + "grad_norm": 0.3316166854938567, + "learning_rate": 4.92290289689661e-06, + "loss": 0.0757, + "step": 6169 + }, + { + "epoch": 0.52, + "grad_norm": 0.38271703714217054, + "learning_rate": 4.921538457442259e-06, + "loss": 0.0924, + "step": 6170 + }, + { + "epoch": 0.52, + "grad_norm": 0.25306676155551033, + "learning_rate": 4.9201740238321805e-06, + "loss": 0.0693, + "step": 6171 + }, + { + "epoch": 0.52, + "grad_norm": 0.30773801579248233, + "learning_rate": 4.9188095961680055e-06, + "loss": 0.0783, + "step": 6172 + }, + { + "epoch": 0.52, + "grad_norm": 0.48992162934954697, + "learning_rate": 4.917445174551363e-06, + "loss": 0.133, + "step": 6173 + }, + { + "epoch": 0.52, + "grad_norm": 0.2861373728579443, + "learning_rate": 4.916080759083887e-06, + "loss": 0.084, + "step": 6174 + }, + { + "epoch": 0.52, + "grad_norm": 0.25371818306946403, + "learning_rate": 4.9147163498672055e-06, + "loss": 0.0633, + "step": 6175 + }, + { + "epoch": 0.52, + "grad_norm": 0.31941508410803, + "learning_rate": 4.913351947002943e-06, + "loss": 0.0778, + "step": 6176 + }, + { + "epoch": 0.52, + "grad_norm": 0.306464808475965, + "learning_rate": 4.911987550592736e-06, + "loss": 0.0566, + "step": 6177 + }, + { + "epoch": 0.52, + "grad_norm": 0.5865977308019092, + "learning_rate": 4.910623160738209e-06, + "loss": 0.1236, + "step": 6178 + }, + { + "epoch": 0.52, + "grad_norm": 0.26491957139116934, + "learning_rate": 4.909258777540987e-06, + "loss": 0.0711, + "step": 6179 + }, + { + "epoch": 0.52, + "grad_norm": 0.38791620493039386, + "learning_rate": 4.907894401102702e-06, + "loss": 0.1087, + "step": 6180 + }, + { + "epoch": 0.52, + "grad_norm": 0.19452559509793094, + "learning_rate": 4.906530031524979e-06, + "loss": 0.0525, + "step": 6181 + }, + { + "epoch": 0.52, + "grad_norm": 0.21244441790593097, + "learning_rate": 4.905165668909444e-06, + "loss": 0.0412, + "step": 6182 + }, + { + "epoch": 0.52, + "grad_norm": 0.2277006057321096, + "learning_rate": 4.903801313357719e-06, + "loss": 0.0527, + "step": 6183 + }, + { + "epoch": 0.52, + "grad_norm": 0.3114487910339468, + "learning_rate": 4.9024369649714355e-06, + "loss": 0.076, + "step": 6184 + }, + { + "epoch": 0.52, + "grad_norm": 0.33686043061717225, + "learning_rate": 4.901072623852215e-06, + "loss": 0.0867, + "step": 6185 + }, + { + "epoch": 0.52, + "grad_norm": 0.2269644153529275, + "learning_rate": 4.8997082901016805e-06, + "loss": 0.0645, + "step": 6186 + }, + { + "epoch": 0.52, + "grad_norm": 0.449950932502013, + "learning_rate": 4.898343963821457e-06, + "loss": 0.105, + "step": 6187 + }, + { + "epoch": 0.52, + "grad_norm": 0.22543888091890615, + "learning_rate": 4.896979645113169e-06, + "loss": 0.0794, + "step": 6188 + }, + { + "epoch": 0.52, + "grad_norm": 0.48887855084364, + "learning_rate": 4.895615334078437e-06, + "loss": 0.0891, + "step": 6189 + }, + { + "epoch": 0.52, + "grad_norm": 0.4087896293110071, + "learning_rate": 4.8942510308188804e-06, + "loss": 0.1072, + "step": 6190 + }, + { + "epoch": 0.52, + "grad_norm": 0.5042544020336369, + "learning_rate": 4.8928867354361254e-06, + "loss": 0.1369, + "step": 6191 + }, + { + "epoch": 0.52, + "grad_norm": 0.2910483740254676, + "learning_rate": 4.891522448031791e-06, + "loss": 0.0799, + "step": 6192 + }, + { + "epoch": 0.52, + "grad_norm": 0.26523813919658606, + "learning_rate": 4.890158168707494e-06, + "loss": 0.077, + "step": 6193 + }, + { + "epoch": 0.52, + "grad_norm": 0.30976789111563174, + "learning_rate": 4.8887938975648565e-06, + "loss": 0.1063, + "step": 6194 + }, + { + "epoch": 0.52, + "grad_norm": 0.3603648007666127, + "learning_rate": 4.887429634705499e-06, + "loss": 0.0818, + "step": 6195 + }, + { + "epoch": 0.52, + "grad_norm": 0.4682107554286099, + "learning_rate": 4.8860653802310374e-06, + "loss": 0.1163, + "step": 6196 + }, + { + "epoch": 0.52, + "grad_norm": 0.460316617414806, + "learning_rate": 4.884701134243091e-06, + "loss": 0.1076, + "step": 6197 + }, + { + "epoch": 0.52, + "grad_norm": 0.248189547260516, + "learning_rate": 4.883336896843275e-06, + "loss": 0.0759, + "step": 6198 + }, + { + "epoch": 0.52, + "grad_norm": 0.36616886101656765, + "learning_rate": 4.881972668133207e-06, + "loss": 0.1078, + "step": 6199 + }, + { + "epoch": 0.52, + "grad_norm": 0.33528772254418476, + "learning_rate": 4.880608448214501e-06, + "loss": 0.092, + "step": 6200 + }, + { + "epoch": 0.52, + "grad_norm": 0.3360783303094265, + "learning_rate": 4.879244237188775e-06, + "loss": 0.1176, + "step": 6201 + }, + { + "epoch": 0.52, + "grad_norm": 0.2585707002194257, + "learning_rate": 4.87788003515764e-06, + "loss": 0.0775, + "step": 6202 + }, + { + "epoch": 0.52, + "grad_norm": 0.20350412578856908, + "learning_rate": 4.8765158422227115e-06, + "loss": 0.0475, + "step": 6203 + }, + { + "epoch": 0.52, + "grad_norm": 0.3315548891993331, + "learning_rate": 4.875151658485604e-06, + "loss": 0.0882, + "step": 6204 + }, + { + "epoch": 0.52, + "grad_norm": 0.6881886227780356, + "learning_rate": 4.873787484047927e-06, + "loss": 0.1248, + "step": 6205 + }, + { + "epoch": 0.52, + "grad_norm": 0.21521139280178045, + "learning_rate": 4.872423319011295e-06, + "loss": 0.0717, + "step": 6206 + }, + { + "epoch": 0.52, + "grad_norm": 0.3109168715839414, + "learning_rate": 4.871059163477315e-06, + "loss": 0.0669, + "step": 6207 + }, + { + "epoch": 0.52, + "grad_norm": 0.34510866006985336, + "learning_rate": 4.869695017547601e-06, + "loss": 0.0739, + "step": 6208 + }, + { + "epoch": 0.52, + "grad_norm": 0.4109845004855507, + "learning_rate": 4.8683308813237616e-06, + "loss": 0.1078, + "step": 6209 + }, + { + "epoch": 0.52, + "grad_norm": 0.28230029370167403, + "learning_rate": 4.866966754907403e-06, + "loss": 0.0813, + "step": 6210 + }, + { + "epoch": 0.52, + "grad_norm": 0.30954225669820873, + "learning_rate": 4.865602638400138e-06, + "loss": 0.0875, + "step": 6211 + }, + { + "epoch": 0.52, + "grad_norm": 0.2646630837706112, + "learning_rate": 4.864238531903571e-06, + "loss": 0.0656, + "step": 6212 + }, + { + "epoch": 0.52, + "grad_norm": 0.3463217752013865, + "learning_rate": 4.862874435519307e-06, + "loss": 0.0581, + "step": 6213 + }, + { + "epoch": 0.52, + "grad_norm": 0.37180422015513553, + "learning_rate": 4.861510349348955e-06, + "loss": 0.0703, + "step": 6214 + }, + { + "epoch": 0.52, + "grad_norm": 0.27825131319640045, + "learning_rate": 4.86014627349412e-06, + "loss": 0.0976, + "step": 6215 + }, + { + "epoch": 0.52, + "grad_norm": 0.31088679586848195, + "learning_rate": 4.858782208056405e-06, + "loss": 0.1008, + "step": 6216 + }, + { + "epoch": 0.52, + "grad_norm": 0.271446927815984, + "learning_rate": 4.857418153137412e-06, + "loss": 0.0799, + "step": 6217 + }, + { + "epoch": 0.52, + "grad_norm": 0.21354901226878356, + "learning_rate": 4.856054108838747e-06, + "loss": 0.0594, + "step": 6218 + }, + { + "epoch": 0.52, + "grad_norm": 0.2921263439087751, + "learning_rate": 4.854690075262011e-06, + "loss": 0.065, + "step": 6219 + }, + { + "epoch": 0.52, + "grad_norm": 0.2259457387344419, + "learning_rate": 4.853326052508802e-06, + "loss": 0.0579, + "step": 6220 + }, + { + "epoch": 0.52, + "grad_norm": 0.3523429891646404, + "learning_rate": 4.851962040680726e-06, + "loss": 0.0535, + "step": 6221 + }, + { + "epoch": 0.52, + "grad_norm": 0.35054118297676823, + "learning_rate": 4.85059803987938e-06, + "loss": 0.0665, + "step": 6222 + }, + { + "epoch": 0.52, + "grad_norm": 0.26621599729415873, + "learning_rate": 4.8492340502063615e-06, + "loss": 0.083, + "step": 6223 + }, + { + "epoch": 0.52, + "grad_norm": 0.3795396936731353, + "learning_rate": 4.8478700717632675e-06, + "loss": 0.0743, + "step": 6224 + }, + { + "epoch": 0.52, + "grad_norm": 0.5025331654852412, + "learning_rate": 4.846506104651698e-06, + "loss": 0.0992, + "step": 6225 + }, + { + "epoch": 0.52, + "grad_norm": 0.37887447002997976, + "learning_rate": 4.845142148973249e-06, + "loss": 0.0858, + "step": 6226 + }, + { + "epoch": 0.52, + "grad_norm": 0.27834869580519567, + "learning_rate": 4.843778204829513e-06, + "loss": 0.0925, + "step": 6227 + }, + { + "epoch": 0.52, + "grad_norm": 0.2670083933259194, + "learning_rate": 4.8424142723220865e-06, + "loss": 0.0805, + "step": 6228 + }, + { + "epoch": 0.52, + "grad_norm": 0.4695305940520185, + "learning_rate": 4.841050351552565e-06, + "loss": 0.1178, + "step": 6229 + }, + { + "epoch": 0.52, + "grad_norm": 0.3898764340310893, + "learning_rate": 4.839686442622537e-06, + "loss": 0.0847, + "step": 6230 + }, + { + "epoch": 0.53, + "grad_norm": 0.36204248280406587, + "learning_rate": 4.838322545633598e-06, + "loss": 0.0983, + "step": 6231 + }, + { + "epoch": 0.53, + "grad_norm": 0.3426300874546533, + "learning_rate": 4.8369586606873376e-06, + "loss": 0.0819, + "step": 6232 + }, + { + "epoch": 0.53, + "grad_norm": 0.18740978688371399, + "learning_rate": 4.835594787885346e-06, + "loss": 0.0517, + "step": 6233 + }, + { + "epoch": 0.53, + "grad_norm": 0.3350686009514446, + "learning_rate": 4.8342309273292115e-06, + "loss": 0.1111, + "step": 6234 + }, + { + "epoch": 0.53, + "grad_norm": 0.26491393326760276, + "learning_rate": 4.832867079120524e-06, + "loss": 0.0782, + "step": 6235 + }, + { + "epoch": 0.53, + "grad_norm": 0.31604590788242415, + "learning_rate": 4.831503243360869e-06, + "loss": 0.0769, + "step": 6236 + }, + { + "epoch": 0.53, + "grad_norm": 0.4068779973617073, + "learning_rate": 4.830139420151835e-06, + "loss": 0.1109, + "step": 6237 + }, + { + "epoch": 0.53, + "grad_norm": 0.43931047000082313, + "learning_rate": 4.828775609595007e-06, + "loss": 0.1237, + "step": 6238 + }, + { + "epoch": 0.53, + "grad_norm": 0.26055486211230416, + "learning_rate": 4.8274118117919685e-06, + "loss": 0.0819, + "step": 6239 + }, + { + "epoch": 0.53, + "grad_norm": 0.2920040158218073, + "learning_rate": 4.826048026844304e-06, + "loss": 0.0857, + "step": 6240 + }, + { + "epoch": 0.53, + "grad_norm": 0.2061827867170571, + "learning_rate": 4.824684254853594e-06, + "loss": 0.0734, + "step": 6241 + }, + { + "epoch": 0.53, + "grad_norm": 0.4860217462059491, + "learning_rate": 4.823320495921424e-06, + "loss": 0.1377, + "step": 6242 + }, + { + "epoch": 0.53, + "grad_norm": 0.2899670608034245, + "learning_rate": 4.821956750149372e-06, + "loss": 0.0522, + "step": 6243 + }, + { + "epoch": 0.53, + "grad_norm": 0.4591723499865208, + "learning_rate": 4.820593017639017e-06, + "loss": 0.0772, + "step": 6244 + }, + { + "epoch": 0.53, + "grad_norm": 0.37750475993514787, + "learning_rate": 4.819229298491941e-06, + "loss": 0.0773, + "step": 6245 + }, + { + "epoch": 0.53, + "grad_norm": 0.2843229307029026, + "learning_rate": 4.81786559280972e-06, + "loss": 0.1086, + "step": 6246 + }, + { + "epoch": 0.53, + "grad_norm": 0.38438792982637776, + "learning_rate": 4.816501900693931e-06, + "loss": 0.0752, + "step": 6247 + }, + { + "epoch": 0.53, + "grad_norm": 0.28842356582610174, + "learning_rate": 4.815138222246147e-06, + "loss": 0.1065, + "step": 6248 + }, + { + "epoch": 0.53, + "grad_norm": 0.20391680627537037, + "learning_rate": 4.813774557567947e-06, + "loss": 0.0676, + "step": 6249 + }, + { + "epoch": 0.53, + "grad_norm": 0.26989407069220805, + "learning_rate": 4.812410906760903e-06, + "loss": 0.0749, + "step": 6250 + }, + { + "epoch": 0.53, + "grad_norm": 0.38659777572540693, + "learning_rate": 4.811047269926585e-06, + "loss": 0.1225, + "step": 6251 + }, + { + "epoch": 0.53, + "grad_norm": 0.30223685482080054, + "learning_rate": 4.809683647166569e-06, + "loss": 0.093, + "step": 6252 + }, + { + "epoch": 0.53, + "grad_norm": 0.31324597709209756, + "learning_rate": 4.808320038582424e-06, + "loss": 0.0756, + "step": 6253 + }, + { + "epoch": 0.53, + "grad_norm": 0.3323420498607163, + "learning_rate": 4.8069564442757176e-06, + "loss": 0.0838, + "step": 6254 + }, + { + "epoch": 0.53, + "grad_norm": 0.3404440843400772, + "learning_rate": 4.805592864348022e-06, + "loss": 0.0996, + "step": 6255 + }, + { + "epoch": 0.53, + "grad_norm": 0.5698618083517882, + "learning_rate": 4.8042292989009016e-06, + "loss": 0.1477, + "step": 6256 + }, + { + "epoch": 0.53, + "grad_norm": 0.17998600127109962, + "learning_rate": 4.8028657480359236e-06, + "loss": 0.0397, + "step": 6257 + }, + { + "epoch": 0.53, + "grad_norm": 0.19905570148235932, + "learning_rate": 4.801502211854651e-06, + "loss": 0.0544, + "step": 6258 + }, + { + "epoch": 0.53, + "grad_norm": 0.32578127007529, + "learning_rate": 4.800138690458653e-06, + "loss": 0.0826, + "step": 6259 + }, + { + "epoch": 0.53, + "grad_norm": 0.22892401593421635, + "learning_rate": 4.798775183949488e-06, + "loss": 0.0682, + "step": 6260 + }, + { + "epoch": 0.53, + "grad_norm": 0.2784866089187351, + "learning_rate": 4.7974116924287194e-06, + "loss": 0.0731, + "step": 6261 + }, + { + "epoch": 0.53, + "grad_norm": 0.24154303058920804, + "learning_rate": 4.796048215997908e-06, + "loss": 0.0472, + "step": 6262 + }, + { + "epoch": 0.53, + "grad_norm": 0.1882640249736183, + "learning_rate": 4.794684754758614e-06, + "loss": 0.0706, + "step": 6263 + }, + { + "epoch": 0.53, + "grad_norm": 0.32166669477369153, + "learning_rate": 4.793321308812396e-06, + "loss": 0.0504, + "step": 6264 + }, + { + "epoch": 0.53, + "grad_norm": 0.3799128565414007, + "learning_rate": 4.791957878260811e-06, + "loss": 0.0779, + "step": 6265 + }, + { + "epoch": 0.53, + "grad_norm": 0.4835770038609911, + "learning_rate": 4.790594463205413e-06, + "loss": 0.0698, + "step": 6266 + }, + { + "epoch": 0.53, + "grad_norm": 0.3239104881339727, + "learning_rate": 4.789231063747763e-06, + "loss": 0.1054, + "step": 6267 + }, + { + "epoch": 0.53, + "grad_norm": 0.4536566216534488, + "learning_rate": 4.787867679989408e-06, + "loss": 0.1383, + "step": 6268 + }, + { + "epoch": 0.53, + "grad_norm": 0.4604642955481199, + "learning_rate": 4.786504312031905e-06, + "loss": 0.1098, + "step": 6269 + }, + { + "epoch": 0.53, + "grad_norm": 0.3122401583874322, + "learning_rate": 4.7851409599768045e-06, + "loss": 0.0616, + "step": 6270 + }, + { + "epoch": 0.53, + "grad_norm": 0.5975977364413436, + "learning_rate": 4.783777623925657e-06, + "loss": 0.1334, + "step": 6271 + }, + { + "epoch": 0.53, + "grad_norm": 0.25105025721283447, + "learning_rate": 4.782414303980012e-06, + "loss": 0.099, + "step": 6272 + }, + { + "epoch": 0.53, + "grad_norm": 0.28175402175268005, + "learning_rate": 4.781051000241417e-06, + "loss": 0.0957, + "step": 6273 + }, + { + "epoch": 0.53, + "grad_norm": 0.28155934290594453, + "learning_rate": 4.779687712811419e-06, + "loss": 0.0595, + "step": 6274 + }, + { + "epoch": 0.53, + "grad_norm": 0.28438737266784225, + "learning_rate": 4.778324441791562e-06, + "loss": 0.0909, + "step": 6275 + }, + { + "epoch": 0.53, + "grad_norm": 0.2715099948536962, + "learning_rate": 4.7769611872833944e-06, + "loss": 0.0524, + "step": 6276 + }, + { + "epoch": 0.53, + "grad_norm": 0.45232396982033773, + "learning_rate": 4.775597949388457e-06, + "loss": 0.1592, + "step": 6277 + }, + { + "epoch": 0.53, + "grad_norm": 0.2592280354765477, + "learning_rate": 4.774234728208288e-06, + "loss": 0.0853, + "step": 6278 + }, + { + "epoch": 0.53, + "grad_norm": 0.30877447044210227, + "learning_rate": 4.772871523844435e-06, + "loss": 0.0825, + "step": 6279 + }, + { + "epoch": 0.53, + "grad_norm": 0.2618951953030846, + "learning_rate": 4.771508336398433e-06, + "loss": 0.0712, + "step": 6280 + }, + { + "epoch": 0.53, + "grad_norm": 0.44030031974701817, + "learning_rate": 4.770145165971823e-06, + "loss": 0.1257, + "step": 6281 + }, + { + "epoch": 0.53, + "grad_norm": 0.3478775458193837, + "learning_rate": 4.768782012666136e-06, + "loss": 0.0918, + "step": 6282 + }, + { + "epoch": 0.53, + "grad_norm": 0.26107654248503626, + "learning_rate": 4.767418876582914e-06, + "loss": 0.0586, + "step": 6283 + }, + { + "epoch": 0.53, + "grad_norm": 0.1992128709048062, + "learning_rate": 4.766055757823689e-06, + "loss": 0.0675, + "step": 6284 + }, + { + "epoch": 0.53, + "grad_norm": 0.41925562761070967, + "learning_rate": 4.764692656489992e-06, + "loss": 0.1136, + "step": 6285 + }, + { + "epoch": 0.53, + "grad_norm": 0.3246617472946094, + "learning_rate": 4.76332957268336e-06, + "loss": 0.0976, + "step": 6286 + }, + { + "epoch": 0.53, + "grad_norm": 0.37278841245889494, + "learning_rate": 4.761966506505319e-06, + "loss": 0.0709, + "step": 6287 + }, + { + "epoch": 0.53, + "grad_norm": 0.48758677019116825, + "learning_rate": 4.760603458057397e-06, + "loss": 0.1171, + "step": 6288 + }, + { + "epoch": 0.53, + "grad_norm": 0.5101353739093077, + "learning_rate": 4.759240427441127e-06, + "loss": 0.176, + "step": 6289 + }, + { + "epoch": 0.53, + "grad_norm": 0.4578674618512451, + "learning_rate": 4.757877414758032e-06, + "loss": 0.0966, + "step": 6290 + }, + { + "epoch": 0.53, + "grad_norm": 0.3082300225027582, + "learning_rate": 4.756514420109638e-06, + "loss": 0.0654, + "step": 6291 + }, + { + "epoch": 0.53, + "grad_norm": 0.24800809474449645, + "learning_rate": 4.755151443597466e-06, + "loss": 0.0899, + "step": 6292 + }, + { + "epoch": 0.53, + "grad_norm": 0.4708500818865379, + "learning_rate": 4.753788485323042e-06, + "loss": 0.1311, + "step": 6293 + }, + { + "epoch": 0.53, + "grad_norm": 0.6769839347135105, + "learning_rate": 4.752425545387887e-06, + "loss": 0.137, + "step": 6294 + }, + { + "epoch": 0.53, + "grad_norm": 0.3155863847727109, + "learning_rate": 4.751062623893518e-06, + "loss": 0.0923, + "step": 6295 + }, + { + "epoch": 0.53, + "grad_norm": 0.27708340327039194, + "learning_rate": 4.749699720941455e-06, + "loss": 0.079, + "step": 6296 + }, + { + "epoch": 0.53, + "grad_norm": 0.34628954262283923, + "learning_rate": 4.7483368366332166e-06, + "loss": 0.0974, + "step": 6297 + }, + { + "epoch": 0.53, + "grad_norm": 0.3294642223885916, + "learning_rate": 4.746973971070316e-06, + "loss": 0.1051, + "step": 6298 + }, + { + "epoch": 0.53, + "grad_norm": 0.23616371939708927, + "learning_rate": 4.745611124354267e-06, + "loss": 0.0821, + "step": 6299 + }, + { + "epoch": 0.53, + "grad_norm": 0.14198819248534972, + "learning_rate": 4.744248296586584e-06, + "loss": 0.0204, + "step": 6300 + }, + { + "epoch": 0.53, + "grad_norm": 0.4264165092607951, + "learning_rate": 4.742885487868778e-06, + "loss": 0.1042, + "step": 6301 + }, + { + "epoch": 0.53, + "grad_norm": 0.24703185208719472, + "learning_rate": 4.741522698302358e-06, + "loss": 0.0432, + "step": 6302 + }, + { + "epoch": 0.53, + "grad_norm": 0.3771141246815471, + "learning_rate": 4.740159927988835e-06, + "loss": 0.1092, + "step": 6303 + }, + { + "epoch": 0.53, + "grad_norm": 0.3329086267049419, + "learning_rate": 4.738797177029711e-06, + "loss": 0.0672, + "step": 6304 + }, + { + "epoch": 0.53, + "grad_norm": 0.19626758612884662, + "learning_rate": 4.737434445526497e-06, + "loss": 0.0532, + "step": 6305 + }, + { + "epoch": 0.53, + "grad_norm": 0.23309166058036826, + "learning_rate": 4.736071733580695e-06, + "loss": 0.0217, + "step": 6306 + }, + { + "epoch": 0.53, + "grad_norm": 0.45785665305736367, + "learning_rate": 4.7347090412938085e-06, + "loss": 0.1495, + "step": 6307 + }, + { + "epoch": 0.53, + "grad_norm": 0.30693582900821176, + "learning_rate": 4.733346368767338e-06, + "loss": 0.1128, + "step": 6308 + }, + { + "epoch": 0.53, + "grad_norm": 0.5212671926191326, + "learning_rate": 4.73198371610278e-06, + "loss": 0.0827, + "step": 6309 + }, + { + "epoch": 0.53, + "grad_norm": 0.9839800350129269, + "learning_rate": 4.73062108340164e-06, + "loss": 0.104, + "step": 6310 + }, + { + "epoch": 0.53, + "grad_norm": 0.2438334409648715, + "learning_rate": 4.7292584707654096e-06, + "loss": 0.0869, + "step": 6311 + }, + { + "epoch": 0.53, + "grad_norm": 0.33223036155517466, + "learning_rate": 4.727895878295584e-06, + "loss": 0.0658, + "step": 6312 + }, + { + "epoch": 0.53, + "grad_norm": 0.3745328593180325, + "learning_rate": 4.7265333060936605e-06, + "loss": 0.1039, + "step": 6313 + }, + { + "epoch": 0.53, + "grad_norm": 0.2989441145540522, + "learning_rate": 4.72517075426113e-06, + "loss": 0.0781, + "step": 6314 + }, + { + "epoch": 0.53, + "grad_norm": 0.44429380754758774, + "learning_rate": 4.723808222899481e-06, + "loss": 0.0742, + "step": 6315 + }, + { + "epoch": 0.53, + "grad_norm": 0.4552469348014972, + "learning_rate": 4.722445712110204e-06, + "loss": 0.1164, + "step": 6316 + }, + { + "epoch": 0.53, + "grad_norm": 0.3150480923628805, + "learning_rate": 4.721083221994789e-06, + "loss": 0.0898, + "step": 6317 + }, + { + "epoch": 0.53, + "grad_norm": 0.35756208420739405, + "learning_rate": 4.71972075265472e-06, + "loss": 0.096, + "step": 6318 + }, + { + "epoch": 0.53, + "grad_norm": 0.2967834577307035, + "learning_rate": 4.71835830419148e-06, + "loss": 0.1013, + "step": 6319 + }, + { + "epoch": 0.53, + "grad_norm": 0.23760317233505612, + "learning_rate": 4.716995876706557e-06, + "loss": 0.0404, + "step": 6320 + }, + { + "epoch": 0.53, + "grad_norm": 0.2278106666694121, + "learning_rate": 4.715633470301429e-06, + "loss": 0.0602, + "step": 6321 + }, + { + "epoch": 0.53, + "grad_norm": 0.25198557628750506, + "learning_rate": 4.7142710850775755e-06, + "loss": 0.0582, + "step": 6322 + }, + { + "epoch": 0.53, + "grad_norm": 0.2677006939004904, + "learning_rate": 4.712908721136477e-06, + "loss": 0.0701, + "step": 6323 + }, + { + "epoch": 0.53, + "grad_norm": 0.31163436756327373, + "learning_rate": 4.71154637857961e-06, + "loss": 0.0857, + "step": 6324 + }, + { + "epoch": 0.53, + "grad_norm": 0.3426430292277268, + "learning_rate": 4.71018405750845e-06, + "loss": 0.0649, + "step": 6325 + }, + { + "epoch": 0.53, + "grad_norm": 0.3352365569695946, + "learning_rate": 4.708821758024466e-06, + "loss": 0.0759, + "step": 6326 + }, + { + "epoch": 0.53, + "grad_norm": 0.6054451185818653, + "learning_rate": 4.707459480229137e-06, + "loss": 0.1483, + "step": 6327 + }, + { + "epoch": 0.53, + "grad_norm": 0.2783469794141662, + "learning_rate": 4.706097224223929e-06, + "loss": 0.0544, + "step": 6328 + }, + { + "epoch": 0.53, + "grad_norm": 0.3319860176359143, + "learning_rate": 4.704734990110311e-06, + "loss": 0.0687, + "step": 6329 + }, + { + "epoch": 0.53, + "grad_norm": 0.26565763150784566, + "learning_rate": 4.703372777989752e-06, + "loss": 0.0734, + "step": 6330 + }, + { + "epoch": 0.53, + "grad_norm": 0.35536486295116987, + "learning_rate": 4.702010587963716e-06, + "loss": 0.1124, + "step": 6331 + }, + { + "epoch": 0.53, + "grad_norm": 0.4190772687855606, + "learning_rate": 4.700648420133669e-06, + "loss": 0.1122, + "step": 6332 + }, + { + "epoch": 0.53, + "grad_norm": 0.4046026844649208, + "learning_rate": 4.699286274601069e-06, + "loss": 0.0903, + "step": 6333 + }, + { + "epoch": 0.53, + "grad_norm": 0.38589767793669816, + "learning_rate": 4.6979241514673805e-06, + "loss": 0.1003, + "step": 6334 + }, + { + "epoch": 0.53, + "grad_norm": 0.2460209757292161, + "learning_rate": 4.69656205083406e-06, + "loss": 0.0788, + "step": 6335 + }, + { + "epoch": 0.53, + "grad_norm": 0.24243466273324263, + "learning_rate": 4.695199972802566e-06, + "loss": 0.0619, + "step": 6336 + }, + { + "epoch": 0.53, + "grad_norm": 0.4275353329535424, + "learning_rate": 4.693837917474355e-06, + "loss": 0.1012, + "step": 6337 + }, + { + "epoch": 0.53, + "grad_norm": 0.3746458077203752, + "learning_rate": 4.692475884950877e-06, + "loss": 0.0989, + "step": 6338 + }, + { + "epoch": 0.53, + "grad_norm": 0.3015386618161945, + "learning_rate": 4.691113875333588e-06, + "loss": 0.0742, + "step": 6339 + }, + { + "epoch": 0.53, + "grad_norm": 0.31256212176039233, + "learning_rate": 4.6897518887239366e-06, + "loss": 0.1225, + "step": 6340 + }, + { + "epoch": 0.53, + "grad_norm": 0.2833146786825738, + "learning_rate": 4.688389925223372e-06, + "loss": 0.1078, + "step": 6341 + }, + { + "epoch": 0.53, + "grad_norm": 0.38156956934619807, + "learning_rate": 4.687027984933343e-06, + "loss": 0.0871, + "step": 6342 + }, + { + "epoch": 0.53, + "grad_norm": 0.3313935835880675, + "learning_rate": 4.685666067955289e-06, + "loss": 0.1036, + "step": 6343 + }, + { + "epoch": 0.53, + "grad_norm": 0.2203990771803701, + "learning_rate": 4.6843041743906595e-06, + "loss": 0.0545, + "step": 6344 + }, + { + "epoch": 0.53, + "grad_norm": 0.2193640648282117, + "learning_rate": 4.682942304340895e-06, + "loss": 0.0704, + "step": 6345 + }, + { + "epoch": 0.53, + "grad_norm": 0.16418517763049162, + "learning_rate": 4.681580457907432e-06, + "loss": 0.049, + "step": 6346 + }, + { + "epoch": 0.53, + "grad_norm": 0.45874778995129695, + "learning_rate": 4.680218635191714e-06, + "loss": 0.1298, + "step": 6347 + }, + { + "epoch": 0.53, + "grad_norm": 0.39042596649995576, + "learning_rate": 4.678856836295174e-06, + "loss": 0.0886, + "step": 6348 + }, + { + "epoch": 0.53, + "grad_norm": 0.86316159869915, + "learning_rate": 4.677495061319248e-06, + "loss": 0.1508, + "step": 6349 + }, + { + "epoch": 0.54, + "grad_norm": 0.3565196450777301, + "learning_rate": 4.676133310365368e-06, + "loss": 0.0775, + "step": 6350 + }, + { + "epoch": 0.54, + "grad_norm": 0.5590742391171413, + "learning_rate": 4.674771583534965e-06, + "loss": 0.0717, + "step": 6351 + }, + { + "epoch": 0.54, + "grad_norm": 0.4080409352837086, + "learning_rate": 4.673409880929471e-06, + "loss": 0.0939, + "step": 6352 + }, + { + "epoch": 0.54, + "grad_norm": 0.3170856063196134, + "learning_rate": 4.672048202650309e-06, + "loss": 0.0793, + "step": 6353 + }, + { + "epoch": 0.54, + "grad_norm": 0.6902625030187478, + "learning_rate": 4.6706865487989095e-06, + "loss": 0.1317, + "step": 6354 + }, + { + "epoch": 0.54, + "grad_norm": 0.23264450947519252, + "learning_rate": 4.669324919476695e-06, + "loss": 0.0528, + "step": 6355 + }, + { + "epoch": 0.54, + "grad_norm": 0.37434976642229184, + "learning_rate": 4.667963314785085e-06, + "loss": 0.0911, + "step": 6356 + }, + { + "epoch": 0.54, + "grad_norm": 0.4088662011726529, + "learning_rate": 4.666601734825501e-06, + "loss": 0.0772, + "step": 6357 + }, + { + "epoch": 0.54, + "grad_norm": 0.2908346145386005, + "learning_rate": 4.665240179699363e-06, + "loss": 0.0734, + "step": 6358 + }, + { + "epoch": 0.54, + "grad_norm": 0.4560207248379656, + "learning_rate": 4.663878649508087e-06, + "loss": 0.098, + "step": 6359 + }, + { + "epoch": 0.54, + "grad_norm": 0.21655318704650411, + "learning_rate": 4.662517144353085e-06, + "loss": 0.0411, + "step": 6360 + }, + { + "epoch": 0.54, + "grad_norm": 0.3130930820213886, + "learning_rate": 4.6611556643357745e-06, + "loss": 0.0842, + "step": 6361 + }, + { + "epoch": 0.54, + "grad_norm": 0.26301732524459226, + "learning_rate": 4.659794209557564e-06, + "loss": 0.0531, + "step": 6362 + }, + { + "epoch": 0.54, + "grad_norm": 0.23218131776371206, + "learning_rate": 4.658432780119861e-06, + "loss": 0.0541, + "step": 6363 + }, + { + "epoch": 0.54, + "grad_norm": 0.246466215368339, + "learning_rate": 4.657071376124075e-06, + "loss": 0.0585, + "step": 6364 + }, + { + "epoch": 0.54, + "grad_norm": 0.303471655266001, + "learning_rate": 4.655709997671612e-06, + "loss": 0.0887, + "step": 6365 + }, + { + "epoch": 0.54, + "grad_norm": 0.2096262742961997, + "learning_rate": 4.654348644863873e-06, + "loss": 0.0599, + "step": 6366 + }, + { + "epoch": 0.54, + "grad_norm": 0.3843488090242564, + "learning_rate": 4.65298731780226e-06, + "loss": 0.094, + "step": 6367 + }, + { + "epoch": 0.54, + "grad_norm": 0.3498700668141037, + "learning_rate": 4.651626016588173e-06, + "loss": 0.0947, + "step": 6368 + }, + { + "epoch": 0.54, + "grad_norm": 0.3623777144333756, + "learning_rate": 4.650264741323011e-06, + "loss": 0.1137, + "step": 6369 + }, + { + "epoch": 0.54, + "grad_norm": 0.2531786888615636, + "learning_rate": 4.648903492108167e-06, + "loss": 0.0717, + "step": 6370 + }, + { + "epoch": 0.54, + "grad_norm": 0.2930192755767202, + "learning_rate": 4.6475422690450375e-06, + "loss": 0.0752, + "step": 6371 + }, + { + "epoch": 0.54, + "grad_norm": 0.2705824940198104, + "learning_rate": 4.6461810722350114e-06, + "loss": 0.0749, + "step": 6372 + }, + { + "epoch": 0.54, + "grad_norm": 0.38572616716722485, + "learning_rate": 4.644819901779482e-06, + "loss": 0.1204, + "step": 6373 + }, + { + "epoch": 0.54, + "grad_norm": 0.3515170840366077, + "learning_rate": 4.643458757779834e-06, + "loss": 0.1058, + "step": 6374 + }, + { + "epoch": 0.54, + "grad_norm": 0.31565159039521745, + "learning_rate": 4.642097640337455e-06, + "loss": 0.0745, + "step": 6375 + }, + { + "epoch": 0.54, + "grad_norm": 0.17036273316881242, + "learning_rate": 4.640736549553731e-06, + "loss": 0.0463, + "step": 6376 + }, + { + "epoch": 0.54, + "grad_norm": 0.20259798271804128, + "learning_rate": 4.6393754855300375e-06, + "loss": 0.0614, + "step": 6377 + }, + { + "epoch": 0.54, + "grad_norm": 0.18105187501842224, + "learning_rate": 4.638014448367762e-06, + "loss": 0.07, + "step": 6378 + }, + { + "epoch": 0.54, + "grad_norm": 0.23670305452410104, + "learning_rate": 4.636653438168277e-06, + "loss": 0.0978, + "step": 6379 + }, + { + "epoch": 0.54, + "grad_norm": 0.2606657516327883, + "learning_rate": 4.63529245503296e-06, + "loss": 0.092, + "step": 6380 + }, + { + "epoch": 0.54, + "grad_norm": 0.3863081708077683, + "learning_rate": 4.633931499063188e-06, + "loss": 0.1243, + "step": 6381 + }, + { + "epoch": 0.54, + "grad_norm": 0.4810535772326138, + "learning_rate": 4.63257057036033e-06, + "loss": 0.0941, + "step": 6382 + }, + { + "epoch": 0.54, + "grad_norm": 0.30794550757641614, + "learning_rate": 4.631209669025756e-06, + "loss": 0.0851, + "step": 6383 + }, + { + "epoch": 0.54, + "grad_norm": 0.3387356568195617, + "learning_rate": 4.629848795160832e-06, + "loss": 0.0928, + "step": 6384 + }, + { + "epoch": 0.54, + "grad_norm": 0.26616877638101577, + "learning_rate": 4.628487948866929e-06, + "loss": 0.0489, + "step": 6385 + }, + { + "epoch": 0.54, + "grad_norm": 0.3209706381627441, + "learning_rate": 4.6271271302454075e-06, + "loss": 0.1002, + "step": 6386 + }, + { + "epoch": 0.54, + "grad_norm": 0.22102149492760403, + "learning_rate": 4.625766339397627e-06, + "loss": 0.0671, + "step": 6387 + }, + { + "epoch": 0.54, + "grad_norm": 0.3741838329478444, + "learning_rate": 4.624405576424951e-06, + "loss": 0.1044, + "step": 6388 + }, + { + "epoch": 0.54, + "grad_norm": 0.2127267148578572, + "learning_rate": 4.6230448414287375e-06, + "loss": 0.0518, + "step": 6389 + }, + { + "epoch": 0.54, + "grad_norm": 0.23212303153536942, + "learning_rate": 4.6216841345103395e-06, + "loss": 0.0848, + "step": 6390 + }, + { + "epoch": 0.54, + "grad_norm": 0.5229934445295567, + "learning_rate": 4.620323455771108e-06, + "loss": 0.1313, + "step": 6391 + }, + { + "epoch": 0.54, + "grad_norm": 0.382715392410636, + "learning_rate": 4.6189628053124e-06, + "loss": 0.1339, + "step": 6392 + }, + { + "epoch": 0.54, + "grad_norm": 0.28198871823808924, + "learning_rate": 4.617602183235562e-06, + "loss": 0.0637, + "step": 6393 + }, + { + "epoch": 0.54, + "grad_norm": 0.3632364371299663, + "learning_rate": 4.616241589641939e-06, + "loss": 0.0817, + "step": 6394 + }, + { + "epoch": 0.54, + "grad_norm": 0.27242411559463203, + "learning_rate": 4.61488102463288e-06, + "loss": 0.0923, + "step": 6395 + }, + { + "epoch": 0.54, + "grad_norm": 0.22680724022912901, + "learning_rate": 4.613520488309726e-06, + "loss": 0.069, + "step": 6396 + }, + { + "epoch": 0.54, + "grad_norm": 0.22214152369120563, + "learning_rate": 4.612159980773816e-06, + "loss": 0.0662, + "step": 6397 + }, + { + "epoch": 0.54, + "grad_norm": 0.2687140450924824, + "learning_rate": 4.61079950212649e-06, + "loss": 0.0683, + "step": 6398 + }, + { + "epoch": 0.54, + "grad_norm": 0.30089743828084026, + "learning_rate": 4.609439052469086e-06, + "loss": 0.0678, + "step": 6399 + }, + { + "epoch": 0.54, + "grad_norm": 0.25960469151427323, + "learning_rate": 4.6080786319029355e-06, + "loss": 0.0763, + "step": 6400 + }, + { + "epoch": 0.54, + "grad_norm": 0.5100085063236124, + "learning_rate": 4.606718240529371e-06, + "loss": 0.1151, + "step": 6401 + }, + { + "epoch": 0.54, + "grad_norm": 0.26377890049904607, + "learning_rate": 4.6053578784497235e-06, + "loss": 0.0846, + "step": 6402 + }, + { + "epoch": 0.54, + "grad_norm": 0.2964093010522445, + "learning_rate": 4.603997545765321e-06, + "loss": 0.0677, + "step": 6403 + }, + { + "epoch": 0.54, + "grad_norm": 0.30774535201730274, + "learning_rate": 4.602637242577487e-06, + "loss": 0.0716, + "step": 6404 + }, + { + "epoch": 0.54, + "grad_norm": 0.3822215528398892, + "learning_rate": 4.601276968987546e-06, + "loss": 0.109, + "step": 6405 + }, + { + "epoch": 0.54, + "grad_norm": 0.9348034496512732, + "learning_rate": 4.5999167250968195e-06, + "loss": 0.2038, + "step": 6406 + }, + { + "epoch": 0.54, + "grad_norm": 0.2978573866782475, + "learning_rate": 4.598556511006627e-06, + "loss": 0.0804, + "step": 6407 + }, + { + "epoch": 0.54, + "grad_norm": 0.23790420202030357, + "learning_rate": 4.597196326818282e-06, + "loss": 0.0644, + "step": 6408 + }, + { + "epoch": 0.54, + "grad_norm": 0.17247229028449812, + "learning_rate": 4.595836172633103e-06, + "loss": 0.0424, + "step": 6409 + }, + { + "epoch": 0.54, + "grad_norm": 0.5509906436531441, + "learning_rate": 4.5944760485523995e-06, + "loss": 0.1528, + "step": 6410 + }, + { + "epoch": 0.54, + "grad_norm": 0.2645332496176322, + "learning_rate": 4.593115954677481e-06, + "loss": 0.0741, + "step": 6411 + }, + { + "epoch": 0.54, + "grad_norm": 0.47295324139615186, + "learning_rate": 4.591755891109659e-06, + "loss": 0.1155, + "step": 6412 + }, + { + "epoch": 0.54, + "grad_norm": 0.378128363343617, + "learning_rate": 4.590395857950235e-06, + "loss": 0.1001, + "step": 6413 + }, + { + "epoch": 0.54, + "grad_norm": 0.20675316225415696, + "learning_rate": 4.589035855300512e-06, + "loss": 0.048, + "step": 6414 + }, + { + "epoch": 0.54, + "grad_norm": 0.22011757692618353, + "learning_rate": 4.587675883261795e-06, + "loss": 0.0549, + "step": 6415 + }, + { + "epoch": 0.54, + "grad_norm": 0.3848278714917212, + "learning_rate": 4.58631594193538e-06, + "loss": 0.0849, + "step": 6416 + }, + { + "epoch": 0.54, + "grad_norm": 0.31712092225589816, + "learning_rate": 4.584956031422564e-06, + "loss": 0.1066, + "step": 6417 + }, + { + "epoch": 0.54, + "grad_norm": 0.4765075750990313, + "learning_rate": 4.583596151824637e-06, + "loss": 0.1205, + "step": 6418 + }, + { + "epoch": 0.54, + "grad_norm": 0.4094440861677164, + "learning_rate": 4.5822363032428986e-06, + "loss": 0.0728, + "step": 6419 + }, + { + "epoch": 0.54, + "grad_norm": 0.23138690444833263, + "learning_rate": 4.580876485778632e-06, + "loss": 0.0739, + "step": 6420 + }, + { + "epoch": 0.54, + "grad_norm": 0.25982815308343693, + "learning_rate": 4.5795166995331256e-06, + "loss": 0.059, + "step": 6421 + }, + { + "epoch": 0.54, + "grad_norm": 0.5117836589684587, + "learning_rate": 4.578156944607667e-06, + "loss": 0.0877, + "step": 6422 + }, + { + "epoch": 0.54, + "grad_norm": 0.3046804946771742, + "learning_rate": 4.576797221103535e-06, + "loss": 0.0542, + "step": 6423 + }, + { + "epoch": 0.54, + "grad_norm": 0.2647933826997276, + "learning_rate": 4.575437529122013e-06, + "loss": 0.0893, + "step": 6424 + }, + { + "epoch": 0.54, + "grad_norm": 0.36714819555118783, + "learning_rate": 4.574077868764374e-06, + "loss": 0.1014, + "step": 6425 + }, + { + "epoch": 0.54, + "grad_norm": 0.2769774862157114, + "learning_rate": 4.5727182401318995e-06, + "loss": 0.0747, + "step": 6426 + }, + { + "epoch": 0.54, + "grad_norm": 0.24724298503471734, + "learning_rate": 4.5713586433258595e-06, + "loss": 0.05, + "step": 6427 + }, + { + "epoch": 0.54, + "grad_norm": 0.3376021179700868, + "learning_rate": 4.569999078447522e-06, + "loss": 0.1112, + "step": 6428 + }, + { + "epoch": 0.54, + "grad_norm": 0.2700747560632514, + "learning_rate": 4.568639545598161e-06, + "loss": 0.0875, + "step": 6429 + }, + { + "epoch": 0.54, + "grad_norm": 0.3183043502078776, + "learning_rate": 4.567280044879038e-06, + "loss": 0.0838, + "step": 6430 + }, + { + "epoch": 0.54, + "grad_norm": 0.34764728283430596, + "learning_rate": 4.565920576391418e-06, + "loss": 0.0768, + "step": 6431 + }, + { + "epoch": 0.54, + "grad_norm": 0.46059120026869094, + "learning_rate": 4.564561140236562e-06, + "loss": 0.1072, + "step": 6432 + }, + { + "epoch": 0.54, + "grad_norm": 0.39292585206242203, + "learning_rate": 4.5632017365157306e-06, + "loss": 0.0938, + "step": 6433 + }, + { + "epoch": 0.54, + "grad_norm": 0.3342316422773509, + "learning_rate": 4.561842365330177e-06, + "loss": 0.1136, + "step": 6434 + }, + { + "epoch": 0.54, + "grad_norm": 0.4116874585579343, + "learning_rate": 4.5604830267811565e-06, + "loss": 0.0948, + "step": 6435 + }, + { + "epoch": 0.54, + "grad_norm": 0.3689668614487559, + "learning_rate": 4.559123720969921e-06, + "loss": 0.1092, + "step": 6436 + }, + { + "epoch": 0.54, + "grad_norm": 0.5256124062907164, + "learning_rate": 4.557764447997719e-06, + "loss": 0.1104, + "step": 6437 + }, + { + "epoch": 0.54, + "grad_norm": 0.18786379736368916, + "learning_rate": 4.556405207965796e-06, + "loss": 0.0479, + "step": 6438 + }, + { + "epoch": 0.54, + "grad_norm": 0.31623415460191534, + "learning_rate": 4.5550460009753995e-06, + "loss": 0.0906, + "step": 6439 + }, + { + "epoch": 0.54, + "grad_norm": 0.29487594013215457, + "learning_rate": 4.553686827127767e-06, + "loss": 0.0541, + "step": 6440 + }, + { + "epoch": 0.54, + "grad_norm": 0.2784189143466072, + "learning_rate": 4.552327686524141e-06, + "loss": 0.1023, + "step": 6441 + }, + { + "epoch": 0.54, + "grad_norm": 0.29301999214702185, + "learning_rate": 4.5509685792657555e-06, + "loss": 0.0945, + "step": 6442 + }, + { + "epoch": 0.54, + "grad_norm": 0.21769059741990507, + "learning_rate": 4.549609505453847e-06, + "loss": 0.0589, + "step": 6443 + }, + { + "epoch": 0.54, + "grad_norm": 0.41797524544995035, + "learning_rate": 4.548250465189647e-06, + "loss": 0.09, + "step": 6444 + }, + { + "epoch": 0.54, + "grad_norm": 0.495765215316743, + "learning_rate": 4.54689145857438e-06, + "loss": 0.0771, + "step": 6445 + }, + { + "epoch": 0.54, + "grad_norm": 0.3022934581411002, + "learning_rate": 4.54553248570928e-06, + "loss": 0.1161, + "step": 6446 + }, + { + "epoch": 0.54, + "grad_norm": 0.290344045502107, + "learning_rate": 4.544173546695568e-06, + "loss": 0.0806, + "step": 6447 + }, + { + "epoch": 0.54, + "grad_norm": 0.3535851970282823, + "learning_rate": 4.542814641634464e-06, + "loss": 0.1135, + "step": 6448 + }, + { + "epoch": 0.54, + "grad_norm": 0.4108216841272214, + "learning_rate": 4.54145577062719e-06, + "loss": 0.1175, + "step": 6449 + }, + { + "epoch": 0.54, + "grad_norm": 0.2927670510392883, + "learning_rate": 4.540096933774962e-06, + "loss": 0.0761, + "step": 6450 + }, + { + "epoch": 0.54, + "grad_norm": 0.5513451454564373, + "learning_rate": 4.538738131178994e-06, + "loss": 0.0568, + "step": 6451 + }, + { + "epoch": 0.54, + "grad_norm": 0.23572640752750904, + "learning_rate": 4.537379362940494e-06, + "loss": 0.0637, + "step": 6452 + }, + { + "epoch": 0.54, + "grad_norm": 0.3779108115056334, + "learning_rate": 4.536020629160676e-06, + "loss": 0.1138, + "step": 6453 + }, + { + "epoch": 0.54, + "grad_norm": 0.2825554717489204, + "learning_rate": 4.534661929940745e-06, + "loss": 0.0779, + "step": 6454 + }, + { + "epoch": 0.54, + "grad_norm": 0.4506875740666896, + "learning_rate": 4.533303265381902e-06, + "loss": 0.0884, + "step": 6455 + }, + { + "epoch": 0.54, + "grad_norm": 0.8333110093918049, + "learning_rate": 4.531944635585352e-06, + "loss": 0.1072, + "step": 6456 + }, + { + "epoch": 0.54, + "grad_norm": 0.34097750357601764, + "learning_rate": 4.530586040652293e-06, + "loss": 0.1069, + "step": 6457 + }, + { + "epoch": 0.54, + "grad_norm": 0.32057741693119035, + "learning_rate": 4.52922748068392e-06, + "loss": 0.0808, + "step": 6458 + }, + { + "epoch": 0.54, + "grad_norm": 0.44342064654459556, + "learning_rate": 4.527868955781424e-06, + "loss": 0.079, + "step": 6459 + }, + { + "epoch": 0.54, + "grad_norm": 0.349842968722777, + "learning_rate": 4.526510466046002e-06, + "loss": 0.0758, + "step": 6460 + }, + { + "epoch": 0.54, + "grad_norm": 0.2797579875720313, + "learning_rate": 4.525152011578838e-06, + "loss": 0.0749, + "step": 6461 + }, + { + "epoch": 0.54, + "grad_norm": 0.3928411408207082, + "learning_rate": 4.523793592481116e-06, + "loss": 0.1213, + "step": 6462 + }, + { + "epoch": 0.54, + "grad_norm": 0.2895674158472946, + "learning_rate": 4.5224352088540234e-06, + "loss": 0.076, + "step": 6463 + }, + { + "epoch": 0.54, + "grad_norm": 0.20909806831362063, + "learning_rate": 4.521076860798739e-06, + "loss": 0.0655, + "step": 6464 + }, + { + "epoch": 0.54, + "grad_norm": 0.30705476267721243, + "learning_rate": 4.519718548416439e-06, + "loss": 0.0744, + "step": 6465 + }, + { + "epoch": 0.54, + "grad_norm": 0.166139207763279, + "learning_rate": 4.518360271808298e-06, + "loss": 0.0259, + "step": 6466 + }, + { + "epoch": 0.54, + "grad_norm": 0.32803136209456396, + "learning_rate": 4.517002031075492e-06, + "loss": 0.0846, + "step": 6467 + }, + { + "epoch": 0.54, + "grad_norm": 0.26106294813053654, + "learning_rate": 4.515643826319188e-06, + "loss": 0.0516, + "step": 6468 + }, + { + "epoch": 0.55, + "grad_norm": 0.48806364193452056, + "learning_rate": 4.514285657640553e-06, + "loss": 0.1128, + "step": 6469 + }, + { + "epoch": 0.55, + "grad_norm": 0.2715210618264937, + "learning_rate": 4.512927525140751e-06, + "loss": 0.0616, + "step": 6470 + }, + { + "epoch": 0.55, + "grad_norm": 0.31206443572645565, + "learning_rate": 4.511569428920946e-06, + "loss": 0.0727, + "step": 6471 + }, + { + "epoch": 0.55, + "grad_norm": 0.3000187433964627, + "learning_rate": 4.510211369082293e-06, + "loss": 0.0805, + "step": 6472 + }, + { + "epoch": 0.55, + "grad_norm": 0.2565792721089214, + "learning_rate": 4.508853345725953e-06, + "loss": 0.0687, + "step": 6473 + }, + { + "epoch": 0.55, + "grad_norm": 0.3332108846145187, + "learning_rate": 4.507495358953075e-06, + "loss": 0.1211, + "step": 6474 + }, + { + "epoch": 0.55, + "grad_norm": 0.38359092998067684, + "learning_rate": 4.506137408864813e-06, + "loss": 0.0807, + "step": 6475 + }, + { + "epoch": 0.55, + "grad_norm": 0.2772647668516554, + "learning_rate": 4.5047794955623125e-06, + "loss": 0.1117, + "step": 6476 + }, + { + "epoch": 0.55, + "grad_norm": 0.3322565166109134, + "learning_rate": 4.503421619146721e-06, + "loss": 0.0843, + "step": 6477 + }, + { + "epoch": 0.55, + "grad_norm": 0.6104521462711345, + "learning_rate": 4.502063779719181e-06, + "loss": 0.1052, + "step": 6478 + }, + { + "epoch": 0.55, + "grad_norm": 0.22461520342183397, + "learning_rate": 4.5007059773808285e-06, + "loss": 0.0679, + "step": 6479 + }, + { + "epoch": 0.55, + "grad_norm": 0.2767457628287545, + "learning_rate": 4.4993482122328056e-06, + "loss": 0.0852, + "step": 6480 + }, + { + "epoch": 0.55, + "grad_norm": 0.39831476439807795, + "learning_rate": 4.497990484376244e-06, + "loss": 0.1126, + "step": 6481 + }, + { + "epoch": 0.55, + "grad_norm": 0.24739671888086207, + "learning_rate": 4.496632793912276e-06, + "loss": 0.0486, + "step": 6482 + }, + { + "epoch": 0.55, + "grad_norm": 0.35337212918139704, + "learning_rate": 4.495275140942028e-06, + "loss": 0.1081, + "step": 6483 + }, + { + "epoch": 0.55, + "grad_norm": 0.25181653419940325, + "learning_rate": 4.493917525566629e-06, + "loss": 0.0738, + "step": 6484 + }, + { + "epoch": 0.55, + "grad_norm": 0.21264521167202877, + "learning_rate": 4.4925599478872014e-06, + "loss": 0.061, + "step": 6485 + }, + { + "epoch": 0.55, + "grad_norm": 0.4840611475951059, + "learning_rate": 4.491202408004863e-06, + "loss": 0.1189, + "step": 6486 + }, + { + "epoch": 0.55, + "grad_norm": 0.27833809987555064, + "learning_rate": 4.489844906020735e-06, + "loss": 0.0423, + "step": 6487 + }, + { + "epoch": 0.55, + "grad_norm": 0.4184173655011638, + "learning_rate": 4.4884874420359306e-06, + "loss": 0.1164, + "step": 6488 + }, + { + "epoch": 0.55, + "grad_norm": 0.3060376038248094, + "learning_rate": 4.487130016151559e-06, + "loss": 0.0954, + "step": 6489 + }, + { + "epoch": 0.55, + "grad_norm": 0.28903430923172074, + "learning_rate": 4.485772628468733e-06, + "loss": 0.0721, + "step": 6490 + }, + { + "epoch": 0.55, + "grad_norm": 0.36461604381872326, + "learning_rate": 4.484415279088558e-06, + "loss": 0.0959, + "step": 6491 + }, + { + "epoch": 0.55, + "grad_norm": 0.40045127240355227, + "learning_rate": 4.4830579681121365e-06, + "loss": 0.0793, + "step": 6492 + }, + { + "epoch": 0.55, + "grad_norm": 0.2640842342708515, + "learning_rate": 4.4817006956405664e-06, + "loss": 0.07, + "step": 6493 + }, + { + "epoch": 0.55, + "grad_norm": 0.2453090620777981, + "learning_rate": 4.4803434617749506e-06, + "loss": 0.0479, + "step": 6494 + }, + { + "epoch": 0.55, + "grad_norm": 0.2796746483975629, + "learning_rate": 4.4789862666163805e-06, + "loss": 0.0865, + "step": 6495 + }, + { + "epoch": 0.55, + "grad_norm": 0.21948158103857257, + "learning_rate": 4.477629110265946e-06, + "loss": 0.0596, + "step": 6496 + }, + { + "epoch": 0.55, + "grad_norm": 0.3709021388517017, + "learning_rate": 4.4762719928247414e-06, + "loss": 0.1073, + "step": 6497 + }, + { + "epoch": 0.55, + "grad_norm": 0.2714682208895978, + "learning_rate": 4.474914914393849e-06, + "loss": 0.0522, + "step": 6498 + }, + { + "epoch": 0.55, + "grad_norm": 0.2225930076623636, + "learning_rate": 4.473557875074353e-06, + "loss": 0.0635, + "step": 6499 + }, + { + "epoch": 0.55, + "grad_norm": 0.36936040799266867, + "learning_rate": 4.472200874967331e-06, + "loss": 0.0879, + "step": 6500 + }, + { + "epoch": 0.55, + "grad_norm": 0.36763221877689395, + "learning_rate": 4.470843914173864e-06, + "loss": 0.0594, + "step": 6501 + }, + { + "epoch": 0.55, + "grad_norm": 0.292314120452045, + "learning_rate": 4.4694869927950255e-06, + "loss": 0.0742, + "step": 6502 + }, + { + "epoch": 0.55, + "grad_norm": 0.3214006042271544, + "learning_rate": 4.468130110931885e-06, + "loss": 0.0809, + "step": 6503 + }, + { + "epoch": 0.55, + "grad_norm": 0.4908728792498285, + "learning_rate": 4.466773268685512e-06, + "loss": 0.0838, + "step": 6504 + }, + { + "epoch": 0.55, + "grad_norm": 1.0177633212010726, + "learning_rate": 4.465416466156974e-06, + "loss": 0.137, + "step": 6505 + }, + { + "epoch": 0.55, + "grad_norm": 0.33014505165767494, + "learning_rate": 4.46405970344733e-06, + "loss": 0.1096, + "step": 6506 + }, + { + "epoch": 0.55, + "grad_norm": 0.3130259957586664, + "learning_rate": 4.462702980657643e-06, + "loss": 0.1128, + "step": 6507 + }, + { + "epoch": 0.55, + "grad_norm": 0.190959551758352, + "learning_rate": 4.461346297888967e-06, + "loss": 0.0389, + "step": 6508 + }, + { + "epoch": 0.55, + "grad_norm": 0.29811166127780836, + "learning_rate": 4.459989655242356e-06, + "loss": 0.0761, + "step": 6509 + }, + { + "epoch": 0.55, + "grad_norm": 0.20021881324505128, + "learning_rate": 4.458633052818862e-06, + "loss": 0.0299, + "step": 6510 + }, + { + "epoch": 0.55, + "grad_norm": 0.16719373848909286, + "learning_rate": 4.457276490719533e-06, + "loss": 0.0476, + "step": 6511 + }, + { + "epoch": 0.55, + "grad_norm": 0.3118397805654914, + "learning_rate": 4.455919969045412e-06, + "loss": 0.0463, + "step": 6512 + }, + { + "epoch": 0.55, + "grad_norm": 0.2966646351038485, + "learning_rate": 4.454563487897539e-06, + "loss": 0.0661, + "step": 6513 + }, + { + "epoch": 0.55, + "grad_norm": 0.4898708810700045, + "learning_rate": 4.453207047376957e-06, + "loss": 0.0874, + "step": 6514 + }, + { + "epoch": 0.55, + "grad_norm": 0.2911432838287848, + "learning_rate": 4.451850647584699e-06, + "loss": 0.0905, + "step": 6515 + }, + { + "epoch": 0.55, + "grad_norm": 0.2525551286673735, + "learning_rate": 4.450494288621799e-06, + "loss": 0.0625, + "step": 6516 + }, + { + "epoch": 0.55, + "grad_norm": 0.41991257024161677, + "learning_rate": 4.449137970589282e-06, + "loss": 0.0839, + "step": 6517 + }, + { + "epoch": 0.55, + "grad_norm": 0.28389265194281565, + "learning_rate": 4.447781693588182e-06, + "loss": 0.0686, + "step": 6518 + }, + { + "epoch": 0.55, + "grad_norm": 0.20028449902639217, + "learning_rate": 4.446425457719517e-06, + "loss": 0.0549, + "step": 6519 + }, + { + "epoch": 0.55, + "grad_norm": 0.29375230328365315, + "learning_rate": 4.445069263084307e-06, + "loss": 0.0689, + "step": 6520 + }, + { + "epoch": 0.55, + "grad_norm": 0.3407383594957745, + "learning_rate": 4.443713109783573e-06, + "loss": 0.1036, + "step": 6521 + }, + { + "epoch": 0.55, + "grad_norm": 0.3821946864149125, + "learning_rate": 4.442356997918327e-06, + "loss": 0.1086, + "step": 6522 + }, + { + "epoch": 0.55, + "grad_norm": 0.4386480130906768, + "learning_rate": 4.441000927589578e-06, + "loss": 0.1016, + "step": 6523 + }, + { + "epoch": 0.55, + "grad_norm": 0.2991156377120141, + "learning_rate": 4.439644898898339e-06, + "loss": 0.0735, + "step": 6524 + }, + { + "epoch": 0.55, + "grad_norm": 0.20814358976718328, + "learning_rate": 4.438288911945613e-06, + "loss": 0.0292, + "step": 6525 + }, + { + "epoch": 0.55, + "grad_norm": 0.4082484089460323, + "learning_rate": 4.436932966832399e-06, + "loss": 0.0928, + "step": 6526 + }, + { + "epoch": 0.55, + "grad_norm": 0.2842921992312938, + "learning_rate": 4.435577063659697e-06, + "loss": 0.0743, + "step": 6527 + }, + { + "epoch": 0.55, + "grad_norm": 0.5379947620468265, + "learning_rate": 4.434221202528506e-06, + "loss": 0.0631, + "step": 6528 + }, + { + "epoch": 0.55, + "grad_norm": 0.46187590798492384, + "learning_rate": 4.432865383539814e-06, + "loss": 0.1245, + "step": 6529 + }, + { + "epoch": 0.55, + "grad_norm": 0.29544556373717507, + "learning_rate": 4.43150960679461e-06, + "loss": 0.0725, + "step": 6530 + }, + { + "epoch": 0.55, + "grad_norm": 0.4520433736703095, + "learning_rate": 4.4301538723938855e-06, + "loss": 0.075, + "step": 6531 + }, + { + "epoch": 0.55, + "grad_norm": 0.5203593556740789, + "learning_rate": 4.42879818043862e-06, + "loss": 0.0905, + "step": 6532 + }, + { + "epoch": 0.55, + "grad_norm": 0.5199286723332162, + "learning_rate": 4.427442531029794e-06, + "loss": 0.1251, + "step": 6533 + }, + { + "epoch": 0.55, + "grad_norm": 0.25428779738881147, + "learning_rate": 4.42608692426838e-06, + "loss": 0.032, + "step": 6534 + }, + { + "epoch": 0.55, + "grad_norm": 0.2693632599447435, + "learning_rate": 4.424731360255359e-06, + "loss": 0.074, + "step": 6535 + }, + { + "epoch": 0.55, + "grad_norm": 0.5095644037596729, + "learning_rate": 4.423375839091696e-06, + "loss": 0.0908, + "step": 6536 + }, + { + "epoch": 0.55, + "grad_norm": 0.3008626265323932, + "learning_rate": 4.42202036087836e-06, + "loss": 0.0973, + "step": 6537 + }, + { + "epoch": 0.55, + "grad_norm": 0.382540018211114, + "learning_rate": 4.420664925716313e-06, + "loss": 0.058, + "step": 6538 + }, + { + "epoch": 0.55, + "grad_norm": 0.269789237696033, + "learning_rate": 4.419309533706519e-06, + "loss": 0.0973, + "step": 6539 + }, + { + "epoch": 0.55, + "grad_norm": 0.3667640734370712, + "learning_rate": 4.417954184949933e-06, + "loss": 0.0931, + "step": 6540 + }, + { + "epoch": 0.55, + "grad_norm": 0.4574104866691601, + "learning_rate": 4.41659887954751e-06, + "loss": 0.0904, + "step": 6541 + }, + { + "epoch": 0.55, + "grad_norm": 0.40677574553479706, + "learning_rate": 4.4152436176002014e-06, + "loss": 0.1117, + "step": 6542 + }, + { + "epoch": 0.55, + "grad_norm": 0.4571608181984217, + "learning_rate": 4.413888399208954e-06, + "loss": 0.1309, + "step": 6543 + }, + { + "epoch": 0.55, + "grad_norm": 0.32249475260900573, + "learning_rate": 4.412533224474714e-06, + "loss": 0.0716, + "step": 6544 + }, + { + "epoch": 0.55, + "grad_norm": 0.38824721471470003, + "learning_rate": 4.411178093498421e-06, + "loss": 0.1047, + "step": 6545 + }, + { + "epoch": 0.55, + "grad_norm": 0.5325411771488757, + "learning_rate": 4.4098230063810136e-06, + "loss": 0.1098, + "step": 6546 + }, + { + "epoch": 0.55, + "grad_norm": 0.4318340872256885, + "learning_rate": 4.4084679632234254e-06, + "loss": 0.0751, + "step": 6547 + }, + { + "epoch": 0.55, + "grad_norm": 0.3202167235232858, + "learning_rate": 4.407112964126591e-06, + "loss": 0.0816, + "step": 6548 + }, + { + "epoch": 0.55, + "grad_norm": 0.20027733975146453, + "learning_rate": 4.405758009191438e-06, + "loss": 0.0332, + "step": 6549 + }, + { + "epoch": 0.55, + "grad_norm": 0.42160213388819356, + "learning_rate": 4.404403098518889e-06, + "loss": 0.079, + "step": 6550 + }, + { + "epoch": 0.55, + "grad_norm": 0.16381146577137853, + "learning_rate": 4.403048232209865e-06, + "loss": 0.0299, + "step": 6551 + }, + { + "epoch": 0.55, + "grad_norm": 0.27391853676011063, + "learning_rate": 4.401693410365288e-06, + "loss": 0.0571, + "step": 6552 + }, + { + "epoch": 0.55, + "grad_norm": 0.29360694785679214, + "learning_rate": 4.400338633086071e-06, + "loss": 0.0875, + "step": 6553 + }, + { + "epoch": 0.55, + "grad_norm": 0.30235228605413994, + "learning_rate": 4.398983900473124e-06, + "loss": 0.0709, + "step": 6554 + }, + { + "epoch": 0.55, + "grad_norm": 0.34727645535127044, + "learning_rate": 4.39762921262736e-06, + "loss": 0.081, + "step": 6555 + }, + { + "epoch": 0.55, + "grad_norm": 0.2536306594792515, + "learning_rate": 4.396274569649681e-06, + "loss": 0.0913, + "step": 6556 + }, + { + "epoch": 0.55, + "grad_norm": 0.2844560563309367, + "learning_rate": 4.394919971640987e-06, + "loss": 0.0682, + "step": 6557 + }, + { + "epoch": 0.55, + "grad_norm": 0.30456877248181224, + "learning_rate": 4.3935654187021804e-06, + "loss": 0.0829, + "step": 6558 + }, + { + "epoch": 0.55, + "grad_norm": 0.4832987049403699, + "learning_rate": 4.392210910934154e-06, + "loss": 0.1396, + "step": 6559 + }, + { + "epoch": 0.55, + "grad_norm": 0.28805171728095846, + "learning_rate": 4.390856448437799e-06, + "loss": 0.0752, + "step": 6560 + }, + { + "epoch": 0.55, + "grad_norm": 0.29295491920806943, + "learning_rate": 4.389502031314003e-06, + "loss": 0.0724, + "step": 6561 + }, + { + "epoch": 0.55, + "grad_norm": 0.2030398017972443, + "learning_rate": 4.388147659663654e-06, + "loss": 0.0501, + "step": 6562 + }, + { + "epoch": 0.55, + "grad_norm": 0.29820618135694626, + "learning_rate": 4.386793333587632e-06, + "loss": 0.0847, + "step": 6563 + }, + { + "epoch": 0.55, + "grad_norm": 0.48852553690562567, + "learning_rate": 4.385439053186812e-06, + "loss": 0.1028, + "step": 6564 + }, + { + "epoch": 0.55, + "grad_norm": 0.4002705697125619, + "learning_rate": 4.384084818562074e-06, + "loss": 0.0647, + "step": 6565 + }, + { + "epoch": 0.55, + "grad_norm": 0.20411258935067553, + "learning_rate": 4.382730629814286e-06, + "loss": 0.057, + "step": 6566 + }, + { + "epoch": 0.55, + "grad_norm": 0.3826902086502373, + "learning_rate": 4.381376487044317e-06, + "loss": 0.0924, + "step": 6567 + }, + { + "epoch": 0.55, + "grad_norm": 0.3576968768263061, + "learning_rate": 4.380022390353029e-06, + "loss": 0.1067, + "step": 6568 + }, + { + "epoch": 0.55, + "grad_norm": 0.20069061578793765, + "learning_rate": 4.378668339841286e-06, + "loss": 0.056, + "step": 6569 + }, + { + "epoch": 0.55, + "grad_norm": 0.2513619123941133, + "learning_rate": 4.377314335609946e-06, + "loss": 0.0814, + "step": 6570 + }, + { + "epoch": 0.55, + "grad_norm": 0.24642943554301217, + "learning_rate": 4.37596037775986e-06, + "loss": 0.0406, + "step": 6571 + }, + { + "epoch": 0.55, + "grad_norm": 0.3003188900237948, + "learning_rate": 4.374606466391881e-06, + "loss": 0.0767, + "step": 6572 + }, + { + "epoch": 0.55, + "grad_norm": 0.2965540754892544, + "learning_rate": 4.373252601606856e-06, + "loss": 0.0563, + "step": 6573 + }, + { + "epoch": 0.55, + "grad_norm": 0.2807779406758953, + "learning_rate": 4.371898783505628e-06, + "loss": 0.0686, + "step": 6574 + }, + { + "epoch": 0.55, + "grad_norm": 0.38687186378380395, + "learning_rate": 4.3705450121890376e-06, + "loss": 0.0715, + "step": 6575 + }, + { + "epoch": 0.55, + "grad_norm": 0.5157531460428615, + "learning_rate": 4.3691912877579216e-06, + "loss": 0.0981, + "step": 6576 + }, + { + "epoch": 0.55, + "grad_norm": 0.3182718582461447, + "learning_rate": 4.367837610313113e-06, + "loss": 0.0767, + "step": 6577 + }, + { + "epoch": 0.55, + "grad_norm": 0.3674405258515762, + "learning_rate": 4.366483979955441e-06, + "loss": 0.0834, + "step": 6578 + }, + { + "epoch": 0.55, + "grad_norm": 0.7990043171834716, + "learning_rate": 4.3651303967857345e-06, + "loss": 0.0654, + "step": 6579 + }, + { + "epoch": 0.55, + "grad_norm": 0.28333553062304423, + "learning_rate": 4.363776860904813e-06, + "loss": 0.0756, + "step": 6580 + }, + { + "epoch": 0.55, + "grad_norm": 0.33339389876216513, + "learning_rate": 4.3624233724134965e-06, + "loss": 0.0954, + "step": 6581 + }, + { + "epoch": 0.55, + "grad_norm": 0.32800952998066174, + "learning_rate": 4.361069931412604e-06, + "loss": 0.0733, + "step": 6582 + }, + { + "epoch": 0.55, + "grad_norm": 0.39939839377022013, + "learning_rate": 4.359716538002944e-06, + "loss": 0.1242, + "step": 6583 + }, + { + "epoch": 0.55, + "grad_norm": 0.20681982256677475, + "learning_rate": 4.3583631922853255e-06, + "loss": 0.065, + "step": 6584 + }, + { + "epoch": 0.55, + "grad_norm": 0.5095063250278454, + "learning_rate": 4.357009894360554e-06, + "loss": 0.0709, + "step": 6585 + }, + { + "epoch": 0.55, + "grad_norm": 0.34737637314789915, + "learning_rate": 4.355656644329431e-06, + "loss": 0.0879, + "step": 6586 + }, + { + "epoch": 0.56, + "grad_norm": 0.26083315376932353, + "learning_rate": 4.354303442292757e-06, + "loss": 0.0743, + "step": 6587 + }, + { + "epoch": 0.56, + "grad_norm": 0.4209411337948804, + "learning_rate": 4.352950288351321e-06, + "loss": 0.1108, + "step": 6588 + }, + { + "epoch": 0.56, + "grad_norm": 0.40871683232666245, + "learning_rate": 4.351597182605918e-06, + "loss": 0.1181, + "step": 6589 + }, + { + "epoch": 0.56, + "grad_norm": 0.31110171970121203, + "learning_rate": 4.350244125157336e-06, + "loss": 0.0847, + "step": 6590 + }, + { + "epoch": 0.56, + "grad_norm": 0.30093018702765834, + "learning_rate": 4.348891116106353e-06, + "loss": 0.0795, + "step": 6591 + }, + { + "epoch": 0.56, + "grad_norm": 0.8227554288219854, + "learning_rate": 4.347538155553755e-06, + "loss": 0.1214, + "step": 6592 + }, + { + "epoch": 0.56, + "grad_norm": 0.18912049663213973, + "learning_rate": 4.346185243600317e-06, + "loss": 0.0517, + "step": 6593 + }, + { + "epoch": 0.56, + "grad_norm": 0.42360315161198403, + "learning_rate": 4.3448323803468105e-06, + "loss": 0.0891, + "step": 6594 + }, + { + "epoch": 0.56, + "grad_norm": 0.30794181128478937, + "learning_rate": 4.343479565894002e-06, + "loss": 0.0791, + "step": 6595 + }, + { + "epoch": 0.56, + "grad_norm": 0.3229937361479185, + "learning_rate": 4.342126800342662e-06, + "loss": 0.0748, + "step": 6596 + }, + { + "epoch": 0.56, + "grad_norm": 0.3481892958922051, + "learning_rate": 4.340774083793551e-06, + "loss": 0.0655, + "step": 6597 + }, + { + "epoch": 0.56, + "grad_norm": 0.32792680359665793, + "learning_rate": 4.339421416347425e-06, + "loss": 0.0857, + "step": 6598 + }, + { + "epoch": 0.56, + "grad_norm": 0.34960538612059905, + "learning_rate": 4.338068798105041e-06, + "loss": 0.0429, + "step": 6599 + }, + { + "epoch": 0.56, + "grad_norm": 0.38350293307697125, + "learning_rate": 4.336716229167149e-06, + "loss": 0.0639, + "step": 6600 + }, + { + "epoch": 0.56, + "grad_norm": 0.370438110846986, + "learning_rate": 4.335363709634497e-06, + "loss": 0.1146, + "step": 6601 + }, + { + "epoch": 0.56, + "grad_norm": 0.21355527642936822, + "learning_rate": 4.334011239607825e-06, + "loss": 0.037, + "step": 6602 + }, + { + "epoch": 0.56, + "grad_norm": 0.41047064644897835, + "learning_rate": 4.3326588191878785e-06, + "loss": 0.0974, + "step": 6603 + }, + { + "epoch": 0.56, + "grad_norm": 0.28104315034643984, + "learning_rate": 4.33130644847539e-06, + "loss": 0.0943, + "step": 6604 + }, + { + "epoch": 0.56, + "grad_norm": 0.4538779493536291, + "learning_rate": 4.329954127571092e-06, + "loss": 0.1005, + "step": 6605 + }, + { + "epoch": 0.56, + "grad_norm": 0.29273096865011583, + "learning_rate": 4.3286018565757125e-06, + "loss": 0.0532, + "step": 6606 + }, + { + "epoch": 0.56, + "grad_norm": 0.3334467847777329, + "learning_rate": 4.327249635589981e-06, + "loss": 0.0683, + "step": 6607 + }, + { + "epoch": 0.56, + "grad_norm": 0.37214306541461123, + "learning_rate": 4.325897464714614e-06, + "loss": 0.0757, + "step": 6608 + }, + { + "epoch": 0.56, + "grad_norm": 0.3084425665272504, + "learning_rate": 4.324545344050331e-06, + "loss": 0.0714, + "step": 6609 + }, + { + "epoch": 0.56, + "grad_norm": 0.35980017020480787, + "learning_rate": 4.323193273697844e-06, + "loss": 0.1024, + "step": 6610 + }, + { + "epoch": 0.56, + "grad_norm": 0.26441614609329506, + "learning_rate": 4.321841253757867e-06, + "loss": 0.094, + "step": 6611 + }, + { + "epoch": 0.56, + "grad_norm": 0.2459246304684299, + "learning_rate": 4.320489284331103e-06, + "loss": 0.0747, + "step": 6612 + }, + { + "epoch": 0.56, + "grad_norm": 0.3570080426497132, + "learning_rate": 4.319137365518255e-06, + "loss": 0.1123, + "step": 6613 + }, + { + "epoch": 0.56, + "grad_norm": 0.3403456851868235, + "learning_rate": 4.317785497420025e-06, + "loss": 0.0632, + "step": 6614 + }, + { + "epoch": 0.56, + "grad_norm": 0.24852310359694293, + "learning_rate": 4.316433680137101e-06, + "loss": 0.0804, + "step": 6615 + }, + { + "epoch": 0.56, + "grad_norm": 0.2807662697401163, + "learning_rate": 4.315081913770182e-06, + "loss": 0.0828, + "step": 6616 + }, + { + "epoch": 0.56, + "grad_norm": 0.3255983647435315, + "learning_rate": 4.313730198419951e-06, + "loss": 0.0919, + "step": 6617 + }, + { + "epoch": 0.56, + "grad_norm": 0.24988162410054074, + "learning_rate": 4.312378534187093e-06, + "loss": 0.06, + "step": 6618 + }, + { + "epoch": 0.56, + "grad_norm": 0.32418165812042693, + "learning_rate": 4.311026921172285e-06, + "loss": 0.1037, + "step": 6619 + }, + { + "epoch": 0.56, + "grad_norm": 0.28493245433731745, + "learning_rate": 4.309675359476208e-06, + "loss": 0.0815, + "step": 6620 + }, + { + "epoch": 0.56, + "grad_norm": 0.27234886762354943, + "learning_rate": 4.3083238491995325e-06, + "loss": 0.0608, + "step": 6621 + }, + { + "epoch": 0.56, + "grad_norm": 0.38223154598747355, + "learning_rate": 4.3069723904429224e-06, + "loss": 0.0953, + "step": 6622 + }, + { + "epoch": 0.56, + "grad_norm": 0.3723687079003514, + "learning_rate": 4.30562098330705e-06, + "loss": 0.0884, + "step": 6623 + }, + { + "epoch": 0.56, + "grad_norm": 0.24222599185044982, + "learning_rate": 4.30426962789257e-06, + "loss": 0.0503, + "step": 6624 + }, + { + "epoch": 0.56, + "grad_norm": 0.4196252028510256, + "learning_rate": 4.302918324300142e-06, + "loss": 0.0644, + "step": 6625 + }, + { + "epoch": 0.56, + "grad_norm": 0.24809022254787272, + "learning_rate": 4.301567072630415e-06, + "loss": 0.0422, + "step": 6626 + }, + { + "epoch": 0.56, + "grad_norm": 0.253785640473781, + "learning_rate": 4.300215872984044e-06, + "loss": 0.0769, + "step": 6627 + }, + { + "epoch": 0.56, + "grad_norm": 0.4202377731230367, + "learning_rate": 4.2988647254616705e-06, + "loss": 0.1111, + "step": 6628 + }, + { + "epoch": 0.56, + "grad_norm": 0.3063537708552872, + "learning_rate": 4.297513630163935e-06, + "loss": 0.0906, + "step": 6629 + }, + { + "epoch": 0.56, + "grad_norm": 0.32862826804926837, + "learning_rate": 4.296162587191479e-06, + "loss": 0.0883, + "step": 6630 + }, + { + "epoch": 0.56, + "grad_norm": 0.3336391256637595, + "learning_rate": 4.2948115966449335e-06, + "loss": 0.0998, + "step": 6631 + }, + { + "epoch": 0.56, + "grad_norm": 0.32074450671741295, + "learning_rate": 4.293460658624926e-06, + "loss": 0.071, + "step": 6632 + }, + { + "epoch": 0.56, + "grad_norm": 0.5546690604661323, + "learning_rate": 4.292109773232087e-06, + "loss": 0.1163, + "step": 6633 + }, + { + "epoch": 0.56, + "grad_norm": 0.2813061406648457, + "learning_rate": 4.290758940567035e-06, + "loss": 0.0819, + "step": 6634 + }, + { + "epoch": 0.56, + "grad_norm": 0.15806140486415668, + "learning_rate": 4.28940816073039e-06, + "loss": 0.046, + "step": 6635 + }, + { + "epoch": 0.56, + "grad_norm": 0.45895587518711134, + "learning_rate": 4.288057433822762e-06, + "loss": 0.1075, + "step": 6636 + }, + { + "epoch": 0.56, + "grad_norm": 0.34311815796934825, + "learning_rate": 4.286706759944766e-06, + "loss": 0.0487, + "step": 6637 + }, + { + "epoch": 0.56, + "grad_norm": 0.37008190399311425, + "learning_rate": 4.285356139197005e-06, + "loss": 0.0765, + "step": 6638 + }, + { + "epoch": 0.56, + "grad_norm": 0.45012839078055383, + "learning_rate": 4.284005571680081e-06, + "loss": 0.1008, + "step": 6639 + }, + { + "epoch": 0.56, + "grad_norm": 0.31837896495886214, + "learning_rate": 4.282655057494593e-06, + "loss": 0.0956, + "step": 6640 + }, + { + "epoch": 0.56, + "grad_norm": 0.21381520167328397, + "learning_rate": 4.281304596741137e-06, + "loss": 0.0647, + "step": 6641 + }, + { + "epoch": 0.56, + "grad_norm": 0.46929036407119457, + "learning_rate": 4.279954189520301e-06, + "loss": 0.0969, + "step": 6642 + }, + { + "epoch": 0.56, + "grad_norm": 0.22438810150653105, + "learning_rate": 4.27860383593267e-06, + "loss": 0.0532, + "step": 6643 + }, + { + "epoch": 0.56, + "grad_norm": 0.7692156015720939, + "learning_rate": 4.277253536078829e-06, + "loss": 0.0907, + "step": 6644 + }, + { + "epoch": 0.56, + "grad_norm": 0.3026380882756604, + "learning_rate": 4.275903290059355e-06, + "loss": 0.0568, + "step": 6645 + }, + { + "epoch": 0.56, + "grad_norm": 0.36395642704955194, + "learning_rate": 4.274553097974821e-06, + "loss": 0.0628, + "step": 6646 + }, + { + "epoch": 0.56, + "grad_norm": 0.4038333685472117, + "learning_rate": 4.2732029599258e-06, + "loss": 0.1113, + "step": 6647 + }, + { + "epoch": 0.56, + "grad_norm": 0.2712562708269014, + "learning_rate": 4.271852876012855e-06, + "loss": 0.0695, + "step": 6648 + }, + { + "epoch": 0.56, + "grad_norm": 0.3752884877672261, + "learning_rate": 4.270502846336549e-06, + "loss": 0.084, + "step": 6649 + }, + { + "epoch": 0.56, + "grad_norm": 0.3016811852232453, + "learning_rate": 4.2691528709974415e-06, + "loss": 0.0944, + "step": 6650 + }, + { + "epoch": 0.56, + "grad_norm": 0.20237388424269012, + "learning_rate": 4.267802950096087e-06, + "loss": 0.0491, + "step": 6651 + }, + { + "epoch": 0.56, + "grad_norm": 0.39437098179936275, + "learning_rate": 4.266453083733033e-06, + "loss": 0.1141, + "step": 6652 + }, + { + "epoch": 0.56, + "grad_norm": 0.5884862383200715, + "learning_rate": 4.265103272008825e-06, + "loss": 0.0884, + "step": 6653 + }, + { + "epoch": 0.56, + "grad_norm": 0.1606884357867657, + "learning_rate": 4.263753515024009e-06, + "loss": 0.0494, + "step": 6654 + }, + { + "epoch": 0.56, + "grad_norm": 0.25377134952149355, + "learning_rate": 4.262403812879119e-06, + "loss": 0.0854, + "step": 6655 + }, + { + "epoch": 0.56, + "grad_norm": 0.4773211473069833, + "learning_rate": 4.261054165674688e-06, + "loss": 0.0915, + "step": 6656 + }, + { + "epoch": 0.56, + "grad_norm": 0.3566499101468165, + "learning_rate": 4.259704573511251e-06, + "loss": 0.0943, + "step": 6657 + }, + { + "epoch": 0.56, + "grad_norm": 0.21462589045795086, + "learning_rate": 4.258355036489328e-06, + "loss": 0.051, + "step": 6658 + }, + { + "epoch": 0.56, + "grad_norm": 0.25194025330417336, + "learning_rate": 4.257005554709443e-06, + "loss": 0.0585, + "step": 6659 + }, + { + "epoch": 0.56, + "grad_norm": 0.4047910446001853, + "learning_rate": 4.255656128272111e-06, + "loss": 0.1007, + "step": 6660 + }, + { + "epoch": 0.56, + "grad_norm": 0.5169029362885067, + "learning_rate": 4.254306757277848e-06, + "loss": 0.1011, + "step": 6661 + }, + { + "epoch": 0.56, + "grad_norm": 0.2710250186959179, + "learning_rate": 4.252957441827162e-06, + "loss": 0.073, + "step": 6662 + }, + { + "epoch": 0.56, + "grad_norm": 0.486120137621442, + "learning_rate": 4.251608182020555e-06, + "loss": 0.1205, + "step": 6663 + }, + { + "epoch": 0.56, + "grad_norm": 0.2434319241789466, + "learning_rate": 4.250258977958533e-06, + "loss": 0.0756, + "step": 6664 + }, + { + "epoch": 0.56, + "grad_norm": 0.3115339028155324, + "learning_rate": 4.24890982974159e-06, + "loss": 0.0972, + "step": 6665 + }, + { + "epoch": 0.56, + "grad_norm": 0.3410972245840506, + "learning_rate": 4.247560737470216e-06, + "loss": 0.0904, + "step": 6666 + }, + { + "epoch": 0.56, + "grad_norm": 0.433896903377954, + "learning_rate": 4.2462117012449036e-06, + "loss": 0.0734, + "step": 6667 + }, + { + "epoch": 0.56, + "grad_norm": 0.31950064877730494, + "learning_rate": 4.244862721166135e-06, + "loss": 0.0683, + "step": 6668 + }, + { + "epoch": 0.56, + "grad_norm": 0.47586824202799577, + "learning_rate": 4.24351379733439e-06, + "loss": 0.0722, + "step": 6669 + }, + { + "epoch": 0.56, + "grad_norm": 0.4467431146350214, + "learning_rate": 4.2421649298501415e-06, + "loss": 0.0975, + "step": 6670 + }, + { + "epoch": 0.56, + "grad_norm": 0.3080481959613241, + "learning_rate": 4.240816118813866e-06, + "loss": 0.0958, + "step": 6671 + }, + { + "epoch": 0.56, + "grad_norm": 0.18986314121026768, + "learning_rate": 4.239467364326029e-06, + "loss": 0.0331, + "step": 6672 + }, + { + "epoch": 0.56, + "grad_norm": 0.3868883046940507, + "learning_rate": 4.238118666487092e-06, + "loss": 0.0788, + "step": 6673 + }, + { + "epoch": 0.56, + "grad_norm": 0.33729334016385415, + "learning_rate": 4.236770025397515e-06, + "loss": 0.1042, + "step": 6674 + }, + { + "epoch": 0.56, + "grad_norm": 0.264508847821021, + "learning_rate": 4.235421441157753e-06, + "loss": 0.0636, + "step": 6675 + }, + { + "epoch": 0.56, + "grad_norm": 0.307877421132644, + "learning_rate": 4.234072913868257e-06, + "loss": 0.0763, + "step": 6676 + }, + { + "epoch": 0.56, + "grad_norm": 0.3351767497350244, + "learning_rate": 4.232724443629471e-06, + "loss": 0.0844, + "step": 6677 + }, + { + "epoch": 0.56, + "grad_norm": 0.22647589931448567, + "learning_rate": 4.231376030541838e-06, + "loss": 0.0556, + "step": 6678 + }, + { + "epoch": 0.56, + "grad_norm": 0.5359067427416808, + "learning_rate": 4.2300276747057965e-06, + "loss": 0.1352, + "step": 6679 + }, + { + "epoch": 0.56, + "grad_norm": 0.2847611911006017, + "learning_rate": 4.228679376221779e-06, + "loss": 0.086, + "step": 6680 + }, + { + "epoch": 0.56, + "grad_norm": 0.4064726869747234, + "learning_rate": 4.227331135190215e-06, + "loss": 0.094, + "step": 6681 + }, + { + "epoch": 0.56, + "grad_norm": 0.41168425302614675, + "learning_rate": 4.22598295171153e-06, + "loss": 0.112, + "step": 6682 + }, + { + "epoch": 0.56, + "grad_norm": 0.4657560318216727, + "learning_rate": 4.224634825886141e-06, + "loss": 0.1181, + "step": 6683 + }, + { + "epoch": 0.56, + "grad_norm": 0.37438699301306105, + "learning_rate": 4.22328675781447e-06, + "loss": 0.0763, + "step": 6684 + }, + { + "epoch": 0.56, + "grad_norm": 0.40047767201660317, + "learning_rate": 4.221938747596924e-06, + "loss": 0.0785, + "step": 6685 + }, + { + "epoch": 0.56, + "grad_norm": 0.3412003601161075, + "learning_rate": 4.220590795333914e-06, + "loss": 0.1131, + "step": 6686 + }, + { + "epoch": 0.56, + "grad_norm": 0.46762750466251835, + "learning_rate": 4.21924290112584e-06, + "loss": 0.1169, + "step": 6687 + }, + { + "epoch": 0.56, + "grad_norm": 0.28760476594750617, + "learning_rate": 4.217895065073105e-06, + "loss": 0.0737, + "step": 6688 + }, + { + "epoch": 0.56, + "grad_norm": 0.31362829091012934, + "learning_rate": 4.216547287276102e-06, + "loss": 0.0852, + "step": 6689 + }, + { + "epoch": 0.56, + "grad_norm": 0.25126486213224736, + "learning_rate": 4.215199567835217e-06, + "loss": 0.0623, + "step": 6690 + }, + { + "epoch": 0.56, + "grad_norm": 0.2922482543097791, + "learning_rate": 4.213851906850844e-06, + "loss": 0.0935, + "step": 6691 + }, + { + "epoch": 0.56, + "grad_norm": 0.30804714623059265, + "learning_rate": 4.21250430442336e-06, + "loss": 0.067, + "step": 6692 + }, + { + "epoch": 0.56, + "grad_norm": 0.340713815975127, + "learning_rate": 4.211156760653143e-06, + "loss": 0.089, + "step": 6693 + }, + { + "epoch": 0.56, + "grad_norm": 0.16815655547344227, + "learning_rate": 4.209809275640564e-06, + "loss": 0.0517, + "step": 6694 + }, + { + "epoch": 0.56, + "grad_norm": 0.36198770625380794, + "learning_rate": 4.208461849485995e-06, + "loss": 0.0936, + "step": 6695 + }, + { + "epoch": 0.56, + "grad_norm": 0.1606019699919142, + "learning_rate": 4.2071144822897985e-06, + "loss": 0.0334, + "step": 6696 + }, + { + "epoch": 0.56, + "grad_norm": 0.3762244402287604, + "learning_rate": 4.205767174152332e-06, + "loss": 0.0902, + "step": 6697 + }, + { + "epoch": 0.56, + "grad_norm": 0.28364316221310926, + "learning_rate": 4.204419925173956e-06, + "loss": 0.069, + "step": 6698 + }, + { + "epoch": 0.56, + "grad_norm": 0.3767236506449875, + "learning_rate": 4.203072735455018e-06, + "loss": 0.0715, + "step": 6699 + }, + { + "epoch": 0.56, + "grad_norm": 0.39123230195369724, + "learning_rate": 4.201725605095863e-06, + "loss": 0.1014, + "step": 6700 + }, + { + "epoch": 0.56, + "grad_norm": 0.3160990980692873, + "learning_rate": 4.200378534196838e-06, + "loss": 0.0739, + "step": 6701 + }, + { + "epoch": 0.56, + "grad_norm": 0.2827541470515303, + "learning_rate": 4.199031522858277e-06, + "loss": 0.0702, + "step": 6702 + }, + { + "epoch": 0.56, + "grad_norm": 0.3362880977363988, + "learning_rate": 4.197684571180515e-06, + "loss": 0.079, + "step": 6703 + }, + { + "epoch": 0.56, + "grad_norm": 0.3799807391828085, + "learning_rate": 4.196337679263877e-06, + "loss": 0.1208, + "step": 6704 + }, + { + "epoch": 0.56, + "grad_norm": 0.38750061256211654, + "learning_rate": 4.194990847208693e-06, + "loss": 0.0849, + "step": 6705 + }, + { + "epoch": 0.57, + "grad_norm": 0.23492537434783853, + "learning_rate": 4.193644075115279e-06, + "loss": 0.0664, + "step": 6706 + }, + { + "epoch": 0.57, + "grad_norm": 0.3812605257978098, + "learning_rate": 4.192297363083952e-06, + "loss": 0.0792, + "step": 6707 + }, + { + "epoch": 0.57, + "grad_norm": 0.2494947143137089, + "learning_rate": 4.1909507112150225e-06, + "loss": 0.1029, + "step": 6708 + }, + { + "epoch": 0.57, + "grad_norm": 0.3435038879352796, + "learning_rate": 4.189604119608798e-06, + "loss": 0.095, + "step": 6709 + }, + { + "epoch": 0.57, + "grad_norm": 0.40538101682228045, + "learning_rate": 4.1882575883655795e-06, + "loss": 0.1293, + "step": 6710 + }, + { + "epoch": 0.57, + "grad_norm": 0.2693400560496811, + "learning_rate": 4.186911117585663e-06, + "loss": 0.0777, + "step": 6711 + }, + { + "epoch": 0.57, + "grad_norm": 0.3433320318061915, + "learning_rate": 4.185564707369344e-06, + "loss": 0.0806, + "step": 6712 + }, + { + "epoch": 0.57, + "grad_norm": 0.41652511664925856, + "learning_rate": 4.184218357816911e-06, + "loss": 0.076, + "step": 6713 + }, + { + "epoch": 0.57, + "grad_norm": 0.4008158984150123, + "learning_rate": 4.182872069028645e-06, + "loss": 0.0587, + "step": 6714 + }, + { + "epoch": 0.57, + "grad_norm": 0.24111092261750192, + "learning_rate": 4.181525841104829e-06, + "loss": 0.0856, + "step": 6715 + }, + { + "epoch": 0.57, + "grad_norm": 0.2913536126368673, + "learning_rate": 4.180179674145737e-06, + "loss": 0.0717, + "step": 6716 + }, + { + "epoch": 0.57, + "grad_norm": 0.3898928730357047, + "learning_rate": 4.178833568251636e-06, + "loss": 0.0939, + "step": 6717 + }, + { + "epoch": 0.57, + "grad_norm": 0.23194975690205097, + "learning_rate": 4.177487523522796e-06, + "loss": 0.05, + "step": 6718 + }, + { + "epoch": 0.57, + "grad_norm": 0.26129087501936504, + "learning_rate": 4.176141540059477e-06, + "loss": 0.0775, + "step": 6719 + }, + { + "epoch": 0.57, + "grad_norm": 0.3249522802961273, + "learning_rate": 4.174795617961936e-06, + "loss": 0.0799, + "step": 6720 + }, + { + "epoch": 0.57, + "grad_norm": 0.3765665308350886, + "learning_rate": 4.173449757330422e-06, + "loss": 0.0889, + "step": 6721 + }, + { + "epoch": 0.57, + "grad_norm": 0.3848623002172383, + "learning_rate": 4.172103958265187e-06, + "loss": 0.0824, + "step": 6722 + }, + { + "epoch": 0.57, + "grad_norm": 0.2947809595564576, + "learning_rate": 4.170758220866473e-06, + "loss": 0.0825, + "step": 6723 + }, + { + "epoch": 0.57, + "grad_norm": 0.32852547407410143, + "learning_rate": 4.169412545234515e-06, + "loss": 0.0997, + "step": 6724 + }, + { + "epoch": 0.57, + "grad_norm": 0.2917985091144354, + "learning_rate": 4.1680669314695506e-06, + "loss": 0.1093, + "step": 6725 + }, + { + "epoch": 0.57, + "grad_norm": 0.3676117906077204, + "learning_rate": 4.166721379671807e-06, + "loss": 0.0759, + "step": 6726 + }, + { + "epoch": 0.57, + "grad_norm": 0.4158686518769835, + "learning_rate": 4.165375889941511e-06, + "loss": 0.1175, + "step": 6727 + }, + { + "epoch": 0.57, + "grad_norm": 0.33874976097916226, + "learning_rate": 4.164030462378877e-06, + "loss": 0.0879, + "step": 6728 + }, + { + "epoch": 0.57, + "grad_norm": 0.6191105705592647, + "learning_rate": 4.162685097084127e-06, + "loss": 0.1136, + "step": 6729 + }, + { + "epoch": 0.57, + "grad_norm": 0.3394354024915789, + "learning_rate": 4.161339794157469e-06, + "loss": 0.0702, + "step": 6730 + }, + { + "epoch": 0.57, + "grad_norm": 0.18324020252193918, + "learning_rate": 4.159994553699106e-06, + "loss": 0.0681, + "step": 6731 + }, + { + "epoch": 0.57, + "grad_norm": 0.3528293076985753, + "learning_rate": 4.158649375809245e-06, + "loss": 0.1096, + "step": 6732 + }, + { + "epoch": 0.57, + "grad_norm": 0.301369646015412, + "learning_rate": 4.157304260588078e-06, + "loss": 0.0429, + "step": 6733 + }, + { + "epoch": 0.57, + "grad_norm": 0.21058740119242356, + "learning_rate": 4.1559592081358005e-06, + "loss": 0.0472, + "step": 6734 + }, + { + "epoch": 0.57, + "grad_norm": 0.34059933226435074, + "learning_rate": 4.154614218552595e-06, + "loss": 0.1106, + "step": 6735 + }, + { + "epoch": 0.57, + "grad_norm": 0.26349059294240984, + "learning_rate": 4.153269291938649e-06, + "loss": 0.0595, + "step": 6736 + }, + { + "epoch": 0.57, + "grad_norm": 0.14022659714520191, + "learning_rate": 4.15192442839414e-06, + "loss": 0.0416, + "step": 6737 + }, + { + "epoch": 0.57, + "grad_norm": 0.3047515663740909, + "learning_rate": 4.150579628019237e-06, + "loss": 0.0757, + "step": 6738 + }, + { + "epoch": 0.57, + "grad_norm": 0.4675222058245978, + "learning_rate": 4.149234890914113e-06, + "loss": 0.1265, + "step": 6739 + }, + { + "epoch": 0.57, + "grad_norm": 0.23816280354749028, + "learning_rate": 4.147890217178932e-06, + "loss": 0.0598, + "step": 6740 + }, + { + "epoch": 0.57, + "grad_norm": 0.42222867059924685, + "learning_rate": 4.146545606913849e-06, + "loss": 0.1091, + "step": 6741 + }, + { + "epoch": 0.57, + "grad_norm": 0.9252420334686703, + "learning_rate": 4.145201060219022e-06, + "loss": 0.1097, + "step": 6742 + }, + { + "epoch": 0.57, + "grad_norm": 0.2079929274586764, + "learning_rate": 4.1438565771946005e-06, + "loss": 0.0536, + "step": 6743 + }, + { + "epoch": 0.57, + "grad_norm": 0.30994152102768036, + "learning_rate": 4.142512157940728e-06, + "loss": 0.0889, + "step": 6744 + }, + { + "epoch": 0.57, + "grad_norm": 0.44675145354536105, + "learning_rate": 4.141167802557544e-06, + "loss": 0.1126, + "step": 6745 + }, + { + "epoch": 0.57, + "grad_norm": 0.5829347882920988, + "learning_rate": 4.1398235111451865e-06, + "loss": 0.1497, + "step": 6746 + }, + { + "epoch": 0.57, + "grad_norm": 0.244406470226285, + "learning_rate": 4.138479283803785e-06, + "loss": 0.0625, + "step": 6747 + }, + { + "epoch": 0.57, + "grad_norm": 0.31077116515650294, + "learning_rate": 4.137135120633464e-06, + "loss": 0.1108, + "step": 6748 + }, + { + "epoch": 0.57, + "grad_norm": 0.3622631766485714, + "learning_rate": 4.135791021734347e-06, + "loss": 0.0871, + "step": 6749 + }, + { + "epoch": 0.57, + "grad_norm": 0.3539181512870654, + "learning_rate": 4.134446987206548e-06, + "loss": 0.1246, + "step": 6750 + }, + { + "epoch": 0.57, + "grad_norm": 0.19785518944316133, + "learning_rate": 4.13310301715018e-06, + "loss": 0.0451, + "step": 6751 + }, + { + "epoch": 0.57, + "grad_norm": 0.3095474536570035, + "learning_rate": 4.131759111665349e-06, + "loss": 0.0822, + "step": 6752 + }, + { + "epoch": 0.57, + "grad_norm": 0.23929064271995967, + "learning_rate": 4.130415270852158e-06, + "loss": 0.0703, + "step": 6753 + }, + { + "epoch": 0.57, + "grad_norm": 0.33406849249711207, + "learning_rate": 4.129071494810704e-06, + "loss": 0.1347, + "step": 6754 + }, + { + "epoch": 0.57, + "grad_norm": 0.3155874898517172, + "learning_rate": 4.127727783641076e-06, + "loss": 0.0812, + "step": 6755 + }, + { + "epoch": 0.57, + "grad_norm": 0.34216318485910996, + "learning_rate": 4.126384137443365e-06, + "loss": 0.0747, + "step": 6756 + }, + { + "epoch": 0.57, + "grad_norm": 0.3134103989651475, + "learning_rate": 4.125040556317654e-06, + "loss": 0.0736, + "step": 6757 + }, + { + "epoch": 0.57, + "grad_norm": 0.3198481571532385, + "learning_rate": 4.123697040364018e-06, + "loss": 0.0645, + "step": 6758 + }, + { + "epoch": 0.57, + "grad_norm": 0.2766614292535357, + "learning_rate": 4.122353589682531e-06, + "loss": 0.0874, + "step": 6759 + }, + { + "epoch": 0.57, + "grad_norm": 0.4309210900063584, + "learning_rate": 4.121010204373264e-06, + "loss": 0.0607, + "step": 6760 + }, + { + "epoch": 0.57, + "grad_norm": 0.3778376034182019, + "learning_rate": 4.1196668845362755e-06, + "loss": 0.1185, + "step": 6761 + }, + { + "epoch": 0.57, + "grad_norm": 0.33557607650423654, + "learning_rate": 4.118323630271624e-06, + "loss": 0.0928, + "step": 6762 + }, + { + "epoch": 0.57, + "grad_norm": 0.3017778456538718, + "learning_rate": 4.116980441679368e-06, + "loss": 0.0588, + "step": 6763 + }, + { + "epoch": 0.57, + "grad_norm": 0.38555669907917045, + "learning_rate": 4.115637318859551e-06, + "loss": 0.0925, + "step": 6764 + }, + { + "epoch": 0.57, + "grad_norm": 0.3107414615073564, + "learning_rate": 4.114294261912217e-06, + "loss": 0.1089, + "step": 6765 + }, + { + "epoch": 0.57, + "grad_norm": 0.27989614983308553, + "learning_rate": 4.112951270937409e-06, + "loss": 0.0667, + "step": 6766 + }, + { + "epoch": 0.57, + "grad_norm": 0.30633754155861675, + "learning_rate": 4.111608346035156e-06, + "loss": 0.0769, + "step": 6767 + }, + { + "epoch": 0.57, + "grad_norm": 0.21228421401237366, + "learning_rate": 4.11026548730549e-06, + "loss": 0.0471, + "step": 6768 + }, + { + "epoch": 0.57, + "grad_norm": 0.29747965032265683, + "learning_rate": 4.108922694848431e-06, + "loss": 0.0767, + "step": 6769 + }, + { + "epoch": 0.57, + "grad_norm": 0.4148808617750333, + "learning_rate": 4.107579968764003e-06, + "loss": 0.1203, + "step": 6770 + }, + { + "epoch": 0.57, + "grad_norm": 0.31096328151370295, + "learning_rate": 4.106237309152216e-06, + "loss": 0.1061, + "step": 6771 + }, + { + "epoch": 0.57, + "grad_norm": 0.1869184244570876, + "learning_rate": 4.104894716113081e-06, + "loss": 0.0676, + "step": 6772 + }, + { + "epoch": 0.57, + "grad_norm": 0.2126062417106381, + "learning_rate": 4.103552189746603e-06, + "loss": 0.0507, + "step": 6773 + }, + { + "epoch": 0.57, + "grad_norm": 0.20945189582317772, + "learning_rate": 4.10220973015278e-06, + "loss": 0.0674, + "step": 6774 + }, + { + "epoch": 0.57, + "grad_norm": 0.314407288815462, + "learning_rate": 4.100867337431605e-06, + "loss": 0.0843, + "step": 6775 + }, + { + "epoch": 0.57, + "grad_norm": 0.3708977701115243, + "learning_rate": 4.099525011683069e-06, + "loss": 0.0916, + "step": 6776 + }, + { + "epoch": 0.57, + "grad_norm": 0.3258221600453027, + "learning_rate": 4.098182753007157e-06, + "loss": 0.0873, + "step": 6777 + }, + { + "epoch": 0.57, + "grad_norm": 0.29359793059488043, + "learning_rate": 4.096840561503847e-06, + "loss": 0.0851, + "step": 6778 + }, + { + "epoch": 0.57, + "grad_norm": 0.32472828410485166, + "learning_rate": 4.095498437273112e-06, + "loss": 0.0778, + "step": 6779 + }, + { + "epoch": 0.57, + "grad_norm": 0.3430649384796402, + "learning_rate": 4.094156380414922e-06, + "loss": 0.0923, + "step": 6780 + }, + { + "epoch": 0.57, + "grad_norm": 0.2590180899862874, + "learning_rate": 4.092814391029244e-06, + "loss": 0.059, + "step": 6781 + }, + { + "epoch": 0.57, + "grad_norm": 0.3245146174371091, + "learning_rate": 4.091472469216033e-06, + "loss": 0.0889, + "step": 6782 + }, + { + "epoch": 0.57, + "grad_norm": 0.2985357030479436, + "learning_rate": 4.090130615075246e-06, + "loss": 0.0837, + "step": 6783 + }, + { + "epoch": 0.57, + "grad_norm": 0.3032639462786435, + "learning_rate": 4.0887888287068315e-06, + "loss": 0.0835, + "step": 6784 + }, + { + "epoch": 0.57, + "grad_norm": 0.37380290910973657, + "learning_rate": 4.0874471102107325e-06, + "loss": 0.0718, + "step": 6785 + }, + { + "epoch": 0.57, + "grad_norm": 0.47892256937214184, + "learning_rate": 4.086105459686888e-06, + "loss": 0.0839, + "step": 6786 + }, + { + "epoch": 0.57, + "grad_norm": 0.2533923036998749, + "learning_rate": 4.084763877235234e-06, + "loss": 0.0693, + "step": 6787 + }, + { + "epoch": 0.57, + "grad_norm": 0.30186878019287416, + "learning_rate": 4.083422362955698e-06, + "loss": 0.0862, + "step": 6788 + }, + { + "epoch": 0.57, + "grad_norm": 0.21807618499576664, + "learning_rate": 4.082080916948203e-06, + "loss": 0.0372, + "step": 6789 + }, + { + "epoch": 0.57, + "grad_norm": 0.32440083919954177, + "learning_rate": 4.080739539312669e-06, + "loss": 0.0979, + "step": 6790 + }, + { + "epoch": 0.57, + "grad_norm": 0.2716222596817052, + "learning_rate": 4.07939823014901e-06, + "loss": 0.0565, + "step": 6791 + }, + { + "epoch": 0.57, + "grad_norm": 0.18935570222734865, + "learning_rate": 4.0780569895571305e-06, + "loss": 0.0476, + "step": 6792 + }, + { + "epoch": 0.57, + "grad_norm": 0.23267733586860823, + "learning_rate": 4.07671581763694e-06, + "loss": 0.0468, + "step": 6793 + }, + { + "epoch": 0.57, + "grad_norm": 0.40777128513992217, + "learning_rate": 4.075374714488335e-06, + "loss": 0.0718, + "step": 6794 + }, + { + "epoch": 0.57, + "grad_norm": 0.3291297067225434, + "learning_rate": 4.0740336802112065e-06, + "loss": 0.105, + "step": 6795 + }, + { + "epoch": 0.57, + "grad_norm": 0.40748911200926774, + "learning_rate": 4.072692714905441e-06, + "loss": 0.1269, + "step": 6796 + }, + { + "epoch": 0.57, + "grad_norm": 0.34523880170730464, + "learning_rate": 4.071351818670928e-06, + "loss": 0.0732, + "step": 6797 + }, + { + "epoch": 0.57, + "grad_norm": 0.36771245433661803, + "learning_rate": 4.07001099160754e-06, + "loss": 0.1244, + "step": 6798 + }, + { + "epoch": 0.57, + "grad_norm": 0.36303065929658823, + "learning_rate": 4.0686702338151485e-06, + "loss": 0.0919, + "step": 6799 + }, + { + "epoch": 0.57, + "grad_norm": 0.27990487172064493, + "learning_rate": 4.067329545393626e-06, + "loss": 0.0766, + "step": 6800 + }, + { + "epoch": 0.57, + "grad_norm": 0.41379882302685794, + "learning_rate": 4.065988926442833e-06, + "loss": 0.1033, + "step": 6801 + }, + { + "epoch": 0.57, + "grad_norm": 0.41654549464294943, + "learning_rate": 4.064648377062624e-06, + "loss": 0.0857, + "step": 6802 + }, + { + "epoch": 0.57, + "grad_norm": 0.27639101990603476, + "learning_rate": 4.063307897352853e-06, + "loss": 0.0875, + "step": 6803 + }, + { + "epoch": 0.57, + "grad_norm": 0.4579069408372044, + "learning_rate": 4.061967487413366e-06, + "loss": 0.0954, + "step": 6804 + }, + { + "epoch": 0.57, + "grad_norm": 0.5178593222827905, + "learning_rate": 4.060627147344006e-06, + "loss": 0.1168, + "step": 6805 + }, + { + "epoch": 0.57, + "grad_norm": 0.5750232642417462, + "learning_rate": 4.059286877244606e-06, + "loss": 0.1279, + "step": 6806 + }, + { + "epoch": 0.57, + "grad_norm": 0.35758193837051583, + "learning_rate": 4.057946677215002e-06, + "loss": 0.0891, + "step": 6807 + }, + { + "epoch": 0.57, + "grad_norm": 0.3680931555999769, + "learning_rate": 4.056606547355017e-06, + "loss": 0.085, + "step": 6808 + }, + { + "epoch": 0.57, + "grad_norm": 0.3928735116157584, + "learning_rate": 4.055266487764472e-06, + "loss": 0.106, + "step": 6809 + }, + { + "epoch": 0.57, + "grad_norm": 0.33825824393667336, + "learning_rate": 4.0539264985431815e-06, + "loss": 0.081, + "step": 6810 + }, + { + "epoch": 0.57, + "grad_norm": 0.2077629439405697, + "learning_rate": 4.052586579790958e-06, + "loss": 0.0764, + "step": 6811 + }, + { + "epoch": 0.57, + "grad_norm": 0.2307810704491016, + "learning_rate": 4.051246731607605e-06, + "loss": 0.0627, + "step": 6812 + }, + { + "epoch": 0.57, + "grad_norm": 0.5049284676722581, + "learning_rate": 4.049906954092922e-06, + "loss": 0.1031, + "step": 6813 + }, + { + "epoch": 0.57, + "grad_norm": 0.41505499428228176, + "learning_rate": 4.048567247346702e-06, + "loss": 0.0903, + "step": 6814 + }, + { + "epoch": 0.57, + "grad_norm": 0.20259650983511962, + "learning_rate": 4.047227611468738e-06, + "loss": 0.0571, + "step": 6815 + }, + { + "epoch": 0.57, + "grad_norm": 0.40376554744034776, + "learning_rate": 4.045888046558811e-06, + "loss": 0.0813, + "step": 6816 + }, + { + "epoch": 0.57, + "grad_norm": 0.3720988875762975, + "learning_rate": 4.044548552716702e-06, + "loss": 0.1188, + "step": 6817 + }, + { + "epoch": 0.57, + "grad_norm": 0.3338429090223802, + "learning_rate": 4.043209130042182e-06, + "loss": 0.0827, + "step": 6818 + }, + { + "epoch": 0.57, + "grad_norm": 0.18049145025515426, + "learning_rate": 4.041869778635018e-06, + "loss": 0.0396, + "step": 6819 + }, + { + "epoch": 0.57, + "grad_norm": 0.31032778295999447, + "learning_rate": 4.040530498594975e-06, + "loss": 0.0602, + "step": 6820 + }, + { + "epoch": 0.57, + "grad_norm": 0.20363874233113247, + "learning_rate": 4.039191290021811e-06, + "loss": 0.0399, + "step": 6821 + }, + { + "epoch": 0.57, + "grad_norm": 0.18534918619017265, + "learning_rate": 4.0378521530152765e-06, + "loss": 0.0664, + "step": 6822 + }, + { + "epoch": 0.57, + "grad_norm": 0.343690806614877, + "learning_rate": 4.0365130876751166e-06, + "loss": 0.0931, + "step": 6823 + }, + { + "epoch": 0.57, + "grad_norm": 0.27651327933324993, + "learning_rate": 4.035174094101077e-06, + "loss": 0.0732, + "step": 6824 + }, + { + "epoch": 0.58, + "grad_norm": 0.7756232879371485, + "learning_rate": 4.033835172392891e-06, + "loss": 0.1068, + "step": 6825 + }, + { + "epoch": 0.58, + "grad_norm": 0.3801855831507543, + "learning_rate": 4.0324963226502875e-06, + "loss": 0.0621, + "step": 6826 + }, + { + "epoch": 0.58, + "grad_norm": 0.3525152499037248, + "learning_rate": 4.031157544972997e-06, + "loss": 0.083, + "step": 6827 + }, + { + "epoch": 0.58, + "grad_norm": 0.4165430351194655, + "learning_rate": 4.029818839460735e-06, + "loss": 0.1311, + "step": 6828 + }, + { + "epoch": 0.58, + "grad_norm": 0.20623182200542317, + "learning_rate": 4.0284802062132186e-06, + "loss": 0.0482, + "step": 6829 + }, + { + "epoch": 0.58, + "grad_norm": 0.16966207710350995, + "learning_rate": 4.027141645330154e-06, + "loss": 0.0448, + "step": 6830 + }, + { + "epoch": 0.58, + "grad_norm": 0.31182183818868703, + "learning_rate": 4.025803156911249e-06, + "loss": 0.1121, + "step": 6831 + }, + { + "epoch": 0.58, + "grad_norm": 0.38283168639700876, + "learning_rate": 4.0244647410562e-06, + "loss": 0.081, + "step": 6832 + }, + { + "epoch": 0.58, + "grad_norm": 0.4177354344355331, + "learning_rate": 4.023126397864697e-06, + "loss": 0.1224, + "step": 6833 + }, + { + "epoch": 0.58, + "grad_norm": 0.31735514000818316, + "learning_rate": 4.021788127436434e-06, + "loss": 0.1078, + "step": 6834 + }, + { + "epoch": 0.58, + "grad_norm": 0.3094289346702054, + "learning_rate": 4.020449929871089e-06, + "loss": 0.1018, + "step": 6835 + }, + { + "epoch": 0.58, + "grad_norm": 0.20002493537778415, + "learning_rate": 4.01911180526834e-06, + "loss": 0.045, + "step": 6836 + }, + { + "epoch": 0.58, + "grad_norm": 0.2180440884792028, + "learning_rate": 4.017773753727855e-06, + "loss": 0.0441, + "step": 6837 + }, + { + "epoch": 0.58, + "grad_norm": 0.30165629935327953, + "learning_rate": 4.0164357753493054e-06, + "loss": 0.09, + "step": 6838 + }, + { + "epoch": 0.58, + "grad_norm": 0.29417436071160563, + "learning_rate": 4.015097870232349e-06, + "loss": 0.052, + "step": 6839 + }, + { + "epoch": 0.58, + "grad_norm": 0.38880938549244015, + "learning_rate": 4.013760038476638e-06, + "loss": 0.1034, + "step": 6840 + }, + { + "epoch": 0.58, + "grad_norm": 0.32806614950758095, + "learning_rate": 4.012422280181827e-06, + "loss": 0.1055, + "step": 6841 + }, + { + "epoch": 0.58, + "grad_norm": 0.19801827242258774, + "learning_rate": 4.011084595447558e-06, + "loss": 0.0354, + "step": 6842 + }, + { + "epoch": 0.58, + "grad_norm": 0.2273540229962862, + "learning_rate": 4.009746984373469e-06, + "loss": 0.0661, + "step": 6843 + }, + { + "epoch": 0.58, + "grad_norm": 0.3107326713018466, + "learning_rate": 4.00840944705919e-06, + "loss": 0.0822, + "step": 6844 + }, + { + "epoch": 0.58, + "grad_norm": 0.33049864915313754, + "learning_rate": 4.007071983604356e-06, + "loss": 0.0761, + "step": 6845 + }, + { + "epoch": 0.58, + "grad_norm": 0.26568217506266945, + "learning_rate": 4.005734594108583e-06, + "loss": 0.0515, + "step": 6846 + }, + { + "epoch": 0.58, + "grad_norm": 0.13978706738923913, + "learning_rate": 4.00439727867149e-06, + "loss": 0.0345, + "step": 6847 + }, + { + "epoch": 0.58, + "grad_norm": 0.2982093507094647, + "learning_rate": 4.003060037392687e-06, + "loss": 0.0934, + "step": 6848 + }, + { + "epoch": 0.58, + "grad_norm": 0.3503974564286525, + "learning_rate": 4.00172287037178e-06, + "loss": 0.1076, + "step": 6849 + }, + { + "epoch": 0.58, + "grad_norm": 0.7170834209883835, + "learning_rate": 4.00038577770837e-06, + "loss": 0.0755, + "step": 6850 + }, + { + "epoch": 0.58, + "grad_norm": 0.39267547573702943, + "learning_rate": 3.99904875950205e-06, + "loss": 0.0721, + "step": 6851 + }, + { + "epoch": 0.58, + "grad_norm": 0.3600723060561483, + "learning_rate": 3.99771181585241e-06, + "loss": 0.107, + "step": 6852 + }, + { + "epoch": 0.58, + "grad_norm": 0.4613315424069212, + "learning_rate": 3.996374946859031e-06, + "loss": 0.0875, + "step": 6853 + }, + { + "epoch": 0.58, + "grad_norm": 0.7428663279329595, + "learning_rate": 3.995038152621493e-06, + "loss": 0.0806, + "step": 6854 + }, + { + "epoch": 0.58, + "grad_norm": 0.32532917000953604, + "learning_rate": 3.993701433239368e-06, + "loss": 0.0993, + "step": 6855 + }, + { + "epoch": 0.58, + "grad_norm": 0.2067053801008601, + "learning_rate": 3.992364788812223e-06, + "loss": 0.0558, + "step": 6856 + }, + { + "epoch": 0.58, + "grad_norm": 0.2842408093994554, + "learning_rate": 3.991028219439616e-06, + "loss": 0.1108, + "step": 6857 + }, + { + "epoch": 0.58, + "grad_norm": 0.26767623446334693, + "learning_rate": 3.989691725221107e-06, + "loss": 0.0594, + "step": 6858 + }, + { + "epoch": 0.58, + "grad_norm": 0.3248118158462626, + "learning_rate": 3.988355306256245e-06, + "loss": 0.0371, + "step": 6859 + }, + { + "epoch": 0.58, + "grad_norm": 0.3112313666799693, + "learning_rate": 3.987018962644572e-06, + "loss": 0.0791, + "step": 6860 + }, + { + "epoch": 0.58, + "grad_norm": 0.5887704456283226, + "learning_rate": 3.985682694485627e-06, + "loss": 0.1529, + "step": 6861 + }, + { + "epoch": 0.58, + "grad_norm": 0.38791183324123424, + "learning_rate": 3.984346501878944e-06, + "loss": 0.0866, + "step": 6862 + }, + { + "epoch": 0.58, + "grad_norm": 0.302491237976391, + "learning_rate": 3.9830103849240505e-06, + "loss": 0.0855, + "step": 6863 + }, + { + "epoch": 0.58, + "grad_norm": 0.3255217508390222, + "learning_rate": 3.981674343720466e-06, + "loss": 0.0729, + "step": 6864 + }, + { + "epoch": 0.58, + "grad_norm": 0.26594685480403674, + "learning_rate": 3.9803383783677105e-06, + "loss": 0.0743, + "step": 6865 + }, + { + "epoch": 0.58, + "grad_norm": 0.3306186169374885, + "learning_rate": 3.979002488965293e-06, + "loss": 0.0865, + "step": 6866 + }, + { + "epoch": 0.58, + "grad_norm": 0.30203550558194897, + "learning_rate": 3.977666675612716e-06, + "loss": 0.0913, + "step": 6867 + }, + { + "epoch": 0.58, + "grad_norm": 0.2507093305463405, + "learning_rate": 3.9763309384094826e-06, + "loss": 0.0916, + "step": 6868 + }, + { + "epoch": 0.58, + "grad_norm": 0.30186650052077385, + "learning_rate": 3.9749952774550835e-06, + "loss": 0.0917, + "step": 6869 + }, + { + "epoch": 0.58, + "grad_norm": 0.3288616087363759, + "learning_rate": 3.973659692849008e-06, + "loss": 0.0878, + "step": 6870 + }, + { + "epoch": 0.58, + "grad_norm": 1.0474023553411038, + "learning_rate": 3.972324184690735e-06, + "loss": 0.12, + "step": 6871 + }, + { + "epoch": 0.58, + "grad_norm": 0.2773767454393442, + "learning_rate": 3.9709887530797455e-06, + "loss": 0.079, + "step": 6872 + }, + { + "epoch": 0.58, + "grad_norm": 0.4087339597638231, + "learning_rate": 3.969653398115507e-06, + "loss": 0.0907, + "step": 6873 + }, + { + "epoch": 0.58, + "grad_norm": 0.7185180661234648, + "learning_rate": 3.9683181198974856e-06, + "loss": 0.1537, + "step": 6874 + }, + { + "epoch": 0.58, + "grad_norm": 0.3052978535512997, + "learning_rate": 3.966982918525141e-06, + "loss": 0.0871, + "step": 6875 + }, + { + "epoch": 0.58, + "grad_norm": 0.23011818774772066, + "learning_rate": 3.965647794097927e-06, + "loss": 0.0656, + "step": 6876 + }, + { + "epoch": 0.58, + "grad_norm": 0.2906644857966385, + "learning_rate": 3.964312746715291e-06, + "loss": 0.09, + "step": 6877 + }, + { + "epoch": 0.58, + "grad_norm": 0.4580226570243548, + "learning_rate": 3.962977776476673e-06, + "loss": 0.126, + "step": 6878 + }, + { + "epoch": 0.58, + "grad_norm": 0.2179894609241442, + "learning_rate": 3.961642883481514e-06, + "loss": 0.0793, + "step": 6879 + }, + { + "epoch": 0.58, + "grad_norm": 0.3149798752114038, + "learning_rate": 3.960308067829241e-06, + "loss": 0.1031, + "step": 6880 + }, + { + "epoch": 0.58, + "grad_norm": 0.36145864546505274, + "learning_rate": 3.958973329619278e-06, + "loss": 0.1012, + "step": 6881 + }, + { + "epoch": 0.58, + "grad_norm": 0.4653254560014289, + "learning_rate": 3.957638668951047e-06, + "loss": 0.1233, + "step": 6882 + }, + { + "epoch": 0.58, + "grad_norm": 0.22409894563567676, + "learning_rate": 3.956304085923961e-06, + "loss": 0.0763, + "step": 6883 + }, + { + "epoch": 0.58, + "grad_norm": 0.3170448879218194, + "learning_rate": 3.954969580637426e-06, + "loss": 0.0684, + "step": 6884 + }, + { + "epoch": 0.58, + "grad_norm": 0.36381311831357716, + "learning_rate": 3.953635153190845e-06, + "loss": 0.0919, + "step": 6885 + }, + { + "epoch": 0.58, + "grad_norm": 0.35663766732705743, + "learning_rate": 3.952300803683612e-06, + "loss": 0.108, + "step": 6886 + }, + { + "epoch": 0.58, + "grad_norm": 0.31699664296900637, + "learning_rate": 3.950966532215119e-06, + "loss": 0.0667, + "step": 6887 + }, + { + "epoch": 0.58, + "grad_norm": 0.37314035144374846, + "learning_rate": 3.949632338884749e-06, + "loss": 0.1008, + "step": 6888 + }, + { + "epoch": 0.58, + "grad_norm": 0.2807584043156559, + "learning_rate": 3.9482982237918834e-06, + "loss": 0.0795, + "step": 6889 + }, + { + "epoch": 0.58, + "grad_norm": 0.27005308049762156, + "learning_rate": 3.946964187035892e-06, + "loss": 0.0772, + "step": 6890 + }, + { + "epoch": 0.58, + "grad_norm": 0.26411207342231685, + "learning_rate": 3.94563022871614e-06, + "loss": 0.0614, + "step": 6891 + }, + { + "epoch": 0.58, + "grad_norm": 0.3958383665053706, + "learning_rate": 3.944296348931993e-06, + "loss": 0.0768, + "step": 6892 + }, + { + "epoch": 0.58, + "grad_norm": 0.21767363431582126, + "learning_rate": 3.942962547782804e-06, + "loss": 0.0579, + "step": 6893 + }, + { + "epoch": 0.58, + "grad_norm": 0.4735568060561059, + "learning_rate": 3.941628825367922e-06, + "loss": 0.1178, + "step": 6894 + }, + { + "epoch": 0.58, + "grad_norm": 0.28398262507960714, + "learning_rate": 3.940295181786688e-06, + "loss": 0.0731, + "step": 6895 + }, + { + "epoch": 0.58, + "grad_norm": 0.2456623769835113, + "learning_rate": 3.938961617138445e-06, + "loss": 0.0947, + "step": 6896 + }, + { + "epoch": 0.58, + "grad_norm": 0.34473921628690324, + "learning_rate": 3.937628131522521e-06, + "loss": 0.1109, + "step": 6897 + }, + { + "epoch": 0.58, + "grad_norm": 0.4360184885539232, + "learning_rate": 3.936294725038241e-06, + "loss": 0.1177, + "step": 6898 + }, + { + "epoch": 0.58, + "grad_norm": 0.2872847294398273, + "learning_rate": 3.9349613977849286e-06, + "loss": 0.0875, + "step": 6899 + }, + { + "epoch": 0.58, + "grad_norm": 0.27738662823783844, + "learning_rate": 3.933628149861895e-06, + "loss": 0.0754, + "step": 6900 + }, + { + "epoch": 0.58, + "grad_norm": 0.7029633715831742, + "learning_rate": 3.932294981368447e-06, + "loss": 0.1495, + "step": 6901 + }, + { + "epoch": 0.58, + "grad_norm": 0.19966650316120085, + "learning_rate": 3.930961892403892e-06, + "loss": 0.0475, + "step": 6902 + }, + { + "epoch": 0.58, + "grad_norm": 0.2816966978197229, + "learning_rate": 3.929628883067521e-06, + "loss": 0.0873, + "step": 6903 + }, + { + "epoch": 0.58, + "grad_norm": 0.22519645617762957, + "learning_rate": 3.928295953458627e-06, + "loss": 0.06, + "step": 6904 + }, + { + "epoch": 0.58, + "grad_norm": 0.45407714829663914, + "learning_rate": 3.926963103676492e-06, + "loss": 0.0948, + "step": 6905 + }, + { + "epoch": 0.58, + "grad_norm": 0.3753609797589767, + "learning_rate": 3.925630333820396e-06, + "loss": 0.0876, + "step": 6906 + }, + { + "epoch": 0.58, + "grad_norm": 0.42273827838680217, + "learning_rate": 3.924297643989614e-06, + "loss": 0.1221, + "step": 6907 + }, + { + "epoch": 0.58, + "grad_norm": 0.17436072127293975, + "learning_rate": 3.922965034283406e-06, + "loss": 0.0574, + "step": 6908 + }, + { + "epoch": 0.58, + "grad_norm": 0.20532330459128384, + "learning_rate": 3.92163250480104e-06, + "loss": 0.044, + "step": 6909 + }, + { + "epoch": 0.58, + "grad_norm": 0.26158196627028846, + "learning_rate": 3.920300055641765e-06, + "loss": 0.0828, + "step": 6910 + }, + { + "epoch": 0.58, + "grad_norm": 0.33341292629031166, + "learning_rate": 3.918967686904834e-06, + "loss": 0.091, + "step": 6911 + }, + { + "epoch": 0.58, + "grad_norm": 0.21122779945521636, + "learning_rate": 3.9176353986894826e-06, + "loss": 0.0619, + "step": 6912 + }, + { + "epoch": 0.58, + "grad_norm": 0.3879874587047194, + "learning_rate": 3.916303191094955e-06, + "loss": 0.1329, + "step": 6913 + }, + { + "epoch": 0.58, + "grad_norm": 0.28900429982951475, + "learning_rate": 3.91497106422048e-06, + "loss": 0.0623, + "step": 6914 + }, + { + "epoch": 0.58, + "grad_norm": 0.25485906322825685, + "learning_rate": 3.913639018165278e-06, + "loss": 0.0477, + "step": 6915 + }, + { + "epoch": 0.58, + "grad_norm": 0.3336056533499356, + "learning_rate": 3.912307053028571e-06, + "loss": 0.0957, + "step": 6916 + }, + { + "epoch": 0.58, + "grad_norm": 0.2855754068711921, + "learning_rate": 3.910975168909573e-06, + "loss": 0.0374, + "step": 6917 + }, + { + "epoch": 0.58, + "grad_norm": 0.13585458714134174, + "learning_rate": 3.909643365907486e-06, + "loss": 0.0353, + "step": 6918 + }, + { + "epoch": 0.58, + "grad_norm": 0.4751721332480166, + "learning_rate": 3.908311644121515e-06, + "loss": 0.1246, + "step": 6919 + }, + { + "epoch": 0.58, + "grad_norm": 0.26515906135054784, + "learning_rate": 3.906980003650852e-06, + "loss": 0.0777, + "step": 6920 + }, + { + "epoch": 0.58, + "grad_norm": 0.7268104034896427, + "learning_rate": 3.905648444594684e-06, + "loss": 0.0925, + "step": 6921 + }, + { + "epoch": 0.58, + "grad_norm": 0.22550233843603237, + "learning_rate": 3.904316967052196e-06, + "loss": 0.0643, + "step": 6922 + }, + { + "epoch": 0.58, + "grad_norm": 0.357974656274648, + "learning_rate": 3.902985571122563e-06, + "loss": 0.062, + "step": 6923 + }, + { + "epoch": 0.58, + "grad_norm": 0.2488885939456447, + "learning_rate": 3.901654256904957e-06, + "loss": 0.0562, + "step": 6924 + }, + { + "epoch": 0.58, + "grad_norm": 0.24560733442498642, + "learning_rate": 3.900323024498536e-06, + "loss": 0.0711, + "step": 6925 + }, + { + "epoch": 0.58, + "grad_norm": 0.316387671919573, + "learning_rate": 3.898991874002466e-06, + "loss": 0.0686, + "step": 6926 + }, + { + "epoch": 0.58, + "grad_norm": 0.3963199896160202, + "learning_rate": 3.897660805515894e-06, + "loss": 0.0977, + "step": 6927 + }, + { + "epoch": 0.58, + "grad_norm": 0.6910810913344372, + "learning_rate": 3.896329819137967e-06, + "loss": 0.1385, + "step": 6928 + }, + { + "epoch": 0.58, + "grad_norm": 0.22774201678755376, + "learning_rate": 3.8949989149678224e-06, + "loss": 0.0448, + "step": 6929 + }, + { + "epoch": 0.58, + "grad_norm": 0.40492149371090796, + "learning_rate": 3.893668093104598e-06, + "loss": 0.1083, + "step": 6930 + }, + { + "epoch": 0.58, + "grad_norm": 0.3799007974898237, + "learning_rate": 3.892337353647419e-06, + "loss": 0.0832, + "step": 6931 + }, + { + "epoch": 0.58, + "grad_norm": 0.25474780141915054, + "learning_rate": 3.891006696695404e-06, + "loss": 0.0833, + "step": 6932 + }, + { + "epoch": 0.58, + "grad_norm": 0.23560666806712227, + "learning_rate": 3.889676122347673e-06, + "loss": 0.0539, + "step": 6933 + }, + { + "epoch": 0.58, + "grad_norm": 0.36829838265912307, + "learning_rate": 3.888345630703333e-06, + "loss": 0.0933, + "step": 6934 + }, + { + "epoch": 0.58, + "grad_norm": 0.35923305151957297, + "learning_rate": 3.887015221861483e-06, + "loss": 0.0951, + "step": 6935 + }, + { + "epoch": 0.58, + "grad_norm": 0.3583063995201625, + "learning_rate": 3.885684895921226e-06, + "loss": 0.1175, + "step": 6936 + }, + { + "epoch": 0.58, + "grad_norm": 0.2571510882881295, + "learning_rate": 3.884354652981649e-06, + "loss": 0.0491, + "step": 6937 + }, + { + "epoch": 0.58, + "grad_norm": 0.5138359927628624, + "learning_rate": 3.883024493141836e-06, + "loss": 0.1114, + "step": 6938 + }, + { + "epoch": 0.58, + "grad_norm": 0.35977749156900724, + "learning_rate": 3.881694416500865e-06, + "loss": 0.1144, + "step": 6939 + }, + { + "epoch": 0.58, + "grad_norm": 0.3227372429212325, + "learning_rate": 3.880364423157809e-06, + "loss": 0.0494, + "step": 6940 + }, + { + "epoch": 0.58, + "grad_norm": 0.3252328144018323, + "learning_rate": 3.879034513211734e-06, + "loss": 0.0733, + "step": 6941 + }, + { + "epoch": 0.58, + "grad_norm": 0.5130741662260205, + "learning_rate": 3.8777046867616965e-06, + "loss": 0.1128, + "step": 6942 + }, + { + "epoch": 0.59, + "grad_norm": 0.3472291509855456, + "learning_rate": 3.876374943906753e-06, + "loss": 0.0897, + "step": 6943 + }, + { + "epoch": 0.59, + "grad_norm": 0.3482848141859018, + "learning_rate": 3.87504528474595e-06, + "loss": 0.0718, + "step": 6944 + }, + { + "epoch": 0.59, + "grad_norm": 0.21451089036494903, + "learning_rate": 3.873715709378327e-06, + "loss": 0.0721, + "step": 6945 + }, + { + "epoch": 0.59, + "grad_norm": 0.2860063289050506, + "learning_rate": 3.872386217902918e-06, + "loss": 0.0601, + "step": 6946 + }, + { + "epoch": 0.59, + "grad_norm": 0.2197182276614109, + "learning_rate": 3.871056810418754e-06, + "loss": 0.0573, + "step": 6947 + }, + { + "epoch": 0.59, + "grad_norm": 0.21099458045111344, + "learning_rate": 3.869727487024854e-06, + "loss": 0.0528, + "step": 6948 + }, + { + "epoch": 0.59, + "grad_norm": 0.27294417207788796, + "learning_rate": 3.8683982478202356e-06, + "loss": 0.0785, + "step": 6949 + }, + { + "epoch": 0.59, + "grad_norm": 0.2626738163970261, + "learning_rate": 3.867069092903908e-06, + "loss": 0.0627, + "step": 6950 + }, + { + "epoch": 0.59, + "grad_norm": 0.2153282411907624, + "learning_rate": 3.8657400223748746e-06, + "loss": 0.0641, + "step": 6951 + }, + { + "epoch": 0.59, + "grad_norm": 0.2517636050579485, + "learning_rate": 3.8644110363321305e-06, + "loss": 0.0681, + "step": 6952 + }, + { + "epoch": 0.59, + "grad_norm": 0.26457393409906205, + "learning_rate": 3.863082134874669e-06, + "loss": 0.0715, + "step": 6953 + }, + { + "epoch": 0.59, + "grad_norm": 0.3022005503802715, + "learning_rate": 3.861753318101473e-06, + "loss": 0.1287, + "step": 6954 + }, + { + "epoch": 0.59, + "grad_norm": 0.2863978156496446, + "learning_rate": 3.86042458611152e-06, + "loss": 0.0975, + "step": 6955 + }, + { + "epoch": 0.59, + "grad_norm": 0.325104885195721, + "learning_rate": 3.859095939003782e-06, + "loss": 0.085, + "step": 6956 + }, + { + "epoch": 0.59, + "grad_norm": 0.31063211095875765, + "learning_rate": 3.857767376877226e-06, + "loss": 0.0492, + "step": 6957 + }, + { + "epoch": 0.59, + "grad_norm": 0.3137533087387582, + "learning_rate": 3.85643889983081e-06, + "loss": 0.0799, + "step": 6958 + }, + { + "epoch": 0.59, + "grad_norm": 0.25382124918736076, + "learning_rate": 3.855110507963484e-06, + "loss": 0.0537, + "step": 6959 + }, + { + "epoch": 0.59, + "grad_norm": 0.3075078301860931, + "learning_rate": 3.853782201374198e-06, + "loss": 0.0706, + "step": 6960 + }, + { + "epoch": 0.59, + "grad_norm": 0.19160549305484859, + "learning_rate": 3.852453980161893e-06, + "loss": 0.0641, + "step": 6961 + }, + { + "epoch": 0.59, + "grad_norm": 0.4451107614333401, + "learning_rate": 3.851125844425499e-06, + "loss": 0.0676, + "step": 6962 + }, + { + "epoch": 0.59, + "grad_norm": 0.2155923247640567, + "learning_rate": 3.849797794263943e-06, + "loss": 0.0568, + "step": 6963 + }, + { + "epoch": 0.59, + "grad_norm": 0.3975958788434452, + "learning_rate": 3.84846982977615e-06, + "loss": 0.1189, + "step": 6964 + }, + { + "epoch": 0.59, + "grad_norm": 0.5710769646276664, + "learning_rate": 3.847141951061032e-06, + "loss": 0.1152, + "step": 6965 + }, + { + "epoch": 0.59, + "grad_norm": 0.4692006955601725, + "learning_rate": 3.845814158217495e-06, + "loss": 0.0904, + "step": 6966 + }, + { + "epoch": 0.59, + "grad_norm": 0.43762383087876533, + "learning_rate": 3.844486451344445e-06, + "loss": 0.0853, + "step": 6967 + }, + { + "epoch": 0.59, + "grad_norm": 0.3706605262386526, + "learning_rate": 3.843158830540777e-06, + "loss": 0.0963, + "step": 6968 + }, + { + "epoch": 0.59, + "grad_norm": 0.4985092999289467, + "learning_rate": 3.841831295905377e-06, + "loss": 0.1026, + "step": 6969 + }, + { + "epoch": 0.59, + "grad_norm": 0.17626772584460504, + "learning_rate": 3.840503847537127e-06, + "loss": 0.0348, + "step": 6970 + }, + { + "epoch": 0.59, + "grad_norm": 0.33472016390239234, + "learning_rate": 3.839176485534908e-06, + "loss": 0.0805, + "step": 6971 + }, + { + "epoch": 0.59, + "grad_norm": 0.4074588277535488, + "learning_rate": 3.837849209997586e-06, + "loss": 0.1013, + "step": 6972 + }, + { + "epoch": 0.59, + "grad_norm": 0.3219608630924946, + "learning_rate": 3.8365220210240225e-06, + "loss": 0.0837, + "step": 6973 + }, + { + "epoch": 0.59, + "grad_norm": 0.24337186896112342, + "learning_rate": 3.8351949187130795e-06, + "loss": 0.0821, + "step": 6974 + }, + { + "epoch": 0.59, + "grad_norm": 0.6445673547171886, + "learning_rate": 3.8338679031636035e-06, + "loss": 0.0894, + "step": 6975 + }, + { + "epoch": 0.59, + "grad_norm": 0.39257176043469416, + "learning_rate": 3.832540974474438e-06, + "loss": 0.1037, + "step": 6976 + }, + { + "epoch": 0.59, + "grad_norm": 0.33674127779434965, + "learning_rate": 3.831214132744423e-06, + "loss": 0.0524, + "step": 6977 + }, + { + "epoch": 0.59, + "grad_norm": 0.2261301318414963, + "learning_rate": 3.829887378072389e-06, + "loss": 0.0588, + "step": 6978 + }, + { + "epoch": 0.59, + "grad_norm": 0.2155514377233801, + "learning_rate": 3.828560710557159e-06, + "loss": 0.0874, + "step": 6979 + }, + { + "epoch": 0.59, + "grad_norm": 0.3549295515673941, + "learning_rate": 3.8272341302975494e-06, + "loss": 0.0947, + "step": 6980 + }, + { + "epoch": 0.59, + "grad_norm": 0.4557513749274151, + "learning_rate": 3.825907637392375e-06, + "loss": 0.1036, + "step": 6981 + }, + { + "epoch": 0.59, + "grad_norm": 0.2870464472742229, + "learning_rate": 3.82458123194044e-06, + "loss": 0.043, + "step": 6982 + }, + { + "epoch": 0.59, + "grad_norm": 0.4886983894195904, + "learning_rate": 3.8232549140405416e-06, + "loss": 0.1041, + "step": 6983 + }, + { + "epoch": 0.59, + "grad_norm": 0.6183753344049833, + "learning_rate": 3.8219286837914716e-06, + "loss": 0.119, + "step": 6984 + }, + { + "epoch": 0.59, + "grad_norm": 0.31722070462520496, + "learning_rate": 3.820602541292017e-06, + "loss": 0.0972, + "step": 6985 + }, + { + "epoch": 0.59, + "grad_norm": 0.2994385694468029, + "learning_rate": 3.819276486640956e-06, + "loss": 0.0953, + "step": 6986 + }, + { + "epoch": 0.59, + "grad_norm": 0.5208278610938926, + "learning_rate": 3.817950519937059e-06, + "loss": 0.0938, + "step": 6987 + }, + { + "epoch": 0.59, + "grad_norm": 0.39204515448357136, + "learning_rate": 3.816624641279096e-06, + "loss": 0.0947, + "step": 6988 + }, + { + "epoch": 0.59, + "grad_norm": 0.2846704213515172, + "learning_rate": 3.81529885076582e-06, + "loss": 0.0706, + "step": 6989 + }, + { + "epoch": 0.59, + "grad_norm": 0.32309530962331634, + "learning_rate": 3.813973148495988e-06, + "loss": 0.0953, + "step": 6990 + }, + { + "epoch": 0.59, + "grad_norm": 0.4249505702345002, + "learning_rate": 3.8126475345683455e-06, + "loss": 0.0768, + "step": 6991 + }, + { + "epoch": 0.59, + "grad_norm": 0.3563129260604548, + "learning_rate": 3.8113220090816323e-06, + "loss": 0.0864, + "step": 6992 + }, + { + "epoch": 0.59, + "grad_norm": 0.28453006800038677, + "learning_rate": 3.8099965721345777e-06, + "loss": 0.0733, + "step": 6993 + }, + { + "epoch": 0.59, + "grad_norm": 0.21862895412323283, + "learning_rate": 3.8086712238259134e-06, + "loss": 0.0456, + "step": 6994 + }, + { + "epoch": 0.59, + "grad_norm": 0.3060886413087305, + "learning_rate": 3.8073459642543558e-06, + "loss": 0.07, + "step": 6995 + }, + { + "epoch": 0.59, + "grad_norm": 0.28349790840860656, + "learning_rate": 3.806020793518619e-06, + "loss": 0.0835, + "step": 6996 + }, + { + "epoch": 0.59, + "grad_norm": 0.4748548267099864, + "learning_rate": 3.8046957117174064e-06, + "loss": 0.1155, + "step": 6997 + }, + { + "epoch": 0.59, + "grad_norm": 0.30080223676320494, + "learning_rate": 3.803370718949423e-06, + "loss": 0.0721, + "step": 6998 + }, + { + "epoch": 0.59, + "grad_norm": 0.20757776942535627, + "learning_rate": 3.8020458153133586e-06, + "loss": 0.0395, + "step": 6999 + }, + { + "epoch": 0.59, + "grad_norm": 0.29913997673728926, + "learning_rate": 3.800721000907899e-06, + "loss": 0.0624, + "step": 7000 + }, + { + "epoch": 0.59, + "grad_norm": 0.25747983312633915, + "learning_rate": 3.799396275831727e-06, + "loss": 0.1024, + "step": 7001 + }, + { + "epoch": 0.59, + "grad_norm": 0.24140307001140046, + "learning_rate": 3.798071640183515e-06, + "loss": 0.0554, + "step": 7002 + }, + { + "epoch": 0.59, + "grad_norm": 0.26058474617581845, + "learning_rate": 3.796747094061929e-06, + "loss": 0.0755, + "step": 7003 + }, + { + "epoch": 0.59, + "grad_norm": 0.2793340283786342, + "learning_rate": 3.795422637565626e-06, + "loss": 0.0667, + "step": 7004 + }, + { + "epoch": 0.59, + "grad_norm": 0.25598930970923833, + "learning_rate": 3.794098270793265e-06, + "loss": 0.0477, + "step": 7005 + }, + { + "epoch": 0.59, + "grad_norm": 0.2547974546742003, + "learning_rate": 3.7927739938434893e-06, + "loss": 0.0699, + "step": 7006 + }, + { + "epoch": 0.59, + "grad_norm": 0.3968049901002055, + "learning_rate": 3.7914498068149375e-06, + "loss": 0.1156, + "step": 7007 + }, + { + "epoch": 0.59, + "grad_norm": 0.3243338367429409, + "learning_rate": 3.790125709806246e-06, + "loss": 0.1036, + "step": 7008 + }, + { + "epoch": 0.59, + "grad_norm": 0.4550230170378503, + "learning_rate": 3.78880170291604e-06, + "loss": 0.1179, + "step": 7009 + }, + { + "epoch": 0.59, + "grad_norm": 0.2832282661186303, + "learning_rate": 3.787477786242937e-06, + "loss": 0.0788, + "step": 7010 + }, + { + "epoch": 0.59, + "grad_norm": 0.8166012434149398, + "learning_rate": 3.7861539598855533e-06, + "loss": 0.0881, + "step": 7011 + }, + { + "epoch": 0.59, + "grad_norm": 0.4151586992920546, + "learning_rate": 3.784830223942495e-06, + "loss": 0.0984, + "step": 7012 + }, + { + "epoch": 0.59, + "grad_norm": 0.49025791287037995, + "learning_rate": 3.78350657851236e-06, + "loss": 0.0746, + "step": 7013 + }, + { + "epoch": 0.59, + "grad_norm": 0.3023400108625917, + "learning_rate": 3.782183023693739e-06, + "loss": 0.0964, + "step": 7014 + }, + { + "epoch": 0.59, + "grad_norm": 0.37215096417704824, + "learning_rate": 3.780859559585223e-06, + "loss": 0.0783, + "step": 7015 + }, + { + "epoch": 0.59, + "grad_norm": 0.35959119478526436, + "learning_rate": 3.7795361862853897e-06, + "loss": 0.12, + "step": 7016 + }, + { + "epoch": 0.59, + "grad_norm": 0.17353135646058698, + "learning_rate": 3.7782129038928096e-06, + "loss": 0.039, + "step": 7017 + }, + { + "epoch": 0.59, + "grad_norm": 0.3013421968143348, + "learning_rate": 3.7768897125060494e-06, + "loss": 0.1069, + "step": 7018 + }, + { + "epoch": 0.59, + "grad_norm": 0.3226532495918191, + "learning_rate": 3.77556661222367e-06, + "loss": 0.0762, + "step": 7019 + }, + { + "epoch": 0.59, + "grad_norm": 0.3215069317216213, + "learning_rate": 3.774243603144222e-06, + "loss": 0.1008, + "step": 7020 + }, + { + "epoch": 0.59, + "grad_norm": 0.4862797368380538, + "learning_rate": 3.7729206853662497e-06, + "loss": 0.0659, + "step": 7021 + }, + { + "epoch": 0.59, + "grad_norm": 0.3655197820433912, + "learning_rate": 3.7715978589882944e-06, + "loss": 0.086, + "step": 7022 + }, + { + "epoch": 0.59, + "grad_norm": 0.5036309981146027, + "learning_rate": 3.7702751241088858e-06, + "loss": 0.0991, + "step": 7023 + }, + { + "epoch": 0.59, + "grad_norm": 0.3497530324309186, + "learning_rate": 3.7689524808265494e-06, + "loss": 0.1037, + "step": 7024 + }, + { + "epoch": 0.59, + "grad_norm": 0.2883804973343093, + "learning_rate": 3.7676299292398045e-06, + "loss": 0.0821, + "step": 7025 + }, + { + "epoch": 0.59, + "grad_norm": 0.5536771259550994, + "learning_rate": 3.766307469447161e-06, + "loss": 0.1333, + "step": 7026 + }, + { + "epoch": 0.59, + "grad_norm": 0.30288012043242024, + "learning_rate": 3.7649851015471227e-06, + "loss": 0.0683, + "step": 7027 + }, + { + "epoch": 0.59, + "grad_norm": 0.320755536701085, + "learning_rate": 3.763662825638191e-06, + "loss": 0.0768, + "step": 7028 + }, + { + "epoch": 0.59, + "grad_norm": 0.250016506061455, + "learning_rate": 3.7623406418188533e-06, + "loss": 0.0544, + "step": 7029 + }, + { + "epoch": 0.59, + "grad_norm": 0.23024196427022733, + "learning_rate": 3.761018550187595e-06, + "loss": 0.0897, + "step": 7030 + }, + { + "epoch": 0.59, + "grad_norm": 0.37991801233326405, + "learning_rate": 3.7596965508428907e-06, + "loss": 0.0825, + "step": 7031 + }, + { + "epoch": 0.59, + "grad_norm": 0.5622701616598457, + "learning_rate": 3.7583746438832143e-06, + "loss": 0.0687, + "step": 7032 + }, + { + "epoch": 0.59, + "grad_norm": 0.4121675383586414, + "learning_rate": 3.7570528294070276e-06, + "loss": 0.1009, + "step": 7033 + }, + { + "epoch": 0.59, + "grad_norm": 0.19817967193002944, + "learning_rate": 3.7557311075127846e-06, + "loss": 0.0498, + "step": 7034 + }, + { + "epoch": 0.59, + "grad_norm": 0.33962798292450097, + "learning_rate": 3.7544094782989394e-06, + "loss": 0.0842, + "step": 7035 + }, + { + "epoch": 0.59, + "grad_norm": 0.1967848655131811, + "learning_rate": 3.753087941863932e-06, + "loss": 0.0488, + "step": 7036 + }, + { + "epoch": 0.59, + "grad_norm": 0.32763062678757426, + "learning_rate": 3.751766498306199e-06, + "loss": 0.1026, + "step": 7037 + }, + { + "epoch": 0.59, + "grad_norm": 0.24105474449013065, + "learning_rate": 3.7504451477241667e-06, + "loss": 0.0736, + "step": 7038 + }, + { + "epoch": 0.59, + "grad_norm": 0.2016926848858177, + "learning_rate": 3.749123890216261e-06, + "loss": 0.0543, + "step": 7039 + }, + { + "epoch": 0.59, + "grad_norm": 0.36091709707264324, + "learning_rate": 3.747802725880894e-06, + "loss": 0.1136, + "step": 7040 + }, + { + "epoch": 0.59, + "grad_norm": 0.14312194905759096, + "learning_rate": 3.746481654816473e-06, + "loss": 0.0273, + "step": 7041 + }, + { + "epoch": 0.59, + "grad_norm": 0.4841918383180562, + "learning_rate": 3.7451606771214032e-06, + "loss": 0.0562, + "step": 7042 + }, + { + "epoch": 0.59, + "grad_norm": 0.6499918442721546, + "learning_rate": 3.7438397928940754e-06, + "loss": 0.1426, + "step": 7043 + }, + { + "epoch": 0.59, + "grad_norm": 0.3049548740958857, + "learning_rate": 3.7425190022328763e-06, + "loss": 0.0972, + "step": 7044 + }, + { + "epoch": 0.59, + "grad_norm": 0.20533678197582406, + "learning_rate": 3.7411983052361887e-06, + "loss": 0.0606, + "step": 7045 + }, + { + "epoch": 0.59, + "grad_norm": 0.27686810093525666, + "learning_rate": 3.7398777020023846e-06, + "loss": 0.0443, + "step": 7046 + }, + { + "epoch": 0.59, + "grad_norm": 0.19279331839169905, + "learning_rate": 3.73855719262983e-06, + "loss": 0.0302, + "step": 7047 + }, + { + "epoch": 0.59, + "grad_norm": 0.49075573565532965, + "learning_rate": 3.737236777216882e-06, + "loss": 0.1017, + "step": 7048 + }, + { + "epoch": 0.59, + "grad_norm": 0.5493096636993589, + "learning_rate": 3.7359164558618964e-06, + "loss": 0.1101, + "step": 7049 + }, + { + "epoch": 0.59, + "grad_norm": 0.3773128878578621, + "learning_rate": 3.734596228663218e-06, + "loss": 0.0894, + "step": 7050 + }, + { + "epoch": 0.59, + "grad_norm": 0.5139926133037153, + "learning_rate": 3.7332760957191828e-06, + "loss": 0.0371, + "step": 7051 + }, + { + "epoch": 0.59, + "grad_norm": 0.4056280877174042, + "learning_rate": 3.7319560571281232e-06, + "loss": 0.0779, + "step": 7052 + }, + { + "epoch": 0.59, + "grad_norm": 0.2956182499980726, + "learning_rate": 3.730636112988364e-06, + "loss": 0.0695, + "step": 7053 + }, + { + "epoch": 0.59, + "grad_norm": 0.29453639301224877, + "learning_rate": 3.7293162633982218e-06, + "loss": 0.081, + "step": 7054 + }, + { + "epoch": 0.59, + "grad_norm": 0.4277574571612674, + "learning_rate": 3.7279965084560054e-06, + "loss": 0.0763, + "step": 7055 + }, + { + "epoch": 0.59, + "grad_norm": 0.5507344968431241, + "learning_rate": 3.7266768482600206e-06, + "loss": 0.1561, + "step": 7056 + }, + { + "epoch": 0.59, + "grad_norm": 0.29962664683835755, + "learning_rate": 3.7253572829085604e-06, + "loss": 0.0989, + "step": 7057 + }, + { + "epoch": 0.59, + "grad_norm": 0.40749641174583323, + "learning_rate": 3.7240378124999157e-06, + "loss": 0.0905, + "step": 7058 + }, + { + "epoch": 0.59, + "grad_norm": 0.35887863029084166, + "learning_rate": 3.7227184371323678e-06, + "loss": 0.0859, + "step": 7059 + }, + { + "epoch": 0.59, + "grad_norm": 0.22793340982692317, + "learning_rate": 3.721399156904192e-06, + "loss": 0.0865, + "step": 7060 + }, + { + "epoch": 0.59, + "grad_norm": 0.44455990637492027, + "learning_rate": 3.720079971913654e-06, + "loss": 0.123, + "step": 7061 + }, + { + "epoch": 0.6, + "grad_norm": 0.19123378058978224, + "learning_rate": 3.7187608822590164e-06, + "loss": 0.0594, + "step": 7062 + }, + { + "epoch": 0.6, + "grad_norm": 0.4022764974218865, + "learning_rate": 3.7174418880385333e-06, + "loss": 0.0918, + "step": 7063 + }, + { + "epoch": 0.6, + "grad_norm": 0.22296929274764535, + "learning_rate": 3.716122989350449e-06, + "loss": 0.0364, + "step": 7064 + }, + { + "epoch": 0.6, + "grad_norm": 0.4068852381796377, + "learning_rate": 3.714804186293002e-06, + "loss": 0.0867, + "step": 7065 + }, + { + "epoch": 0.6, + "grad_norm": 0.20005847141779912, + "learning_rate": 3.7134854789644276e-06, + "loss": 0.0592, + "step": 7066 + }, + { + "epoch": 0.6, + "grad_norm": 0.2120790917461066, + "learning_rate": 3.7121668674629497e-06, + "loss": 0.03, + "step": 7067 + }, + { + "epoch": 0.6, + "grad_norm": 0.5852540904333556, + "learning_rate": 3.710848351886783e-06, + "loss": 0.1213, + "step": 7068 + }, + { + "epoch": 0.6, + "grad_norm": 0.38404801998372473, + "learning_rate": 3.709529932334144e-06, + "loss": 0.1191, + "step": 7069 + }, + { + "epoch": 0.6, + "grad_norm": 0.32121427795862706, + "learning_rate": 3.708211608903233e-06, + "loss": 0.0801, + "step": 7070 + }, + { + "epoch": 0.6, + "grad_norm": 0.31588263187885507, + "learning_rate": 3.7068933816922458e-06, + "loss": 0.061, + "step": 7071 + }, + { + "epoch": 0.6, + "grad_norm": 0.21635739226761505, + "learning_rate": 3.7055752507993703e-06, + "loss": 0.0645, + "step": 7072 + }, + { + "epoch": 0.6, + "grad_norm": 0.3935744420770631, + "learning_rate": 3.7042572163227937e-06, + "loss": 0.1009, + "step": 7073 + }, + { + "epoch": 0.6, + "grad_norm": 0.3416488109587413, + "learning_rate": 3.702939278360687e-06, + "loss": 0.0668, + "step": 7074 + }, + { + "epoch": 0.6, + "grad_norm": 0.28693921237537107, + "learning_rate": 3.7016214370112173e-06, + "loss": 0.0516, + "step": 7075 + }, + { + "epoch": 0.6, + "grad_norm": 0.23247866998050992, + "learning_rate": 3.700303692372549e-06, + "loss": 0.0757, + "step": 7076 + }, + { + "epoch": 0.6, + "grad_norm": 0.3260319601780935, + "learning_rate": 3.6989860445428323e-06, + "loss": 0.0991, + "step": 7077 + }, + { + "epoch": 0.6, + "grad_norm": 0.23245566701996806, + "learning_rate": 3.6976684936202116e-06, + "loss": 0.046, + "step": 7078 + }, + { + "epoch": 0.6, + "grad_norm": 0.33818073060730136, + "learning_rate": 3.6963510397028314e-06, + "loss": 0.0852, + "step": 7079 + }, + { + "epoch": 0.6, + "grad_norm": 0.23990985349862584, + "learning_rate": 3.695033682888819e-06, + "loss": 0.0488, + "step": 7080 + }, + { + "epoch": 0.6, + "grad_norm": 0.19898496092289647, + "learning_rate": 3.6937164232763e-06, + "loss": 0.0655, + "step": 7081 + }, + { + "epoch": 0.6, + "grad_norm": 0.334187534008223, + "learning_rate": 3.6923992609633897e-06, + "loss": 0.0913, + "step": 7082 + }, + { + "epoch": 0.6, + "grad_norm": 0.34328319424172987, + "learning_rate": 3.691082196048202e-06, + "loss": 0.0676, + "step": 7083 + }, + { + "epoch": 0.6, + "grad_norm": 0.47413862018557135, + "learning_rate": 3.689765228628837e-06, + "loss": 0.1294, + "step": 7084 + }, + { + "epoch": 0.6, + "grad_norm": 0.3251873131766871, + "learning_rate": 3.68844835880339e-06, + "loss": 0.0795, + "step": 7085 + }, + { + "epoch": 0.6, + "grad_norm": 0.3130937656927123, + "learning_rate": 3.6871315866699487e-06, + "loss": 0.0886, + "step": 7086 + }, + { + "epoch": 0.6, + "grad_norm": 0.22048519587811288, + "learning_rate": 3.6858149123265962e-06, + "loss": 0.0592, + "step": 7087 + }, + { + "epoch": 0.6, + "grad_norm": 0.2401010322221783, + "learning_rate": 3.6844983358714047e-06, + "loss": 0.0629, + "step": 7088 + }, + { + "epoch": 0.6, + "grad_norm": 0.29585843772610343, + "learning_rate": 3.6831818574024405e-06, + "loss": 0.0791, + "step": 7089 + }, + { + "epoch": 0.6, + "grad_norm": 0.44718787829652257, + "learning_rate": 3.6818654770177626e-06, + "loss": 0.0872, + "step": 7090 + }, + { + "epoch": 0.6, + "grad_norm": 0.24341815348838447, + "learning_rate": 3.6805491948154214e-06, + "loss": 0.0748, + "step": 7091 + }, + { + "epoch": 0.6, + "grad_norm": 0.3054959231265354, + "learning_rate": 3.679233010893463e-06, + "loss": 0.0886, + "step": 7092 + }, + { + "epoch": 0.6, + "grad_norm": 0.277906863215993, + "learning_rate": 3.677916925349925e-06, + "loss": 0.0851, + "step": 7093 + }, + { + "epoch": 0.6, + "grad_norm": 0.2865706191090071, + "learning_rate": 3.676600938282836e-06, + "loss": 0.0603, + "step": 7094 + }, + { + "epoch": 0.6, + "grad_norm": 0.34437711121338116, + "learning_rate": 3.6752850497902174e-06, + "loss": 0.0672, + "step": 7095 + }, + { + "epoch": 0.6, + "grad_norm": 0.4460454863222348, + "learning_rate": 3.673969259970086e-06, + "loss": 0.0942, + "step": 7096 + }, + { + "epoch": 0.6, + "grad_norm": 0.5874193689607091, + "learning_rate": 3.672653568920449e-06, + "loss": 0.1072, + "step": 7097 + }, + { + "epoch": 0.6, + "grad_norm": 0.3286084095064521, + "learning_rate": 3.671337976739307e-06, + "loss": 0.0953, + "step": 7098 + }, + { + "epoch": 0.6, + "grad_norm": 0.602485039120254, + "learning_rate": 3.67002248352465e-06, + "loss": 0.1131, + "step": 7099 + }, + { + "epoch": 0.6, + "grad_norm": 0.3137692093687027, + "learning_rate": 3.6687070893744685e-06, + "loss": 0.1204, + "step": 7100 + }, + { + "epoch": 0.6, + "grad_norm": 0.39656881011082407, + "learning_rate": 3.6673917943867386e-06, + "loss": 0.0951, + "step": 7101 + }, + { + "epoch": 0.6, + "grad_norm": 0.26946064492873245, + "learning_rate": 3.6660765986594282e-06, + "loss": 0.0761, + "step": 7102 + }, + { + "epoch": 0.6, + "grad_norm": 0.27844623347034936, + "learning_rate": 3.664761502290506e-06, + "loss": 0.094, + "step": 7103 + }, + { + "epoch": 0.6, + "grad_norm": 0.47889147465179843, + "learning_rate": 3.6634465053779257e-06, + "loss": 0.0938, + "step": 7104 + }, + { + "epoch": 0.6, + "grad_norm": 0.27852307460161446, + "learning_rate": 3.662131608019635e-06, + "loss": 0.0901, + "step": 7105 + }, + { + "epoch": 0.6, + "grad_norm": 0.42453675918345757, + "learning_rate": 3.6608168103135737e-06, + "loss": 0.1019, + "step": 7106 + }, + { + "epoch": 0.6, + "grad_norm": 0.21032317166867096, + "learning_rate": 3.6595021123576803e-06, + "loss": 0.0685, + "step": 7107 + }, + { + "epoch": 0.6, + "grad_norm": 0.32767050823324617, + "learning_rate": 3.658187514249878e-06, + "loss": 0.0906, + "step": 7108 + }, + { + "epoch": 0.6, + "grad_norm": 0.296831715977161, + "learning_rate": 3.656873016088085e-06, + "loss": 0.0788, + "step": 7109 + }, + { + "epoch": 0.6, + "grad_norm": 0.32933418030133305, + "learning_rate": 3.655558617970215e-06, + "loss": 0.0853, + "step": 7110 + }, + { + "epoch": 0.6, + "grad_norm": 0.271136525418101, + "learning_rate": 3.654244319994172e-06, + "loss": 0.0702, + "step": 7111 + }, + { + "epoch": 0.6, + "grad_norm": 0.2462001482687036, + "learning_rate": 3.6529301222578517e-06, + "loss": 0.0437, + "step": 7112 + }, + { + "epoch": 0.6, + "grad_norm": 0.3594241603997126, + "learning_rate": 3.6516160248591404e-06, + "loss": 0.0631, + "step": 7113 + }, + { + "epoch": 0.6, + "grad_norm": 0.46842087785984116, + "learning_rate": 3.6503020278959244e-06, + "loss": 0.0755, + "step": 7114 + }, + { + "epoch": 0.6, + "grad_norm": 0.45674193330505825, + "learning_rate": 3.648988131466077e-06, + "loss": 0.1104, + "step": 7115 + }, + { + "epoch": 0.6, + "grad_norm": 0.344206651257694, + "learning_rate": 3.647674335667461e-06, + "loss": 0.1088, + "step": 7116 + }, + { + "epoch": 0.6, + "grad_norm": 0.3357409102647396, + "learning_rate": 3.6463606405979403e-06, + "loss": 0.1002, + "step": 7117 + }, + { + "epoch": 0.6, + "grad_norm": 0.2690026049259225, + "learning_rate": 3.645047046355365e-06, + "loss": 0.069, + "step": 7118 + }, + { + "epoch": 0.6, + "grad_norm": 0.30496112825872285, + "learning_rate": 3.6437335530375778e-06, + "loss": 0.1042, + "step": 7119 + }, + { + "epoch": 0.6, + "grad_norm": 0.20285205233021208, + "learning_rate": 3.6424201607424163e-06, + "loss": 0.0448, + "step": 7120 + }, + { + "epoch": 0.6, + "grad_norm": 0.23580418383731042, + "learning_rate": 3.641106869567711e-06, + "loss": 0.0625, + "step": 7121 + }, + { + "epoch": 0.6, + "grad_norm": 0.2947611114259498, + "learning_rate": 3.639793679611282e-06, + "loss": 0.0822, + "step": 7122 + }, + { + "epoch": 0.6, + "grad_norm": 0.29643890819337687, + "learning_rate": 3.638480590970942e-06, + "loss": 0.0825, + "step": 7123 + }, + { + "epoch": 0.6, + "grad_norm": 0.3693671498326471, + "learning_rate": 3.637167603744501e-06, + "loss": 0.0886, + "step": 7124 + }, + { + "epoch": 0.6, + "grad_norm": 0.44237181526538577, + "learning_rate": 3.635854718029754e-06, + "loss": 0.0906, + "step": 7125 + }, + { + "epoch": 0.6, + "grad_norm": 0.3420100536857426, + "learning_rate": 3.6345419339244946e-06, + "loss": 0.0714, + "step": 7126 + }, + { + "epoch": 0.6, + "grad_norm": 0.3103941523748551, + "learning_rate": 3.6332292515265078e-06, + "loss": 0.1064, + "step": 7127 + }, + { + "epoch": 0.6, + "grad_norm": 0.4541477247644959, + "learning_rate": 3.6319166709335684e-06, + "loss": 0.0994, + "step": 7128 + }, + { + "epoch": 0.6, + "grad_norm": 0.3192004727331564, + "learning_rate": 3.6306041922434436e-06, + "loss": 0.0851, + "step": 7129 + }, + { + "epoch": 0.6, + "grad_norm": 0.851652154574873, + "learning_rate": 3.629291815553896e-06, + "loss": 0.126, + "step": 7130 + }, + { + "epoch": 0.6, + "grad_norm": 0.3196016860985171, + "learning_rate": 3.62797954096268e-06, + "loss": 0.083, + "step": 7131 + }, + { + "epoch": 0.6, + "grad_norm": 0.3360810262656064, + "learning_rate": 3.62666736856754e-06, + "loss": 0.0769, + "step": 7132 + }, + { + "epoch": 0.6, + "grad_norm": 0.32071694812099405, + "learning_rate": 3.625355298466213e-06, + "loss": 0.0977, + "step": 7133 + }, + { + "epoch": 0.6, + "grad_norm": 0.2502376544929433, + "learning_rate": 3.6240433307564337e-06, + "loss": 0.0789, + "step": 7134 + }, + { + "epoch": 0.6, + "grad_norm": 0.4073455920283119, + "learning_rate": 3.6227314655359226e-06, + "loss": 0.0812, + "step": 7135 + }, + { + "epoch": 0.6, + "grad_norm": 0.2160006907522294, + "learning_rate": 3.6214197029023923e-06, + "loss": 0.0559, + "step": 7136 + }, + { + "epoch": 0.6, + "grad_norm": 0.27085113965736396, + "learning_rate": 3.6201080429535563e-06, + "loss": 0.105, + "step": 7137 + }, + { + "epoch": 0.6, + "grad_norm": 0.41180008480737884, + "learning_rate": 3.6187964857871107e-06, + "loss": 0.1044, + "step": 7138 + }, + { + "epoch": 0.6, + "grad_norm": 0.36068851672146485, + "learning_rate": 3.61748503150075e-06, + "loss": 0.0792, + "step": 7139 + }, + { + "epoch": 0.6, + "grad_norm": 0.20841773566430538, + "learning_rate": 3.6161736801921554e-06, + "loss": 0.0702, + "step": 7140 + }, + { + "epoch": 0.6, + "grad_norm": 0.27459567741339497, + "learning_rate": 3.6148624319590083e-06, + "loss": 0.0618, + "step": 7141 + }, + { + "epoch": 0.6, + "grad_norm": 0.3977736107589453, + "learning_rate": 3.6135512868989765e-06, + "loss": 0.0973, + "step": 7142 + }, + { + "epoch": 0.6, + "grad_norm": 0.420698169307046, + "learning_rate": 3.6122402451097192e-06, + "loss": 0.0862, + "step": 7143 + }, + { + "epoch": 0.6, + "grad_norm": 0.3406006623975837, + "learning_rate": 3.610929306688895e-06, + "loss": 0.0885, + "step": 7144 + }, + { + "epoch": 0.6, + "grad_norm": 0.28569623738410815, + "learning_rate": 3.6096184717341483e-06, + "loss": 0.0759, + "step": 7145 + }, + { + "epoch": 0.6, + "grad_norm": 0.2618131131678179, + "learning_rate": 3.6083077403431165e-06, + "loss": 0.0681, + "step": 7146 + }, + { + "epoch": 0.6, + "grad_norm": 0.3710443493857251, + "learning_rate": 3.6069971126134306e-06, + "loss": 0.0626, + "step": 7147 + }, + { + "epoch": 0.6, + "grad_norm": 0.41453698327334637, + "learning_rate": 3.605686588642716e-06, + "loss": 0.124, + "step": 7148 + }, + { + "epoch": 0.6, + "grad_norm": 0.16193254323420794, + "learning_rate": 3.604376168528587e-06, + "loss": 0.0345, + "step": 7149 + }, + { + "epoch": 0.6, + "grad_norm": 0.2696876898211043, + "learning_rate": 3.603065852368648e-06, + "loss": 0.0773, + "step": 7150 + }, + { + "epoch": 0.6, + "grad_norm": 0.24805561066795712, + "learning_rate": 3.6017556402605054e-06, + "loss": 0.0617, + "step": 7151 + }, + { + "epoch": 0.6, + "grad_norm": 0.25096618503627316, + "learning_rate": 3.6004455323017473e-06, + "loss": 0.048, + "step": 7152 + }, + { + "epoch": 0.6, + "grad_norm": 0.2558511966307502, + "learning_rate": 3.599135528589958e-06, + "loss": 0.0824, + "step": 7153 + }, + { + "epoch": 0.6, + "grad_norm": 0.2836724332371867, + "learning_rate": 3.5978256292227152e-06, + "loss": 0.06, + "step": 7154 + }, + { + "epoch": 0.6, + "grad_norm": 0.4096538406473501, + "learning_rate": 3.596515834297589e-06, + "loss": 0.0924, + "step": 7155 + }, + { + "epoch": 0.6, + "grad_norm": 0.5103851424960107, + "learning_rate": 3.59520614391214e-06, + "loss": 0.0988, + "step": 7156 + }, + { + "epoch": 0.6, + "grad_norm": 0.3657133228520561, + "learning_rate": 3.5938965581639195e-06, + "loss": 0.0661, + "step": 7157 + }, + { + "epoch": 0.6, + "grad_norm": 0.3912204006862977, + "learning_rate": 3.592587077150475e-06, + "loss": 0.1171, + "step": 7158 + }, + { + "epoch": 0.6, + "grad_norm": 0.2884624306791856, + "learning_rate": 3.5912777009693435e-06, + "loss": 0.0773, + "step": 7159 + }, + { + "epoch": 0.6, + "grad_norm": 0.5088823110812557, + "learning_rate": 3.589968429718055e-06, + "loss": 0.1177, + "step": 7160 + }, + { + "epoch": 0.6, + "grad_norm": 0.3899696117750577, + "learning_rate": 3.588659263494132e-06, + "loss": 0.0824, + "step": 7161 + }, + { + "epoch": 0.6, + "grad_norm": 0.21575436392551625, + "learning_rate": 3.5873502023950898e-06, + "loss": 0.0577, + "step": 7162 + }, + { + "epoch": 0.6, + "grad_norm": 0.38425591725953806, + "learning_rate": 3.5860412465184324e-06, + "loss": 0.0944, + "step": 7163 + }, + { + "epoch": 0.6, + "grad_norm": 0.32378828785144875, + "learning_rate": 3.58473239596166e-06, + "loss": 0.0919, + "step": 7164 + }, + { + "epoch": 0.6, + "grad_norm": 0.26621206431989625, + "learning_rate": 3.5834236508222643e-06, + "loss": 0.0888, + "step": 7165 + }, + { + "epoch": 0.6, + "grad_norm": 0.4117143748668806, + "learning_rate": 3.582115011197727e-06, + "loss": 0.1014, + "step": 7166 + }, + { + "epoch": 0.6, + "grad_norm": 0.17836120445202577, + "learning_rate": 3.5808064771855216e-06, + "loss": 0.051, + "step": 7167 + }, + { + "epoch": 0.6, + "grad_norm": 0.18995216725015457, + "learning_rate": 3.5794980488831194e-06, + "loss": 0.0615, + "step": 7168 + }, + { + "epoch": 0.6, + "grad_norm": 0.7707197645463754, + "learning_rate": 3.578189726387977e-06, + "loss": 0.0827, + "step": 7169 + }, + { + "epoch": 0.6, + "grad_norm": 0.2828497960713177, + "learning_rate": 3.5768815097975452e-06, + "loss": 0.0933, + "step": 7170 + }, + { + "epoch": 0.6, + "grad_norm": 0.35325781465784833, + "learning_rate": 3.57557339920927e-06, + "loss": 0.0535, + "step": 7171 + }, + { + "epoch": 0.6, + "grad_norm": 0.25975126129292264, + "learning_rate": 3.5742653947205865e-06, + "loss": 0.063, + "step": 7172 + }, + { + "epoch": 0.6, + "grad_norm": 0.2939556017225196, + "learning_rate": 3.572957496428922e-06, + "loss": 0.073, + "step": 7173 + }, + { + "epoch": 0.6, + "grad_norm": 0.3376509869795295, + "learning_rate": 3.5716497044316943e-06, + "loss": 0.0924, + "step": 7174 + }, + { + "epoch": 0.6, + "grad_norm": 0.2495971077391261, + "learning_rate": 3.5703420188263193e-06, + "loss": 0.0531, + "step": 7175 + }, + { + "epoch": 0.6, + "grad_norm": 0.7372077295233995, + "learning_rate": 3.5690344397101983e-06, + "loss": 0.1145, + "step": 7176 + }, + { + "epoch": 0.6, + "grad_norm": 0.306580763620131, + "learning_rate": 3.567726967180727e-06, + "loss": 0.0717, + "step": 7177 + }, + { + "epoch": 0.6, + "grad_norm": 0.34639346707495006, + "learning_rate": 3.566419601335297e-06, + "loss": 0.069, + "step": 7178 + }, + { + "epoch": 0.6, + "grad_norm": 0.2794510385541492, + "learning_rate": 3.5651123422712865e-06, + "loss": 0.0677, + "step": 7179 + }, + { + "epoch": 0.6, + "grad_norm": 0.27498673063775847, + "learning_rate": 3.563805190086067e-06, + "loss": 0.086, + "step": 7180 + }, + { + "epoch": 0.61, + "grad_norm": 0.24367884563086192, + "learning_rate": 3.5624981448770012e-06, + "loss": 0.072, + "step": 7181 + }, + { + "epoch": 0.61, + "grad_norm": 0.26630958905056645, + "learning_rate": 3.56119120674145e-06, + "loss": 0.0846, + "step": 7182 + }, + { + "epoch": 0.61, + "grad_norm": 0.310386756000949, + "learning_rate": 3.5598843757767595e-06, + "loss": 0.0578, + "step": 7183 + }, + { + "epoch": 0.61, + "grad_norm": 0.41210458038505293, + "learning_rate": 3.5585776520802673e-06, + "loss": 0.1029, + "step": 7184 + }, + { + "epoch": 0.61, + "grad_norm": 0.3503584387298873, + "learning_rate": 3.557271035749311e-06, + "loss": 0.0519, + "step": 7185 + }, + { + "epoch": 0.61, + "grad_norm": 0.24825980923834484, + "learning_rate": 3.5559645268812126e-06, + "loss": 0.0833, + "step": 7186 + }, + { + "epoch": 0.61, + "grad_norm": 0.38934650203515087, + "learning_rate": 3.5546581255732872e-06, + "loss": 0.0953, + "step": 7187 + }, + { + "epoch": 0.61, + "grad_norm": 0.2910646303609189, + "learning_rate": 3.5533518319228445e-06, + "loss": 0.0938, + "step": 7188 + }, + { + "epoch": 0.61, + "grad_norm": 0.2859802459032237, + "learning_rate": 3.5520456460271858e-06, + "loss": 0.0708, + "step": 7189 + }, + { + "epoch": 0.61, + "grad_norm": 0.24187029811429853, + "learning_rate": 3.550739567983602e-06, + "loss": 0.0553, + "step": 7190 + }, + { + "epoch": 0.61, + "grad_norm": 0.27932892510458834, + "learning_rate": 3.5494335978893766e-06, + "loss": 0.058, + "step": 7191 + }, + { + "epoch": 0.61, + "grad_norm": 0.3030468633946572, + "learning_rate": 3.548127735841789e-06, + "loss": 0.0542, + "step": 7192 + }, + { + "epoch": 0.61, + "grad_norm": 0.510280397826606, + "learning_rate": 3.5468219819381035e-06, + "loss": 0.0868, + "step": 7193 + }, + { + "epoch": 0.61, + "grad_norm": 0.2569317369229568, + "learning_rate": 3.545516336275582e-06, + "loss": 0.0614, + "step": 7194 + }, + { + "epoch": 0.61, + "grad_norm": 1.1593744124521028, + "learning_rate": 3.5442107989514783e-06, + "loss": 0.183, + "step": 7195 + }, + { + "epoch": 0.61, + "grad_norm": 0.6709256031053239, + "learning_rate": 3.5429053700630343e-06, + "loss": 0.0887, + "step": 7196 + }, + { + "epoch": 0.61, + "grad_norm": 0.2781811730022812, + "learning_rate": 3.5416000497074866e-06, + "loss": 0.0879, + "step": 7197 + }, + { + "epoch": 0.61, + "grad_norm": 0.616379470887631, + "learning_rate": 3.5402948379820623e-06, + "loss": 0.0972, + "step": 7198 + }, + { + "epoch": 0.61, + "grad_norm": 0.3660574288429818, + "learning_rate": 3.538989734983983e-06, + "loss": 0.0858, + "step": 7199 + }, + { + "epoch": 0.61, + "grad_norm": 0.26300874629795823, + "learning_rate": 3.537684740810459e-06, + "loss": 0.0662, + "step": 7200 + }, + { + "epoch": 0.61, + "grad_norm": 0.23966330082773055, + "learning_rate": 3.5363798555586924e-06, + "loss": 0.0569, + "step": 7201 + }, + { + "epoch": 0.61, + "grad_norm": 0.3735382078055617, + "learning_rate": 3.5350750793258825e-06, + "loss": 0.0989, + "step": 7202 + }, + { + "epoch": 0.61, + "grad_norm": 0.36859260403936434, + "learning_rate": 3.5337704122092144e-06, + "loss": 0.1041, + "step": 7203 + }, + { + "epoch": 0.61, + "grad_norm": 0.24370026362096808, + "learning_rate": 3.532465854305867e-06, + "loss": 0.0866, + "step": 7204 + }, + { + "epoch": 0.61, + "grad_norm": 0.38199543199599034, + "learning_rate": 3.53116140571301e-06, + "loss": 0.1313, + "step": 7205 + }, + { + "epoch": 0.61, + "grad_norm": 0.38612073418466125, + "learning_rate": 3.5298570665278106e-06, + "loss": 0.0952, + "step": 7206 + }, + { + "epoch": 0.61, + "grad_norm": 0.4394731385307924, + "learning_rate": 3.528552836847421e-06, + "loss": 0.106, + "step": 7207 + }, + { + "epoch": 0.61, + "grad_norm": 0.3529553057936811, + "learning_rate": 3.527248716768985e-06, + "loss": 0.0782, + "step": 7208 + }, + { + "epoch": 0.61, + "grad_norm": 0.4342915743036655, + "learning_rate": 3.5259447063896468e-06, + "loss": 0.1164, + "step": 7209 + }, + { + "epoch": 0.61, + "grad_norm": 0.3296869251887252, + "learning_rate": 3.524640805806534e-06, + "loss": 0.05, + "step": 7210 + }, + { + "epoch": 0.61, + "grad_norm": 0.19725458332355078, + "learning_rate": 3.5233370151167664e-06, + "loss": 0.0493, + "step": 7211 + }, + { + "epoch": 0.61, + "grad_norm": 0.3578378313893545, + "learning_rate": 3.522033334417462e-06, + "loss": 0.0802, + "step": 7212 + }, + { + "epoch": 0.61, + "grad_norm": 0.3629770845891983, + "learning_rate": 3.520729763805725e-06, + "loss": 0.1133, + "step": 7213 + }, + { + "epoch": 0.61, + "grad_norm": 0.26450723990271996, + "learning_rate": 3.519426303378652e-06, + "loss": 0.0684, + "step": 7214 + }, + { + "epoch": 0.61, + "grad_norm": 0.36609275708141564, + "learning_rate": 3.5181229532333304e-06, + "loss": 0.0967, + "step": 7215 + }, + { + "epoch": 0.61, + "grad_norm": 0.21398705665598083, + "learning_rate": 3.516819713466847e-06, + "loss": 0.0496, + "step": 7216 + }, + { + "epoch": 0.61, + "grad_norm": 0.6343095212699651, + "learning_rate": 3.515516584176271e-06, + "loss": 0.1404, + "step": 7217 + }, + { + "epoch": 0.61, + "grad_norm": 0.3184382668078663, + "learning_rate": 3.5142135654586655e-06, + "loss": 0.0748, + "step": 7218 + }, + { + "epoch": 0.61, + "grad_norm": 0.41082450263607484, + "learning_rate": 3.5129106574110916e-06, + "loss": 0.0866, + "step": 7219 + }, + { + "epoch": 0.61, + "grad_norm": 0.20336614906203018, + "learning_rate": 3.5116078601305944e-06, + "loss": 0.07, + "step": 7220 + }, + { + "epoch": 0.61, + "grad_norm": 0.5156052422853074, + "learning_rate": 3.510305173714214e-06, + "loss": 0.1082, + "step": 7221 + }, + { + "epoch": 0.61, + "grad_norm": 0.3787695549469417, + "learning_rate": 3.5090025982589804e-06, + "loss": 0.0957, + "step": 7222 + }, + { + "epoch": 0.61, + "grad_norm": 0.3821850536403129, + "learning_rate": 3.507700133861922e-06, + "loss": 0.1093, + "step": 7223 + }, + { + "epoch": 0.61, + "grad_norm": 0.3570362528167716, + "learning_rate": 3.50639778062005e-06, + "loss": 0.1035, + "step": 7224 + }, + { + "epoch": 0.61, + "grad_norm": 1.0692125173788571, + "learning_rate": 3.505095538630371e-06, + "loss": 0.1579, + "step": 7225 + }, + { + "epoch": 0.61, + "grad_norm": 0.33028632913736616, + "learning_rate": 3.503793407989886e-06, + "loss": 0.0543, + "step": 7226 + }, + { + "epoch": 0.61, + "grad_norm": 0.2511899919211044, + "learning_rate": 3.5024913887955838e-06, + "loss": 0.0647, + "step": 7227 + }, + { + "epoch": 0.61, + "grad_norm": 0.498909533634594, + "learning_rate": 3.5011894811444457e-06, + "loss": 0.094, + "step": 7228 + }, + { + "epoch": 0.61, + "grad_norm": 0.30543285911720114, + "learning_rate": 3.4998876851334473e-06, + "loss": 0.0398, + "step": 7229 + }, + { + "epoch": 0.61, + "grad_norm": 0.27309554345108344, + "learning_rate": 3.498586000859553e-06, + "loss": 0.0597, + "step": 7230 + }, + { + "epoch": 0.61, + "grad_norm": 0.4788962021688593, + "learning_rate": 3.4972844284197194e-06, + "loss": 0.1755, + "step": 7231 + }, + { + "epoch": 0.61, + "grad_norm": 0.33577069238052243, + "learning_rate": 3.495982967910895e-06, + "loss": 0.0862, + "step": 7232 + }, + { + "epoch": 0.61, + "grad_norm": 0.45041548434189355, + "learning_rate": 3.4946816194300214e-06, + "loss": 0.0927, + "step": 7233 + }, + { + "epoch": 0.61, + "grad_norm": 0.23446000809126838, + "learning_rate": 3.49338038307403e-06, + "loss": 0.043, + "step": 7234 + }, + { + "epoch": 0.61, + "grad_norm": 0.5058923573069056, + "learning_rate": 3.4920792589398423e-06, + "loss": 0.0563, + "step": 7235 + }, + { + "epoch": 0.61, + "grad_norm": 0.34508190081391704, + "learning_rate": 3.4907782471243784e-06, + "loss": 0.0756, + "step": 7236 + }, + { + "epoch": 0.61, + "grad_norm": 0.2168039682729426, + "learning_rate": 3.489477347724542e-06, + "loss": 0.0693, + "step": 7237 + }, + { + "epoch": 0.61, + "grad_norm": 0.8416626268546971, + "learning_rate": 3.488176560837232e-06, + "loss": 0.1531, + "step": 7238 + }, + { + "epoch": 0.61, + "grad_norm": 0.35826822003421627, + "learning_rate": 3.4868758865593374e-06, + "loss": 0.07, + "step": 7239 + }, + { + "epoch": 0.61, + "grad_norm": 0.27368161822874465, + "learning_rate": 3.485575324987743e-06, + "loss": 0.0795, + "step": 7240 + }, + { + "epoch": 0.61, + "grad_norm": 0.2799987437948479, + "learning_rate": 3.4842748762193203e-06, + "loss": 0.0799, + "step": 7241 + }, + { + "epoch": 0.61, + "grad_norm": 0.42396320867636134, + "learning_rate": 3.482974540350933e-06, + "loss": 0.0908, + "step": 7242 + }, + { + "epoch": 0.61, + "grad_norm": 0.3893674761808773, + "learning_rate": 3.481674317479441e-06, + "loss": 0.104, + "step": 7243 + }, + { + "epoch": 0.61, + "grad_norm": 0.30654440707673486, + "learning_rate": 3.4803742077016913e-06, + "loss": 0.0814, + "step": 7244 + }, + { + "epoch": 0.61, + "grad_norm": 0.32677813121857013, + "learning_rate": 3.4790742111145206e-06, + "loss": 0.0684, + "step": 7245 + }, + { + "epoch": 0.61, + "grad_norm": 0.25611821595628015, + "learning_rate": 3.4777743278147645e-06, + "loss": 0.0664, + "step": 7246 + }, + { + "epoch": 0.61, + "grad_norm": 0.3176130736017971, + "learning_rate": 3.476474557899244e-06, + "loss": 0.0963, + "step": 7247 + }, + { + "epoch": 0.61, + "grad_norm": 0.2528567963636341, + "learning_rate": 3.475174901464774e-06, + "loss": 0.0832, + "step": 7248 + }, + { + "epoch": 0.61, + "grad_norm": 0.5049130924403095, + "learning_rate": 3.473875358608158e-06, + "loss": 0.0704, + "step": 7249 + }, + { + "epoch": 0.61, + "grad_norm": 0.2666556149932916, + "learning_rate": 3.472575929426197e-06, + "loss": 0.0729, + "step": 7250 + }, + { + "epoch": 0.61, + "grad_norm": 0.6513495967114249, + "learning_rate": 3.47127661401568e-06, + "loss": 0.1104, + "step": 7251 + }, + { + "epoch": 0.61, + "grad_norm": 0.5802113438065355, + "learning_rate": 3.469977412473383e-06, + "loss": 0.102, + "step": 7252 + }, + { + "epoch": 0.61, + "grad_norm": 0.24800736729906492, + "learning_rate": 3.4686783248960843e-06, + "loss": 0.0675, + "step": 7253 + }, + { + "epoch": 0.61, + "grad_norm": 0.2271687024284112, + "learning_rate": 3.467379351380544e-06, + "loss": 0.0666, + "step": 7254 + }, + { + "epoch": 0.61, + "grad_norm": 0.21486998138012595, + "learning_rate": 3.4660804920235184e-06, + "loss": 0.0903, + "step": 7255 + }, + { + "epoch": 0.61, + "grad_norm": 0.2933283723673711, + "learning_rate": 3.464781746921751e-06, + "loss": 0.0674, + "step": 7256 + }, + { + "epoch": 0.61, + "grad_norm": 0.5886895327579774, + "learning_rate": 3.4634831161719847e-06, + "loss": 0.0994, + "step": 7257 + }, + { + "epoch": 0.61, + "grad_norm": 0.339142436612873, + "learning_rate": 3.462184599870947e-06, + "loss": 0.0444, + "step": 7258 + }, + { + "epoch": 0.61, + "grad_norm": 0.2788659062621638, + "learning_rate": 3.460886198115357e-06, + "loss": 0.0746, + "step": 7259 + }, + { + "epoch": 0.61, + "grad_norm": 0.3195635724142746, + "learning_rate": 3.4595879110019306e-06, + "loss": 0.0966, + "step": 7260 + }, + { + "epoch": 0.61, + "grad_norm": 0.33568624617670695, + "learning_rate": 3.4582897386273683e-06, + "loss": 0.0605, + "step": 7261 + }, + { + "epoch": 0.61, + "grad_norm": 0.2729839246012114, + "learning_rate": 3.456991681088368e-06, + "loss": 0.0808, + "step": 7262 + }, + { + "epoch": 0.61, + "grad_norm": 0.32401307201830587, + "learning_rate": 3.4556937384816167e-06, + "loss": 0.0607, + "step": 7263 + }, + { + "epoch": 0.61, + "grad_norm": 0.19311700523388095, + "learning_rate": 3.4543959109037927e-06, + "loss": 0.0573, + "step": 7264 + }, + { + "epoch": 0.61, + "grad_norm": 1.1199505208179195, + "learning_rate": 3.453098198451563e-06, + "loss": 0.0815, + "step": 7265 + }, + { + "epoch": 0.61, + "grad_norm": 0.34774625334082765, + "learning_rate": 3.4518006012215915e-06, + "loss": 0.0553, + "step": 7266 + }, + { + "epoch": 0.61, + "grad_norm": 0.44682756864854534, + "learning_rate": 3.450503119310531e-06, + "loss": 0.0796, + "step": 7267 + }, + { + "epoch": 0.61, + "grad_norm": 0.28661969624421535, + "learning_rate": 3.4492057528150245e-06, + "loss": 0.1129, + "step": 7268 + }, + { + "epoch": 0.61, + "grad_norm": 0.27184027153051976, + "learning_rate": 3.447908501831706e-06, + "loss": 0.0645, + "step": 7269 + }, + { + "epoch": 0.61, + "grad_norm": 0.3619101616059619, + "learning_rate": 3.446611366457206e-06, + "loss": 0.0748, + "step": 7270 + }, + { + "epoch": 0.61, + "grad_norm": 0.2830447566215545, + "learning_rate": 3.44531434678814e-06, + "loss": 0.1141, + "step": 7271 + }, + { + "epoch": 0.61, + "grad_norm": 0.33487652330292617, + "learning_rate": 3.4440174429211186e-06, + "loss": 0.0944, + "step": 7272 + }, + { + "epoch": 0.61, + "grad_norm": 0.3159947315615128, + "learning_rate": 3.4427206549527404e-06, + "loss": 0.0724, + "step": 7273 + }, + { + "epoch": 0.61, + "grad_norm": 0.3927528037145045, + "learning_rate": 3.4414239829796014e-06, + "loss": 0.1437, + "step": 7274 + }, + { + "epoch": 0.61, + "grad_norm": 0.5889045046118133, + "learning_rate": 3.4401274270982833e-06, + "loss": 0.1242, + "step": 7275 + }, + { + "epoch": 0.61, + "grad_norm": 0.23341212144436196, + "learning_rate": 3.4388309874053593e-06, + "loss": 0.0468, + "step": 7276 + }, + { + "epoch": 0.61, + "grad_norm": 0.27156731859335653, + "learning_rate": 3.4375346639974005e-06, + "loss": 0.1025, + "step": 7277 + }, + { + "epoch": 0.61, + "grad_norm": 0.20075248458132203, + "learning_rate": 3.436238456970962e-06, + "loss": 0.0471, + "step": 7278 + }, + { + "epoch": 0.61, + "grad_norm": 0.3265523324186541, + "learning_rate": 3.43494236642259e-06, + "loss": 0.1012, + "step": 7279 + }, + { + "epoch": 0.61, + "grad_norm": 0.1911025447309183, + "learning_rate": 3.43364639244883e-06, + "loss": 0.0369, + "step": 7280 + }, + { + "epoch": 0.61, + "grad_norm": 0.34143365340398263, + "learning_rate": 3.4323505351462116e-06, + "loss": 0.0783, + "step": 7281 + }, + { + "epoch": 0.61, + "grad_norm": 0.35100147408668453, + "learning_rate": 3.4310547946112573e-06, + "loss": 0.0902, + "step": 7282 + }, + { + "epoch": 0.61, + "grad_norm": 0.2238889101175209, + "learning_rate": 3.42975917094048e-06, + "loss": 0.0701, + "step": 7283 + }, + { + "epoch": 0.61, + "grad_norm": 0.37107890748805655, + "learning_rate": 3.4284636642303885e-06, + "loss": 0.0644, + "step": 7284 + }, + { + "epoch": 0.61, + "grad_norm": 0.7028317820048504, + "learning_rate": 3.4271682745774786e-06, + "loss": 0.143, + "step": 7285 + }, + { + "epoch": 0.61, + "grad_norm": 0.4752189097069653, + "learning_rate": 3.425873002078236e-06, + "loss": 0.1018, + "step": 7286 + }, + { + "epoch": 0.61, + "grad_norm": 0.3003558705325369, + "learning_rate": 3.424577846829144e-06, + "loss": 0.0879, + "step": 7287 + }, + { + "epoch": 0.61, + "grad_norm": 0.31777370239077735, + "learning_rate": 3.423282808926671e-06, + "loss": 0.0887, + "step": 7288 + }, + { + "epoch": 0.61, + "grad_norm": 0.17369588604691094, + "learning_rate": 3.4219878884672807e-06, + "loss": 0.0247, + "step": 7289 + }, + { + "epoch": 0.61, + "grad_norm": 0.25601900319042425, + "learning_rate": 3.4206930855474217e-06, + "loss": 0.0648, + "step": 7290 + }, + { + "epoch": 0.61, + "grad_norm": 0.5064267884878119, + "learning_rate": 3.419398400263545e-06, + "loss": 0.0789, + "step": 7291 + }, + { + "epoch": 0.61, + "grad_norm": 0.2507583373150979, + "learning_rate": 3.4181038327120822e-06, + "loss": 0.0812, + "step": 7292 + }, + { + "epoch": 0.61, + "grad_norm": 0.3658612007364356, + "learning_rate": 3.416809382989461e-06, + "loss": 0.0954, + "step": 7293 + }, + { + "epoch": 0.61, + "grad_norm": 0.2817766497967344, + "learning_rate": 3.4155150511921005e-06, + "loss": 0.0841, + "step": 7294 + }, + { + "epoch": 0.61, + "grad_norm": 0.28812257852455986, + "learning_rate": 3.4142208374164077e-06, + "loss": 0.0714, + "step": 7295 + }, + { + "epoch": 0.61, + "grad_norm": 0.4013677716460676, + "learning_rate": 3.412926741758785e-06, + "loss": 0.1409, + "step": 7296 + }, + { + "epoch": 0.61, + "grad_norm": 0.31206205628267586, + "learning_rate": 3.4116327643156256e-06, + "loss": 0.0467, + "step": 7297 + }, + { + "epoch": 0.61, + "grad_norm": 0.2916657268965481, + "learning_rate": 3.4103389051833104e-06, + "loss": 0.0486, + "step": 7298 + }, + { + "epoch": 0.62, + "grad_norm": 0.1984090740492212, + "learning_rate": 3.409045164458213e-06, + "loss": 0.0674, + "step": 7299 + }, + { + "epoch": 0.62, + "grad_norm": 0.472677919117807, + "learning_rate": 3.407751542236699e-06, + "loss": 0.123, + "step": 7300 + }, + { + "epoch": 0.62, + "grad_norm": 0.24128546558172273, + "learning_rate": 3.406458038615128e-06, + "loss": 0.0506, + "step": 7301 + }, + { + "epoch": 0.62, + "grad_norm": 0.2964864172269203, + "learning_rate": 3.405164653689845e-06, + "loss": 0.0929, + "step": 7302 + }, + { + "epoch": 0.62, + "grad_norm": 0.27177509258566596, + "learning_rate": 3.4038713875571865e-06, + "loss": 0.0682, + "step": 7303 + }, + { + "epoch": 0.62, + "grad_norm": 0.22988649633746, + "learning_rate": 3.402578240313488e-06, + "loss": 0.0477, + "step": 7304 + }, + { + "epoch": 0.62, + "grad_norm": 0.39519627710965466, + "learning_rate": 3.401285212055067e-06, + "loss": 0.0949, + "step": 7305 + }, + { + "epoch": 0.62, + "grad_norm": 0.4447553832897663, + "learning_rate": 3.399992302878237e-06, + "loss": 0.13, + "step": 7306 + }, + { + "epoch": 0.62, + "grad_norm": 0.35348263136679137, + "learning_rate": 3.3986995128792995e-06, + "loss": 0.0641, + "step": 7307 + }, + { + "epoch": 0.62, + "grad_norm": 0.26923819017135264, + "learning_rate": 3.397406842154552e-06, + "loss": 0.0962, + "step": 7308 + }, + { + "epoch": 0.62, + "grad_norm": 0.2661925759353993, + "learning_rate": 3.3961142908002788e-06, + "loss": 0.0667, + "step": 7309 + }, + { + "epoch": 0.62, + "grad_norm": 0.2794693465144013, + "learning_rate": 3.3948218589127546e-06, + "loss": 0.1075, + "step": 7310 + }, + { + "epoch": 0.62, + "grad_norm": 0.3559780866132902, + "learning_rate": 3.3935295465882513e-06, + "loss": 0.0776, + "step": 7311 + }, + { + "epoch": 0.62, + "grad_norm": 0.3357754746711033, + "learning_rate": 3.392237353923026e-06, + "loss": 0.1276, + "step": 7312 + }, + { + "epoch": 0.62, + "grad_norm": 0.3534045442914703, + "learning_rate": 3.3909452810133258e-06, + "loss": 0.0973, + "step": 7313 + }, + { + "epoch": 0.62, + "grad_norm": 0.3439437243947871, + "learning_rate": 3.3896533279553965e-06, + "loss": 0.1033, + "step": 7314 + }, + { + "epoch": 0.62, + "grad_norm": 0.18324978723755, + "learning_rate": 3.3883614948454692e-06, + "loss": 0.0426, + "step": 7315 + }, + { + "epoch": 0.62, + "grad_norm": 0.2048284909644386, + "learning_rate": 3.3870697817797647e-06, + "loss": 0.0576, + "step": 7316 + }, + { + "epoch": 0.62, + "grad_norm": 0.35030533290719207, + "learning_rate": 3.385778188854498e-06, + "loss": 0.0855, + "step": 7317 + }, + { + "epoch": 0.62, + "grad_norm": 0.3072156923647518, + "learning_rate": 3.3844867161658775e-06, + "loss": 0.0696, + "step": 7318 + }, + { + "epoch": 0.62, + "grad_norm": 0.36013910883288075, + "learning_rate": 3.3831953638100967e-06, + "loss": 0.0571, + "step": 7319 + }, + { + "epoch": 0.62, + "grad_norm": 0.22589359986472995, + "learning_rate": 3.3819041318833423e-06, + "loss": 0.0537, + "step": 7320 + }, + { + "epoch": 0.62, + "grad_norm": 0.3297858010687959, + "learning_rate": 3.3806130204817967e-06, + "loss": 0.1055, + "step": 7321 + }, + { + "epoch": 0.62, + "grad_norm": 0.497880815879118, + "learning_rate": 3.379322029701626e-06, + "loss": 0.1246, + "step": 7322 + }, + { + "epoch": 0.62, + "grad_norm": 0.19213067700810027, + "learning_rate": 3.3780311596389925e-06, + "loss": 0.0465, + "step": 7323 + }, + { + "epoch": 0.62, + "grad_norm": 0.20977806692211526, + "learning_rate": 3.3767404103900443e-06, + "loss": 0.043, + "step": 7324 + }, + { + "epoch": 0.62, + "grad_norm": 0.25975951229543537, + "learning_rate": 3.3754497820509286e-06, + "loss": 0.0627, + "step": 7325 + }, + { + "epoch": 0.62, + "grad_norm": 0.3013359840330051, + "learning_rate": 3.374159274717777e-06, + "loss": 0.0679, + "step": 7326 + }, + { + "epoch": 0.62, + "grad_norm": 0.3724425507276075, + "learning_rate": 3.372868888486713e-06, + "loss": 0.1111, + "step": 7327 + }, + { + "epoch": 0.62, + "grad_norm": 0.26164632783404873, + "learning_rate": 3.3715786234538535e-06, + "loss": 0.0787, + "step": 7328 + }, + { + "epoch": 0.62, + "grad_norm": 0.2705317428346432, + "learning_rate": 3.3702884797153036e-06, + "loss": 0.0824, + "step": 7329 + }, + { + "epoch": 0.62, + "grad_norm": 0.30002090191429237, + "learning_rate": 3.3689984573671626e-06, + "loss": 0.0675, + "step": 7330 + }, + { + "epoch": 0.62, + "grad_norm": 0.3401056574839956, + "learning_rate": 3.367708556505517e-06, + "loss": 0.1174, + "step": 7331 + }, + { + "epoch": 0.62, + "grad_norm": 0.5492510053705704, + "learning_rate": 3.3664187772264465e-06, + "loss": 0.1041, + "step": 7332 + }, + { + "epoch": 0.62, + "grad_norm": 0.24289093831202652, + "learning_rate": 3.3651291196260226e-06, + "loss": 0.0709, + "step": 7333 + }, + { + "epoch": 0.62, + "grad_norm": 0.31834569301477755, + "learning_rate": 3.3638395838003047e-06, + "loss": 0.0926, + "step": 7334 + }, + { + "epoch": 0.62, + "grad_norm": 0.30027725163024976, + "learning_rate": 3.362550169845348e-06, + "loss": 0.0688, + "step": 7335 + }, + { + "epoch": 0.62, + "grad_norm": 0.35920159354771275, + "learning_rate": 3.361260877857192e-06, + "loss": 0.0993, + "step": 7336 + }, + { + "epoch": 0.62, + "grad_norm": 0.36193896607786297, + "learning_rate": 3.3599717079318705e-06, + "loss": 0.0927, + "step": 7337 + }, + { + "epoch": 0.62, + "grad_norm": 0.4561312958747651, + "learning_rate": 3.358682660165413e-06, + "loss": 0.0454, + "step": 7338 + }, + { + "epoch": 0.62, + "grad_norm": 0.4716281307587733, + "learning_rate": 3.357393734653832e-06, + "loss": 0.0793, + "step": 7339 + }, + { + "epoch": 0.62, + "grad_norm": 0.24014773629796418, + "learning_rate": 3.356104931493134e-06, + "loss": 0.0688, + "step": 7340 + }, + { + "epoch": 0.62, + "grad_norm": 0.17549648648519187, + "learning_rate": 3.354816250779316e-06, + "loss": 0.0182, + "step": 7341 + }, + { + "epoch": 0.62, + "grad_norm": 0.42810442730459763, + "learning_rate": 3.353527692608368e-06, + "loss": 0.0938, + "step": 7342 + }, + { + "epoch": 0.62, + "grad_norm": 0.5372456079457855, + "learning_rate": 3.352239257076271e-06, + "loss": 0.1099, + "step": 7343 + }, + { + "epoch": 0.62, + "grad_norm": 0.4387094212202471, + "learning_rate": 3.350950944278989e-06, + "loss": 0.124, + "step": 7344 + }, + { + "epoch": 0.62, + "grad_norm": 0.27863917736818483, + "learning_rate": 3.3496627543124904e-06, + "loss": 0.0732, + "step": 7345 + }, + { + "epoch": 0.62, + "grad_norm": 0.42412365415940234, + "learning_rate": 3.3483746872727228e-06, + "loss": 0.1084, + "step": 7346 + }, + { + "epoch": 0.62, + "grad_norm": 0.41643913594161863, + "learning_rate": 3.3470867432556307e-06, + "loss": 0.0874, + "step": 7347 + }, + { + "epoch": 0.62, + "grad_norm": 0.28728368779798186, + "learning_rate": 3.345798922357145e-06, + "loss": 0.0773, + "step": 7348 + }, + { + "epoch": 0.62, + "grad_norm": 0.3309504340312473, + "learning_rate": 3.344511224673194e-06, + "loss": 0.0894, + "step": 7349 + }, + { + "epoch": 0.62, + "grad_norm": 0.5964580472130235, + "learning_rate": 3.34322365029969e-06, + "loss": 0.1727, + "step": 7350 + }, + { + "epoch": 0.62, + "grad_norm": 0.2896800268124212, + "learning_rate": 3.341936199332538e-06, + "loss": 0.0741, + "step": 7351 + }, + { + "epoch": 0.62, + "grad_norm": 0.2854547072253174, + "learning_rate": 3.34064887186764e-06, + "loss": 0.0778, + "step": 7352 + }, + { + "epoch": 0.62, + "grad_norm": 0.34577572636504195, + "learning_rate": 3.33936166800088e-06, + "loss": 0.1097, + "step": 7353 + }, + { + "epoch": 0.62, + "grad_norm": 0.4139979838833184, + "learning_rate": 3.3380745878281345e-06, + "loss": 0.108, + "step": 7354 + }, + { + "epoch": 0.62, + "grad_norm": 0.29551235409294335, + "learning_rate": 3.3367876314452773e-06, + "loss": 0.1135, + "step": 7355 + }, + { + "epoch": 0.62, + "grad_norm": 0.25174756766885115, + "learning_rate": 3.3355007989481668e-06, + "loss": 0.0565, + "step": 7356 + }, + { + "epoch": 0.62, + "grad_norm": 0.31572549084114787, + "learning_rate": 3.3342140904326524e-06, + "loss": 0.1054, + "step": 7357 + }, + { + "epoch": 0.62, + "grad_norm": 0.21828230464923454, + "learning_rate": 3.3329275059945747e-06, + "loss": 0.0706, + "step": 7358 + }, + { + "epoch": 0.62, + "grad_norm": 0.3904951675509458, + "learning_rate": 3.33164104572977e-06, + "loss": 0.0924, + "step": 7359 + }, + { + "epoch": 0.62, + "grad_norm": 0.2975274293301175, + "learning_rate": 3.3303547097340593e-06, + "loss": 0.0671, + "step": 7360 + }, + { + "epoch": 0.62, + "grad_norm": 0.3702635298505712, + "learning_rate": 3.3290684981032545e-06, + "loss": 0.0706, + "step": 7361 + }, + { + "epoch": 0.62, + "grad_norm": 0.3884181094518574, + "learning_rate": 3.3277824109331636e-06, + "loss": 0.1417, + "step": 7362 + }, + { + "epoch": 0.62, + "grad_norm": 0.30903463681615256, + "learning_rate": 3.3264964483195784e-06, + "loss": 0.1039, + "step": 7363 + }, + { + "epoch": 0.62, + "grad_norm": 0.32391025757051095, + "learning_rate": 3.3252106103582883e-06, + "loss": 0.0811, + "step": 7364 + }, + { + "epoch": 0.62, + "grad_norm": 0.3960958468995765, + "learning_rate": 3.3239248971450667e-06, + "loss": 0.1204, + "step": 7365 + }, + { + "epoch": 0.62, + "grad_norm": 0.3405914240571599, + "learning_rate": 3.3226393087756836e-06, + "loss": 0.0785, + "step": 7366 + }, + { + "epoch": 0.62, + "grad_norm": 0.33048864176932524, + "learning_rate": 3.321353845345896e-06, + "loss": 0.0892, + "step": 7367 + }, + { + "epoch": 0.62, + "grad_norm": 0.4854807937129951, + "learning_rate": 3.3200685069514523e-06, + "loss": 0.0883, + "step": 7368 + }, + { + "epoch": 0.62, + "grad_norm": 0.216819190456203, + "learning_rate": 3.3187832936880937e-06, + "loss": 0.08, + "step": 7369 + }, + { + "epoch": 0.62, + "grad_norm": 0.30879965060969233, + "learning_rate": 3.3174982056515485e-06, + "loss": 0.0863, + "step": 7370 + }, + { + "epoch": 0.62, + "grad_norm": 0.2531302495911648, + "learning_rate": 3.3162132429375372e-06, + "loss": 0.0799, + "step": 7371 + }, + { + "epoch": 0.62, + "grad_norm": 0.31671883728501554, + "learning_rate": 3.3149284056417734e-06, + "loss": 0.1009, + "step": 7372 + }, + { + "epoch": 0.62, + "grad_norm": 0.4015282545285348, + "learning_rate": 3.313643693859959e-06, + "loss": 0.091, + "step": 7373 + }, + { + "epoch": 0.62, + "grad_norm": 0.34851302314913324, + "learning_rate": 3.312359107687786e-06, + "loss": 0.1097, + "step": 7374 + }, + { + "epoch": 0.62, + "grad_norm": 0.2692607578424626, + "learning_rate": 3.3110746472209354e-06, + "loss": 0.067, + "step": 7375 + }, + { + "epoch": 0.62, + "grad_norm": 0.16629148272299638, + "learning_rate": 3.3097903125550857e-06, + "loss": 0.0449, + "step": 7376 + }, + { + "epoch": 0.62, + "grad_norm": 0.24846075841033108, + "learning_rate": 3.3085061037859e-06, + "loss": 0.0761, + "step": 7377 + }, + { + "epoch": 0.62, + "grad_norm": 0.2516934027597921, + "learning_rate": 3.3072220210090315e-06, + "loss": 0.0901, + "step": 7378 + }, + { + "epoch": 0.62, + "grad_norm": 0.38975211102828633, + "learning_rate": 3.30593806432013e-06, + "loss": 0.0575, + "step": 7379 + }, + { + "epoch": 0.62, + "grad_norm": 0.2711966316855158, + "learning_rate": 3.3046542338148302e-06, + "loss": 0.0772, + "step": 7380 + }, + { + "epoch": 0.62, + "grad_norm": 0.23389505861708823, + "learning_rate": 3.303370529588759e-06, + "loss": 0.0425, + "step": 7381 + }, + { + "epoch": 0.62, + "grad_norm": 0.19994936475620134, + "learning_rate": 3.3020869517375327e-06, + "loss": 0.072, + "step": 7382 + }, + { + "epoch": 0.62, + "grad_norm": 0.2328239407172048, + "learning_rate": 3.300803500356763e-06, + "loss": 0.0571, + "step": 7383 + }, + { + "epoch": 0.62, + "grad_norm": 0.3860696656748534, + "learning_rate": 3.299520175542047e-06, + "loss": 0.0909, + "step": 7384 + }, + { + "epoch": 0.62, + "grad_norm": 0.3961233647663579, + "learning_rate": 3.298236977388972e-06, + "loss": 0.0461, + "step": 7385 + }, + { + "epoch": 0.62, + "grad_norm": 0.3327187898042797, + "learning_rate": 3.296953905993123e-06, + "loss": 0.0931, + "step": 7386 + }, + { + "epoch": 0.62, + "grad_norm": 0.2623153457039759, + "learning_rate": 3.295670961450068e-06, + "loss": 0.062, + "step": 7387 + }, + { + "epoch": 0.62, + "grad_norm": 0.2500242220306109, + "learning_rate": 3.2943881438553665e-06, + "loss": 0.0553, + "step": 7388 + }, + { + "epoch": 0.62, + "grad_norm": 0.27799651995084546, + "learning_rate": 3.2931054533045736e-06, + "loss": 0.0861, + "step": 7389 + }, + { + "epoch": 0.62, + "grad_norm": 0.24129785802628034, + "learning_rate": 3.2918228898932304e-06, + "loss": 0.0563, + "step": 7390 + }, + { + "epoch": 0.62, + "grad_norm": 0.3216812873857199, + "learning_rate": 3.2905404537168687e-06, + "loss": 0.0781, + "step": 7391 + }, + { + "epoch": 0.62, + "grad_norm": 0.3016006457518386, + "learning_rate": 3.2892581448710105e-06, + "loss": 0.1051, + "step": 7392 + }, + { + "epoch": 0.62, + "grad_norm": 0.36514021903641075, + "learning_rate": 3.2879759634511736e-06, + "loss": 0.0881, + "step": 7393 + }, + { + "epoch": 0.62, + "grad_norm": 0.579465200900013, + "learning_rate": 3.2866939095528606e-06, + "loss": 0.1168, + "step": 7394 + }, + { + "epoch": 0.62, + "grad_norm": 0.25839963791599485, + "learning_rate": 3.2854119832715637e-06, + "loss": 0.0975, + "step": 7395 + }, + { + "epoch": 0.62, + "grad_norm": 0.27439272810427784, + "learning_rate": 3.284130184702773e-06, + "loss": 0.0777, + "step": 7396 + }, + { + "epoch": 0.62, + "grad_norm": 0.4287215952810872, + "learning_rate": 3.2828485139419598e-06, + "loss": 0.0906, + "step": 7397 + }, + { + "epoch": 0.62, + "grad_norm": 0.44182111832395715, + "learning_rate": 3.281566971084594e-06, + "loss": 0.1299, + "step": 7398 + }, + { + "epoch": 0.62, + "grad_norm": 0.3311099603997046, + "learning_rate": 3.2802855562261293e-06, + "loss": 0.0607, + "step": 7399 + }, + { + "epoch": 0.62, + "grad_norm": 0.2744813227370427, + "learning_rate": 3.2790042694620154e-06, + "loss": 0.077, + "step": 7400 + }, + { + "epoch": 0.62, + "grad_norm": 0.21197500147752987, + "learning_rate": 3.277723110887688e-06, + "loss": 0.0547, + "step": 7401 + }, + { + "epoch": 0.62, + "grad_norm": 0.27168772065251484, + "learning_rate": 3.2764420805985777e-06, + "loss": 0.078, + "step": 7402 + }, + { + "epoch": 0.62, + "grad_norm": 0.2149601811319796, + "learning_rate": 3.275161178690101e-06, + "loss": 0.0747, + "step": 7403 + }, + { + "epoch": 0.62, + "grad_norm": 0.2741569599668653, + "learning_rate": 3.2738804052576683e-06, + "loss": 0.0828, + "step": 7404 + }, + { + "epoch": 0.62, + "grad_norm": 0.30684780417902063, + "learning_rate": 3.2725997603966763e-06, + "loss": 0.082, + "step": 7405 + }, + { + "epoch": 0.62, + "grad_norm": 0.5059134743669398, + "learning_rate": 3.2713192442025195e-06, + "loss": 0.1437, + "step": 7406 + }, + { + "epoch": 0.62, + "grad_norm": 0.4577032491610798, + "learning_rate": 3.2700388567705755e-06, + "loss": 0.1013, + "step": 7407 + }, + { + "epoch": 0.62, + "grad_norm": 0.27872862959896944, + "learning_rate": 3.2687585981962142e-06, + "loss": 0.0756, + "step": 7408 + }, + { + "epoch": 0.62, + "grad_norm": 0.35960748796772535, + "learning_rate": 3.267478468574797e-06, + "loss": 0.1251, + "step": 7409 + }, + { + "epoch": 0.62, + "grad_norm": 0.2883856213497421, + "learning_rate": 3.2661984680016777e-06, + "loss": 0.0582, + "step": 7410 + }, + { + "epoch": 0.62, + "grad_norm": 0.28983736253553816, + "learning_rate": 3.264918596572196e-06, + "loss": 0.0764, + "step": 7411 + }, + { + "epoch": 0.62, + "grad_norm": 0.424724935430722, + "learning_rate": 3.2636388543816846e-06, + "loss": 0.1145, + "step": 7412 + }, + { + "epoch": 0.62, + "grad_norm": 0.17338907510544171, + "learning_rate": 3.2623592415254668e-06, + "loss": 0.0516, + "step": 7413 + }, + { + "epoch": 0.62, + "grad_norm": 0.5885301216754125, + "learning_rate": 3.261079758098855e-06, + "loss": 0.1491, + "step": 7414 + }, + { + "epoch": 0.62, + "grad_norm": 0.4242738373687518, + "learning_rate": 3.2598004041971544e-06, + "loss": 0.0921, + "step": 7415 + }, + { + "epoch": 0.62, + "grad_norm": 0.24088113766842717, + "learning_rate": 3.2585211799156537e-06, + "loss": 0.0702, + "step": 7416 + }, + { + "epoch": 0.62, + "grad_norm": 0.3045936350204951, + "learning_rate": 3.257242085349643e-06, + "loss": 0.0748, + "step": 7417 + }, + { + "epoch": 0.63, + "grad_norm": 0.42868643259688044, + "learning_rate": 3.255963120594394e-06, + "loss": 0.1094, + "step": 7418 + }, + { + "epoch": 0.63, + "grad_norm": 0.28358696502776637, + "learning_rate": 3.2546842857451688e-06, + "loss": 0.088, + "step": 7419 + }, + { + "epoch": 0.63, + "grad_norm": 0.39101465549547637, + "learning_rate": 3.253405580897228e-06, + "loss": 0.1042, + "step": 7420 + }, + { + "epoch": 0.63, + "grad_norm": 0.37056056531663695, + "learning_rate": 3.2521270061458143e-06, + "loss": 0.107, + "step": 7421 + }, + { + "epoch": 0.63, + "grad_norm": 0.45593406653836094, + "learning_rate": 3.2508485615861605e-06, + "loss": 0.0832, + "step": 7422 + }, + { + "epoch": 0.63, + "grad_norm": 0.4008710362891544, + "learning_rate": 3.2495702473134983e-06, + "loss": 0.0933, + "step": 7423 + }, + { + "epoch": 0.63, + "grad_norm": 0.584780999806043, + "learning_rate": 3.248292063423041e-06, + "loss": 0.079, + "step": 7424 + }, + { + "epoch": 0.63, + "grad_norm": 0.2017669908599626, + "learning_rate": 3.2470140100099944e-06, + "loss": 0.0575, + "step": 7425 + }, + { + "epoch": 0.63, + "grad_norm": 0.6526943454967085, + "learning_rate": 3.2457360871695555e-06, + "loss": 0.111, + "step": 7426 + }, + { + "epoch": 0.63, + "grad_norm": 0.5545066672299871, + "learning_rate": 3.2444582949969126e-06, + "loss": 0.1066, + "step": 7427 + }, + { + "epoch": 0.63, + "grad_norm": 0.5836604015333878, + "learning_rate": 3.2431806335872438e-06, + "loss": 0.1675, + "step": 7428 + }, + { + "epoch": 0.63, + "grad_norm": 0.2635620055119552, + "learning_rate": 3.241903103035715e-06, + "loss": 0.0563, + "step": 7429 + }, + { + "epoch": 0.63, + "grad_norm": 0.5406997658665079, + "learning_rate": 3.240625703437485e-06, + "loss": 0.1324, + "step": 7430 + }, + { + "epoch": 0.63, + "grad_norm": 0.25023229935125674, + "learning_rate": 3.239348434887701e-06, + "loss": 0.0337, + "step": 7431 + }, + { + "epoch": 0.63, + "grad_norm": 0.23068573734113001, + "learning_rate": 3.238071297481503e-06, + "loss": 0.0552, + "step": 7432 + }, + { + "epoch": 0.63, + "grad_norm": 0.16806121367917953, + "learning_rate": 3.2367942913140178e-06, + "loss": 0.0508, + "step": 7433 + }, + { + "epoch": 0.63, + "grad_norm": 0.29090898578568836, + "learning_rate": 3.235517416480366e-06, + "loss": 0.1198, + "step": 7434 + }, + { + "epoch": 0.63, + "grad_norm": 0.7812114589650989, + "learning_rate": 3.234240673075655e-06, + "loss": 0.0778, + "step": 7435 + }, + { + "epoch": 0.63, + "grad_norm": 0.32679717562924704, + "learning_rate": 3.2329640611949855e-06, + "loss": 0.0852, + "step": 7436 + }, + { + "epoch": 0.63, + "grad_norm": 0.29535154160652316, + "learning_rate": 3.231687580933447e-06, + "loss": 0.0843, + "step": 7437 + }, + { + "epoch": 0.63, + "grad_norm": 0.24215785554315591, + "learning_rate": 3.230411232386118e-06, + "loss": 0.05, + "step": 7438 + }, + { + "epoch": 0.63, + "grad_norm": 0.2537081196884538, + "learning_rate": 3.2291350156480685e-06, + "loss": 0.0578, + "step": 7439 + }, + { + "epoch": 0.63, + "grad_norm": 0.527436623416943, + "learning_rate": 3.22785893081436e-06, + "loss": 0.1109, + "step": 7440 + }, + { + "epoch": 0.63, + "grad_norm": 0.2469326007158126, + "learning_rate": 3.226582977980042e-06, + "loss": 0.0754, + "step": 7441 + }, + { + "epoch": 0.63, + "grad_norm": 0.24640223940945197, + "learning_rate": 3.225307157240155e-06, + "loss": 0.0602, + "step": 7442 + }, + { + "epoch": 0.63, + "grad_norm": 0.25192200679490573, + "learning_rate": 3.224031468689727e-06, + "loss": 0.0652, + "step": 7443 + }, + { + "epoch": 0.63, + "grad_norm": 0.2965783969795636, + "learning_rate": 3.222755912423783e-06, + "loss": 0.0724, + "step": 7444 + }, + { + "epoch": 0.63, + "grad_norm": 0.3104032394575369, + "learning_rate": 3.221480488537332e-06, + "loss": 0.0968, + "step": 7445 + }, + { + "epoch": 0.63, + "grad_norm": 0.4107924142341617, + "learning_rate": 3.220205197125373e-06, + "loss": 0.1247, + "step": 7446 + }, + { + "epoch": 0.63, + "grad_norm": 0.3294485010096923, + "learning_rate": 3.2189300382829002e-06, + "loss": 0.0796, + "step": 7447 + }, + { + "epoch": 0.63, + "grad_norm": 0.1961269796591617, + "learning_rate": 3.2176550121048944e-06, + "loss": 0.049, + "step": 7448 + }, + { + "epoch": 0.63, + "grad_norm": 0.3046730998588272, + "learning_rate": 3.2163801186863266e-06, + "loss": 0.0939, + "step": 7449 + }, + { + "epoch": 0.63, + "grad_norm": 0.23586704429609778, + "learning_rate": 3.2151053581221545e-06, + "loss": 0.0597, + "step": 7450 + }, + { + "epoch": 0.63, + "grad_norm": 0.26475644537801235, + "learning_rate": 3.2138307305073367e-06, + "loss": 0.0535, + "step": 7451 + }, + { + "epoch": 0.63, + "grad_norm": 0.3310516696459729, + "learning_rate": 3.2125562359368105e-06, + "loss": 0.1155, + "step": 7452 + }, + { + "epoch": 0.63, + "grad_norm": 0.3811301332478338, + "learning_rate": 3.2112818745055065e-06, + "loss": 0.1128, + "step": 7453 + }, + { + "epoch": 0.63, + "grad_norm": 0.4508032513921881, + "learning_rate": 3.2100076463083507e-06, + "loss": 0.1148, + "step": 7454 + }, + { + "epoch": 0.63, + "grad_norm": 0.3237442424519417, + "learning_rate": 3.2087335514402527e-06, + "loss": 0.1137, + "step": 7455 + }, + { + "epoch": 0.63, + "grad_norm": 0.3145742399360536, + "learning_rate": 3.2074595899961146e-06, + "loss": 0.0796, + "step": 7456 + }, + { + "epoch": 0.63, + "grad_norm": 0.32677911581995633, + "learning_rate": 3.206185762070827e-06, + "loss": 0.0794, + "step": 7457 + }, + { + "epoch": 0.63, + "grad_norm": 0.4609880382334431, + "learning_rate": 3.2049120677592755e-06, + "loss": 0.1271, + "step": 7458 + }, + { + "epoch": 0.63, + "grad_norm": 0.32738224998198867, + "learning_rate": 3.2036385071563302e-06, + "loss": 0.0999, + "step": 7459 + }, + { + "epoch": 0.63, + "grad_norm": 0.3748991117333819, + "learning_rate": 3.202365080356851e-06, + "loss": 0.116, + "step": 7460 + }, + { + "epoch": 0.63, + "grad_norm": 0.49097756483682037, + "learning_rate": 3.2010917874556954e-06, + "loss": 0.1191, + "step": 7461 + }, + { + "epoch": 0.63, + "grad_norm": 0.5516323334164326, + "learning_rate": 3.199818628547702e-06, + "loss": 0.1301, + "step": 7462 + }, + { + "epoch": 0.63, + "grad_norm": 0.1945615103618961, + "learning_rate": 3.198545603727704e-06, + "loss": 0.0617, + "step": 7463 + }, + { + "epoch": 0.63, + "grad_norm": 0.3783222525498935, + "learning_rate": 3.197272713090524e-06, + "loss": 0.116, + "step": 7464 + }, + { + "epoch": 0.63, + "grad_norm": 0.24916970996288107, + "learning_rate": 3.195999956730973e-06, + "loss": 0.0861, + "step": 7465 + }, + { + "epoch": 0.63, + "grad_norm": 0.20255172540595803, + "learning_rate": 3.194727334743856e-06, + "loss": 0.0415, + "step": 7466 + }, + { + "epoch": 0.63, + "grad_norm": 0.2838204883826325, + "learning_rate": 3.1934548472239617e-06, + "loss": 0.0674, + "step": 7467 + }, + { + "epoch": 0.63, + "grad_norm": 0.24761518232478513, + "learning_rate": 3.1921824942660763e-06, + "loss": 0.0619, + "step": 7468 + }, + { + "epoch": 0.63, + "grad_norm": 0.40029139447753037, + "learning_rate": 3.190910275964968e-06, + "loss": 0.1098, + "step": 7469 + }, + { + "epoch": 0.63, + "grad_norm": 0.31293603460587804, + "learning_rate": 3.189638192415402e-06, + "loss": 0.0838, + "step": 7470 + }, + { + "epoch": 0.63, + "grad_norm": 0.41762864022561214, + "learning_rate": 3.1883662437121297e-06, + "loss": 0.1005, + "step": 7471 + }, + { + "epoch": 0.63, + "grad_norm": 0.5067572040653646, + "learning_rate": 3.1870944299498928e-06, + "loss": 0.0625, + "step": 7472 + }, + { + "epoch": 0.63, + "grad_norm": 0.3461648499545234, + "learning_rate": 3.1858227512234254e-06, + "loss": 0.1192, + "step": 7473 + }, + { + "epoch": 0.63, + "grad_norm": 0.29577072173602353, + "learning_rate": 3.1845512076274444e-06, + "loss": 0.0819, + "step": 7474 + }, + { + "epoch": 0.63, + "grad_norm": 0.6096168953491207, + "learning_rate": 3.1832797992566677e-06, + "loss": 0.1136, + "step": 7475 + }, + { + "epoch": 0.63, + "grad_norm": 0.32072682282314624, + "learning_rate": 3.1820085262057943e-06, + "loss": 0.0771, + "step": 7476 + }, + { + "epoch": 0.63, + "grad_norm": 0.26252219685391276, + "learning_rate": 3.180737388569515e-06, + "loss": 0.08, + "step": 7477 + }, + { + "epoch": 0.63, + "grad_norm": 0.287183794688386, + "learning_rate": 3.1794663864425146e-06, + "loss": 0.0977, + "step": 7478 + }, + { + "epoch": 0.63, + "grad_norm": 0.2594711886747748, + "learning_rate": 3.1781955199194634e-06, + "loss": 0.0855, + "step": 7479 + }, + { + "epoch": 0.63, + "grad_norm": 0.35828038605174267, + "learning_rate": 3.1769247890950206e-06, + "loss": 0.1002, + "step": 7480 + }, + { + "epoch": 0.63, + "grad_norm": 0.3046537943329612, + "learning_rate": 3.1756541940638417e-06, + "loss": 0.0888, + "step": 7481 + }, + { + "epoch": 0.63, + "grad_norm": 0.33707491248391264, + "learning_rate": 3.1743837349205665e-06, + "loss": 0.0878, + "step": 7482 + }, + { + "epoch": 0.63, + "grad_norm": 0.33828426149122465, + "learning_rate": 3.173113411759825e-06, + "loss": 0.068, + "step": 7483 + }, + { + "epoch": 0.63, + "grad_norm": 0.32238878051012504, + "learning_rate": 3.1718432246762376e-06, + "loss": 0.0775, + "step": 7484 + }, + { + "epoch": 0.63, + "grad_norm": 0.23557545569126495, + "learning_rate": 3.1705731737644186e-06, + "loss": 0.0579, + "step": 7485 + }, + { + "epoch": 0.63, + "grad_norm": 0.3840864540453718, + "learning_rate": 3.169303259118966e-06, + "loss": 0.0944, + "step": 7486 + }, + { + "epoch": 0.63, + "grad_norm": 0.2021329499404736, + "learning_rate": 3.1680334808344696e-06, + "loss": 0.0676, + "step": 7487 + }, + { + "epoch": 0.63, + "grad_norm": 0.3248235212173923, + "learning_rate": 3.166763839005514e-06, + "loss": 0.1202, + "step": 7488 + }, + { + "epoch": 0.63, + "grad_norm": 0.35598711735489263, + "learning_rate": 3.1654943337266663e-06, + "loss": 0.0569, + "step": 7489 + }, + { + "epoch": 0.63, + "grad_norm": 0.1922245355040462, + "learning_rate": 3.1642249650924883e-06, + "loss": 0.0768, + "step": 7490 + }, + { + "epoch": 0.63, + "grad_norm": 0.2651243935893133, + "learning_rate": 3.162955733197527e-06, + "loss": 0.0969, + "step": 7491 + }, + { + "epoch": 0.63, + "grad_norm": 0.35811733768048903, + "learning_rate": 3.1616866381363255e-06, + "loss": 0.0717, + "step": 7492 + }, + { + "epoch": 0.63, + "grad_norm": 0.47824437231040295, + "learning_rate": 3.1604176800034127e-06, + "loss": 0.0795, + "step": 7493 + }, + { + "epoch": 0.63, + "grad_norm": 0.26514132231403953, + "learning_rate": 3.159148858893305e-06, + "loss": 0.0693, + "step": 7494 + }, + { + "epoch": 0.63, + "grad_norm": 0.41285330699260103, + "learning_rate": 3.157880174900517e-06, + "loss": 0.12, + "step": 7495 + }, + { + "epoch": 0.63, + "grad_norm": 0.26052959659735436, + "learning_rate": 3.1566116281195437e-06, + "loss": 0.0854, + "step": 7496 + }, + { + "epoch": 0.63, + "grad_norm": 0.2112470210598013, + "learning_rate": 3.1553432186448745e-06, + "loss": 0.0451, + "step": 7497 + }, + { + "epoch": 0.63, + "grad_norm": 0.2880792287848207, + "learning_rate": 3.154074946570989e-06, + "loss": 0.0807, + "step": 7498 + }, + { + "epoch": 0.63, + "grad_norm": 0.41890540761238787, + "learning_rate": 3.1528068119923537e-06, + "loss": 0.0918, + "step": 7499 + }, + { + "epoch": 0.63, + "grad_norm": 0.5898363425109961, + "learning_rate": 3.151538815003429e-06, + "loss": 0.1364, + "step": 7500 + }, + { + "epoch": 0.63, + "grad_norm": 0.3266462844920886, + "learning_rate": 3.1502709556986606e-06, + "loss": 0.1139, + "step": 7501 + }, + { + "epoch": 0.63, + "grad_norm": 0.40465090162948564, + "learning_rate": 3.1490032341724876e-06, + "loss": 0.1251, + "step": 7502 + }, + { + "epoch": 0.63, + "grad_norm": 0.31940297971941356, + "learning_rate": 3.1477356505193357e-06, + "loss": 0.0801, + "step": 7503 + }, + { + "epoch": 0.63, + "grad_norm": 0.3053068160535285, + "learning_rate": 3.1464682048336224e-06, + "loss": 0.068, + "step": 7504 + }, + { + "epoch": 0.63, + "grad_norm": 0.22115661040473558, + "learning_rate": 3.145200897209756e-06, + "loss": 0.0667, + "step": 7505 + }, + { + "epoch": 0.63, + "grad_norm": 0.3058206710117369, + "learning_rate": 3.1439337277421323e-06, + "loss": 0.092, + "step": 7506 + }, + { + "epoch": 0.63, + "grad_norm": 0.2740688334106272, + "learning_rate": 3.142666696525136e-06, + "loss": 0.0929, + "step": 7507 + }, + { + "epoch": 0.63, + "grad_norm": 0.3548648423804408, + "learning_rate": 3.1413998036531424e-06, + "loss": 0.1054, + "step": 7508 + }, + { + "epoch": 0.63, + "grad_norm": 0.643187805530458, + "learning_rate": 3.1401330492205205e-06, + "loss": 0.1205, + "step": 7509 + }, + { + "epoch": 0.63, + "grad_norm": 0.24681355199470112, + "learning_rate": 3.1388664333216246e-06, + "loss": 0.0876, + "step": 7510 + }, + { + "epoch": 0.63, + "grad_norm": 0.44139657157666573, + "learning_rate": 3.1375999560507954e-06, + "loss": 0.1115, + "step": 7511 + }, + { + "epoch": 0.63, + "grad_norm": 0.2134277138639315, + "learning_rate": 3.136333617502373e-06, + "loss": 0.0639, + "step": 7512 + }, + { + "epoch": 0.63, + "grad_norm": 0.5950430369478585, + "learning_rate": 3.13506741777068e-06, + "loss": 0.1327, + "step": 7513 + }, + { + "epoch": 0.63, + "grad_norm": 0.275102249907796, + "learning_rate": 3.133801356950027e-06, + "loss": 0.0705, + "step": 7514 + }, + { + "epoch": 0.63, + "grad_norm": 0.4307600565521753, + "learning_rate": 3.1325354351347227e-06, + "loss": 0.1, + "step": 7515 + }, + { + "epoch": 0.63, + "grad_norm": 0.34454543120684683, + "learning_rate": 3.1312696524190577e-06, + "loss": 0.1178, + "step": 7516 + }, + { + "epoch": 0.63, + "grad_norm": 0.40124859522257766, + "learning_rate": 3.130004008897316e-06, + "loss": 0.1009, + "step": 7517 + }, + { + "epoch": 0.63, + "grad_norm": 0.2732894227225401, + "learning_rate": 3.1287385046637665e-06, + "loss": 0.0436, + "step": 7518 + }, + { + "epoch": 0.63, + "grad_norm": 0.3304042071898356, + "learning_rate": 3.127473139812676e-06, + "loss": 0.0814, + "step": 7519 + }, + { + "epoch": 0.63, + "grad_norm": 0.6031544168331392, + "learning_rate": 3.1262079144382946e-06, + "loss": 0.0954, + "step": 7520 + }, + { + "epoch": 0.63, + "grad_norm": 0.2863460074938233, + "learning_rate": 3.1249428286348616e-06, + "loss": 0.0873, + "step": 7521 + }, + { + "epoch": 0.63, + "grad_norm": 0.5194621824346859, + "learning_rate": 3.1236778824966106e-06, + "loss": 0.1495, + "step": 7522 + }, + { + "epoch": 0.63, + "grad_norm": 0.23213992496197666, + "learning_rate": 3.122413076117763e-06, + "loss": 0.0628, + "step": 7523 + }, + { + "epoch": 0.63, + "grad_norm": 0.4066952204403874, + "learning_rate": 3.121148409592526e-06, + "loss": 0.0956, + "step": 7524 + }, + { + "epoch": 0.63, + "grad_norm": 0.2863313488331882, + "learning_rate": 3.119883883015099e-06, + "loss": 0.104, + "step": 7525 + }, + { + "epoch": 0.63, + "grad_norm": 0.3574330701026415, + "learning_rate": 3.1186194964796744e-06, + "loss": 0.1006, + "step": 7526 + }, + { + "epoch": 0.63, + "grad_norm": 0.1910168147289262, + "learning_rate": 3.1173552500804305e-06, + "loss": 0.0482, + "step": 7527 + }, + { + "epoch": 0.63, + "grad_norm": 0.351227076390276, + "learning_rate": 3.1160911439115327e-06, + "loss": 0.1313, + "step": 7528 + }, + { + "epoch": 0.63, + "grad_norm": 0.3142577786051454, + "learning_rate": 3.1148271780671425e-06, + "loss": 0.0852, + "step": 7529 + }, + { + "epoch": 0.63, + "grad_norm": 0.2908722023233454, + "learning_rate": 3.113563352641406e-06, + "loss": 0.1013, + "step": 7530 + }, + { + "epoch": 0.63, + "grad_norm": 0.24316715242066264, + "learning_rate": 3.1122996677284593e-06, + "loss": 0.0644, + "step": 7531 + }, + { + "epoch": 0.63, + "grad_norm": 0.339780043367246, + "learning_rate": 3.1110361234224316e-06, + "loss": 0.0865, + "step": 7532 + }, + { + "epoch": 0.63, + "grad_norm": 0.3948221383374114, + "learning_rate": 3.1097727198174356e-06, + "loss": 0.0683, + "step": 7533 + }, + { + "epoch": 0.63, + "grad_norm": 0.542824965001126, + "learning_rate": 3.1085094570075804e-06, + "loss": 0.1002, + "step": 7534 + }, + { + "epoch": 0.63, + "grad_norm": 0.3465377031481567, + "learning_rate": 3.1072463350869587e-06, + "loss": 0.1254, + "step": 7535 + }, + { + "epoch": 0.63, + "grad_norm": 0.2684367183488828, + "learning_rate": 3.1059833541496563e-06, + "loss": 0.0606, + "step": 7536 + }, + { + "epoch": 0.64, + "grad_norm": 0.2927199227029024, + "learning_rate": 3.1047205142897473e-06, + "loss": 0.0822, + "step": 7537 + }, + { + "epoch": 0.64, + "grad_norm": 0.26780500429948084, + "learning_rate": 3.1034578156012947e-06, + "loss": 0.0862, + "step": 7538 + }, + { + "epoch": 0.64, + "grad_norm": 0.3409494701252806, + "learning_rate": 3.102195258178353e-06, + "loss": 0.0832, + "step": 7539 + }, + { + "epoch": 0.64, + "grad_norm": 0.19368975188938772, + "learning_rate": 3.100932842114964e-06, + "loss": 0.0467, + "step": 7540 + }, + { + "epoch": 0.64, + "grad_norm": 0.2769635623097641, + "learning_rate": 3.0996705675051597e-06, + "loss": 0.0777, + "step": 7541 + }, + { + "epoch": 0.64, + "grad_norm": 0.19825581210940144, + "learning_rate": 3.09840843444296e-06, + "loss": 0.0339, + "step": 7542 + }, + { + "epoch": 0.64, + "grad_norm": 0.3835708473794745, + "learning_rate": 3.09714644302238e-06, + "loss": 0.0824, + "step": 7543 + }, + { + "epoch": 0.64, + "grad_norm": 0.41280720711689006, + "learning_rate": 3.095884593337417e-06, + "loss": 0.0857, + "step": 7544 + }, + { + "epoch": 0.64, + "grad_norm": 0.6029844371439689, + "learning_rate": 3.0946228854820603e-06, + "loss": 0.0462, + "step": 7545 + }, + { + "epoch": 0.64, + "grad_norm": 0.43417514878235447, + "learning_rate": 3.093361319550293e-06, + "loss": 0.1178, + "step": 7546 + }, + { + "epoch": 0.64, + "grad_norm": 0.22678908796391573, + "learning_rate": 3.0920998956360817e-06, + "loss": 0.0494, + "step": 7547 + }, + { + "epoch": 0.64, + "grad_norm": 0.28458731029134526, + "learning_rate": 3.0908386138333812e-06, + "loss": 0.0858, + "step": 7548 + }, + { + "epoch": 0.64, + "grad_norm": 0.5136574362698686, + "learning_rate": 3.0895774742361463e-06, + "loss": 0.1197, + "step": 7549 + }, + { + "epoch": 0.64, + "grad_norm": 0.16634165774189022, + "learning_rate": 3.088316476938309e-06, + "loss": 0.0539, + "step": 7550 + }, + { + "epoch": 0.64, + "grad_norm": 0.4630251740916094, + "learning_rate": 3.0870556220337964e-06, + "loss": 0.1193, + "step": 7551 + }, + { + "epoch": 0.64, + "grad_norm": 0.30637543965006275, + "learning_rate": 3.085794909616524e-06, + "loss": 0.083, + "step": 7552 + }, + { + "epoch": 0.64, + "grad_norm": 0.2890721748017524, + "learning_rate": 3.0845343397803994e-06, + "loss": 0.0497, + "step": 7553 + }, + { + "epoch": 0.64, + "grad_norm": 0.3632288029514629, + "learning_rate": 3.0832739126193158e-06, + "loss": 0.09, + "step": 7554 + }, + { + "epoch": 0.64, + "grad_norm": 0.3533243182288359, + "learning_rate": 3.082013628227154e-06, + "loss": 0.119, + "step": 7555 + }, + { + "epoch": 0.64, + "grad_norm": 0.3205222088986445, + "learning_rate": 3.0807534866977928e-06, + "loss": 0.1015, + "step": 7556 + }, + { + "epoch": 0.64, + "grad_norm": 0.3894941259944348, + "learning_rate": 3.079493488125093e-06, + "loss": 0.078, + "step": 7557 + }, + { + "epoch": 0.64, + "grad_norm": 0.4771072371476969, + "learning_rate": 3.078233632602905e-06, + "loss": 0.095, + "step": 7558 + }, + { + "epoch": 0.64, + "grad_norm": 0.27833711163922675, + "learning_rate": 3.07697392022507e-06, + "loss": 0.09, + "step": 7559 + }, + { + "epoch": 0.64, + "grad_norm": 0.2276439154504608, + "learning_rate": 3.0757143510854217e-06, + "loss": 0.0546, + "step": 7560 + }, + { + "epoch": 0.64, + "grad_norm": 0.3594107325058686, + "learning_rate": 3.074454925277778e-06, + "loss": 0.0815, + "step": 7561 + }, + { + "epoch": 0.64, + "grad_norm": 0.1897825091893244, + "learning_rate": 3.0731956428959463e-06, + "loss": 0.0413, + "step": 7562 + }, + { + "epoch": 0.64, + "grad_norm": 0.5288974970070162, + "learning_rate": 3.07193650403373e-06, + "loss": 0.1671, + "step": 7563 + }, + { + "epoch": 0.64, + "grad_norm": 0.2618825749210449, + "learning_rate": 3.0706775087849143e-06, + "loss": 0.0688, + "step": 7564 + }, + { + "epoch": 0.64, + "grad_norm": 0.28055677976640175, + "learning_rate": 3.0694186572432768e-06, + "loss": 0.0883, + "step": 7565 + }, + { + "epoch": 0.64, + "grad_norm": 0.3503467993303243, + "learning_rate": 3.0681599495025845e-06, + "loss": 0.1308, + "step": 7566 + }, + { + "epoch": 0.64, + "grad_norm": 0.3505450712437244, + "learning_rate": 3.066901385656592e-06, + "loss": 0.102, + "step": 7567 + }, + { + "epoch": 0.64, + "grad_norm": 0.22559138454775035, + "learning_rate": 3.0656429657990462e-06, + "loss": 0.0502, + "step": 7568 + }, + { + "epoch": 0.64, + "grad_norm": 0.30361790805215616, + "learning_rate": 3.06438469002368e-06, + "loss": 0.0387, + "step": 7569 + }, + { + "epoch": 0.64, + "grad_norm": 0.3317921809128949, + "learning_rate": 3.0631265584242194e-06, + "loss": 0.0889, + "step": 7570 + }, + { + "epoch": 0.64, + "grad_norm": 0.3118201982288268, + "learning_rate": 3.061868571094374e-06, + "loss": 0.091, + "step": 7571 + }, + { + "epoch": 0.64, + "grad_norm": 0.22449881726134763, + "learning_rate": 3.0606107281278485e-06, + "loss": 0.0547, + "step": 7572 + }, + { + "epoch": 0.64, + "grad_norm": 0.35839939438308904, + "learning_rate": 3.0593530296183345e-06, + "loss": 0.072, + "step": 7573 + }, + { + "epoch": 0.64, + "grad_norm": 0.27299680614028915, + "learning_rate": 3.0580954756595115e-06, + "loss": 0.0577, + "step": 7574 + }, + { + "epoch": 0.64, + "grad_norm": 0.28344973987279354, + "learning_rate": 3.056838066345051e-06, + "loss": 0.0587, + "step": 7575 + }, + { + "epoch": 0.64, + "grad_norm": 0.3718812552683147, + "learning_rate": 3.0555808017686084e-06, + "loss": 0.0664, + "step": 7576 + }, + { + "epoch": 0.64, + "grad_norm": 0.26011765558266553, + "learning_rate": 3.0543236820238366e-06, + "loss": 0.0708, + "step": 7577 + }, + { + "epoch": 0.64, + "grad_norm": 0.20279512283766082, + "learning_rate": 3.0530667072043718e-06, + "loss": 0.0486, + "step": 7578 + }, + { + "epoch": 0.64, + "grad_norm": 0.2158328986102784, + "learning_rate": 3.051809877403838e-06, + "loss": 0.0471, + "step": 7579 + }, + { + "epoch": 0.64, + "grad_norm": 0.35759888191261263, + "learning_rate": 3.0505531927158556e-06, + "loss": 0.0552, + "step": 7580 + }, + { + "epoch": 0.64, + "grad_norm": 0.27633923251662657, + "learning_rate": 3.049296653234028e-06, + "loss": 0.091, + "step": 7581 + }, + { + "epoch": 0.64, + "grad_norm": 0.32840874081319354, + "learning_rate": 3.0480402590519493e-06, + "loss": 0.0788, + "step": 7582 + }, + { + "epoch": 0.64, + "grad_norm": 0.35431497364871034, + "learning_rate": 3.0467840102632007e-06, + "loss": 0.0727, + "step": 7583 + }, + { + "epoch": 0.64, + "grad_norm": 0.46701377723717535, + "learning_rate": 3.0455279069613596e-06, + "loss": 0.0829, + "step": 7584 + }, + { + "epoch": 0.64, + "grad_norm": 0.4477093426231771, + "learning_rate": 3.0442719492399865e-06, + "loss": 0.1326, + "step": 7585 + }, + { + "epoch": 0.64, + "grad_norm": 0.355387574543283, + "learning_rate": 3.0430161371926283e-06, + "loss": 0.0794, + "step": 7586 + }, + { + "epoch": 0.64, + "grad_norm": 0.27048287788175784, + "learning_rate": 3.041760470912831e-06, + "loss": 0.0605, + "step": 7587 + }, + { + "epoch": 0.64, + "grad_norm": 0.40011901009465345, + "learning_rate": 3.0405049504941212e-06, + "loss": 0.0951, + "step": 7588 + }, + { + "epoch": 0.64, + "grad_norm": 0.3158251845240956, + "learning_rate": 3.039249576030016e-06, + "loss": 0.0822, + "step": 7589 + }, + { + "epoch": 0.64, + "grad_norm": 0.2837336618924544, + "learning_rate": 3.037994347614026e-06, + "loss": 0.0855, + "step": 7590 + }, + { + "epoch": 0.64, + "grad_norm": 0.2123223136818581, + "learning_rate": 3.0367392653396476e-06, + "loss": 0.0815, + "step": 7591 + }, + { + "epoch": 0.64, + "grad_norm": 0.22147569414361187, + "learning_rate": 3.0354843293003644e-06, + "loss": 0.0576, + "step": 7592 + }, + { + "epoch": 0.64, + "grad_norm": 0.3002331992333214, + "learning_rate": 3.0342295395896516e-06, + "loss": 0.1006, + "step": 7593 + }, + { + "epoch": 0.64, + "grad_norm": 0.3940278974763917, + "learning_rate": 3.0329748963009754e-06, + "loss": 0.0908, + "step": 7594 + }, + { + "epoch": 0.64, + "grad_norm": 0.40997655731125643, + "learning_rate": 3.0317203995277876e-06, + "loss": 0.1172, + "step": 7595 + }, + { + "epoch": 0.64, + "grad_norm": 0.2685186800301309, + "learning_rate": 3.0304660493635283e-06, + "loss": 0.0735, + "step": 7596 + }, + { + "epoch": 0.64, + "grad_norm": 0.3287420030588943, + "learning_rate": 3.0292118459016338e-06, + "loss": 0.0766, + "step": 7597 + }, + { + "epoch": 0.64, + "grad_norm": 0.6935983027051918, + "learning_rate": 3.0279577892355215e-06, + "loss": 0.1155, + "step": 7598 + }, + { + "epoch": 0.64, + "grad_norm": 0.33427866364929465, + "learning_rate": 3.0267038794586005e-06, + "loss": 0.0955, + "step": 7599 + }, + { + "epoch": 0.64, + "grad_norm": 0.2982176294491736, + "learning_rate": 3.02545011666427e-06, + "loss": 0.0687, + "step": 7600 + }, + { + "epoch": 0.64, + "grad_norm": 0.2778211255949358, + "learning_rate": 3.024196500945916e-06, + "loss": 0.0696, + "step": 7601 + }, + { + "epoch": 0.64, + "grad_norm": 0.24645067918227956, + "learning_rate": 3.0229430323969185e-06, + "loss": 0.071, + "step": 7602 + }, + { + "epoch": 0.64, + "grad_norm": 0.5074886491320254, + "learning_rate": 3.0216897111106397e-06, + "loss": 0.1213, + "step": 7603 + }, + { + "epoch": 0.64, + "grad_norm": 0.3699543368319706, + "learning_rate": 3.020436537180437e-06, + "loss": 0.0928, + "step": 7604 + }, + { + "epoch": 0.64, + "grad_norm": 0.3630029844294792, + "learning_rate": 3.0191835106996513e-06, + "loss": 0.097, + "step": 7605 + }, + { + "epoch": 0.64, + "grad_norm": 0.2644561583063568, + "learning_rate": 3.0179306317616175e-06, + "loss": 0.0756, + "step": 7606 + }, + { + "epoch": 0.64, + "grad_norm": 0.35252190859469246, + "learning_rate": 3.0166779004596574e-06, + "loss": 0.0807, + "step": 7607 + }, + { + "epoch": 0.64, + "grad_norm": 0.15544356822506278, + "learning_rate": 3.015425316887081e-06, + "loss": 0.0346, + "step": 7608 + }, + { + "epoch": 0.64, + "grad_norm": 0.30891978930094066, + "learning_rate": 3.0141728811371885e-06, + "loss": 0.0985, + "step": 7609 + }, + { + "epoch": 0.64, + "grad_norm": 0.3512663288688792, + "learning_rate": 3.012920593303265e-06, + "loss": 0.1315, + "step": 7610 + }, + { + "epoch": 0.64, + "grad_norm": 0.28492316852100275, + "learning_rate": 3.011668453478594e-06, + "loss": 0.0598, + "step": 7611 + }, + { + "epoch": 0.64, + "grad_norm": 0.43692417010839485, + "learning_rate": 3.0104164617564393e-06, + "loss": 0.1343, + "step": 7612 + }, + { + "epoch": 0.64, + "grad_norm": 0.3972450705812554, + "learning_rate": 3.009164618230055e-06, + "loss": 0.0851, + "step": 7613 + }, + { + "epoch": 0.64, + "grad_norm": 0.31786627481822394, + "learning_rate": 3.007912922992689e-06, + "loss": 0.1005, + "step": 7614 + }, + { + "epoch": 0.64, + "grad_norm": 0.36484990001751216, + "learning_rate": 3.0066613761375734e-06, + "loss": 0.0757, + "step": 7615 + }, + { + "epoch": 0.64, + "grad_norm": 0.29865652232610834, + "learning_rate": 3.0054099777579303e-06, + "loss": 0.1089, + "step": 7616 + }, + { + "epoch": 0.64, + "grad_norm": 0.2014425658868871, + "learning_rate": 3.0041587279469696e-06, + "loss": 0.0594, + "step": 7617 + }, + { + "epoch": 0.64, + "grad_norm": 0.2176238146550361, + "learning_rate": 3.002907626797896e-06, + "loss": 0.0568, + "step": 7618 + }, + { + "epoch": 0.64, + "grad_norm": 0.30891447627532476, + "learning_rate": 3.0016566744038955e-06, + "loss": 0.0745, + "step": 7619 + }, + { + "epoch": 0.64, + "grad_norm": 0.2923663072973704, + "learning_rate": 3.0004058708581456e-06, + "loss": 0.0553, + "step": 7620 + }, + { + "epoch": 0.64, + "grad_norm": 0.347270635943339, + "learning_rate": 2.9991552162538174e-06, + "loss": 0.0829, + "step": 7621 + }, + { + "epoch": 0.64, + "grad_norm": 0.19497550949400194, + "learning_rate": 2.9979047106840634e-06, + "loss": 0.0539, + "step": 7622 + }, + { + "epoch": 0.64, + "grad_norm": 0.3107590316173691, + "learning_rate": 2.996654354242028e-06, + "loss": 0.1101, + "step": 7623 + }, + { + "epoch": 0.64, + "grad_norm": 0.18428650511238057, + "learning_rate": 2.995404147020849e-06, + "loss": 0.0657, + "step": 7624 + }, + { + "epoch": 0.64, + "grad_norm": 0.5619812205864332, + "learning_rate": 2.9941540891136467e-06, + "loss": 0.1093, + "step": 7625 + }, + { + "epoch": 0.64, + "grad_norm": 0.3511298786457933, + "learning_rate": 2.9929041806135323e-06, + "loss": 0.0561, + "step": 7626 + }, + { + "epoch": 0.64, + "grad_norm": 0.33180366620939994, + "learning_rate": 2.991654421613605e-06, + "loss": 0.0757, + "step": 7627 + }, + { + "epoch": 0.64, + "grad_norm": 0.3045304684837932, + "learning_rate": 2.990404812206958e-06, + "loss": 0.0954, + "step": 7628 + }, + { + "epoch": 0.64, + "grad_norm": 0.20954155704102642, + "learning_rate": 2.989155352486667e-06, + "loss": 0.0792, + "step": 7629 + }, + { + "epoch": 0.64, + "grad_norm": 0.2537791676624729, + "learning_rate": 2.9879060425457974e-06, + "loss": 0.0234, + "step": 7630 + }, + { + "epoch": 0.64, + "grad_norm": 0.36437972679109165, + "learning_rate": 2.9866568824774096e-06, + "loss": 0.0976, + "step": 7631 + }, + { + "epoch": 0.64, + "grad_norm": 0.37627610736542844, + "learning_rate": 2.985407872374545e-06, + "loss": 0.102, + "step": 7632 + }, + { + "epoch": 0.64, + "grad_norm": 0.36931605273029683, + "learning_rate": 2.984159012330239e-06, + "loss": 0.0912, + "step": 7633 + }, + { + "epoch": 0.64, + "grad_norm": 0.34110256966372815, + "learning_rate": 2.9829103024375105e-06, + "loss": 0.0951, + "step": 7634 + }, + { + "epoch": 0.64, + "grad_norm": 0.3265722812173713, + "learning_rate": 2.981661742789374e-06, + "loss": 0.0993, + "step": 7635 + }, + { + "epoch": 0.64, + "grad_norm": 0.3624395643476571, + "learning_rate": 2.9804133334788297e-06, + "loss": 0.0852, + "step": 7636 + }, + { + "epoch": 0.64, + "grad_norm": 0.3264678749388332, + "learning_rate": 2.979165074598864e-06, + "loss": 0.0822, + "step": 7637 + }, + { + "epoch": 0.64, + "grad_norm": 0.4241695418593112, + "learning_rate": 2.9779169662424566e-06, + "loss": 0.0755, + "step": 7638 + }, + { + "epoch": 0.64, + "grad_norm": 0.2479543260289617, + "learning_rate": 2.9766690085025722e-06, + "loss": 0.0892, + "step": 7639 + }, + { + "epoch": 0.64, + "grad_norm": 0.25915875201505956, + "learning_rate": 2.975421201472167e-06, + "loss": 0.0552, + "step": 7640 + }, + { + "epoch": 0.64, + "grad_norm": 0.34036949312700226, + "learning_rate": 2.9741735452441855e-06, + "loss": 0.0846, + "step": 7641 + }, + { + "epoch": 0.64, + "grad_norm": 0.21488368851748546, + "learning_rate": 2.97292603991156e-06, + "loss": 0.0394, + "step": 7642 + }, + { + "epoch": 0.64, + "grad_norm": 0.20697780050330308, + "learning_rate": 2.9716786855672113e-06, + "loss": 0.0462, + "step": 7643 + }, + { + "epoch": 0.64, + "grad_norm": 0.3678616622894788, + "learning_rate": 2.970431482304048e-06, + "loss": 0.0878, + "step": 7644 + }, + { + "epoch": 0.64, + "grad_norm": 0.41651679679682846, + "learning_rate": 2.969184430214973e-06, + "loss": 0.0953, + "step": 7645 + }, + { + "epoch": 0.64, + "grad_norm": 0.36251147165396075, + "learning_rate": 2.9679375293928726e-06, + "loss": 0.0816, + "step": 7646 + }, + { + "epoch": 0.64, + "grad_norm": 0.2741761732758418, + "learning_rate": 2.96669077993062e-06, + "loss": 0.0678, + "step": 7647 + }, + { + "epoch": 0.64, + "grad_norm": 0.240763451112145, + "learning_rate": 2.965444181921085e-06, + "loss": 0.0789, + "step": 7648 + }, + { + "epoch": 0.64, + "grad_norm": 0.2565469244560333, + "learning_rate": 2.9641977354571194e-06, + "loss": 0.0746, + "step": 7649 + }, + { + "epoch": 0.64, + "grad_norm": 0.43655989860299493, + "learning_rate": 2.9629514406315663e-06, + "loss": 0.0883, + "step": 7650 + }, + { + "epoch": 0.64, + "grad_norm": 0.3119912018197633, + "learning_rate": 2.961705297537254e-06, + "loss": 0.0911, + "step": 7651 + }, + { + "epoch": 0.64, + "grad_norm": 0.22531002021468668, + "learning_rate": 2.960459306267008e-06, + "loss": 0.0788, + "step": 7652 + }, + { + "epoch": 0.64, + "grad_norm": 0.49296910053286613, + "learning_rate": 2.9592134669136334e-06, + "loss": 0.0983, + "step": 7653 + }, + { + "epoch": 0.64, + "grad_norm": 0.21650063558728297, + "learning_rate": 2.957967779569926e-06, + "loss": 0.0568, + "step": 7654 + }, + { + "epoch": 0.65, + "grad_norm": 0.3485043769503001, + "learning_rate": 2.956722244328677e-06, + "loss": 0.094, + "step": 7655 + }, + { + "epoch": 0.65, + "grad_norm": 0.19558482986653544, + "learning_rate": 2.955476861282658e-06, + "loss": 0.0473, + "step": 7656 + }, + { + "epoch": 0.65, + "grad_norm": 0.30923844446709947, + "learning_rate": 2.95423163052463e-06, + "loss": 0.0614, + "step": 7657 + }, + { + "epoch": 0.65, + "grad_norm": 0.6232460291241838, + "learning_rate": 2.95298655214735e-06, + "loss": 0.0951, + "step": 7658 + }, + { + "epoch": 0.65, + "grad_norm": 0.27995577607582195, + "learning_rate": 2.951741626243556e-06, + "loss": 0.0453, + "step": 7659 + }, + { + "epoch": 0.65, + "grad_norm": 0.29337837941816786, + "learning_rate": 2.9504968529059784e-06, + "loss": 0.1032, + "step": 7660 + }, + { + "epoch": 0.65, + "grad_norm": 0.2296433823809354, + "learning_rate": 2.949252232227332e-06, + "loss": 0.0523, + "step": 7661 + }, + { + "epoch": 0.65, + "grad_norm": 0.2768467010035087, + "learning_rate": 2.948007764300328e-06, + "loss": 0.0605, + "step": 7662 + }, + { + "epoch": 0.65, + "grad_norm": 0.4289900829373465, + "learning_rate": 2.946763449217659e-06, + "loss": 0.1005, + "step": 7663 + }, + { + "epoch": 0.65, + "grad_norm": 0.25394514404082935, + "learning_rate": 2.9455192870720073e-06, + "loss": 0.0567, + "step": 7664 + }, + { + "epoch": 0.65, + "grad_norm": 0.4441613527634566, + "learning_rate": 2.9442752779560502e-06, + "loss": 0.117, + "step": 7665 + }, + { + "epoch": 0.65, + "grad_norm": 0.20181939604021895, + "learning_rate": 2.9430314219624446e-06, + "loss": 0.0666, + "step": 7666 + }, + { + "epoch": 0.65, + "grad_norm": 0.6510775557177738, + "learning_rate": 2.941787719183842e-06, + "loss": 0.1088, + "step": 7667 + }, + { + "epoch": 0.65, + "grad_norm": 0.25728350642052045, + "learning_rate": 2.9405441697128795e-06, + "loss": 0.0434, + "step": 7668 + }, + { + "epoch": 0.65, + "grad_norm": 0.31808476105024885, + "learning_rate": 2.939300773642184e-06, + "loss": 0.1049, + "step": 7669 + }, + { + "epoch": 0.65, + "grad_norm": 0.2932412813441007, + "learning_rate": 2.938057531064372e-06, + "loss": 0.0632, + "step": 7670 + }, + { + "epoch": 0.65, + "grad_norm": 0.2579541918096553, + "learning_rate": 2.936814442072047e-06, + "loss": 0.0512, + "step": 7671 + }, + { + "epoch": 0.65, + "grad_norm": 0.3456252510687903, + "learning_rate": 2.9355715067578015e-06, + "loss": 0.1066, + "step": 7672 + }, + { + "epoch": 0.65, + "grad_norm": 0.27739207304873187, + "learning_rate": 2.934328725214215e-06, + "loss": 0.0719, + "step": 7673 + }, + { + "epoch": 0.65, + "grad_norm": 0.505938677170983, + "learning_rate": 2.9330860975338592e-06, + "loss": 0.1642, + "step": 7674 + }, + { + "epoch": 0.65, + "grad_norm": 0.22631945575667362, + "learning_rate": 2.9318436238092917e-06, + "loss": 0.074, + "step": 7675 + }, + { + "epoch": 0.65, + "grad_norm": 0.3580393207810524, + "learning_rate": 2.93060130413306e-06, + "loss": 0.0832, + "step": 7676 + }, + { + "epoch": 0.65, + "grad_norm": 0.2521019471724546, + "learning_rate": 2.929359138597698e-06, + "loss": 0.0735, + "step": 7677 + }, + { + "epoch": 0.65, + "grad_norm": 0.28287885008670893, + "learning_rate": 2.9281171272957275e-06, + "loss": 0.0799, + "step": 7678 + }, + { + "epoch": 0.65, + "grad_norm": 0.5348398919167312, + "learning_rate": 2.9268752703196645e-06, + "loss": 0.1071, + "step": 7679 + }, + { + "epoch": 0.65, + "grad_norm": 0.22248502795144798, + "learning_rate": 2.925633567762009e-06, + "loss": 0.0659, + "step": 7680 + }, + { + "epoch": 0.65, + "grad_norm": 0.4584132035139908, + "learning_rate": 2.924392019715247e-06, + "loss": 0.1124, + "step": 7681 + }, + { + "epoch": 0.65, + "grad_norm": 0.4183298450255073, + "learning_rate": 2.9231506262718612e-06, + "loss": 0.0842, + "step": 7682 + }, + { + "epoch": 0.65, + "grad_norm": 0.26046234945738206, + "learning_rate": 2.9219093875243144e-06, + "loss": 0.0571, + "step": 7683 + }, + { + "epoch": 0.65, + "grad_norm": 0.4564274945246979, + "learning_rate": 2.920668303565063e-06, + "loss": 0.1033, + "step": 7684 + }, + { + "epoch": 0.65, + "grad_norm": 0.4381342566983733, + "learning_rate": 2.919427374486547e-06, + "loss": 0.1129, + "step": 7685 + }, + { + "epoch": 0.65, + "grad_norm": 0.3263740523874261, + "learning_rate": 2.9181866003812025e-06, + "loss": 0.0732, + "step": 7686 + }, + { + "epoch": 0.65, + "grad_norm": 0.17927566082124533, + "learning_rate": 2.9169459813414477e-06, + "loss": 0.0603, + "step": 7687 + }, + { + "epoch": 0.65, + "grad_norm": 0.24534618688691814, + "learning_rate": 2.915705517459688e-06, + "loss": 0.0631, + "step": 7688 + }, + { + "epoch": 0.65, + "grad_norm": 0.4702917318921455, + "learning_rate": 2.9144652088283264e-06, + "loss": 0.08, + "step": 7689 + }, + { + "epoch": 0.65, + "grad_norm": 0.6530921816652279, + "learning_rate": 2.9132250555397445e-06, + "loss": 0.1248, + "step": 7690 + }, + { + "epoch": 0.65, + "grad_norm": 0.36946832985721206, + "learning_rate": 2.9119850576863175e-06, + "loss": 0.1047, + "step": 7691 + }, + { + "epoch": 0.65, + "grad_norm": 0.237938050649642, + "learning_rate": 2.910745215360404e-06, + "loss": 0.0743, + "step": 7692 + }, + { + "epoch": 0.65, + "grad_norm": 0.5283002663052525, + "learning_rate": 2.9095055286543605e-06, + "loss": 0.1435, + "step": 7693 + }, + { + "epoch": 0.65, + "grad_norm": 0.3016769280838413, + "learning_rate": 2.9082659976605226e-06, + "loss": 0.0627, + "step": 7694 + }, + { + "epoch": 0.65, + "grad_norm": 0.1457954373829542, + "learning_rate": 2.907026622471217e-06, + "loss": 0.0418, + "step": 7695 + }, + { + "epoch": 0.65, + "grad_norm": 0.3808649438057077, + "learning_rate": 2.9057874031787624e-06, + "loss": 0.0584, + "step": 7696 + }, + { + "epoch": 0.65, + "grad_norm": 0.37938589666766376, + "learning_rate": 2.9045483398754628e-06, + "loss": 0.0953, + "step": 7697 + }, + { + "epoch": 0.65, + "grad_norm": 0.5125293923537707, + "learning_rate": 2.9033094326536063e-06, + "loss": 0.128, + "step": 7698 + }, + { + "epoch": 0.65, + "grad_norm": 0.23634379723225254, + "learning_rate": 2.90207068160548e-06, + "loss": 0.0616, + "step": 7699 + }, + { + "epoch": 0.65, + "grad_norm": 0.4405512483281603, + "learning_rate": 2.9008320868233507e-06, + "loss": 0.067, + "step": 7700 + }, + { + "epoch": 0.65, + "grad_norm": 0.3237926244621433, + "learning_rate": 2.899593648399477e-06, + "loss": 0.0784, + "step": 7701 + }, + { + "epoch": 0.65, + "grad_norm": 0.30430600796467955, + "learning_rate": 2.898355366426101e-06, + "loss": 0.0644, + "step": 7702 + }, + { + "epoch": 0.65, + "grad_norm": 0.4212062858735066, + "learning_rate": 2.897117240995463e-06, + "loss": 0.1017, + "step": 7703 + }, + { + "epoch": 0.65, + "grad_norm": 0.3278006754646287, + "learning_rate": 2.8958792721997823e-06, + "loss": 0.0791, + "step": 7704 + }, + { + "epoch": 0.65, + "grad_norm": 0.26884786771859026, + "learning_rate": 2.8946414601312696e-06, + "loss": 0.0878, + "step": 7705 + }, + { + "epoch": 0.65, + "grad_norm": 0.6112089498822476, + "learning_rate": 2.893403804882127e-06, + "loss": 0.1099, + "step": 7706 + }, + { + "epoch": 0.65, + "grad_norm": 0.35748296203113894, + "learning_rate": 2.8921663065445414e-06, + "loss": 0.0929, + "step": 7707 + }, + { + "epoch": 0.65, + "grad_norm": 0.2454367672306242, + "learning_rate": 2.8909289652106877e-06, + "loss": 0.049, + "step": 7708 + }, + { + "epoch": 0.65, + "grad_norm": 0.29977887854425456, + "learning_rate": 2.889691780972729e-06, + "loss": 0.0704, + "step": 7709 + }, + { + "epoch": 0.65, + "grad_norm": 0.20627984595493482, + "learning_rate": 2.8884547539228225e-06, + "loss": 0.0832, + "step": 7710 + }, + { + "epoch": 0.65, + "grad_norm": 0.26327292206158076, + "learning_rate": 2.887217884153106e-06, + "loss": 0.0655, + "step": 7711 + }, + { + "epoch": 0.65, + "grad_norm": 0.29190156002195655, + "learning_rate": 2.885981171755708e-06, + "loss": 0.0618, + "step": 7712 + }, + { + "epoch": 0.65, + "grad_norm": 0.6801916682029193, + "learning_rate": 2.8847446168227487e-06, + "loss": 0.1327, + "step": 7713 + }, + { + "epoch": 0.65, + "grad_norm": 0.3070634528299016, + "learning_rate": 2.8835082194463324e-06, + "loss": 0.0821, + "step": 7714 + }, + { + "epoch": 0.65, + "grad_norm": 0.24887581119208113, + "learning_rate": 2.882271979718553e-06, + "loss": 0.0839, + "step": 7715 + }, + { + "epoch": 0.65, + "grad_norm": 0.2973307206098664, + "learning_rate": 2.8810358977314935e-06, + "loss": 0.0979, + "step": 7716 + }, + { + "epoch": 0.65, + "grad_norm": 0.25893381919528624, + "learning_rate": 2.879799973577225e-06, + "loss": 0.0707, + "step": 7717 + }, + { + "epoch": 0.65, + "grad_norm": 0.38457707097676397, + "learning_rate": 2.8785642073478054e-06, + "loss": 0.0962, + "step": 7718 + }, + { + "epoch": 0.65, + "grad_norm": 0.3778350828875611, + "learning_rate": 2.877328599135282e-06, + "loss": 0.1255, + "step": 7719 + }, + { + "epoch": 0.65, + "grad_norm": 0.22353347419610237, + "learning_rate": 2.876093149031688e-06, + "loss": 0.0746, + "step": 7720 + }, + { + "epoch": 0.65, + "grad_norm": 0.23773070229008716, + "learning_rate": 2.8748578571290507e-06, + "loss": 0.077, + "step": 7721 + }, + { + "epoch": 0.65, + "grad_norm": 0.18234505482289376, + "learning_rate": 2.87362272351938e-06, + "loss": 0.0498, + "step": 7722 + }, + { + "epoch": 0.65, + "grad_norm": 0.3580999734792023, + "learning_rate": 2.8723877482946762e-06, + "loss": 0.1087, + "step": 7723 + }, + { + "epoch": 0.65, + "grad_norm": 0.3466034664032849, + "learning_rate": 2.871152931546925e-06, + "loss": 0.0784, + "step": 7724 + }, + { + "epoch": 0.65, + "grad_norm": 0.27517761546731734, + "learning_rate": 2.869918273368107e-06, + "loss": 0.0503, + "step": 7725 + }, + { + "epoch": 0.65, + "grad_norm": 0.33251455436398075, + "learning_rate": 2.868683773850185e-06, + "loss": 0.0788, + "step": 7726 + }, + { + "epoch": 0.65, + "grad_norm": 0.2634912591045007, + "learning_rate": 2.867449433085111e-06, + "loss": 0.0631, + "step": 7727 + }, + { + "epoch": 0.65, + "grad_norm": 0.3223917887124083, + "learning_rate": 2.866215251164824e-06, + "loss": 0.0736, + "step": 7728 + }, + { + "epoch": 0.65, + "grad_norm": 0.390254519818775, + "learning_rate": 2.864981228181257e-06, + "loss": 0.0781, + "step": 7729 + }, + { + "epoch": 0.65, + "grad_norm": 0.17689229295112066, + "learning_rate": 2.863747364226327e-06, + "loss": 0.0496, + "step": 7730 + }, + { + "epoch": 0.65, + "grad_norm": 0.3741799012458619, + "learning_rate": 2.8625136593919368e-06, + "loss": 0.1193, + "step": 7731 + }, + { + "epoch": 0.65, + "grad_norm": 0.38117468634003, + "learning_rate": 2.8612801137699786e-06, + "loss": 0.0845, + "step": 7732 + }, + { + "epoch": 0.65, + "grad_norm": 0.22853354874061838, + "learning_rate": 2.8600467274523392e-06, + "loss": 0.0614, + "step": 7733 + }, + { + "epoch": 0.65, + "grad_norm": 0.17525877578123059, + "learning_rate": 2.858813500530886e-06, + "loss": 0.0533, + "step": 7734 + }, + { + "epoch": 0.65, + "grad_norm": 0.327697450578931, + "learning_rate": 2.857580433097476e-06, + "loss": 0.0637, + "step": 7735 + }, + { + "epoch": 0.65, + "grad_norm": 0.25955956707958583, + "learning_rate": 2.8563475252439544e-06, + "loss": 0.079, + "step": 7736 + }, + { + "epoch": 0.65, + "grad_norm": 0.36288134821774254, + "learning_rate": 2.855114777062158e-06, + "loss": 0.0748, + "step": 7737 + }, + { + "epoch": 0.65, + "grad_norm": 0.30444640532718537, + "learning_rate": 2.853882188643908e-06, + "loss": 0.0766, + "step": 7738 + }, + { + "epoch": 0.65, + "grad_norm": 0.3056367521384169, + "learning_rate": 2.8526497600810133e-06, + "loss": 0.0747, + "step": 7739 + }, + { + "epoch": 0.65, + "grad_norm": 0.2728911026014942, + "learning_rate": 2.851417491465276e-06, + "loss": 0.0404, + "step": 7740 + }, + { + "epoch": 0.65, + "grad_norm": 0.2262891081268102, + "learning_rate": 2.8501853828884795e-06, + "loss": 0.0705, + "step": 7741 + }, + { + "epoch": 0.65, + "grad_norm": 0.2633499713807462, + "learning_rate": 2.8489534344424e-06, + "loss": 0.0813, + "step": 7742 + }, + { + "epoch": 0.65, + "grad_norm": 0.3990478145803632, + "learning_rate": 2.847721646218796e-06, + "loss": 0.0926, + "step": 7743 + }, + { + "epoch": 0.65, + "grad_norm": 0.28539402583828705, + "learning_rate": 2.8464900183094256e-06, + "loss": 0.0748, + "step": 7744 + }, + { + "epoch": 0.65, + "grad_norm": 0.3222072327034806, + "learning_rate": 2.8452585508060225e-06, + "loss": 0.1009, + "step": 7745 + }, + { + "epoch": 0.65, + "grad_norm": 0.29571964393787387, + "learning_rate": 2.8440272438003136e-06, + "loss": 0.0684, + "step": 7746 + }, + { + "epoch": 0.65, + "grad_norm": 0.29543316741360764, + "learning_rate": 2.842796097384016e-06, + "loss": 0.0945, + "step": 7747 + }, + { + "epoch": 0.65, + "grad_norm": 1.189283231814599, + "learning_rate": 2.841565111648833e-06, + "loss": 0.1147, + "step": 7748 + }, + { + "epoch": 0.65, + "grad_norm": 0.3870682024027058, + "learning_rate": 2.8403342866864514e-06, + "loss": 0.1044, + "step": 7749 + }, + { + "epoch": 0.65, + "grad_norm": 0.2427672205820367, + "learning_rate": 2.839103622588556e-06, + "loss": 0.0792, + "step": 7750 + }, + { + "epoch": 0.65, + "grad_norm": 0.37481290294364916, + "learning_rate": 2.8378731194468103e-06, + "loss": 0.0812, + "step": 7751 + }, + { + "epoch": 0.65, + "grad_norm": 0.3615292004364502, + "learning_rate": 2.83664277735287e-06, + "loss": 0.1025, + "step": 7752 + }, + { + "epoch": 0.65, + "grad_norm": 0.3066090494529635, + "learning_rate": 2.835412596398376e-06, + "loss": 0.095, + "step": 7753 + }, + { + "epoch": 0.65, + "grad_norm": 0.40884492792958027, + "learning_rate": 2.8341825766749635e-06, + "loss": 0.0731, + "step": 7754 + }, + { + "epoch": 0.65, + "grad_norm": 0.3268826758852914, + "learning_rate": 2.83295271827425e-06, + "loss": 0.0558, + "step": 7755 + }, + { + "epoch": 0.65, + "grad_norm": 0.3354642759847708, + "learning_rate": 2.83172302128784e-06, + "loss": 0.0969, + "step": 7756 + }, + { + "epoch": 0.65, + "grad_norm": 0.40071439121645547, + "learning_rate": 2.830493485807332e-06, + "loss": 0.0729, + "step": 7757 + }, + { + "epoch": 0.65, + "grad_norm": 0.29087465458835, + "learning_rate": 2.8292641119243074e-06, + "loss": 0.0641, + "step": 7758 + }, + { + "epoch": 0.65, + "grad_norm": 0.22827017699942953, + "learning_rate": 2.8280348997303376e-06, + "loss": 0.0623, + "step": 7759 + }, + { + "epoch": 0.65, + "grad_norm": 0.508477524648532, + "learning_rate": 2.8268058493169793e-06, + "loss": 0.0763, + "step": 7760 + }, + { + "epoch": 0.65, + "grad_norm": 0.465987828406582, + "learning_rate": 2.8255769607757826e-06, + "loss": 0.1108, + "step": 7761 + }, + { + "epoch": 0.65, + "grad_norm": 0.21670620629936793, + "learning_rate": 2.8243482341982807e-06, + "loss": 0.0503, + "step": 7762 + }, + { + "epoch": 0.65, + "grad_norm": 0.49318093587915685, + "learning_rate": 2.8231196696759956e-06, + "loss": 0.1072, + "step": 7763 + }, + { + "epoch": 0.65, + "grad_norm": 0.267304769734604, + "learning_rate": 2.8218912673004394e-06, + "loss": 0.0723, + "step": 7764 + }, + { + "epoch": 0.65, + "grad_norm": 0.3441234836373086, + "learning_rate": 2.8206630271631108e-06, + "loss": 0.0616, + "step": 7765 + }, + { + "epoch": 0.65, + "grad_norm": 0.2819235176435247, + "learning_rate": 2.8194349493554935e-06, + "loss": 0.0548, + "step": 7766 + }, + { + "epoch": 0.65, + "grad_norm": 0.26834172068321543, + "learning_rate": 2.818207033969066e-06, + "loss": 0.0559, + "step": 7767 + }, + { + "epoch": 0.65, + "grad_norm": 0.327223231205228, + "learning_rate": 2.816979281095288e-06, + "loss": 0.0577, + "step": 7768 + }, + { + "epoch": 0.65, + "grad_norm": 0.4358538600455522, + "learning_rate": 2.81575169082561e-06, + "loss": 0.0863, + "step": 7769 + }, + { + "epoch": 0.65, + "grad_norm": 0.22055821894129377, + "learning_rate": 2.814524263251469e-06, + "loss": 0.0477, + "step": 7770 + }, + { + "epoch": 0.65, + "grad_norm": 0.2574180088296187, + "learning_rate": 2.813296998464293e-06, + "loss": 0.0862, + "step": 7771 + }, + { + "epoch": 0.65, + "grad_norm": 0.3828656067902435, + "learning_rate": 2.8120698965554967e-06, + "loss": 0.1086, + "step": 7772 + }, + { + "epoch": 0.65, + "grad_norm": 0.2144904971797863, + "learning_rate": 2.8108429576164775e-06, + "loss": 0.0493, + "step": 7773 + }, + { + "epoch": 0.66, + "grad_norm": 0.5926798224449158, + "learning_rate": 2.809616181738629e-06, + "loss": 0.0884, + "step": 7774 + }, + { + "epoch": 0.66, + "grad_norm": 0.39327353425927253, + "learning_rate": 2.808389569013328e-06, + "loss": 0.121, + "step": 7775 + }, + { + "epoch": 0.66, + "grad_norm": 0.33730792100709706, + "learning_rate": 2.807163119531938e-06, + "loss": 0.1116, + "step": 7776 + }, + { + "epoch": 0.66, + "grad_norm": 0.2543890407827106, + "learning_rate": 2.805936833385812e-06, + "loss": 0.0796, + "step": 7777 + }, + { + "epoch": 0.66, + "grad_norm": 0.17976443099663517, + "learning_rate": 2.804710710666294e-06, + "loss": 0.0337, + "step": 7778 + }, + { + "epoch": 0.66, + "grad_norm": 0.23361126209148086, + "learning_rate": 2.80348475146471e-06, + "loss": 0.0402, + "step": 7779 + }, + { + "epoch": 0.66, + "grad_norm": 0.29027813613608794, + "learning_rate": 2.8022589558723763e-06, + "loss": 0.0675, + "step": 7780 + }, + { + "epoch": 0.66, + "grad_norm": 0.3977364594731456, + "learning_rate": 2.8010333239805997e-06, + "loss": 0.0785, + "step": 7781 + }, + { + "epoch": 0.66, + "grad_norm": 0.32304287409554566, + "learning_rate": 2.7998078558806712e-06, + "loss": 0.0812, + "step": 7782 + }, + { + "epoch": 0.66, + "grad_norm": 0.4061670729655189, + "learning_rate": 2.7985825516638685e-06, + "loss": 0.1093, + "step": 7783 + }, + { + "epoch": 0.66, + "grad_norm": 0.23842240679660404, + "learning_rate": 2.797357411421464e-06, + "loss": 0.0225, + "step": 7784 + }, + { + "epoch": 0.66, + "grad_norm": 0.2305189449582252, + "learning_rate": 2.7961324352447106e-06, + "loss": 0.0631, + "step": 7785 + }, + { + "epoch": 0.66, + "grad_norm": 0.45914660814205205, + "learning_rate": 2.7949076232248516e-06, + "loss": 0.0885, + "step": 7786 + }, + { + "epoch": 0.66, + "grad_norm": 0.17680099406509622, + "learning_rate": 2.7936829754531192e-06, + "loss": 0.0495, + "step": 7787 + }, + { + "epoch": 0.66, + "grad_norm": 0.2053245544012522, + "learning_rate": 2.7924584920207287e-06, + "loss": 0.0355, + "step": 7788 + }, + { + "epoch": 0.66, + "grad_norm": 0.3549145613488633, + "learning_rate": 2.791234173018892e-06, + "loss": 0.0618, + "step": 7789 + }, + { + "epoch": 0.66, + "grad_norm": 0.31950699897421087, + "learning_rate": 2.790010018538801e-06, + "loss": 0.0824, + "step": 7790 + }, + { + "epoch": 0.66, + "grad_norm": 0.3568178312543278, + "learning_rate": 2.7887860286716383e-06, + "loss": 0.0752, + "step": 7791 + }, + { + "epoch": 0.66, + "grad_norm": 0.42522926563767893, + "learning_rate": 2.7875622035085713e-06, + "loss": 0.0707, + "step": 7792 + }, + { + "epoch": 0.66, + "grad_norm": 0.2639977703192523, + "learning_rate": 2.786338543140762e-06, + "loss": 0.0784, + "step": 7793 + }, + { + "epoch": 0.66, + "grad_norm": 0.3697239805784935, + "learning_rate": 2.7851150476593523e-06, + "loss": 0.082, + "step": 7794 + }, + { + "epoch": 0.66, + "grad_norm": 0.3751595180440534, + "learning_rate": 2.783891717155478e-06, + "loss": 0.0393, + "step": 7795 + }, + { + "epoch": 0.66, + "grad_norm": 0.3198995296957473, + "learning_rate": 2.782668551720256e-06, + "loss": 0.1074, + "step": 7796 + }, + { + "epoch": 0.66, + "grad_norm": 0.5890423476203442, + "learning_rate": 2.781445551444799e-06, + "loss": 0.0895, + "step": 7797 + }, + { + "epoch": 0.66, + "grad_norm": 0.26591142817913277, + "learning_rate": 2.780222716420201e-06, + "loss": 0.0606, + "step": 7798 + }, + { + "epoch": 0.66, + "grad_norm": 0.4395782770188453, + "learning_rate": 2.7790000467375467e-06, + "loss": 0.104, + "step": 7799 + }, + { + "epoch": 0.66, + "grad_norm": 0.47830707903245, + "learning_rate": 2.777777542487905e-06, + "loss": 0.1229, + "step": 7800 + }, + { + "epoch": 0.66, + "grad_norm": 0.3234717114586146, + "learning_rate": 2.77655520376234e-06, + "loss": 0.0521, + "step": 7801 + }, + { + "epoch": 0.66, + "grad_norm": 0.1814886558484413, + "learning_rate": 2.775333030651895e-06, + "loss": 0.0559, + "step": 7802 + }, + { + "epoch": 0.66, + "grad_norm": 0.3798997246875799, + "learning_rate": 2.774111023247606e-06, + "loss": 0.1199, + "step": 7803 + }, + { + "epoch": 0.66, + "grad_norm": 0.5250078719229171, + "learning_rate": 2.7728891816404922e-06, + "loss": 0.1141, + "step": 7804 + }, + { + "epoch": 0.66, + "grad_norm": 0.3577445476544192, + "learning_rate": 2.771667505921569e-06, + "loss": 0.0804, + "step": 7805 + }, + { + "epoch": 0.66, + "grad_norm": 0.35737388718942253, + "learning_rate": 2.77044599618183e-06, + "loss": 0.0795, + "step": 7806 + }, + { + "epoch": 0.66, + "grad_norm": 0.4059075186476327, + "learning_rate": 2.7692246525122603e-06, + "loss": 0.1059, + "step": 7807 + }, + { + "epoch": 0.66, + "grad_norm": 0.31507643981614714, + "learning_rate": 2.7680034750038354e-06, + "loss": 0.0838, + "step": 7808 + }, + { + "epoch": 0.66, + "grad_norm": 0.27236420278011086, + "learning_rate": 2.7667824637475137e-06, + "loss": 0.045, + "step": 7809 + }, + { + "epoch": 0.66, + "grad_norm": 0.25166108048761, + "learning_rate": 2.7655616188342437e-06, + "loss": 0.0696, + "step": 7810 + }, + { + "epoch": 0.66, + "grad_norm": 0.31330747265961656, + "learning_rate": 2.7643409403549587e-06, + "loss": 0.1096, + "step": 7811 + }, + { + "epoch": 0.66, + "grad_norm": 0.12588044884692123, + "learning_rate": 2.7631204284005863e-06, + "loss": 0.025, + "step": 7812 + }, + { + "epoch": 0.66, + "grad_norm": 0.3687055559235135, + "learning_rate": 2.7619000830620347e-06, + "loss": 0.0777, + "step": 7813 + }, + { + "epoch": 0.66, + "grad_norm": 0.2656017790739315, + "learning_rate": 2.7606799044302013e-06, + "loss": 0.0646, + "step": 7814 + }, + { + "epoch": 0.66, + "grad_norm": 0.5328935801261846, + "learning_rate": 2.759459892595976e-06, + "loss": 0.1435, + "step": 7815 + }, + { + "epoch": 0.66, + "grad_norm": 0.351131237976295, + "learning_rate": 2.7582400476502293e-06, + "loss": 0.0767, + "step": 7816 + }, + { + "epoch": 0.66, + "grad_norm": 0.2699508551993985, + "learning_rate": 2.757020369683824e-06, + "loss": 0.0701, + "step": 7817 + }, + { + "epoch": 0.66, + "grad_norm": 0.2582939422333032, + "learning_rate": 2.755800858787605e-06, + "loss": 0.092, + "step": 7818 + }, + { + "epoch": 0.66, + "grad_norm": 0.2121190075548633, + "learning_rate": 2.7545815150524134e-06, + "loss": 0.0625, + "step": 7819 + }, + { + "epoch": 0.66, + "grad_norm": 0.3533572052613979, + "learning_rate": 2.7533623385690716e-06, + "loss": 0.096, + "step": 7820 + }, + { + "epoch": 0.66, + "grad_norm": 0.4350425122952263, + "learning_rate": 2.7521433294283884e-06, + "loss": 0.1098, + "step": 7821 + }, + { + "epoch": 0.66, + "grad_norm": 0.40485789113325527, + "learning_rate": 2.7509244877211666e-06, + "loss": 0.1183, + "step": 7822 + }, + { + "epoch": 0.66, + "grad_norm": 0.41907797938320884, + "learning_rate": 2.7497058135381903e-06, + "loss": 0.0969, + "step": 7823 + }, + { + "epoch": 0.66, + "grad_norm": 0.41703583241675096, + "learning_rate": 2.748487306970233e-06, + "loss": 0.1044, + "step": 7824 + }, + { + "epoch": 0.66, + "grad_norm": 0.3705024668974954, + "learning_rate": 2.747268968108059e-06, + "loss": 0.0806, + "step": 7825 + }, + { + "epoch": 0.66, + "grad_norm": 0.32594385451834257, + "learning_rate": 2.7460507970424143e-06, + "loss": 0.1168, + "step": 7826 + }, + { + "epoch": 0.66, + "grad_norm": 0.1816370898887897, + "learning_rate": 2.7448327938640374e-06, + "loss": 0.0627, + "step": 7827 + }, + { + "epoch": 0.66, + "grad_norm": 0.5533248363859766, + "learning_rate": 2.743614958663649e-06, + "loss": 0.0649, + "step": 7828 + }, + { + "epoch": 0.66, + "grad_norm": 0.29956056363042194, + "learning_rate": 2.7423972915319653e-06, + "loss": 0.0574, + "step": 7829 + }, + { + "epoch": 0.66, + "grad_norm": 0.4491376535673036, + "learning_rate": 2.7411797925596818e-06, + "loss": 0.1129, + "step": 7830 + }, + { + "epoch": 0.66, + "grad_norm": 0.2017419965726022, + "learning_rate": 2.739962461837485e-06, + "loss": 0.0509, + "step": 7831 + }, + { + "epoch": 0.66, + "grad_norm": 0.48957780482574104, + "learning_rate": 2.7387452994560517e-06, + "loss": 0.1025, + "step": 7832 + }, + { + "epoch": 0.66, + "grad_norm": 0.322230990206635, + "learning_rate": 2.7375283055060405e-06, + "loss": 0.1073, + "step": 7833 + }, + { + "epoch": 0.66, + "grad_norm": 0.2993773628773903, + "learning_rate": 2.736311480078101e-06, + "loss": 0.0782, + "step": 7834 + }, + { + "epoch": 0.66, + "grad_norm": 0.24678599431252085, + "learning_rate": 2.735094823262868e-06, + "loss": 0.0477, + "step": 7835 + }, + { + "epoch": 0.66, + "grad_norm": 0.6238278960056596, + "learning_rate": 2.733878335150969e-06, + "loss": 0.0616, + "step": 7836 + }, + { + "epoch": 0.66, + "grad_norm": 0.25164155024049567, + "learning_rate": 2.732662015833012e-06, + "loss": 0.0567, + "step": 7837 + }, + { + "epoch": 0.66, + "grad_norm": 0.3205467252916991, + "learning_rate": 2.7314458653995946e-06, + "loss": 0.0891, + "step": 7838 + }, + { + "epoch": 0.66, + "grad_norm": 0.3074138568856897, + "learning_rate": 2.7302298839413062e-06, + "loss": 0.0908, + "step": 7839 + }, + { + "epoch": 0.66, + "grad_norm": 0.277509210014652, + "learning_rate": 2.7290140715487185e-06, + "loss": 0.0896, + "step": 7840 + }, + { + "epoch": 0.66, + "grad_norm": 0.23960097747581732, + "learning_rate": 2.7277984283123906e-06, + "loss": 0.0755, + "step": 7841 + }, + { + "epoch": 0.66, + "grad_norm": 0.36162889905909923, + "learning_rate": 2.726582954322874e-06, + "loss": 0.0816, + "step": 7842 + }, + { + "epoch": 0.66, + "grad_norm": 0.26884981010338127, + "learning_rate": 2.725367649670702e-06, + "loss": 0.0698, + "step": 7843 + }, + { + "epoch": 0.66, + "grad_norm": 0.285403732899656, + "learning_rate": 2.7241525144463987e-06, + "loss": 0.0706, + "step": 7844 + }, + { + "epoch": 0.66, + "grad_norm": 0.4037689316534924, + "learning_rate": 2.722937548740472e-06, + "loss": 0.1042, + "step": 7845 + }, + { + "epoch": 0.66, + "grad_norm": 0.29525894508852857, + "learning_rate": 2.7217227526434237e-06, + "loss": 0.0759, + "step": 7846 + }, + { + "epoch": 0.66, + "grad_norm": 0.1708952397844576, + "learning_rate": 2.7205081262457366e-06, + "loss": 0.0417, + "step": 7847 + }, + { + "epoch": 0.66, + "grad_norm": 0.32597083290489226, + "learning_rate": 2.719293669637881e-06, + "loss": 0.0773, + "step": 7848 + }, + { + "epoch": 0.66, + "grad_norm": 0.24411555223661527, + "learning_rate": 2.7180793829103214e-06, + "loss": 0.0655, + "step": 7849 + }, + { + "epoch": 0.66, + "grad_norm": 0.3067400517368821, + "learning_rate": 2.716865266153502e-06, + "loss": 0.0734, + "step": 7850 + }, + { + "epoch": 0.66, + "grad_norm": 0.3299897782231123, + "learning_rate": 2.7156513194578584e-06, + "loss": 0.057, + "step": 7851 + }, + { + "epoch": 0.66, + "grad_norm": 0.28077228179291125, + "learning_rate": 2.714437542913809e-06, + "loss": 0.0672, + "step": 7852 + }, + { + "epoch": 0.66, + "grad_norm": 0.17620731993091054, + "learning_rate": 2.7132239366117684e-06, + "loss": 0.0521, + "step": 7853 + }, + { + "epoch": 0.66, + "grad_norm": 0.19853743567584312, + "learning_rate": 2.712010500642131e-06, + "loss": 0.0466, + "step": 7854 + }, + { + "epoch": 0.66, + "grad_norm": 0.3335095358961763, + "learning_rate": 2.7107972350952795e-06, + "loss": 0.0981, + "step": 7855 + }, + { + "epoch": 0.66, + "grad_norm": 0.48050156576743913, + "learning_rate": 2.709584140061583e-06, + "loss": 0.1453, + "step": 7856 + }, + { + "epoch": 0.66, + "grad_norm": 0.40536760072149425, + "learning_rate": 2.708371215631405e-06, + "loss": 0.0975, + "step": 7857 + }, + { + "epoch": 0.66, + "grad_norm": 0.35686882452031177, + "learning_rate": 2.7071584618950886e-06, + "loss": 0.0902, + "step": 7858 + }, + { + "epoch": 0.66, + "grad_norm": 0.19574659721337634, + "learning_rate": 2.705945878942967e-06, + "loss": 0.0587, + "step": 7859 + }, + { + "epoch": 0.66, + "grad_norm": 0.20187919848150349, + "learning_rate": 2.7047334668653574e-06, + "loss": 0.0525, + "step": 7860 + }, + { + "epoch": 0.66, + "grad_norm": 0.3547963410962722, + "learning_rate": 2.703521225752572e-06, + "loss": 0.0825, + "step": 7861 + }, + { + "epoch": 0.66, + "grad_norm": 0.4694460516804668, + "learning_rate": 2.7023091556949048e-06, + "loss": 0.0938, + "step": 7862 + }, + { + "epoch": 0.66, + "grad_norm": 0.3161041146096953, + "learning_rate": 2.7010972567826365e-06, + "loss": 0.0854, + "step": 7863 + }, + { + "epoch": 0.66, + "grad_norm": 0.31826629587531613, + "learning_rate": 2.699885529106034e-06, + "loss": 0.1001, + "step": 7864 + }, + { + "epoch": 0.66, + "grad_norm": 0.2752697481315665, + "learning_rate": 2.698673972755359e-06, + "loss": 0.0806, + "step": 7865 + }, + { + "epoch": 0.66, + "grad_norm": 0.4269416297414388, + "learning_rate": 2.697462587820852e-06, + "loss": 0.0866, + "step": 7866 + }, + { + "epoch": 0.66, + "grad_norm": 0.34594113775183505, + "learning_rate": 2.6962513743927454e-06, + "loss": 0.0821, + "step": 7867 + }, + { + "epoch": 0.66, + "grad_norm": 0.3188123008229541, + "learning_rate": 2.695040332561256e-06, + "loss": 0.0768, + "step": 7868 + }, + { + "epoch": 0.66, + "grad_norm": 0.29597609788007545, + "learning_rate": 2.693829462416589e-06, + "loss": 0.0697, + "step": 7869 + }, + { + "epoch": 0.66, + "grad_norm": 0.23628018380265178, + "learning_rate": 2.692618764048939e-06, + "loss": 0.0619, + "step": 7870 + }, + { + "epoch": 0.66, + "grad_norm": 0.4242817761984848, + "learning_rate": 2.6914082375484863e-06, + "loss": 0.112, + "step": 7871 + }, + { + "epoch": 0.66, + "grad_norm": 0.35032744474092903, + "learning_rate": 2.690197883005393e-06, + "loss": 0.0639, + "step": 7872 + }, + { + "epoch": 0.66, + "grad_norm": 0.2634822542471033, + "learning_rate": 2.68898770050982e-06, + "loss": 0.0438, + "step": 7873 + }, + { + "epoch": 0.66, + "grad_norm": 0.4410345109038879, + "learning_rate": 2.687777690151906e-06, + "loss": 0.1062, + "step": 7874 + }, + { + "epoch": 0.66, + "grad_norm": 0.29247298539355265, + "learning_rate": 2.6865678520217765e-06, + "loss": 0.0693, + "step": 7875 + }, + { + "epoch": 0.66, + "grad_norm": 0.5402063674634102, + "learning_rate": 2.6853581862095524e-06, + "loss": 0.099, + "step": 7876 + }, + { + "epoch": 0.66, + "grad_norm": 0.1725202969911037, + "learning_rate": 2.684148692805335e-06, + "loss": 0.0723, + "step": 7877 + }, + { + "epoch": 0.66, + "grad_norm": 0.2177217934113603, + "learning_rate": 2.682939371899213e-06, + "loss": 0.0622, + "step": 7878 + }, + { + "epoch": 0.66, + "grad_norm": 0.23646372464505894, + "learning_rate": 2.6817302235812625e-06, + "loss": 0.0636, + "step": 7879 + }, + { + "epoch": 0.66, + "grad_norm": 0.3756749996670768, + "learning_rate": 2.680521247941552e-06, + "loss": 0.0995, + "step": 7880 + }, + { + "epoch": 0.66, + "grad_norm": 0.47637620095847244, + "learning_rate": 2.6793124450701314e-06, + "loss": 0.1053, + "step": 7881 + }, + { + "epoch": 0.66, + "grad_norm": 0.2656782541565853, + "learning_rate": 2.678103815057036e-06, + "loss": 0.0471, + "step": 7882 + }, + { + "epoch": 0.66, + "grad_norm": 0.3342829250824342, + "learning_rate": 2.6768953579922976e-06, + "loss": 0.073, + "step": 7883 + }, + { + "epoch": 0.66, + "grad_norm": 0.27171851169235217, + "learning_rate": 2.6756870739659247e-06, + "loss": 0.0741, + "step": 7884 + }, + { + "epoch": 0.66, + "grad_norm": 0.18468439560027985, + "learning_rate": 2.6744789630679198e-06, + "loss": 0.0491, + "step": 7885 + }, + { + "epoch": 0.66, + "grad_norm": 0.18093937977521382, + "learning_rate": 2.6732710253882665e-06, + "loss": 0.051, + "step": 7886 + }, + { + "epoch": 0.66, + "grad_norm": 0.44470270813498286, + "learning_rate": 2.6720632610169426e-06, + "loss": 0.0852, + "step": 7887 + }, + { + "epoch": 0.66, + "grad_norm": 0.1773735011019568, + "learning_rate": 2.6708556700439094e-06, + "loss": 0.0438, + "step": 7888 + }, + { + "epoch": 0.66, + "grad_norm": 0.2064305264420676, + "learning_rate": 2.6696482525591116e-06, + "loss": 0.0488, + "step": 7889 + }, + { + "epoch": 0.66, + "grad_norm": 0.5828310543377022, + "learning_rate": 2.6684410086524896e-06, + "loss": 0.0909, + "step": 7890 + }, + { + "epoch": 0.66, + "grad_norm": 0.2751868848328496, + "learning_rate": 2.6672339384139634e-06, + "loss": 0.0484, + "step": 7891 + }, + { + "epoch": 0.66, + "grad_norm": 0.49140815995877346, + "learning_rate": 2.666027041933441e-06, + "loss": 0.1199, + "step": 7892 + }, + { + "epoch": 0.67, + "grad_norm": 0.3817822901672961, + "learning_rate": 2.6648203193008225e-06, + "loss": 0.0905, + "step": 7893 + }, + { + "epoch": 0.67, + "grad_norm": 0.3125312302618564, + "learning_rate": 2.6636137706059906e-06, + "loss": 0.0552, + "step": 7894 + }, + { + "epoch": 0.67, + "grad_norm": 0.2995485084664994, + "learning_rate": 2.6624073959388153e-06, + "loss": 0.0932, + "step": 7895 + }, + { + "epoch": 0.67, + "grad_norm": 0.28106045259664436, + "learning_rate": 2.6612011953891526e-06, + "loss": 0.0807, + "step": 7896 + }, + { + "epoch": 0.67, + "grad_norm": 0.22649314143287447, + "learning_rate": 2.6599951690468516e-06, + "loss": 0.0649, + "step": 7897 + }, + { + "epoch": 0.67, + "grad_norm": 0.2602532925856686, + "learning_rate": 2.658789317001742e-06, + "loss": 0.0284, + "step": 7898 + }, + { + "epoch": 0.67, + "grad_norm": 0.25984306927463574, + "learning_rate": 2.6575836393436407e-06, + "loss": 0.067, + "step": 7899 + }, + { + "epoch": 0.67, + "grad_norm": 0.24044049077688343, + "learning_rate": 2.6563781361623563e-06, + "loss": 0.0779, + "step": 7900 + }, + { + "epoch": 0.67, + "grad_norm": 0.2551521840030982, + "learning_rate": 2.6551728075476823e-06, + "loss": 0.0522, + "step": 7901 + }, + { + "epoch": 0.67, + "grad_norm": 0.3096010478802843, + "learning_rate": 2.6539676535893967e-06, + "loss": 0.0829, + "step": 7902 + }, + { + "epoch": 0.67, + "grad_norm": 0.20417451457778238, + "learning_rate": 2.652762674377265e-06, + "loss": 0.0576, + "step": 7903 + }, + { + "epoch": 0.67, + "grad_norm": 0.29062435408955417, + "learning_rate": 2.651557870001045e-06, + "loss": 0.0418, + "step": 7904 + }, + { + "epoch": 0.67, + "grad_norm": 0.2628041823285726, + "learning_rate": 2.650353240550475e-06, + "loss": 0.0581, + "step": 7905 + }, + { + "epoch": 0.67, + "grad_norm": 0.37303414114705163, + "learning_rate": 2.649148786115282e-06, + "loss": 0.1083, + "step": 7906 + }, + { + "epoch": 0.67, + "grad_norm": 0.30263834980057, + "learning_rate": 2.647944506785184e-06, + "loss": 0.1086, + "step": 7907 + }, + { + "epoch": 0.67, + "grad_norm": 0.263391040234311, + "learning_rate": 2.6467404026498803e-06, + "loss": 0.0674, + "step": 7908 + }, + { + "epoch": 0.67, + "grad_norm": 0.19306731159863216, + "learning_rate": 2.645536473799059e-06, + "loss": 0.0579, + "step": 7909 + }, + { + "epoch": 0.67, + "grad_norm": 0.4793996899831052, + "learning_rate": 2.644332720322399e-06, + "loss": 0.1167, + "step": 7910 + }, + { + "epoch": 0.67, + "grad_norm": 0.503266095195825, + "learning_rate": 2.6431291423095594e-06, + "loss": 0.1261, + "step": 7911 + }, + { + "epoch": 0.67, + "grad_norm": 0.35496053492939644, + "learning_rate": 2.6419257398501923e-06, + "loss": 0.1046, + "step": 7912 + }, + { + "epoch": 0.67, + "grad_norm": 0.49066334216129187, + "learning_rate": 2.640722513033931e-06, + "loss": 0.0974, + "step": 7913 + }, + { + "epoch": 0.67, + "grad_norm": 0.43497272531607867, + "learning_rate": 2.6395194619504024e-06, + "loss": 0.0759, + "step": 7914 + }, + { + "epoch": 0.67, + "grad_norm": 0.29366808496240576, + "learning_rate": 2.6383165866892146e-06, + "loss": 0.0611, + "step": 7915 + }, + { + "epoch": 0.67, + "grad_norm": 0.2525665534002413, + "learning_rate": 2.6371138873399637e-06, + "loss": 0.0414, + "step": 7916 + }, + { + "epoch": 0.67, + "grad_norm": 0.2448144855770156, + "learning_rate": 2.6359113639922375e-06, + "loss": 0.067, + "step": 7917 + }, + { + "epoch": 0.67, + "grad_norm": 0.3867596272315621, + "learning_rate": 2.634709016735604e-06, + "loss": 0.1151, + "step": 7918 + }, + { + "epoch": 0.67, + "grad_norm": 0.3210786728905828, + "learning_rate": 2.6335068456596223e-06, + "loss": 0.0727, + "step": 7919 + }, + { + "epoch": 0.67, + "grad_norm": 0.2020164693887553, + "learning_rate": 2.6323048508538353e-06, + "loss": 0.0433, + "step": 7920 + }, + { + "epoch": 0.67, + "grad_norm": 0.4044527713005456, + "learning_rate": 2.631103032407777e-06, + "loss": 0.1029, + "step": 7921 + }, + { + "epoch": 0.67, + "grad_norm": 0.5106797358987005, + "learning_rate": 2.6299013904109645e-06, + "loss": 0.0602, + "step": 7922 + }, + { + "epoch": 0.67, + "grad_norm": 0.36700643800471494, + "learning_rate": 2.6286999249529032e-06, + "loss": 0.1165, + "step": 7923 + }, + { + "epoch": 0.67, + "grad_norm": 0.3385525640357685, + "learning_rate": 2.6274986361230836e-06, + "loss": 0.1096, + "step": 7924 + }, + { + "epoch": 0.67, + "grad_norm": 0.28467652376479996, + "learning_rate": 2.626297524010989e-06, + "loss": 0.0641, + "step": 7925 + }, + { + "epoch": 0.67, + "grad_norm": 0.23735562169568133, + "learning_rate": 2.625096588706082e-06, + "loss": 0.0313, + "step": 7926 + }, + { + "epoch": 0.67, + "grad_norm": 0.22092309069410704, + "learning_rate": 2.6238958302978163e-06, + "loss": 0.077, + "step": 7927 + }, + { + "epoch": 0.67, + "grad_norm": 0.7322307021203499, + "learning_rate": 2.6226952488756284e-06, + "loss": 0.1426, + "step": 7928 + }, + { + "epoch": 0.67, + "grad_norm": 0.16958910564242013, + "learning_rate": 2.62149484452895e-06, + "loss": 0.0436, + "step": 7929 + }, + { + "epoch": 0.67, + "grad_norm": 0.5693882229237957, + "learning_rate": 2.620294617347191e-06, + "loss": 0.1399, + "step": 7930 + }, + { + "epoch": 0.67, + "grad_norm": 0.32913931125206497, + "learning_rate": 2.6190945674197513e-06, + "loss": 0.107, + "step": 7931 + }, + { + "epoch": 0.67, + "grad_norm": 0.3925179355450706, + "learning_rate": 2.617894694836017e-06, + "loss": 0.113, + "step": 7932 + }, + { + "epoch": 0.67, + "grad_norm": 0.3578974717149245, + "learning_rate": 2.616694999685364e-06, + "loss": 0.1156, + "step": 7933 + }, + { + "epoch": 0.67, + "grad_norm": 0.38182302238121263, + "learning_rate": 2.615495482057151e-06, + "loss": 0.1016, + "step": 7934 + }, + { + "epoch": 0.67, + "grad_norm": 0.7766674072775798, + "learning_rate": 2.6142961420407252e-06, + "loss": 0.1739, + "step": 7935 + }, + { + "epoch": 0.67, + "grad_norm": 0.39578034441327137, + "learning_rate": 2.6130969797254212e-06, + "loss": 0.1194, + "step": 7936 + }, + { + "epoch": 0.67, + "grad_norm": 0.3100158836957796, + "learning_rate": 2.611897995200556e-06, + "loss": 0.0942, + "step": 7937 + }, + { + "epoch": 0.67, + "grad_norm": 0.28223776458272143, + "learning_rate": 2.610699188555442e-06, + "loss": 0.0775, + "step": 7938 + }, + { + "epoch": 0.67, + "grad_norm": 0.351301666317223, + "learning_rate": 2.609500559879372e-06, + "loss": 0.0881, + "step": 7939 + }, + { + "epoch": 0.67, + "grad_norm": 0.28547956647131745, + "learning_rate": 2.6083021092616235e-06, + "loss": 0.091, + "step": 7940 + }, + { + "epoch": 0.67, + "grad_norm": 0.1851543090787496, + "learning_rate": 2.607103836791468e-06, + "loss": 0.0452, + "step": 7941 + }, + { + "epoch": 0.67, + "grad_norm": 0.22685978835876186, + "learning_rate": 2.605905742558159e-06, + "loss": 0.0562, + "step": 7942 + }, + { + "epoch": 0.67, + "grad_norm": 0.36176116326103597, + "learning_rate": 2.6047078266509372e-06, + "loss": 0.096, + "step": 7943 + }, + { + "epoch": 0.67, + "grad_norm": 0.2691616197366879, + "learning_rate": 2.6035100891590277e-06, + "loss": 0.0503, + "step": 7944 + }, + { + "epoch": 0.67, + "grad_norm": 0.3631583931559883, + "learning_rate": 2.602312530171649e-06, + "loss": 0.0941, + "step": 7945 + }, + { + "epoch": 0.67, + "grad_norm": 0.4191126518876571, + "learning_rate": 2.601115149778001e-06, + "loss": 0.1266, + "step": 7946 + }, + { + "epoch": 0.67, + "grad_norm": 0.2590201740525622, + "learning_rate": 2.5999179480672694e-06, + "loss": 0.0896, + "step": 7947 + }, + { + "epoch": 0.67, + "grad_norm": 0.2377390387611689, + "learning_rate": 2.5987209251286323e-06, + "loss": 0.0619, + "step": 7948 + }, + { + "epoch": 0.67, + "grad_norm": 0.46279872092274565, + "learning_rate": 2.597524081051249e-06, + "loss": 0.0912, + "step": 7949 + }, + { + "epoch": 0.67, + "grad_norm": 0.2934251277913643, + "learning_rate": 2.596327415924266e-06, + "loss": 0.0647, + "step": 7950 + }, + { + "epoch": 0.67, + "grad_norm": 0.5327006757661311, + "learning_rate": 2.595130929836821e-06, + "loss": 0.147, + "step": 7951 + }, + { + "epoch": 0.67, + "grad_norm": 0.4739423815248122, + "learning_rate": 2.593934622878034e-06, + "loss": 0.0582, + "step": 7952 + }, + { + "epoch": 0.67, + "grad_norm": 0.44691232993665647, + "learning_rate": 2.592738495137013e-06, + "loss": 0.0814, + "step": 7953 + }, + { + "epoch": 0.67, + "grad_norm": 0.4520183714963539, + "learning_rate": 2.5915425467028498e-06, + "loss": 0.1168, + "step": 7954 + }, + { + "epoch": 0.67, + "grad_norm": 0.28150655015612336, + "learning_rate": 2.5903467776646297e-06, + "loss": 0.0945, + "step": 7955 + }, + { + "epoch": 0.67, + "grad_norm": 0.42923237418568994, + "learning_rate": 2.5891511881114197e-06, + "loss": 0.1267, + "step": 7956 + }, + { + "epoch": 0.67, + "grad_norm": 0.22550780075808677, + "learning_rate": 2.5879557781322705e-06, + "loss": 0.0406, + "step": 7957 + }, + { + "epoch": 0.67, + "grad_norm": 0.21929245027387836, + "learning_rate": 2.586760547816229e-06, + "loss": 0.0618, + "step": 7958 + }, + { + "epoch": 0.67, + "grad_norm": 0.21636772461612827, + "learning_rate": 2.5855654972523196e-06, + "loss": 0.064, + "step": 7959 + }, + { + "epoch": 0.67, + "grad_norm": 0.3661217157205062, + "learning_rate": 2.5843706265295575e-06, + "loss": 0.0972, + "step": 7960 + }, + { + "epoch": 0.67, + "grad_norm": 0.2097512488024827, + "learning_rate": 2.583175935736941e-06, + "loss": 0.0418, + "step": 7961 + }, + { + "epoch": 0.67, + "grad_norm": 0.36329489770450357, + "learning_rate": 2.581981424963461e-06, + "loss": 0.0927, + "step": 7962 + }, + { + "epoch": 0.67, + "grad_norm": 0.41026926072776804, + "learning_rate": 2.5807870942980916e-06, + "loss": 0.1117, + "step": 7963 + }, + { + "epoch": 0.67, + "grad_norm": 0.35886973747634987, + "learning_rate": 2.57959294382979e-06, + "loss": 0.0965, + "step": 7964 + }, + { + "epoch": 0.67, + "grad_norm": 0.29296103527102124, + "learning_rate": 2.5783989736475077e-06, + "loss": 0.0602, + "step": 7965 + }, + { + "epoch": 0.67, + "grad_norm": 0.3690539566551583, + "learning_rate": 2.577205183840177e-06, + "loss": 0.1113, + "step": 7966 + }, + { + "epoch": 0.67, + "grad_norm": 0.325298871932655, + "learning_rate": 2.576011574496716e-06, + "loss": 0.0886, + "step": 7967 + }, + { + "epoch": 0.67, + "grad_norm": 0.29421768994824826, + "learning_rate": 2.574818145706036e-06, + "loss": 0.0815, + "step": 7968 + }, + { + "epoch": 0.67, + "grad_norm": 0.25158215701713493, + "learning_rate": 2.5736248975570277e-06, + "loss": 0.0941, + "step": 7969 + }, + { + "epoch": 0.67, + "grad_norm": 0.33778250328556053, + "learning_rate": 2.5724318301385714e-06, + "loss": 0.1375, + "step": 7970 + }, + { + "epoch": 0.67, + "grad_norm": 0.433937841332717, + "learning_rate": 2.5712389435395324e-06, + "loss": 0.0812, + "step": 7971 + }, + { + "epoch": 0.67, + "grad_norm": 0.3119580847235038, + "learning_rate": 2.570046237848768e-06, + "loss": 0.0917, + "step": 7972 + }, + { + "epoch": 0.67, + "grad_norm": 0.2953838538271283, + "learning_rate": 2.5688537131551144e-06, + "loss": 0.0653, + "step": 7973 + }, + { + "epoch": 0.67, + "grad_norm": 0.23440058943056358, + "learning_rate": 2.567661369547397e-06, + "loss": 0.0741, + "step": 7974 + }, + { + "epoch": 0.67, + "grad_norm": 0.4332995510552151, + "learning_rate": 2.5664692071144326e-06, + "loss": 0.1031, + "step": 7975 + }, + { + "epoch": 0.67, + "grad_norm": 0.4160646948135601, + "learning_rate": 2.5652772259450174e-06, + "loss": 0.1167, + "step": 7976 + }, + { + "epoch": 0.67, + "grad_norm": 0.19382790359075167, + "learning_rate": 2.5640854261279384e-06, + "loss": 0.0606, + "step": 7977 + }, + { + "epoch": 0.67, + "grad_norm": 0.27681001395100363, + "learning_rate": 2.562893807751965e-06, + "loss": 0.0837, + "step": 7978 + }, + { + "epoch": 0.67, + "grad_norm": 0.2998379481154383, + "learning_rate": 2.5617023709058597e-06, + "loss": 0.0537, + "step": 7979 + }, + { + "epoch": 0.67, + "grad_norm": 0.4277746405934601, + "learning_rate": 2.5605111156783663e-06, + "loss": 0.1055, + "step": 7980 + }, + { + "epoch": 0.67, + "grad_norm": 0.36874101247388635, + "learning_rate": 2.559320042158214e-06, + "loss": 0.1018, + "step": 7981 + }, + { + "epoch": 0.67, + "grad_norm": 0.3061749805245895, + "learning_rate": 2.558129150434125e-06, + "loss": 0.0684, + "step": 7982 + }, + { + "epoch": 0.67, + "grad_norm": 0.4120718525427058, + "learning_rate": 2.5569384405948015e-06, + "loss": 0.0877, + "step": 7983 + }, + { + "epoch": 0.67, + "grad_norm": 0.28961813172953005, + "learning_rate": 2.555747912728933e-06, + "loss": 0.0528, + "step": 7984 + }, + { + "epoch": 0.67, + "grad_norm": 0.255214812442905, + "learning_rate": 2.554557566925201e-06, + "loss": 0.0538, + "step": 7985 + }, + { + "epoch": 0.67, + "grad_norm": 0.23039480923391234, + "learning_rate": 2.5533674032722665e-06, + "loss": 0.0865, + "step": 7986 + }, + { + "epoch": 0.67, + "grad_norm": 0.22432749672141716, + "learning_rate": 2.5521774218587813e-06, + "loss": 0.0472, + "step": 7987 + }, + { + "epoch": 0.67, + "grad_norm": 0.39131026170360134, + "learning_rate": 2.5509876227733788e-06, + "loss": 0.0822, + "step": 7988 + }, + { + "epoch": 0.67, + "grad_norm": 0.22949298276261443, + "learning_rate": 2.549798006104687e-06, + "loss": 0.0746, + "step": 7989 + }, + { + "epoch": 0.67, + "grad_norm": 0.2730336172824576, + "learning_rate": 2.5486085719413127e-06, + "loss": 0.078, + "step": 7990 + }, + { + "epoch": 0.67, + "grad_norm": 0.5531843830311298, + "learning_rate": 2.5474193203718536e-06, + "loss": 0.1044, + "step": 7991 + }, + { + "epoch": 0.67, + "grad_norm": 0.32602872359575064, + "learning_rate": 2.5462302514848874e-06, + "loss": 0.0774, + "step": 7992 + }, + { + "epoch": 0.67, + "grad_norm": 0.3356118578801749, + "learning_rate": 2.5450413653689885e-06, + "loss": 0.0889, + "step": 7993 + }, + { + "epoch": 0.67, + "grad_norm": 0.15671592888727345, + "learning_rate": 2.5438526621127108e-06, + "loss": 0.0318, + "step": 7994 + }, + { + "epoch": 0.67, + "grad_norm": 0.24736936035823426, + "learning_rate": 2.5426641418045946e-06, + "loss": 0.0605, + "step": 7995 + }, + { + "epoch": 0.67, + "grad_norm": 0.20120185186239548, + "learning_rate": 2.5414758045331654e-06, + "loss": 0.0697, + "step": 7996 + }, + { + "epoch": 0.67, + "grad_norm": 0.21887979379456166, + "learning_rate": 2.5402876503869432e-06, + "loss": 0.0485, + "step": 7997 + }, + { + "epoch": 0.67, + "grad_norm": 0.24814881072050332, + "learning_rate": 2.5390996794544253e-06, + "loss": 0.0498, + "step": 7998 + }, + { + "epoch": 0.67, + "grad_norm": 0.31650711019459493, + "learning_rate": 2.5379118918241e-06, + "loss": 0.0966, + "step": 7999 + }, + { + "epoch": 0.67, + "grad_norm": 0.34400013619979486, + "learning_rate": 2.5367242875844365e-06, + "loss": 0.075, + "step": 8000 + }, + { + "epoch": 0.67, + "grad_norm": 0.48691739006600604, + "learning_rate": 2.5355368668239e-06, + "loss": 0.107, + "step": 8001 + }, + { + "epoch": 0.67, + "grad_norm": 0.2548134432087641, + "learning_rate": 2.5343496296309345e-06, + "loss": 0.0479, + "step": 8002 + }, + { + "epoch": 0.67, + "grad_norm": 0.37552813887767184, + "learning_rate": 2.5331625760939715e-06, + "loss": 0.0976, + "step": 8003 + }, + { + "epoch": 0.67, + "grad_norm": 0.2837505775068292, + "learning_rate": 2.5319757063014305e-06, + "loss": 0.0825, + "step": 8004 + }, + { + "epoch": 0.67, + "grad_norm": 0.2809265562096386, + "learning_rate": 2.5307890203417142e-06, + "loss": 0.1217, + "step": 8005 + }, + { + "epoch": 0.67, + "grad_norm": 0.18501569087690484, + "learning_rate": 2.529602518303218e-06, + "loss": 0.0518, + "step": 8006 + }, + { + "epoch": 0.67, + "grad_norm": 0.3161016690811233, + "learning_rate": 2.5284162002743172e-06, + "loss": 0.0937, + "step": 8007 + }, + { + "epoch": 0.67, + "grad_norm": 0.247218317365687, + "learning_rate": 2.527230066343374e-06, + "loss": 0.0768, + "step": 8008 + }, + { + "epoch": 0.67, + "grad_norm": 0.3240391471159267, + "learning_rate": 2.5260441165987426e-06, + "loss": 0.0919, + "step": 8009 + }, + { + "epoch": 0.67, + "grad_norm": 0.42817202855167735, + "learning_rate": 2.524858351128757e-06, + "loss": 0.1394, + "step": 8010 + }, + { + "epoch": 0.68, + "grad_norm": 0.2516167682932726, + "learning_rate": 2.52367277002174e-06, + "loss": 0.0602, + "step": 8011 + }, + { + "epoch": 0.68, + "grad_norm": 0.4512517759269975, + "learning_rate": 2.5224873733659994e-06, + "loss": 0.0806, + "step": 8012 + }, + { + "epoch": 0.68, + "grad_norm": 0.3919637499405157, + "learning_rate": 2.5213021612498332e-06, + "loss": 0.0766, + "step": 8013 + }, + { + "epoch": 0.68, + "grad_norm": 0.18140553394668607, + "learning_rate": 2.5201171337615216e-06, + "loss": 0.0589, + "step": 8014 + }, + { + "epoch": 0.68, + "grad_norm": 0.12805926326406425, + "learning_rate": 2.51893229098933e-06, + "loss": 0.032, + "step": 8015 + }, + { + "epoch": 0.68, + "grad_norm": 0.20761169057330878, + "learning_rate": 2.517747633021517e-06, + "loss": 0.0453, + "step": 8016 + }, + { + "epoch": 0.68, + "grad_norm": 0.2894182093207002, + "learning_rate": 2.516563159946321e-06, + "loss": 0.0943, + "step": 8017 + }, + { + "epoch": 0.68, + "grad_norm": 0.20635762756910017, + "learning_rate": 2.5153788718519654e-06, + "loss": 0.0468, + "step": 8018 + }, + { + "epoch": 0.68, + "grad_norm": 0.2833224135851099, + "learning_rate": 2.514194768826667e-06, + "loss": 0.0584, + "step": 8019 + }, + { + "epoch": 0.68, + "grad_norm": 0.25421539164092294, + "learning_rate": 2.5130108509586237e-06, + "loss": 0.0701, + "step": 8020 + }, + { + "epoch": 0.68, + "grad_norm": 0.1964257456031457, + "learning_rate": 2.51182711833602e-06, + "loss": 0.0507, + "step": 8021 + }, + { + "epoch": 0.68, + "grad_norm": 0.26858305410470207, + "learning_rate": 2.510643571047025e-06, + "loss": 0.0605, + "step": 8022 + }, + { + "epoch": 0.68, + "grad_norm": 0.5159884721712533, + "learning_rate": 2.5094602091798004e-06, + "loss": 0.1366, + "step": 8023 + }, + { + "epoch": 0.68, + "grad_norm": 0.4199788142649705, + "learning_rate": 2.508277032822488e-06, + "loss": 0.1063, + "step": 8024 + }, + { + "epoch": 0.68, + "grad_norm": 0.19251075852326502, + "learning_rate": 2.5070940420632152e-06, + "loss": 0.0311, + "step": 8025 + }, + { + "epoch": 0.68, + "grad_norm": 0.26765056855751096, + "learning_rate": 2.505911236990103e-06, + "loss": 0.073, + "step": 8026 + }, + { + "epoch": 0.68, + "grad_norm": 0.3075561279589553, + "learning_rate": 2.5047286176912503e-06, + "loss": 0.0947, + "step": 8027 + }, + { + "epoch": 0.68, + "grad_norm": 0.26649119599091714, + "learning_rate": 2.503546184254747e-06, + "loss": 0.0701, + "step": 8028 + }, + { + "epoch": 0.68, + "grad_norm": 0.29336334366498784, + "learning_rate": 2.502363936768664e-06, + "loss": 0.0799, + "step": 8029 + }, + { + "epoch": 0.68, + "grad_norm": 0.22747500099257587, + "learning_rate": 2.501181875321067e-06, + "loss": 0.087, + "step": 8030 + }, + { + "epoch": 0.68, + "grad_norm": 0.20828914005863092, + "learning_rate": 2.5000000000000015e-06, + "loss": 0.0627, + "step": 8031 + }, + { + "epoch": 0.68, + "grad_norm": 0.754524687709029, + "learning_rate": 2.4988183108934968e-06, + "loss": 0.1885, + "step": 8032 + }, + { + "epoch": 0.68, + "grad_norm": 0.3497416774320822, + "learning_rate": 2.497636808089577e-06, + "loss": 0.0929, + "step": 8033 + }, + { + "epoch": 0.68, + "grad_norm": 0.5020016499156276, + "learning_rate": 2.4964554916762446e-06, + "loss": 0.1353, + "step": 8034 + }, + { + "epoch": 0.68, + "grad_norm": 0.333233957396391, + "learning_rate": 2.49527436174149e-06, + "loss": 0.0955, + "step": 8035 + }, + { + "epoch": 0.68, + "grad_norm": 0.4411361957689018, + "learning_rate": 2.4940934183732934e-06, + "loss": 0.0812, + "step": 8036 + }, + { + "epoch": 0.68, + "grad_norm": 0.19724742402346213, + "learning_rate": 2.492912661659617e-06, + "loss": 0.0377, + "step": 8037 + }, + { + "epoch": 0.68, + "grad_norm": 0.5159963894659594, + "learning_rate": 2.4917320916884115e-06, + "loss": 0.0791, + "step": 8038 + }, + { + "epoch": 0.68, + "grad_norm": 0.2533603551358582, + "learning_rate": 2.490551708547609e-06, + "loss": 0.0805, + "step": 8039 + }, + { + "epoch": 0.68, + "grad_norm": 0.2832465978797984, + "learning_rate": 2.4893715123251355e-06, + "loss": 0.0644, + "step": 8040 + }, + { + "epoch": 0.68, + "grad_norm": 0.3846127429227038, + "learning_rate": 2.488191503108897e-06, + "loss": 0.0798, + "step": 8041 + }, + { + "epoch": 0.68, + "grad_norm": 0.32691562379721095, + "learning_rate": 2.4870116809867867e-06, + "loss": 0.0868, + "step": 8042 + }, + { + "epoch": 0.68, + "grad_norm": 0.44984869424054613, + "learning_rate": 2.4858320460466867e-06, + "loss": 0.0904, + "step": 8043 + }, + { + "epoch": 0.68, + "grad_norm": 0.266892990111272, + "learning_rate": 2.4846525983764624e-06, + "loss": 0.0493, + "step": 8044 + }, + { + "epoch": 0.68, + "grad_norm": 0.23035150180528954, + "learning_rate": 2.483473338063966e-06, + "loss": 0.0765, + "step": 8045 + }, + { + "epoch": 0.68, + "grad_norm": 0.2485047735744902, + "learning_rate": 2.4822942651970328e-06, + "loss": 0.0852, + "step": 8046 + }, + { + "epoch": 0.68, + "grad_norm": 0.27373478047833266, + "learning_rate": 2.4811153798634915e-06, + "loss": 0.0798, + "step": 8047 + }, + { + "epoch": 0.68, + "grad_norm": 0.3131649586130334, + "learning_rate": 2.47993668215115e-06, + "loss": 0.0993, + "step": 8048 + }, + { + "epoch": 0.68, + "grad_norm": 0.6641047689593959, + "learning_rate": 2.478758172147803e-06, + "loss": 0.1024, + "step": 8049 + }, + { + "epoch": 0.68, + "grad_norm": 0.31715245648287627, + "learning_rate": 2.477579849941237e-06, + "loss": 0.0771, + "step": 8050 + }, + { + "epoch": 0.68, + "grad_norm": 0.41974305016893426, + "learning_rate": 2.4764017156192176e-06, + "loss": 0.1167, + "step": 8051 + }, + { + "epoch": 0.68, + "grad_norm": 0.2658635147860032, + "learning_rate": 2.4752237692695e-06, + "loss": 0.067, + "step": 8052 + }, + { + "epoch": 0.68, + "grad_norm": 0.3075842019827212, + "learning_rate": 2.4740460109798216e-06, + "loss": 0.0796, + "step": 8053 + }, + { + "epoch": 0.68, + "grad_norm": 0.18232864950809874, + "learning_rate": 2.472868440837913e-06, + "loss": 0.0332, + "step": 8054 + }, + { + "epoch": 0.68, + "grad_norm": 0.2668748606444816, + "learning_rate": 2.471691058931484e-06, + "loss": 0.0637, + "step": 8055 + }, + { + "epoch": 0.68, + "grad_norm": 0.40018004293384646, + "learning_rate": 2.4705138653482323e-06, + "loss": 0.131, + "step": 8056 + }, + { + "epoch": 0.68, + "grad_norm": 0.2976738692723799, + "learning_rate": 2.469336860175845e-06, + "loss": 0.1032, + "step": 8057 + }, + { + "epoch": 0.68, + "grad_norm": 0.21767646028944168, + "learning_rate": 2.46816004350199e-06, + "loss": 0.0476, + "step": 8058 + }, + { + "epoch": 0.68, + "grad_norm": 0.2313481084279434, + "learning_rate": 2.4669834154143252e-06, + "loss": 0.0605, + "step": 8059 + }, + { + "epoch": 0.68, + "grad_norm": 0.22083637562350358, + "learning_rate": 2.465806976000489e-06, + "loss": 0.0842, + "step": 8060 + }, + { + "epoch": 0.68, + "grad_norm": 0.19149052807894268, + "learning_rate": 2.464630725348114e-06, + "loss": 0.0409, + "step": 8061 + }, + { + "epoch": 0.68, + "grad_norm": 0.2809956347253728, + "learning_rate": 2.463454663544812e-06, + "loss": 0.0588, + "step": 8062 + }, + { + "epoch": 0.68, + "grad_norm": 0.3466644469789324, + "learning_rate": 2.462278790678183e-06, + "loss": 0.0929, + "step": 8063 + }, + { + "epoch": 0.68, + "grad_norm": 0.27576839404603953, + "learning_rate": 2.461103106835811e-06, + "loss": 0.0584, + "step": 8064 + }, + { + "epoch": 0.68, + "grad_norm": 0.21884424109249867, + "learning_rate": 2.4599276121052724e-06, + "loss": 0.0496, + "step": 8065 + }, + { + "epoch": 0.68, + "grad_norm": 0.21223302770917157, + "learning_rate": 2.4587523065741215e-06, + "loss": 0.0422, + "step": 8066 + }, + { + "epoch": 0.68, + "grad_norm": 0.3558587317364715, + "learning_rate": 2.457577190329903e-06, + "loss": 0.0872, + "step": 8067 + }, + { + "epoch": 0.68, + "grad_norm": 0.35948690772773934, + "learning_rate": 2.456402263460144e-06, + "loss": 0.1109, + "step": 8068 + }, + { + "epoch": 0.68, + "grad_norm": 0.5142850160991096, + "learning_rate": 2.4552275260523643e-06, + "loss": 0.0849, + "step": 8069 + }, + { + "epoch": 0.68, + "grad_norm": 0.44822317913201726, + "learning_rate": 2.4540529781940627e-06, + "loss": 0.105, + "step": 8070 + }, + { + "epoch": 0.68, + "grad_norm": 0.3587540614966444, + "learning_rate": 2.452878619972726e-06, + "loss": 0.0767, + "step": 8071 + }, + { + "epoch": 0.68, + "grad_norm": 0.31999772035977314, + "learning_rate": 2.4517044514758283e-06, + "loss": 0.0697, + "step": 8072 + }, + { + "epoch": 0.68, + "grad_norm": 0.4832497834881513, + "learning_rate": 2.4505304727908256e-06, + "loss": 0.0695, + "step": 8073 + }, + { + "epoch": 0.68, + "grad_norm": 0.3605789406964529, + "learning_rate": 2.4493566840051675e-06, + "loss": 0.0961, + "step": 8074 + }, + { + "epoch": 0.68, + "grad_norm": 0.35935605366543644, + "learning_rate": 2.4481830852062822e-06, + "loss": 0.0855, + "step": 8075 + }, + { + "epoch": 0.68, + "grad_norm": 0.20932573801051588, + "learning_rate": 2.4470096764815835e-06, + "loss": 0.0447, + "step": 8076 + }, + { + "epoch": 0.68, + "grad_norm": 0.3451073924776638, + "learning_rate": 2.445836457918479e-06, + "loss": 0.0442, + "step": 8077 + }, + { + "epoch": 0.68, + "grad_norm": 0.21688131464164123, + "learning_rate": 2.444663429604354e-06, + "loss": 0.0346, + "step": 8078 + }, + { + "epoch": 0.68, + "grad_norm": 0.4958023049880385, + "learning_rate": 2.4434905916265827e-06, + "loss": 0.12, + "step": 8079 + }, + { + "epoch": 0.68, + "grad_norm": 0.32214520519332135, + "learning_rate": 2.442317944072523e-06, + "loss": 0.0668, + "step": 8080 + }, + { + "epoch": 0.68, + "grad_norm": 0.348837401589321, + "learning_rate": 2.4411454870295243e-06, + "loss": 0.0801, + "step": 8081 + }, + { + "epoch": 0.68, + "grad_norm": 0.31166341805781594, + "learning_rate": 2.4399732205849163e-06, + "loss": 0.0828, + "step": 8082 + }, + { + "epoch": 0.68, + "grad_norm": 0.4607716011522742, + "learning_rate": 2.438801144826014e-06, + "loss": 0.1056, + "step": 8083 + }, + { + "epoch": 0.68, + "grad_norm": 0.23156766433261153, + "learning_rate": 2.4376292598401247e-06, + "loss": 0.0567, + "step": 8084 + }, + { + "epoch": 0.68, + "grad_norm": 0.2745516870946822, + "learning_rate": 2.4364575657145346e-06, + "loss": 0.0481, + "step": 8085 + }, + { + "epoch": 0.68, + "grad_norm": 0.6158662316441955, + "learning_rate": 2.435286062536519e-06, + "loss": 0.0701, + "step": 8086 + }, + { + "epoch": 0.68, + "grad_norm": 0.4416976260609938, + "learning_rate": 2.434114750393336e-06, + "loss": 0.1559, + "step": 8087 + }, + { + "epoch": 0.68, + "grad_norm": 0.2643541175123459, + "learning_rate": 2.4329436293722356e-06, + "loss": 0.0506, + "step": 8088 + }, + { + "epoch": 0.68, + "grad_norm": 0.38772184468999604, + "learning_rate": 2.4317726995604486e-06, + "loss": 0.0938, + "step": 8089 + }, + { + "epoch": 0.68, + "grad_norm": 0.5929529771801245, + "learning_rate": 2.430601961045189e-06, + "loss": 0.088, + "step": 8090 + }, + { + "epoch": 0.68, + "grad_norm": 0.44999434707681407, + "learning_rate": 2.429431413913666e-06, + "loss": 0.0892, + "step": 8091 + }, + { + "epoch": 0.68, + "grad_norm": 0.6146399970921894, + "learning_rate": 2.428261058253065e-06, + "loss": 0.1535, + "step": 8092 + }, + { + "epoch": 0.68, + "grad_norm": 0.27788218486761906, + "learning_rate": 2.427090894150561e-06, + "loss": 0.0565, + "step": 8093 + }, + { + "epoch": 0.68, + "grad_norm": 0.32346642998620456, + "learning_rate": 2.4259209216933167e-06, + "loss": 0.0692, + "step": 8094 + }, + { + "epoch": 0.68, + "grad_norm": 0.3398073967232939, + "learning_rate": 2.424751140968478e-06, + "loss": 0.1119, + "step": 8095 + }, + { + "epoch": 0.68, + "grad_norm": 0.3529240686077884, + "learning_rate": 2.4235815520631755e-06, + "loss": 0.0938, + "step": 8096 + }, + { + "epoch": 0.68, + "grad_norm": 0.2844105468958982, + "learning_rate": 2.422412155064526e-06, + "loss": 0.0577, + "step": 8097 + }, + { + "epoch": 0.68, + "grad_norm": 0.2914077634383241, + "learning_rate": 2.421242950059637e-06, + "loss": 0.0836, + "step": 8098 + }, + { + "epoch": 0.68, + "grad_norm": 0.42881957017671174, + "learning_rate": 2.4200739371355953e-06, + "loss": 0.0733, + "step": 8099 + }, + { + "epoch": 0.68, + "grad_norm": 0.293584844174575, + "learning_rate": 2.4189051163794735e-06, + "loss": 0.0625, + "step": 8100 + }, + { + "epoch": 0.68, + "grad_norm": 0.37657029514672713, + "learning_rate": 2.4177364878783372e-06, + "loss": 0.0879, + "step": 8101 + }, + { + "epoch": 0.68, + "grad_norm": 0.3552949377056055, + "learning_rate": 2.4165680517192298e-06, + "loss": 0.0853, + "step": 8102 + }, + { + "epoch": 0.68, + "grad_norm": 0.2790554288012035, + "learning_rate": 2.4153998079891837e-06, + "loss": 0.0842, + "step": 8103 + }, + { + "epoch": 0.68, + "grad_norm": 0.21347609155940397, + "learning_rate": 2.4142317567752144e-06, + "loss": 0.0528, + "step": 8104 + }, + { + "epoch": 0.68, + "grad_norm": 0.3241123585988747, + "learning_rate": 2.4130638981643286e-06, + "loss": 0.108, + "step": 8105 + }, + { + "epoch": 0.68, + "grad_norm": 0.4993634105141833, + "learning_rate": 2.411896232243514e-06, + "loss": 0.115, + "step": 8106 + }, + { + "epoch": 0.68, + "grad_norm": 0.2783575675623533, + "learning_rate": 2.410728759099743e-06, + "loss": 0.0698, + "step": 8107 + }, + { + "epoch": 0.68, + "grad_norm": 0.3468728923999155, + "learning_rate": 2.4095614788199796e-06, + "loss": 0.0484, + "step": 8108 + }, + { + "epoch": 0.68, + "grad_norm": 0.40507023077314613, + "learning_rate": 2.408394391491167e-06, + "loss": 0.1349, + "step": 8109 + }, + { + "epoch": 0.68, + "grad_norm": 0.43726005770570237, + "learning_rate": 2.407227497200236e-06, + "loss": 0.0954, + "step": 8110 + }, + { + "epoch": 0.68, + "grad_norm": 0.5280341072157042, + "learning_rate": 2.406060796034107e-06, + "loss": 0.1425, + "step": 8111 + }, + { + "epoch": 0.68, + "grad_norm": 0.28028143348750906, + "learning_rate": 2.4048942880796805e-06, + "loss": 0.0514, + "step": 8112 + }, + { + "epoch": 0.68, + "grad_norm": 0.2928159603820459, + "learning_rate": 2.403727973423845e-06, + "loss": 0.0732, + "step": 8113 + }, + { + "epoch": 0.68, + "grad_norm": 0.4804918985117611, + "learning_rate": 2.402561852153473e-06, + "loss": 0.0995, + "step": 8114 + }, + { + "epoch": 0.68, + "grad_norm": 0.3191786734271622, + "learning_rate": 2.401395924355427e-06, + "loss": 0.0401, + "step": 8115 + }, + { + "epoch": 0.68, + "grad_norm": 0.28711237609491935, + "learning_rate": 2.4002301901165504e-06, + "loss": 0.0733, + "step": 8116 + }, + { + "epoch": 0.68, + "grad_norm": 0.23034313817422605, + "learning_rate": 2.3990646495236726e-06, + "loss": 0.0502, + "step": 8117 + }, + { + "epoch": 0.68, + "grad_norm": 0.3516996422256878, + "learning_rate": 2.3978993026636128e-06, + "loss": 0.0765, + "step": 8118 + }, + { + "epoch": 0.68, + "grad_norm": 0.18382498547990275, + "learning_rate": 2.396734149623171e-06, + "loss": 0.0125, + "step": 8119 + }, + { + "epoch": 0.68, + "grad_norm": 0.41475944546263616, + "learning_rate": 2.3955691904891355e-06, + "loss": 0.0908, + "step": 8120 + }, + { + "epoch": 0.68, + "grad_norm": 0.28537379126679735, + "learning_rate": 2.3944044253482764e-06, + "loss": 0.0647, + "step": 8121 + }, + { + "epoch": 0.68, + "grad_norm": 0.43865571177300045, + "learning_rate": 2.3932398542873565e-06, + "loss": 0.0733, + "step": 8122 + }, + { + "epoch": 0.68, + "grad_norm": 0.2786524143354487, + "learning_rate": 2.3920754773931175e-06, + "loss": 0.0646, + "step": 8123 + }, + { + "epoch": 0.68, + "grad_norm": 0.7500231216255687, + "learning_rate": 2.390911294752287e-06, + "loss": 0.1548, + "step": 8124 + }, + { + "epoch": 0.68, + "grad_norm": 0.29221400681552423, + "learning_rate": 2.3897473064515846e-06, + "loss": 0.0664, + "step": 8125 + }, + { + "epoch": 0.68, + "grad_norm": 0.4032963964696591, + "learning_rate": 2.388583512577709e-06, + "loss": 0.0895, + "step": 8126 + }, + { + "epoch": 0.68, + "grad_norm": 0.2234733384629255, + "learning_rate": 2.387419913217346e-06, + "loss": 0.0545, + "step": 8127 + }, + { + "epoch": 0.68, + "grad_norm": 0.39569949841370666, + "learning_rate": 2.386256508457165e-06, + "loss": 0.0795, + "step": 8128 + }, + { + "epoch": 0.68, + "grad_norm": 0.26995017957746464, + "learning_rate": 2.3850932983838278e-06, + "loss": 0.0714, + "step": 8129 + }, + { + "epoch": 0.69, + "grad_norm": 0.209080128593074, + "learning_rate": 2.383930283083974e-06, + "loss": 0.034, + "step": 8130 + }, + { + "epoch": 0.69, + "grad_norm": 0.3080747064795015, + "learning_rate": 2.3827674626442327e-06, + "loss": 0.0812, + "step": 8131 + }, + { + "epoch": 0.69, + "grad_norm": 0.2733800464591378, + "learning_rate": 2.3816048371512156e-06, + "loss": 0.0715, + "step": 8132 + }, + { + "epoch": 0.69, + "grad_norm": 0.1711715181922208, + "learning_rate": 2.380442406691525e-06, + "loss": 0.0434, + "step": 8133 + }, + { + "epoch": 0.69, + "grad_norm": 0.4132237388466646, + "learning_rate": 2.3792801713517444e-06, + "loss": 0.1072, + "step": 8134 + }, + { + "epoch": 0.69, + "grad_norm": 0.3811966028564031, + "learning_rate": 2.378118131218443e-06, + "loss": 0.077, + "step": 8135 + }, + { + "epoch": 0.69, + "grad_norm": 0.2529576829270131, + "learning_rate": 2.376956286378176e-06, + "loss": 0.0868, + "step": 8136 + }, + { + "epoch": 0.69, + "grad_norm": 0.3326983378707, + "learning_rate": 2.375794636917486e-06, + "loss": 0.0582, + "step": 8137 + }, + { + "epoch": 0.69, + "grad_norm": 0.38652833085651345, + "learning_rate": 2.3746331829228987e-06, + "loss": 0.0826, + "step": 8138 + }, + { + "epoch": 0.69, + "grad_norm": 0.3444491147776681, + "learning_rate": 2.3734719244809263e-06, + "loss": 0.0907, + "step": 8139 + }, + { + "epoch": 0.69, + "grad_norm": 0.3901812724388572, + "learning_rate": 2.3723108616780647e-06, + "loss": 0.1042, + "step": 8140 + }, + { + "epoch": 0.69, + "grad_norm": 0.42120335959737143, + "learning_rate": 2.3711499946007966e-06, + "loss": 0.0822, + "step": 8141 + }, + { + "epoch": 0.69, + "grad_norm": 0.23353821483913406, + "learning_rate": 2.3699893233355924e-06, + "loss": 0.0623, + "step": 8142 + }, + { + "epoch": 0.69, + "grad_norm": 0.1363099328467121, + "learning_rate": 2.3688288479689047e-06, + "loss": 0.0381, + "step": 8143 + }, + { + "epoch": 0.69, + "grad_norm": 0.3145043414305087, + "learning_rate": 2.36766856858717e-06, + "loss": 0.0657, + "step": 8144 + }, + { + "epoch": 0.69, + "grad_norm": 0.3764845375555749, + "learning_rate": 2.3665084852768166e-06, + "loss": 0.0674, + "step": 8145 + }, + { + "epoch": 0.69, + "grad_norm": 0.22491304031803827, + "learning_rate": 2.365348598124253e-06, + "loss": 0.041, + "step": 8146 + }, + { + "epoch": 0.69, + "grad_norm": 0.2754475587643003, + "learning_rate": 2.364188907215873e-06, + "loss": 0.081, + "step": 8147 + }, + { + "epoch": 0.69, + "grad_norm": 0.3851073160924233, + "learning_rate": 2.3630294126380564e-06, + "loss": 0.0688, + "step": 8148 + }, + { + "epoch": 0.69, + "grad_norm": 0.2812221114577284, + "learning_rate": 2.3618701144771726e-06, + "loss": 0.0455, + "step": 8149 + }, + { + "epoch": 0.69, + "grad_norm": 0.29891103456227003, + "learning_rate": 2.360711012819571e-06, + "loss": 0.0628, + "step": 8150 + }, + { + "epoch": 0.69, + "grad_norm": 0.2664787937025612, + "learning_rate": 2.359552107751586e-06, + "loss": 0.0832, + "step": 8151 + }, + { + "epoch": 0.69, + "grad_norm": 0.3137430470310707, + "learning_rate": 2.3583933993595435e-06, + "loss": 0.107, + "step": 8152 + }, + { + "epoch": 0.69, + "grad_norm": 0.38031404549337133, + "learning_rate": 2.3572348877297495e-06, + "loss": 0.089, + "step": 8153 + }, + { + "epoch": 0.69, + "grad_norm": 0.25154042903179935, + "learning_rate": 2.3560765729484964e-06, + "loss": 0.0808, + "step": 8154 + }, + { + "epoch": 0.69, + "grad_norm": 0.36703601728832985, + "learning_rate": 2.3549184551020597e-06, + "loss": 0.1247, + "step": 8155 + }, + { + "epoch": 0.69, + "grad_norm": 0.28684099655841416, + "learning_rate": 2.353760534276708e-06, + "loss": 0.0537, + "step": 8156 + }, + { + "epoch": 0.69, + "grad_norm": 0.25517849069716125, + "learning_rate": 2.3526028105586864e-06, + "loss": 0.074, + "step": 8157 + }, + { + "epoch": 0.69, + "grad_norm": 0.2974549841593156, + "learning_rate": 2.351445284034228e-06, + "loss": 0.1157, + "step": 8158 + }, + { + "epoch": 0.69, + "grad_norm": 0.37723826996287274, + "learning_rate": 2.3502879547895557e-06, + "loss": 0.0649, + "step": 8159 + }, + { + "epoch": 0.69, + "grad_norm": 0.15869081585834996, + "learning_rate": 2.3491308229108724e-06, + "loss": 0.0463, + "step": 8160 + }, + { + "epoch": 0.69, + "grad_norm": 0.21339357734202838, + "learning_rate": 2.347973888484366e-06, + "loss": 0.0624, + "step": 8161 + }, + { + "epoch": 0.69, + "grad_norm": 0.3891342095865076, + "learning_rate": 2.3468171515962153e-06, + "loss": 0.0971, + "step": 8162 + }, + { + "epoch": 0.69, + "grad_norm": 0.4522276696293356, + "learning_rate": 2.345660612332579e-06, + "loss": 0.0881, + "step": 8163 + }, + { + "epoch": 0.69, + "grad_norm": 0.19729122357824827, + "learning_rate": 2.344504270779603e-06, + "loss": 0.0533, + "step": 8164 + }, + { + "epoch": 0.69, + "grad_norm": 0.34484931590870704, + "learning_rate": 2.3433481270234165e-06, + "loss": 0.1026, + "step": 8165 + }, + { + "epoch": 0.69, + "grad_norm": 0.5298712150064175, + "learning_rate": 2.3421921811501397e-06, + "loss": 0.0987, + "step": 8166 + }, + { + "epoch": 0.69, + "grad_norm": 0.24860946341035167, + "learning_rate": 2.3410364332458717e-06, + "loss": 0.0674, + "step": 8167 + }, + { + "epoch": 0.69, + "grad_norm": 0.3452771313306856, + "learning_rate": 2.3398808833966986e-06, + "loss": 0.0551, + "step": 8168 + }, + { + "epoch": 0.69, + "grad_norm": 0.2213633156578429, + "learning_rate": 2.3387255316886947e-06, + "loss": 0.0482, + "step": 8169 + }, + { + "epoch": 0.69, + "grad_norm": 0.41450239364038427, + "learning_rate": 2.3375703782079167e-06, + "loss": 0.1026, + "step": 8170 + }, + { + "epoch": 0.69, + "grad_norm": 0.31070746665663973, + "learning_rate": 2.3364154230404068e-06, + "loss": 0.0967, + "step": 8171 + }, + { + "epoch": 0.69, + "grad_norm": 0.3367805468857731, + "learning_rate": 2.3352606662721904e-06, + "loss": 0.0829, + "step": 8172 + }, + { + "epoch": 0.69, + "grad_norm": 0.32419958235428303, + "learning_rate": 2.334106107989285e-06, + "loss": 0.0686, + "step": 8173 + }, + { + "epoch": 0.69, + "grad_norm": 0.25254632981763864, + "learning_rate": 2.332951748277687e-06, + "loss": 0.0574, + "step": 8174 + }, + { + "epoch": 0.69, + "grad_norm": 0.31943237342192005, + "learning_rate": 2.331797587223377e-06, + "loss": 0.0737, + "step": 8175 + }, + { + "epoch": 0.69, + "grad_norm": 0.2887930862160014, + "learning_rate": 2.330643624912328e-06, + "loss": 0.0761, + "step": 8176 + }, + { + "epoch": 0.69, + "grad_norm": 0.2952333108657727, + "learning_rate": 2.329489861430493e-06, + "loss": 0.0601, + "step": 8177 + }, + { + "epoch": 0.69, + "grad_norm": 0.32231977767869785, + "learning_rate": 2.328336296863809e-06, + "loss": 0.0951, + "step": 8178 + }, + { + "epoch": 0.69, + "grad_norm": 0.3837727011817731, + "learning_rate": 2.327182931298199e-06, + "loss": 0.0664, + "step": 8179 + }, + { + "epoch": 0.69, + "grad_norm": 0.3774429133756885, + "learning_rate": 2.3260297648195775e-06, + "loss": 0.107, + "step": 8180 + }, + { + "epoch": 0.69, + "grad_norm": 0.3480880680120187, + "learning_rate": 2.324876797513836e-06, + "loss": 0.0643, + "step": 8181 + }, + { + "epoch": 0.69, + "grad_norm": 0.3802512600919708, + "learning_rate": 2.3237240294668516e-06, + "loss": 0.1233, + "step": 8182 + }, + { + "epoch": 0.69, + "grad_norm": 0.4125467771660437, + "learning_rate": 2.3225714607644945e-06, + "loss": 0.1076, + "step": 8183 + }, + { + "epoch": 0.69, + "grad_norm": 0.3595899997810304, + "learning_rate": 2.321419091492612e-06, + "loss": 0.0536, + "step": 8184 + }, + { + "epoch": 0.69, + "grad_norm": 0.28431198178735206, + "learning_rate": 2.320266921737038e-06, + "loss": 0.0922, + "step": 8185 + }, + { + "epoch": 0.69, + "grad_norm": 0.25850827855427566, + "learning_rate": 2.319114951583595e-06, + "loss": 0.0355, + "step": 8186 + }, + { + "epoch": 0.69, + "grad_norm": 0.24705841508809565, + "learning_rate": 2.3179631811180893e-06, + "loss": 0.0625, + "step": 8187 + }, + { + "epoch": 0.69, + "grad_norm": 0.35221132676035877, + "learning_rate": 2.3168116104263087e-06, + "loss": 0.0935, + "step": 8188 + }, + { + "epoch": 0.69, + "grad_norm": 0.16895014936836256, + "learning_rate": 2.3156602395940284e-06, + "loss": 0.0437, + "step": 8189 + }, + { + "epoch": 0.69, + "grad_norm": 0.46315374354200667, + "learning_rate": 2.314509068707013e-06, + "loss": 0.0884, + "step": 8190 + }, + { + "epoch": 0.69, + "grad_norm": 0.4275299754266828, + "learning_rate": 2.313358097851007e-06, + "loss": 0.1236, + "step": 8191 + }, + { + "epoch": 0.69, + "grad_norm": 0.2254195089672145, + "learning_rate": 2.3122073271117377e-06, + "loss": 0.0725, + "step": 8192 + }, + { + "epoch": 0.69, + "grad_norm": 0.4673925712183842, + "learning_rate": 2.3110567565749266e-06, + "loss": 0.0939, + "step": 8193 + }, + { + "epoch": 0.69, + "grad_norm": 0.22734738873313137, + "learning_rate": 2.309906386326272e-06, + "loss": 0.076, + "step": 8194 + }, + { + "epoch": 0.69, + "grad_norm": 0.31884050634619693, + "learning_rate": 2.3087562164514616e-06, + "loss": 0.0451, + "step": 8195 + }, + { + "epoch": 0.69, + "grad_norm": 0.5955037401392582, + "learning_rate": 2.3076062470361633e-06, + "loss": 0.1104, + "step": 8196 + }, + { + "epoch": 0.69, + "grad_norm": 0.333876183468035, + "learning_rate": 2.306456478166038e-06, + "loss": 0.0599, + "step": 8197 + }, + { + "epoch": 0.69, + "grad_norm": 0.29679477332703175, + "learning_rate": 2.3053069099267258e-06, + "loss": 0.0714, + "step": 8198 + }, + { + "epoch": 0.69, + "grad_norm": 0.24847932574114995, + "learning_rate": 2.304157542403852e-06, + "loss": 0.0659, + "step": 8199 + }, + { + "epoch": 0.69, + "grad_norm": 0.3730900376316934, + "learning_rate": 2.3030083756830273e-06, + "loss": 0.1012, + "step": 8200 + }, + { + "epoch": 0.69, + "grad_norm": 0.5050976470812316, + "learning_rate": 2.301859409849852e-06, + "loss": 0.1144, + "step": 8201 + }, + { + "epoch": 0.69, + "grad_norm": 0.36682215454047484, + "learning_rate": 2.3007106449899057e-06, + "loss": 0.0865, + "step": 8202 + }, + { + "epoch": 0.69, + "grad_norm": 0.2465918884401312, + "learning_rate": 2.299562081188755e-06, + "loss": 0.0711, + "step": 8203 + }, + { + "epoch": 0.69, + "grad_norm": 0.49510917590764936, + "learning_rate": 2.2984137185319493e-06, + "loss": 0.1097, + "step": 8204 + }, + { + "epoch": 0.69, + "grad_norm": 0.21291243920002315, + "learning_rate": 2.29726555710503e-06, + "loss": 0.0422, + "step": 8205 + }, + { + "epoch": 0.69, + "grad_norm": 0.22447561988243364, + "learning_rate": 2.296117596993517e-06, + "loss": 0.0639, + "step": 8206 + }, + { + "epoch": 0.69, + "grad_norm": 0.6643570533675038, + "learning_rate": 2.294969838282916e-06, + "loss": 0.0904, + "step": 8207 + }, + { + "epoch": 0.69, + "grad_norm": 0.2676314841623326, + "learning_rate": 2.2938222810587203e-06, + "loss": 0.0847, + "step": 8208 + }, + { + "epoch": 0.69, + "grad_norm": 0.19901550644355864, + "learning_rate": 2.292674925406403e-06, + "loss": 0.0711, + "step": 8209 + }, + { + "epoch": 0.69, + "grad_norm": 0.42956110768676475, + "learning_rate": 2.291527771411431e-06, + "loss": 0.136, + "step": 8210 + }, + { + "epoch": 0.69, + "grad_norm": 0.3666575379904941, + "learning_rate": 2.2903808191592482e-06, + "loss": 0.067, + "step": 8211 + }, + { + "epoch": 0.69, + "grad_norm": 0.8460809432169117, + "learning_rate": 2.2892340687352864e-06, + "loss": 0.1402, + "step": 8212 + }, + { + "epoch": 0.69, + "grad_norm": 0.4397310089339902, + "learning_rate": 2.2880875202249616e-06, + "loss": 0.101, + "step": 8213 + }, + { + "epoch": 0.69, + "grad_norm": 0.4724799158550494, + "learning_rate": 2.2869411737136776e-06, + "loss": 0.1226, + "step": 8214 + }, + { + "epoch": 0.69, + "grad_norm": 0.48398515748068516, + "learning_rate": 2.2857950292868196e-06, + "loss": 0.1012, + "step": 8215 + }, + { + "epoch": 0.69, + "grad_norm": 0.31176729534184416, + "learning_rate": 2.284649087029758e-06, + "loss": 0.1031, + "step": 8216 + }, + { + "epoch": 0.69, + "grad_norm": 0.2725401129390788, + "learning_rate": 2.2835033470278515e-06, + "loss": 0.0685, + "step": 8217 + }, + { + "epoch": 0.69, + "grad_norm": 0.4255972552172791, + "learning_rate": 2.282357809366441e-06, + "loss": 0.0896, + "step": 8218 + }, + { + "epoch": 0.69, + "grad_norm": 0.305404304316731, + "learning_rate": 2.2812124741308505e-06, + "loss": 0.0691, + "step": 8219 + }, + { + "epoch": 0.69, + "grad_norm": 0.1918144405130339, + "learning_rate": 2.280067341406395e-06, + "loss": 0.0355, + "step": 8220 + }, + { + "epoch": 0.69, + "grad_norm": 0.22301497074716992, + "learning_rate": 2.278922411278368e-06, + "loss": 0.0489, + "step": 8221 + }, + { + "epoch": 0.69, + "grad_norm": 0.2761686383822466, + "learning_rate": 2.277777683832052e-06, + "loss": 0.0936, + "step": 8222 + }, + { + "epoch": 0.69, + "grad_norm": 0.23012506201191857, + "learning_rate": 2.2766331591527103e-06, + "loss": 0.0692, + "step": 8223 + }, + { + "epoch": 0.69, + "grad_norm": 0.37612805610696265, + "learning_rate": 2.2754888373255975e-06, + "loss": 0.0805, + "step": 8224 + }, + { + "epoch": 0.69, + "grad_norm": 0.21667128895859342, + "learning_rate": 2.2743447184359473e-06, + "loss": 0.0637, + "step": 8225 + }, + { + "epoch": 0.69, + "grad_norm": 0.2583340176953693, + "learning_rate": 2.273200802568979e-06, + "loss": 0.0795, + "step": 8226 + }, + { + "epoch": 0.69, + "grad_norm": 0.2348846207661785, + "learning_rate": 2.272057089809901e-06, + "loss": 0.0469, + "step": 8227 + }, + { + "epoch": 0.69, + "grad_norm": 0.24174578992106766, + "learning_rate": 2.270913580243903e-06, + "loss": 0.0517, + "step": 8228 + }, + { + "epoch": 0.69, + "grad_norm": 0.3271962405798819, + "learning_rate": 2.2697702739561595e-06, + "loss": 0.0711, + "step": 8229 + }, + { + "epoch": 0.69, + "grad_norm": 0.18671840928034406, + "learning_rate": 2.2686271710318287e-06, + "loss": 0.0576, + "step": 8230 + }, + { + "epoch": 0.69, + "grad_norm": 0.18693910168661979, + "learning_rate": 2.2674842715560595e-06, + "loss": 0.0635, + "step": 8231 + }, + { + "epoch": 0.69, + "grad_norm": 0.4429133842218223, + "learning_rate": 2.2663415756139802e-06, + "loss": 0.0971, + "step": 8232 + }, + { + "epoch": 0.69, + "grad_norm": 0.35442171318348153, + "learning_rate": 2.2651990832907027e-06, + "loss": 0.0911, + "step": 8233 + }, + { + "epoch": 0.69, + "grad_norm": 0.3425757651304039, + "learning_rate": 2.264056794671331e-06, + "loss": 0.1229, + "step": 8234 + }, + { + "epoch": 0.69, + "grad_norm": 0.2668660403609579, + "learning_rate": 2.262914709840947e-06, + "loss": 0.0625, + "step": 8235 + }, + { + "epoch": 0.69, + "grad_norm": 0.3564272034094718, + "learning_rate": 2.2617728288846185e-06, + "loss": 0.1075, + "step": 8236 + }, + { + "epoch": 0.69, + "grad_norm": 0.3213710803396447, + "learning_rate": 2.260631151887403e-06, + "loss": 0.0561, + "step": 8237 + }, + { + "epoch": 0.69, + "grad_norm": 0.27268749549507937, + "learning_rate": 2.2594896789343372e-06, + "loss": 0.0972, + "step": 8238 + }, + { + "epoch": 0.69, + "grad_norm": 0.42044965318155925, + "learning_rate": 2.258348410110445e-06, + "loss": 0.1118, + "step": 8239 + }, + { + "epoch": 0.69, + "grad_norm": 0.30079030768330145, + "learning_rate": 2.2572073455007325e-06, + "loss": 0.0802, + "step": 8240 + }, + { + "epoch": 0.69, + "grad_norm": 0.34496689551959236, + "learning_rate": 2.256066485190197e-06, + "loss": 0.0917, + "step": 8241 + }, + { + "epoch": 0.69, + "grad_norm": 0.3383911450788624, + "learning_rate": 2.2549258292638137e-06, + "loss": 0.0726, + "step": 8242 + }, + { + "epoch": 0.69, + "grad_norm": 0.23090556327194298, + "learning_rate": 2.2537853778065445e-06, + "loss": 0.0648, + "step": 8243 + }, + { + "epoch": 0.69, + "grad_norm": 0.3558244217609913, + "learning_rate": 2.2526451309033397e-06, + "loss": 0.1193, + "step": 8244 + }, + { + "epoch": 0.69, + "grad_norm": 0.2826079996752124, + "learning_rate": 2.2515050886391306e-06, + "loss": 0.086, + "step": 8245 + }, + { + "epoch": 0.69, + "grad_norm": 0.3849818201934799, + "learning_rate": 2.2503652510988326e-06, + "loss": 0.0595, + "step": 8246 + }, + { + "epoch": 0.69, + "grad_norm": 0.26609201227851337, + "learning_rate": 2.2492256183673476e-06, + "loss": 0.0876, + "step": 8247 + }, + { + "epoch": 0.69, + "grad_norm": 0.3141802976901484, + "learning_rate": 2.248086190529565e-06, + "loss": 0.0884, + "step": 8248 + }, + { + "epoch": 0.7, + "grad_norm": 0.4229164929567971, + "learning_rate": 2.2469469676703536e-06, + "loss": 0.1248, + "step": 8249 + }, + { + "epoch": 0.7, + "grad_norm": 0.4051876021904241, + "learning_rate": 2.2458079498745675e-06, + "loss": 0.125, + "step": 8250 + }, + { + "epoch": 0.7, + "grad_norm": 0.25001926485657355, + "learning_rate": 2.2446691372270523e-06, + "loss": 0.0612, + "step": 8251 + }, + { + "epoch": 0.7, + "grad_norm": 0.50843339555988, + "learning_rate": 2.24353052981263e-06, + "loss": 0.0772, + "step": 8252 + }, + { + "epoch": 0.7, + "grad_norm": 0.6155887087397431, + "learning_rate": 2.2423921277161103e-06, + "loss": 0.042, + "step": 8253 + }, + { + "epoch": 0.7, + "grad_norm": 0.24807387258629288, + "learning_rate": 2.241253931022291e-06, + "loss": 0.0678, + "step": 8254 + }, + { + "epoch": 0.7, + "grad_norm": 0.19562751459739416, + "learning_rate": 2.2401159398159492e-06, + "loss": 0.041, + "step": 8255 + }, + { + "epoch": 0.7, + "grad_norm": 0.22540173162042015, + "learning_rate": 2.23897815418185e-06, + "loss": 0.0435, + "step": 8256 + }, + { + "epoch": 0.7, + "grad_norm": 0.31509657629465876, + "learning_rate": 2.2378405742047406e-06, + "loss": 0.0717, + "step": 8257 + }, + { + "epoch": 0.7, + "grad_norm": 0.2968119278186583, + "learning_rate": 2.236703199969357e-06, + "loss": 0.0834, + "step": 8258 + }, + { + "epoch": 0.7, + "grad_norm": 0.6292634174285717, + "learning_rate": 2.2355660315604173e-06, + "loss": 0.1524, + "step": 8259 + }, + { + "epoch": 0.7, + "grad_norm": 0.3853349931526165, + "learning_rate": 2.234429069062621e-06, + "loss": 0.0756, + "step": 8260 + }, + { + "epoch": 0.7, + "grad_norm": 0.3257037756920801, + "learning_rate": 2.2332923125606608e-06, + "loss": 0.076, + "step": 8261 + }, + { + "epoch": 0.7, + "grad_norm": 0.5049199829859229, + "learning_rate": 2.232155762139206e-06, + "loss": 0.1096, + "step": 8262 + }, + { + "epoch": 0.7, + "grad_norm": 0.5884114196935599, + "learning_rate": 2.231019417882914e-06, + "loss": 0.1413, + "step": 8263 + }, + { + "epoch": 0.7, + "grad_norm": 0.4249956825727513, + "learning_rate": 2.2298832798764246e-06, + "loss": 0.0999, + "step": 8264 + }, + { + "epoch": 0.7, + "grad_norm": 0.4292746325383951, + "learning_rate": 2.2287473482043676e-06, + "loss": 0.1262, + "step": 8265 + }, + { + "epoch": 0.7, + "grad_norm": 0.3952749160588101, + "learning_rate": 2.2276116229513515e-06, + "loss": 0.1009, + "step": 8266 + }, + { + "epoch": 0.7, + "grad_norm": 0.2184450788959631, + "learning_rate": 2.2264761042019724e-06, + "loss": 0.0618, + "step": 8267 + }, + { + "epoch": 0.7, + "grad_norm": 0.48053085464086254, + "learning_rate": 2.2253407920408078e-06, + "loss": 0.0836, + "step": 8268 + }, + { + "epoch": 0.7, + "grad_norm": 0.28941080855400425, + "learning_rate": 2.2242056865524263e-06, + "loss": 0.1008, + "step": 8269 + }, + { + "epoch": 0.7, + "grad_norm": 0.22125260822283646, + "learning_rate": 2.223070787821376e-06, + "loss": 0.0708, + "step": 8270 + }, + { + "epoch": 0.7, + "grad_norm": 0.3188860276328193, + "learning_rate": 2.22193609593219e-06, + "loss": 0.0738, + "step": 8271 + }, + { + "epoch": 0.7, + "grad_norm": 0.4441931392712277, + "learning_rate": 2.2208016109693853e-06, + "loss": 0.0469, + "step": 8272 + }, + { + "epoch": 0.7, + "grad_norm": 0.2589215671631317, + "learning_rate": 2.219667333017468e-06, + "loss": 0.0823, + "step": 8273 + }, + { + "epoch": 0.7, + "grad_norm": 0.35658879913443536, + "learning_rate": 2.2185332621609246e-06, + "loss": 0.0828, + "step": 8274 + }, + { + "epoch": 0.7, + "grad_norm": 0.2515164986207159, + "learning_rate": 2.2173993984842274e-06, + "loss": 0.0793, + "step": 8275 + }, + { + "epoch": 0.7, + "grad_norm": 0.42418142142400783, + "learning_rate": 2.2162657420718325e-06, + "loss": 0.1063, + "step": 8276 + }, + { + "epoch": 0.7, + "grad_norm": 0.26381239013335395, + "learning_rate": 2.2151322930081797e-06, + "loss": 0.094, + "step": 8277 + }, + { + "epoch": 0.7, + "grad_norm": 0.40830864053053045, + "learning_rate": 2.2139990513776987e-06, + "loss": 0.0539, + "step": 8278 + }, + { + "epoch": 0.7, + "grad_norm": 0.31287292212604134, + "learning_rate": 2.2128660172647985e-06, + "loss": 0.1005, + "step": 8279 + }, + { + "epoch": 0.7, + "grad_norm": 0.28493551969664616, + "learning_rate": 2.211733190753873e-06, + "loss": 0.0838, + "step": 8280 + }, + { + "epoch": 0.7, + "grad_norm": 0.22407053972757265, + "learning_rate": 2.2106005719293007e-06, + "loss": 0.0594, + "step": 8281 + }, + { + "epoch": 0.7, + "grad_norm": 0.21207820772189656, + "learning_rate": 2.209468160875449e-06, + "loss": 0.0487, + "step": 8282 + }, + { + "epoch": 0.7, + "grad_norm": 0.3221982924551854, + "learning_rate": 2.208335957676665e-06, + "loss": 0.1005, + "step": 8283 + }, + { + "epoch": 0.7, + "grad_norm": 0.30321714071028494, + "learning_rate": 2.2072039624172793e-06, + "loss": 0.077, + "step": 8284 + }, + { + "epoch": 0.7, + "grad_norm": 0.3428388330212152, + "learning_rate": 2.206072175181614e-06, + "loss": 0.0887, + "step": 8285 + }, + { + "epoch": 0.7, + "grad_norm": 0.2991361152659501, + "learning_rate": 2.2049405960539684e-06, + "loss": 0.0776, + "step": 8286 + }, + { + "epoch": 0.7, + "grad_norm": 0.2687866617421833, + "learning_rate": 2.2038092251186277e-06, + "loss": 0.0711, + "step": 8287 + }, + { + "epoch": 0.7, + "grad_norm": 0.32898224696366285, + "learning_rate": 2.2026780624598672e-06, + "loss": 0.0729, + "step": 8288 + }, + { + "epoch": 0.7, + "grad_norm": 0.20414464298871673, + "learning_rate": 2.20154710816194e-06, + "loss": 0.0491, + "step": 8289 + }, + { + "epoch": 0.7, + "grad_norm": 0.21836862187615885, + "learning_rate": 2.200416362309087e-06, + "loss": 0.0362, + "step": 8290 + }, + { + "epoch": 0.7, + "grad_norm": 0.19247931252578104, + "learning_rate": 2.1992858249855293e-06, + "loss": 0.0488, + "step": 8291 + }, + { + "epoch": 0.7, + "grad_norm": 0.2245907699571742, + "learning_rate": 2.1981554962754813e-06, + "loss": 0.0412, + "step": 8292 + }, + { + "epoch": 0.7, + "grad_norm": 0.33202523241763193, + "learning_rate": 2.1970253762631334e-06, + "loss": 0.0945, + "step": 8293 + }, + { + "epoch": 0.7, + "grad_norm": 0.29533226013352354, + "learning_rate": 2.1958954650326624e-06, + "loss": 0.0866, + "step": 8294 + }, + { + "epoch": 0.7, + "grad_norm": 0.31270948841039575, + "learning_rate": 2.194765762668234e-06, + "loss": 0.0851, + "step": 8295 + }, + { + "epoch": 0.7, + "grad_norm": 0.38024842750123045, + "learning_rate": 2.1936362692539936e-06, + "loss": 0.052, + "step": 8296 + }, + { + "epoch": 0.7, + "grad_norm": 0.2539527132980576, + "learning_rate": 2.192506984874072e-06, + "loss": 0.0506, + "step": 8297 + }, + { + "epoch": 0.7, + "grad_norm": 0.3790691511096439, + "learning_rate": 2.1913779096125832e-06, + "loss": 0.0841, + "step": 8298 + }, + { + "epoch": 0.7, + "grad_norm": 0.38380322277077483, + "learning_rate": 2.1902490435536304e-06, + "loss": 0.1123, + "step": 8299 + }, + { + "epoch": 0.7, + "grad_norm": 0.3939305018904942, + "learning_rate": 2.1891203867812977e-06, + "loss": 0.1244, + "step": 8300 + }, + { + "epoch": 0.7, + "grad_norm": 0.3917614048056084, + "learning_rate": 2.1879919393796505e-06, + "loss": 0.0886, + "step": 8301 + }, + { + "epoch": 0.7, + "grad_norm": 0.3835769224131753, + "learning_rate": 2.1868637014327476e-06, + "loss": 0.0996, + "step": 8302 + }, + { + "epoch": 0.7, + "grad_norm": 0.22923062086987134, + "learning_rate": 2.185735673024623e-06, + "loss": 0.068, + "step": 8303 + }, + { + "epoch": 0.7, + "grad_norm": 0.2313880447960041, + "learning_rate": 2.1846078542393005e-06, + "loss": 0.0875, + "step": 8304 + }, + { + "epoch": 0.7, + "grad_norm": 0.28166880269457767, + "learning_rate": 2.183480245160784e-06, + "loss": 0.0646, + "step": 8305 + }, + { + "epoch": 0.7, + "grad_norm": 0.23286866918393045, + "learning_rate": 2.1823528458730674e-06, + "loss": 0.0764, + "step": 8306 + }, + { + "epoch": 0.7, + "grad_norm": 0.5873112169650783, + "learning_rate": 2.181225656460126e-06, + "loss": 0.1427, + "step": 8307 + }, + { + "epoch": 0.7, + "grad_norm": 0.15940495085580306, + "learning_rate": 2.1800986770059155e-06, + "loss": 0.0498, + "step": 8308 + }, + { + "epoch": 0.7, + "grad_norm": 0.19668343513785205, + "learning_rate": 2.178971907594385e-06, + "loss": 0.0604, + "step": 8309 + }, + { + "epoch": 0.7, + "grad_norm": 0.22044081269960228, + "learning_rate": 2.1778453483094604e-06, + "loss": 0.0719, + "step": 8310 + }, + { + "epoch": 0.7, + "grad_norm": 0.276561912619221, + "learning_rate": 2.1767189992350523e-06, + "loss": 0.0659, + "step": 8311 + }, + { + "epoch": 0.7, + "grad_norm": 0.40868232965030293, + "learning_rate": 2.175592860455062e-06, + "loss": 0.079, + "step": 8312 + }, + { + "epoch": 0.7, + "grad_norm": 0.21590705529561863, + "learning_rate": 2.174466932053369e-06, + "loss": 0.0688, + "step": 8313 + }, + { + "epoch": 0.7, + "grad_norm": 0.37969881438766684, + "learning_rate": 2.173341214113839e-06, + "loss": 0.0864, + "step": 8314 + }, + { + "epoch": 0.7, + "grad_norm": 0.2611057076256042, + "learning_rate": 2.17221570672032e-06, + "loss": 0.0596, + "step": 8315 + }, + { + "epoch": 0.7, + "grad_norm": 0.3345410625063943, + "learning_rate": 2.1710904099566497e-06, + "loss": 0.0755, + "step": 8316 + }, + { + "epoch": 0.7, + "grad_norm": 0.19900779356904622, + "learning_rate": 2.1699653239066455e-06, + "loss": 0.072, + "step": 8317 + }, + { + "epoch": 0.7, + "grad_norm": 0.2846356919423404, + "learning_rate": 2.168840448654108e-06, + "loss": 0.0754, + "step": 8318 + }, + { + "epoch": 0.7, + "grad_norm": 0.3521408907321636, + "learning_rate": 2.167715784282829e-06, + "loss": 0.0705, + "step": 8319 + }, + { + "epoch": 0.7, + "grad_norm": 0.2877522500827389, + "learning_rate": 2.1665913308765773e-06, + "loss": 0.0762, + "step": 8320 + }, + { + "epoch": 0.7, + "grad_norm": 0.2722918757864807, + "learning_rate": 2.165467088519109e-06, + "loss": 0.0518, + "step": 8321 + }, + { + "epoch": 0.7, + "grad_norm": 0.3190264944240664, + "learning_rate": 2.164343057294163e-06, + "loss": 0.1192, + "step": 8322 + }, + { + "epoch": 0.7, + "grad_norm": 0.4833905171420414, + "learning_rate": 2.1632192372854667e-06, + "loss": 0.1148, + "step": 8323 + }, + { + "epoch": 0.7, + "grad_norm": 0.2637189410013057, + "learning_rate": 2.1620956285767268e-06, + "loss": 0.094, + "step": 8324 + }, + { + "epoch": 0.7, + "grad_norm": 0.2545600755787577, + "learning_rate": 2.160972231251635e-06, + "loss": 0.0362, + "step": 8325 + }, + { + "epoch": 0.7, + "grad_norm": 0.37467484036892373, + "learning_rate": 2.1598490453938716e-06, + "loss": 0.1164, + "step": 8326 + }, + { + "epoch": 0.7, + "grad_norm": 0.23520199292378555, + "learning_rate": 2.158726071087096e-06, + "loss": 0.0546, + "step": 8327 + }, + { + "epoch": 0.7, + "grad_norm": 0.7551659696366472, + "learning_rate": 2.1576033084149534e-06, + "loss": 0.1224, + "step": 8328 + }, + { + "epoch": 0.7, + "grad_norm": 0.4191249899379254, + "learning_rate": 2.1564807574610756e-06, + "loss": 0.1131, + "step": 8329 + }, + { + "epoch": 0.7, + "grad_norm": 0.34532968058970004, + "learning_rate": 2.155358418309076e-06, + "loss": 0.0825, + "step": 8330 + }, + { + "epoch": 0.7, + "grad_norm": 0.4049522010092695, + "learning_rate": 2.1542362910425525e-06, + "loss": 0.1173, + "step": 8331 + }, + { + "epoch": 0.7, + "grad_norm": 0.3738414309737468, + "learning_rate": 2.153114375745086e-06, + "loss": 0.0771, + "step": 8332 + }, + { + "epoch": 0.7, + "grad_norm": 0.3046374813897974, + "learning_rate": 2.151992672500247e-06, + "loss": 0.0973, + "step": 8333 + }, + { + "epoch": 0.7, + "grad_norm": 0.2520584343762753, + "learning_rate": 2.150871181391585e-06, + "loss": 0.0963, + "step": 8334 + }, + { + "epoch": 0.7, + "grad_norm": 0.33102346124299453, + "learning_rate": 2.1497499025026348e-06, + "loss": 0.0789, + "step": 8335 + }, + { + "epoch": 0.7, + "grad_norm": 0.1648212026405659, + "learning_rate": 2.1486288359169133e-06, + "loss": 0.0624, + "step": 8336 + }, + { + "epoch": 0.7, + "grad_norm": 0.36213414600885024, + "learning_rate": 2.147507981717929e-06, + "loss": 0.0911, + "step": 8337 + }, + { + "epoch": 0.7, + "grad_norm": 0.42187564263971966, + "learning_rate": 2.146387339989167e-06, + "loss": 0.1094, + "step": 8338 + }, + { + "epoch": 0.7, + "grad_norm": 0.19234902656797517, + "learning_rate": 2.1452669108140995e-06, + "loss": 0.0628, + "step": 8339 + }, + { + "epoch": 0.7, + "grad_norm": 0.2969239339687833, + "learning_rate": 2.1441466942761807e-06, + "loss": 0.1153, + "step": 8340 + }, + { + "epoch": 0.7, + "grad_norm": 0.25015669339859553, + "learning_rate": 2.1430266904588544e-06, + "loss": 0.057, + "step": 8341 + }, + { + "epoch": 0.7, + "grad_norm": 0.34756851260200783, + "learning_rate": 2.1419068994455435e-06, + "loss": 0.0811, + "step": 8342 + }, + { + "epoch": 0.7, + "grad_norm": 0.2181171457248317, + "learning_rate": 2.1407873213196562e-06, + "loss": 0.0705, + "step": 8343 + }, + { + "epoch": 0.7, + "grad_norm": 0.3405487549680562, + "learning_rate": 2.139667956164585e-06, + "loss": 0.0852, + "step": 8344 + }, + { + "epoch": 0.7, + "grad_norm": 0.36686095237676986, + "learning_rate": 2.1385488040637057e-06, + "loss": 0.1198, + "step": 8345 + }, + { + "epoch": 0.7, + "grad_norm": 0.2813718707575593, + "learning_rate": 2.1374298651003826e-06, + "loss": 0.0704, + "step": 8346 + }, + { + "epoch": 0.7, + "grad_norm": 0.2584350016154863, + "learning_rate": 2.1363111393579587e-06, + "loss": 0.0715, + "step": 8347 + }, + { + "epoch": 0.7, + "grad_norm": 0.23036667172341257, + "learning_rate": 2.135192626919763e-06, + "loss": 0.0751, + "step": 8348 + }, + { + "epoch": 0.7, + "grad_norm": 0.22521560760151998, + "learning_rate": 2.1340743278691077e-06, + "loss": 0.0583, + "step": 8349 + }, + { + "epoch": 0.7, + "grad_norm": 0.5349610050745811, + "learning_rate": 2.1329562422892936e-06, + "loss": 0.1085, + "step": 8350 + }, + { + "epoch": 0.7, + "grad_norm": 0.3183396778376335, + "learning_rate": 2.1318383702636e-06, + "loss": 0.0818, + "step": 8351 + }, + { + "epoch": 0.7, + "grad_norm": 0.3943010127877746, + "learning_rate": 2.1307207118752917e-06, + "loss": 0.0909, + "step": 8352 + }, + { + "epoch": 0.7, + "grad_norm": 0.48039189209374855, + "learning_rate": 2.129603267207621e-06, + "loss": 0.0758, + "step": 8353 + }, + { + "epoch": 0.7, + "grad_norm": 0.3885992812700629, + "learning_rate": 2.12848603634382e-06, + "loss": 0.0684, + "step": 8354 + }, + { + "epoch": 0.7, + "grad_norm": 0.20430083207197544, + "learning_rate": 2.127369019367107e-06, + "loss": 0.053, + "step": 8355 + }, + { + "epoch": 0.7, + "grad_norm": 0.32395637513706593, + "learning_rate": 2.1262522163606825e-06, + "loss": 0.0886, + "step": 8356 + }, + { + "epoch": 0.7, + "grad_norm": 0.2699451447574738, + "learning_rate": 2.125135627407735e-06, + "loss": 0.0744, + "step": 8357 + }, + { + "epoch": 0.7, + "grad_norm": 0.18174907045690855, + "learning_rate": 2.1240192525914334e-06, + "loss": 0.0629, + "step": 8358 + }, + { + "epoch": 0.7, + "grad_norm": 0.29084295515623754, + "learning_rate": 2.1229030919949295e-06, + "loss": 0.0618, + "step": 8359 + }, + { + "epoch": 0.7, + "grad_norm": 0.3164916077501644, + "learning_rate": 2.121787145701366e-06, + "loss": 0.0808, + "step": 8360 + }, + { + "epoch": 0.7, + "grad_norm": 0.22405570020793797, + "learning_rate": 2.1206714137938624e-06, + "loss": 0.0677, + "step": 8361 + }, + { + "epoch": 0.7, + "grad_norm": 0.35295209069707945, + "learning_rate": 2.1195558963555235e-06, + "loss": 0.088, + "step": 8362 + }, + { + "epoch": 0.7, + "grad_norm": 0.3310045314634588, + "learning_rate": 2.118440593469443e-06, + "loss": 0.0708, + "step": 8363 + }, + { + "epoch": 0.7, + "grad_norm": 0.36240565132332553, + "learning_rate": 2.1173255052186934e-06, + "loss": 0.0861, + "step": 8364 + }, + { + "epoch": 0.7, + "grad_norm": 0.3638281357565688, + "learning_rate": 2.1162106316863333e-06, + "loss": 0.108, + "step": 8365 + }, + { + "epoch": 0.7, + "grad_norm": 0.25470380681193217, + "learning_rate": 2.115095972955403e-06, + "loss": 0.0767, + "step": 8366 + }, + { + "epoch": 0.71, + "grad_norm": 0.4858707117540823, + "learning_rate": 2.1139815291089316e-06, + "loss": 0.093, + "step": 8367 + }, + { + "epoch": 0.71, + "grad_norm": 0.19496605427084243, + "learning_rate": 2.112867300229929e-06, + "loss": 0.0385, + "step": 8368 + }, + { + "epoch": 0.71, + "grad_norm": 0.16330483683321675, + "learning_rate": 2.111753286401386e-06, + "loss": 0.0602, + "step": 8369 + }, + { + "epoch": 0.71, + "grad_norm": 0.4254200884016728, + "learning_rate": 2.1106394877062857e-06, + "loss": 0.1127, + "step": 8370 + }, + { + "epoch": 0.71, + "grad_norm": 0.2467093187559839, + "learning_rate": 2.109525904227588e-06, + "loss": 0.0657, + "step": 8371 + }, + { + "epoch": 0.71, + "grad_norm": 0.18741905357433997, + "learning_rate": 2.1084125360482395e-06, + "loss": 0.0438, + "step": 8372 + }, + { + "epoch": 0.71, + "grad_norm": 0.3463653946317927, + "learning_rate": 2.107299383251168e-06, + "loss": 0.081, + "step": 8373 + }, + { + "epoch": 0.71, + "grad_norm": 0.2678088691666406, + "learning_rate": 2.1061864459192918e-06, + "loss": 0.0781, + "step": 8374 + }, + { + "epoch": 0.71, + "grad_norm": 0.3802184092433274, + "learning_rate": 2.105073724135506e-06, + "loss": 0.086, + "step": 8375 + }, + { + "epoch": 0.71, + "grad_norm": 0.47757995524382674, + "learning_rate": 2.1039612179826924e-06, + "loss": 0.1276, + "step": 8376 + }, + { + "epoch": 0.71, + "grad_norm": 0.34347790441977466, + "learning_rate": 2.10284892754372e-06, + "loss": 0.0996, + "step": 8377 + }, + { + "epoch": 0.71, + "grad_norm": 0.7379331150655836, + "learning_rate": 2.1017368529014357e-06, + "loss": 0.121, + "step": 8378 + }, + { + "epoch": 0.71, + "grad_norm": 0.3288952063018087, + "learning_rate": 2.100624994138673e-06, + "loss": 0.0724, + "step": 8379 + }, + { + "epoch": 0.71, + "grad_norm": 0.3842620570062215, + "learning_rate": 2.0995133513382527e-06, + "loss": 0.1049, + "step": 8380 + }, + { + "epoch": 0.71, + "grad_norm": 0.19194204042663732, + "learning_rate": 2.0984019245829745e-06, + "loss": 0.0678, + "step": 8381 + }, + { + "epoch": 0.71, + "grad_norm": 0.7171173131565487, + "learning_rate": 2.0972907139556237e-06, + "loss": 0.122, + "step": 8382 + }, + { + "epoch": 0.71, + "grad_norm": 0.2599427122423824, + "learning_rate": 2.096179719538969e-06, + "loss": 0.0643, + "step": 8383 + }, + { + "epoch": 0.71, + "grad_norm": 0.44751356690909866, + "learning_rate": 2.095068941415766e-06, + "loss": 0.0952, + "step": 8384 + }, + { + "epoch": 0.71, + "grad_norm": 0.3400890017046582, + "learning_rate": 2.093958379668751e-06, + "loss": 0.0526, + "step": 8385 + }, + { + "epoch": 0.71, + "grad_norm": 0.20846312971393863, + "learning_rate": 2.0928480343806423e-06, + "loss": 0.0458, + "step": 8386 + }, + { + "epoch": 0.71, + "grad_norm": 0.5410756770441011, + "learning_rate": 2.09173790563415e-06, + "loss": 0.1037, + "step": 8387 + }, + { + "epoch": 0.71, + "grad_norm": 0.248353674784034, + "learning_rate": 2.09062799351196e-06, + "loss": 0.0568, + "step": 8388 + }, + { + "epoch": 0.71, + "grad_norm": 0.46795089099270226, + "learning_rate": 2.0895182980967453e-06, + "loss": 0.1073, + "step": 8389 + }, + { + "epoch": 0.71, + "grad_norm": 0.19309832299130636, + "learning_rate": 2.088408819471161e-06, + "loss": 0.0405, + "step": 8390 + }, + { + "epoch": 0.71, + "grad_norm": 0.270987729281112, + "learning_rate": 2.087299557717851e-06, + "loss": 0.068, + "step": 8391 + }, + { + "epoch": 0.71, + "grad_norm": 0.24750933783724813, + "learning_rate": 2.0861905129194377e-06, + "loss": 0.0639, + "step": 8392 + }, + { + "epoch": 0.71, + "grad_norm": 0.3596252297552899, + "learning_rate": 2.085081685158527e-06, + "loss": 0.0881, + "step": 8393 + }, + { + "epoch": 0.71, + "grad_norm": 0.22647214983458508, + "learning_rate": 2.083973074517715e-06, + "loss": 0.0638, + "step": 8394 + }, + { + "epoch": 0.71, + "grad_norm": 0.4910123388763237, + "learning_rate": 2.082864681079576e-06, + "loss": 0.1275, + "step": 8395 + }, + { + "epoch": 0.71, + "grad_norm": 0.23628405386874665, + "learning_rate": 2.081756504926667e-06, + "loss": 0.0664, + "step": 8396 + }, + { + "epoch": 0.71, + "grad_norm": 0.2909959222530801, + "learning_rate": 2.0806485461415347e-06, + "loss": 0.0775, + "step": 8397 + }, + { + "epoch": 0.71, + "grad_norm": 0.26908507252932734, + "learning_rate": 2.079540804806706e-06, + "loss": 0.0626, + "step": 8398 + }, + { + "epoch": 0.71, + "grad_norm": 0.5802150831002991, + "learning_rate": 2.078433281004691e-06, + "loss": 0.121, + "step": 8399 + }, + { + "epoch": 0.71, + "grad_norm": 0.1978838491681722, + "learning_rate": 2.0773259748179827e-06, + "loss": 0.0429, + "step": 8400 + }, + { + "epoch": 0.71, + "grad_norm": 0.24581154589215215, + "learning_rate": 2.0762188863290632e-06, + "loss": 0.0825, + "step": 8401 + }, + { + "epoch": 0.71, + "grad_norm": 0.21439576309310326, + "learning_rate": 2.0751120156203935e-06, + "loss": 0.0499, + "step": 8402 + }, + { + "epoch": 0.71, + "grad_norm": 0.32905432384479627, + "learning_rate": 2.0740053627744195e-06, + "loss": 0.0612, + "step": 8403 + }, + { + "epoch": 0.71, + "grad_norm": 0.24921363499672192, + "learning_rate": 2.0728989278735693e-06, + "loss": 0.0667, + "step": 8404 + }, + { + "epoch": 0.71, + "grad_norm": 0.3682516118629222, + "learning_rate": 2.071792711000261e-06, + "loss": 0.0814, + "step": 8405 + }, + { + "epoch": 0.71, + "grad_norm": 0.2550398916280299, + "learning_rate": 2.0706867122368897e-06, + "loss": 0.0821, + "step": 8406 + }, + { + "epoch": 0.71, + "grad_norm": 0.4547458096486892, + "learning_rate": 2.069580931665836e-06, + "loss": 0.1531, + "step": 8407 + }, + { + "epoch": 0.71, + "grad_norm": 0.43522368625652363, + "learning_rate": 2.0684753693694636e-06, + "loss": 0.0713, + "step": 8408 + }, + { + "epoch": 0.71, + "grad_norm": 0.44752555486562134, + "learning_rate": 2.0673700254301253e-06, + "loss": 0.1126, + "step": 8409 + }, + { + "epoch": 0.71, + "grad_norm": 0.2698307917708234, + "learning_rate": 2.066264899930151e-06, + "loss": 0.0706, + "step": 8410 + }, + { + "epoch": 0.71, + "grad_norm": 0.24850844085048537, + "learning_rate": 2.0651599929518574e-06, + "loss": 0.0683, + "step": 8411 + }, + { + "epoch": 0.71, + "grad_norm": 0.29797010470380547, + "learning_rate": 2.0640553045775443e-06, + "loss": 0.088, + "step": 8412 + }, + { + "epoch": 0.71, + "grad_norm": 0.2008024905706189, + "learning_rate": 2.062950834889493e-06, + "loss": 0.0544, + "step": 8413 + }, + { + "epoch": 0.71, + "grad_norm": 0.5548691988399708, + "learning_rate": 2.061846583969976e-06, + "loss": 0.1214, + "step": 8414 + }, + { + "epoch": 0.71, + "grad_norm": 0.2157245605101346, + "learning_rate": 2.0607425519012404e-06, + "loss": 0.047, + "step": 8415 + }, + { + "epoch": 0.71, + "grad_norm": 0.43524764830999374, + "learning_rate": 2.0596387387655224e-06, + "loss": 0.0948, + "step": 8416 + }, + { + "epoch": 0.71, + "grad_norm": 0.24060152594546, + "learning_rate": 2.058535144645038e-06, + "loss": 0.0979, + "step": 8417 + }, + { + "epoch": 0.71, + "grad_norm": 0.2636199024725251, + "learning_rate": 2.057431769621993e-06, + "loss": 0.0883, + "step": 8418 + }, + { + "epoch": 0.71, + "grad_norm": 0.39978839537230043, + "learning_rate": 2.056328613778572e-06, + "loss": 0.1136, + "step": 8419 + }, + { + "epoch": 0.71, + "grad_norm": 0.3781309478680478, + "learning_rate": 2.055225677196942e-06, + "loss": 0.1238, + "step": 8420 + }, + { + "epoch": 0.71, + "grad_norm": 0.3411043132445427, + "learning_rate": 2.05412295995926e-06, + "loss": 0.1024, + "step": 8421 + }, + { + "epoch": 0.71, + "grad_norm": 0.32952514850732917, + "learning_rate": 2.0530204621476607e-06, + "loss": 0.0772, + "step": 8422 + }, + { + "epoch": 0.71, + "grad_norm": 0.3694622923693062, + "learning_rate": 2.0519181838442646e-06, + "loss": 0.0753, + "step": 8423 + }, + { + "epoch": 0.71, + "grad_norm": 0.4680299330826819, + "learning_rate": 2.0508161251311748e-06, + "loss": 0.1044, + "step": 8424 + }, + { + "epoch": 0.71, + "grad_norm": 0.3711038438246592, + "learning_rate": 2.0497142860904817e-06, + "loss": 0.0707, + "step": 8425 + }, + { + "epoch": 0.71, + "grad_norm": 0.4619408187278086, + "learning_rate": 2.0486126668042554e-06, + "loss": 0.1163, + "step": 8426 + }, + { + "epoch": 0.71, + "grad_norm": 0.33569376255040817, + "learning_rate": 2.0475112673545487e-06, + "loss": 0.0673, + "step": 8427 + }, + { + "epoch": 0.71, + "grad_norm": 0.2103920103948312, + "learning_rate": 2.046410087823405e-06, + "loss": 0.0808, + "step": 8428 + }, + { + "epoch": 0.71, + "grad_norm": 0.4061037085671439, + "learning_rate": 2.045309128292843e-06, + "loss": 0.0813, + "step": 8429 + }, + { + "epoch": 0.71, + "grad_norm": 0.2829112191875717, + "learning_rate": 2.04420838884487e-06, + "loss": 0.0635, + "step": 8430 + }, + { + "epoch": 0.71, + "grad_norm": 0.18993723841079488, + "learning_rate": 2.0431078695614728e-06, + "loss": 0.0617, + "step": 8431 + }, + { + "epoch": 0.71, + "grad_norm": 0.4829519471029326, + "learning_rate": 2.0420075705246283e-06, + "loss": 0.1189, + "step": 8432 + }, + { + "epoch": 0.71, + "grad_norm": 0.5709067961081419, + "learning_rate": 2.040907491816292e-06, + "loss": 0.0759, + "step": 8433 + }, + { + "epoch": 0.71, + "grad_norm": 0.28283449259845955, + "learning_rate": 2.0398076335184015e-06, + "loss": 0.0879, + "step": 8434 + }, + { + "epoch": 0.71, + "grad_norm": 0.4228203937879955, + "learning_rate": 2.038707995712885e-06, + "loss": 0.0972, + "step": 8435 + }, + { + "epoch": 0.71, + "grad_norm": 0.764629553359395, + "learning_rate": 2.0376085784816473e-06, + "loss": 0.0694, + "step": 8436 + }, + { + "epoch": 0.71, + "grad_norm": 0.5632052735436225, + "learning_rate": 2.0365093819065785e-06, + "loss": 0.0648, + "step": 8437 + }, + { + "epoch": 0.71, + "grad_norm": 0.35578078265866986, + "learning_rate": 2.035410406069556e-06, + "loss": 0.1156, + "step": 8438 + }, + { + "epoch": 0.71, + "grad_norm": 0.4111706395502615, + "learning_rate": 2.034311651052437e-06, + "loss": 0.0676, + "step": 8439 + }, + { + "epoch": 0.71, + "grad_norm": 0.2434040680824665, + "learning_rate": 2.0332131169370624e-06, + "loss": 0.088, + "step": 8440 + }, + { + "epoch": 0.71, + "grad_norm": 0.21028405553623808, + "learning_rate": 2.0321148038052556e-06, + "loss": 0.0718, + "step": 8441 + }, + { + "epoch": 0.71, + "grad_norm": 0.351969601383178, + "learning_rate": 2.0310167117388294e-06, + "loss": 0.0732, + "step": 8442 + }, + { + "epoch": 0.71, + "grad_norm": 0.17555885939721344, + "learning_rate": 2.0299188408195742e-06, + "loss": 0.0485, + "step": 8443 + }, + { + "epoch": 0.71, + "grad_norm": 0.3147388028165598, + "learning_rate": 2.028821191129264e-06, + "loss": 0.0376, + "step": 8444 + }, + { + "epoch": 0.71, + "grad_norm": 0.2642972871459576, + "learning_rate": 2.0277237627496615e-06, + "loss": 0.0476, + "step": 8445 + }, + { + "epoch": 0.71, + "grad_norm": 0.5425764197585431, + "learning_rate": 2.026626555762508e-06, + "loss": 0.1123, + "step": 8446 + }, + { + "epoch": 0.71, + "grad_norm": 0.27872088794663147, + "learning_rate": 2.02552957024953e-06, + "loss": 0.0673, + "step": 8447 + }, + { + "epoch": 0.71, + "grad_norm": 0.2565679160096511, + "learning_rate": 2.024432806292435e-06, + "loss": 0.073, + "step": 8448 + }, + { + "epoch": 0.71, + "grad_norm": 0.3731645511312853, + "learning_rate": 2.0233362639729204e-06, + "loss": 0.1083, + "step": 8449 + }, + { + "epoch": 0.71, + "grad_norm": 0.21366848819269063, + "learning_rate": 2.0222399433726615e-06, + "loss": 0.0711, + "step": 8450 + }, + { + "epoch": 0.71, + "grad_norm": 0.3733688019053652, + "learning_rate": 2.021143844573316e-06, + "loss": 0.0985, + "step": 8451 + }, + { + "epoch": 0.71, + "grad_norm": 0.43457777439224915, + "learning_rate": 2.0200479676565325e-06, + "loss": 0.1098, + "step": 8452 + }, + { + "epoch": 0.71, + "grad_norm": 0.32355716354670944, + "learning_rate": 2.0189523127039355e-06, + "loss": 0.0882, + "step": 8453 + }, + { + "epoch": 0.71, + "grad_norm": 0.2907924190503596, + "learning_rate": 2.017856879797135e-06, + "loss": 0.075, + "step": 8454 + }, + { + "epoch": 0.71, + "grad_norm": 0.23519417576165608, + "learning_rate": 2.0167616690177272e-06, + "loss": 0.0602, + "step": 8455 + }, + { + "epoch": 0.71, + "grad_norm": 0.218485572369199, + "learning_rate": 2.0156666804472896e-06, + "loss": 0.0583, + "step": 8456 + }, + { + "epoch": 0.71, + "grad_norm": 0.3463550082841253, + "learning_rate": 2.0145719141673823e-06, + "loss": 0.0648, + "step": 8457 + }, + { + "epoch": 0.71, + "grad_norm": 0.39630140393255936, + "learning_rate": 2.0134773702595484e-06, + "loss": 0.0757, + "step": 8458 + }, + { + "epoch": 0.71, + "grad_norm": 0.3866339619249023, + "learning_rate": 2.01238304880532e-06, + "loss": 0.0777, + "step": 8459 + }, + { + "epoch": 0.71, + "grad_norm": 0.3492133643110339, + "learning_rate": 2.011288949886206e-06, + "loss": 0.0409, + "step": 8460 + }, + { + "epoch": 0.71, + "grad_norm": 0.5039637790517895, + "learning_rate": 2.0101950735836994e-06, + "loss": 0.1156, + "step": 8461 + }, + { + "epoch": 0.71, + "grad_norm": 0.32164155767504793, + "learning_rate": 2.009101419979283e-06, + "loss": 0.0349, + "step": 8462 + }, + { + "epoch": 0.71, + "grad_norm": 0.3122411688901697, + "learning_rate": 2.0080079891544155e-06, + "loss": 0.0469, + "step": 8463 + }, + { + "epoch": 0.71, + "grad_norm": 0.37755826340016413, + "learning_rate": 2.006914781190543e-06, + "loss": 0.0964, + "step": 8464 + }, + { + "epoch": 0.71, + "grad_norm": 0.2753612623000977, + "learning_rate": 2.005821796169092e-06, + "loss": 0.0704, + "step": 8465 + }, + { + "epoch": 0.71, + "grad_norm": 0.24769971796894905, + "learning_rate": 2.0047290341714777e-06, + "loss": 0.0757, + "step": 8466 + }, + { + "epoch": 0.71, + "grad_norm": 0.23023768074387638, + "learning_rate": 2.0036364952790937e-06, + "loss": 0.0662, + "step": 8467 + }, + { + "epoch": 0.71, + "grad_norm": 0.36002300891044653, + "learning_rate": 2.002544179573317e-06, + "loss": 0.0756, + "step": 8468 + }, + { + "epoch": 0.71, + "grad_norm": 0.5287592097186977, + "learning_rate": 2.0014520871355127e-06, + "loss": 0.0806, + "step": 8469 + }, + { + "epoch": 0.71, + "grad_norm": 0.33091963321383, + "learning_rate": 2.0003602180470246e-06, + "loss": 0.0513, + "step": 8470 + }, + { + "epoch": 0.71, + "grad_norm": 0.32636544346329205, + "learning_rate": 1.999268572389182e-06, + "loss": 0.0761, + "step": 8471 + }, + { + "epoch": 0.71, + "grad_norm": 0.19175048623923208, + "learning_rate": 1.9981771502432947e-06, + "loss": 0.0379, + "step": 8472 + }, + { + "epoch": 0.71, + "grad_norm": 0.3044771248200612, + "learning_rate": 1.997085951690662e-06, + "loss": 0.0837, + "step": 8473 + }, + { + "epoch": 0.71, + "grad_norm": 0.3165734753174872, + "learning_rate": 1.995994976812561e-06, + "loss": 0.0787, + "step": 8474 + }, + { + "epoch": 0.71, + "grad_norm": 0.2409673826011493, + "learning_rate": 1.994904225690254e-06, + "loss": 0.0645, + "step": 8475 + }, + { + "epoch": 0.71, + "grad_norm": 0.2629626129129082, + "learning_rate": 1.993813698404984e-06, + "loss": 0.0809, + "step": 8476 + }, + { + "epoch": 0.71, + "grad_norm": 0.3930171347661737, + "learning_rate": 1.9927233950379838e-06, + "loss": 0.0757, + "step": 8477 + }, + { + "epoch": 0.71, + "grad_norm": 0.31669100530274635, + "learning_rate": 1.9916333156704642e-06, + "loss": 0.0703, + "step": 8478 + }, + { + "epoch": 0.71, + "grad_norm": 0.4598854097267955, + "learning_rate": 1.99054346038362e-06, + "loss": 0.1017, + "step": 8479 + }, + { + "epoch": 0.71, + "grad_norm": 0.25319977275121913, + "learning_rate": 1.9894538292586304e-06, + "loss": 0.0714, + "step": 8480 + }, + { + "epoch": 0.71, + "grad_norm": 0.2545310671076968, + "learning_rate": 1.988364422376656e-06, + "loss": 0.0718, + "step": 8481 + }, + { + "epoch": 0.71, + "grad_norm": 0.34467019749411537, + "learning_rate": 1.987275239818844e-06, + "loss": 0.0626, + "step": 8482 + }, + { + "epoch": 0.71, + "grad_norm": 0.34897606521787566, + "learning_rate": 1.9861862816663234e-06, + "loss": 0.0811, + "step": 8483 + }, + { + "epoch": 0.71, + "grad_norm": 0.3451391432867077, + "learning_rate": 1.9850975480002057e-06, + "loss": 0.0946, + "step": 8484 + }, + { + "epoch": 0.71, + "grad_norm": 0.18572092932507714, + "learning_rate": 1.984009038901583e-06, + "loss": 0.0553, + "step": 8485 + }, + { + "epoch": 0.72, + "grad_norm": 0.3626592306908417, + "learning_rate": 1.9829207544515387e-06, + "loss": 0.108, + "step": 8486 + }, + { + "epoch": 0.72, + "grad_norm": 0.3983146219398756, + "learning_rate": 1.9818326947311326e-06, + "loss": 0.1087, + "step": 8487 + }, + { + "epoch": 0.72, + "grad_norm": 0.31522883499755, + "learning_rate": 1.980744859821408e-06, + "loss": 0.0887, + "step": 8488 + }, + { + "epoch": 0.72, + "grad_norm": 0.2741636112516907, + "learning_rate": 1.979657249803396e-06, + "loss": 0.0694, + "step": 8489 + }, + { + "epoch": 0.72, + "grad_norm": 0.39055946050310086, + "learning_rate": 1.978569864758107e-06, + "loss": 0.0842, + "step": 8490 + }, + { + "epoch": 0.72, + "grad_norm": 0.34908140403243165, + "learning_rate": 1.9774827047665357e-06, + "loss": 0.0913, + "step": 8491 + }, + { + "epoch": 0.72, + "grad_norm": 0.38331206874640184, + "learning_rate": 1.976395769909658e-06, + "loss": 0.0829, + "step": 8492 + }, + { + "epoch": 0.72, + "grad_norm": 0.2439903850610896, + "learning_rate": 1.975309060268439e-06, + "loss": 0.0469, + "step": 8493 + }, + { + "epoch": 0.72, + "grad_norm": 0.4912761390955267, + "learning_rate": 1.9742225759238215e-06, + "loss": 0.1165, + "step": 8494 + }, + { + "epoch": 0.72, + "grad_norm": 0.23214921079174586, + "learning_rate": 1.9731363169567307e-06, + "loss": 0.0441, + "step": 8495 + }, + { + "epoch": 0.72, + "grad_norm": 0.25573491688857525, + "learning_rate": 1.972050283448082e-06, + "loss": 0.0617, + "step": 8496 + }, + { + "epoch": 0.72, + "grad_norm": 0.29143229827524975, + "learning_rate": 1.970964475478768e-06, + "loss": 0.0564, + "step": 8497 + }, + { + "epoch": 0.72, + "grad_norm": 0.40077345182736057, + "learning_rate": 1.969878893129664e-06, + "loss": 0.0602, + "step": 8498 + }, + { + "epoch": 0.72, + "grad_norm": 0.334789715853125, + "learning_rate": 1.968793536481631e-06, + "loss": 0.0636, + "step": 8499 + }, + { + "epoch": 0.72, + "grad_norm": 0.21077862709145756, + "learning_rate": 1.9677084056155144e-06, + "loss": 0.0547, + "step": 8500 + }, + { + "epoch": 0.72, + "grad_norm": 0.5042201979732444, + "learning_rate": 1.96662350061214e-06, + "loss": 0.1341, + "step": 8501 + }, + { + "epoch": 0.72, + "grad_norm": 0.4295493063017846, + "learning_rate": 1.9655388215523173e-06, + "loss": 0.1111, + "step": 8502 + }, + { + "epoch": 0.72, + "grad_norm": 0.4620603316204197, + "learning_rate": 1.9644543685168418e-06, + "loss": 0.117, + "step": 8503 + }, + { + "epoch": 0.72, + "grad_norm": 0.6368737974836071, + "learning_rate": 1.9633701415864883e-06, + "loss": 0.155, + "step": 8504 + }, + { + "epoch": 0.72, + "grad_norm": 0.6179040138009031, + "learning_rate": 1.9622861408420142e-06, + "loss": 0.1045, + "step": 8505 + }, + { + "epoch": 0.72, + "grad_norm": 0.19967687952981883, + "learning_rate": 1.9612023663641656e-06, + "loss": 0.0538, + "step": 8506 + }, + { + "epoch": 0.72, + "grad_norm": 0.20942268097242273, + "learning_rate": 1.960118818233668e-06, + "loss": 0.0615, + "step": 8507 + }, + { + "epoch": 0.72, + "grad_norm": 0.22525760976935139, + "learning_rate": 1.959035496531229e-06, + "loss": 0.0623, + "step": 8508 + }, + { + "epoch": 0.72, + "grad_norm": 0.3570799984179504, + "learning_rate": 1.9579524013375394e-06, + "loss": 0.0985, + "step": 8509 + }, + { + "epoch": 0.72, + "grad_norm": 0.3836991323651682, + "learning_rate": 1.9568695327332776e-06, + "loss": 0.0712, + "step": 8510 + }, + { + "epoch": 0.72, + "grad_norm": 0.25641064259957336, + "learning_rate": 1.955786890799101e-06, + "loss": 0.0692, + "step": 8511 + }, + { + "epoch": 0.72, + "grad_norm": 0.3780333985950308, + "learning_rate": 1.9547044756156493e-06, + "loss": 0.1008, + "step": 8512 + }, + { + "epoch": 0.72, + "grad_norm": 0.2892969338192432, + "learning_rate": 1.953622287263549e-06, + "loss": 0.08, + "step": 8513 + }, + { + "epoch": 0.72, + "grad_norm": 0.38516021365613, + "learning_rate": 1.9525403258234084e-06, + "loss": 0.0829, + "step": 8514 + }, + { + "epoch": 0.72, + "grad_norm": 0.42864268732933164, + "learning_rate": 1.951458591375816e-06, + "loss": 0.1032, + "step": 8515 + }, + { + "epoch": 0.72, + "grad_norm": 0.16980827386873734, + "learning_rate": 1.950377084001346e-06, + "loss": 0.0438, + "step": 8516 + }, + { + "epoch": 0.72, + "grad_norm": 0.32610088354772443, + "learning_rate": 1.9492958037805576e-06, + "loss": 0.0943, + "step": 8517 + }, + { + "epoch": 0.72, + "grad_norm": 0.35624047022311983, + "learning_rate": 1.948214750793989e-06, + "loss": 0.0873, + "step": 8518 + }, + { + "epoch": 0.72, + "grad_norm": 0.23688965826678443, + "learning_rate": 1.947133925122162e-06, + "loss": 0.0688, + "step": 8519 + }, + { + "epoch": 0.72, + "grad_norm": 0.32540265060456125, + "learning_rate": 1.946053326845587e-06, + "loss": 0.0847, + "step": 8520 + }, + { + "epoch": 0.72, + "grad_norm": 0.34916619318427433, + "learning_rate": 1.94497295604475e-06, + "loss": 0.0965, + "step": 8521 + }, + { + "epoch": 0.72, + "grad_norm": 0.32179700827909685, + "learning_rate": 1.943892812800123e-06, + "loss": 0.0788, + "step": 8522 + }, + { + "epoch": 0.72, + "grad_norm": 0.2426318719346473, + "learning_rate": 1.9428128971921633e-06, + "loss": 0.0608, + "step": 8523 + }, + { + "epoch": 0.72, + "grad_norm": 0.2432053021198872, + "learning_rate": 1.941733209301309e-06, + "loss": 0.0427, + "step": 8524 + }, + { + "epoch": 0.72, + "grad_norm": 0.34656210228519874, + "learning_rate": 1.9406537492079815e-06, + "loss": 0.0916, + "step": 8525 + }, + { + "epoch": 0.72, + "grad_norm": 0.46448863782728345, + "learning_rate": 1.9395745169925823e-06, + "loss": 0.0996, + "step": 8526 + }, + { + "epoch": 0.72, + "grad_norm": 0.2374105775600577, + "learning_rate": 1.9384955127355032e-06, + "loss": 0.0595, + "step": 8527 + }, + { + "epoch": 0.72, + "grad_norm": 0.368170653000627, + "learning_rate": 1.937416736517113e-06, + "loss": 0.1103, + "step": 8528 + }, + { + "epoch": 0.72, + "grad_norm": 0.5461100414797615, + "learning_rate": 1.9363381884177635e-06, + "loss": 0.1054, + "step": 8529 + }, + { + "epoch": 0.72, + "grad_norm": 0.25884766353785843, + "learning_rate": 1.9352598685177942e-06, + "loss": 0.0772, + "step": 8530 + }, + { + "epoch": 0.72, + "grad_norm": 0.31664226038978116, + "learning_rate": 1.934181776897524e-06, + "loss": 0.1167, + "step": 8531 + }, + { + "epoch": 0.72, + "grad_norm": 0.2720614345815531, + "learning_rate": 1.933103913637254e-06, + "loss": 0.0712, + "step": 8532 + }, + { + "epoch": 0.72, + "grad_norm": 0.37683024052490904, + "learning_rate": 1.9320262788172696e-06, + "loss": 0.1149, + "step": 8533 + }, + { + "epoch": 0.72, + "grad_norm": 0.45275804524832103, + "learning_rate": 1.9309488725178415e-06, + "loss": 0.0867, + "step": 8534 + }, + { + "epoch": 0.72, + "grad_norm": 0.2576651796996571, + "learning_rate": 1.9298716948192197e-06, + "loss": 0.0732, + "step": 8535 + }, + { + "epoch": 0.72, + "grad_norm": 0.4300314333953212, + "learning_rate": 1.9287947458016374e-06, + "loss": 0.1012, + "step": 8536 + }, + { + "epoch": 0.72, + "grad_norm": 0.2754475262068441, + "learning_rate": 1.9277180255453154e-06, + "loss": 0.0675, + "step": 8537 + }, + { + "epoch": 0.72, + "grad_norm": 0.6744490503627898, + "learning_rate": 1.9266415341304524e-06, + "loss": 0.072, + "step": 8538 + }, + { + "epoch": 0.72, + "grad_norm": 0.24537260077628936, + "learning_rate": 1.9255652716372307e-06, + "loss": 0.0799, + "step": 8539 + }, + { + "epoch": 0.72, + "grad_norm": 0.2332707268434878, + "learning_rate": 1.9244892381458164e-06, + "loss": 0.0525, + "step": 8540 + }, + { + "epoch": 0.72, + "grad_norm": 0.3244815322951511, + "learning_rate": 1.9234134337363613e-06, + "loss": 0.0882, + "step": 8541 + }, + { + "epoch": 0.72, + "grad_norm": 0.3890515872322682, + "learning_rate": 1.922337858488997e-06, + "loss": 0.094, + "step": 8542 + }, + { + "epoch": 0.72, + "grad_norm": 0.29159352975622804, + "learning_rate": 1.9212625124838363e-06, + "loss": 0.0866, + "step": 8543 + }, + { + "epoch": 0.72, + "grad_norm": 0.4266420484417009, + "learning_rate": 1.9201873958009776e-06, + "loss": 0.0899, + "step": 8544 + }, + { + "epoch": 0.72, + "grad_norm": 0.5099198120632253, + "learning_rate": 1.9191125085205047e-06, + "loss": 0.1197, + "step": 8545 + }, + { + "epoch": 0.72, + "grad_norm": 0.4252373406429376, + "learning_rate": 1.9180378507224797e-06, + "loss": 0.1089, + "step": 8546 + }, + { + "epoch": 0.72, + "grad_norm": 0.24882572374304812, + "learning_rate": 1.916963422486949e-06, + "loss": 0.0672, + "step": 8547 + }, + { + "epoch": 0.72, + "grad_norm": 0.29076623430755955, + "learning_rate": 1.915889223893943e-06, + "loss": 0.0692, + "step": 8548 + }, + { + "epoch": 0.72, + "grad_norm": 0.2526994460435599, + "learning_rate": 1.9148152550234715e-06, + "loss": 0.073, + "step": 8549 + }, + { + "epoch": 0.72, + "grad_norm": 0.49834985927370773, + "learning_rate": 1.913741515955535e-06, + "loss": 0.0782, + "step": 8550 + }, + { + "epoch": 0.72, + "grad_norm": 0.3531671118218845, + "learning_rate": 1.9126680067701085e-06, + "loss": 0.0798, + "step": 8551 + }, + { + "epoch": 0.72, + "grad_norm": 0.304935035676818, + "learning_rate": 1.911594727547154e-06, + "loss": 0.0939, + "step": 8552 + }, + { + "epoch": 0.72, + "grad_norm": 0.2885637580433055, + "learning_rate": 1.9105216783666143e-06, + "loss": 0.0687, + "step": 8553 + }, + { + "epoch": 0.72, + "grad_norm": 0.3490227025641991, + "learning_rate": 1.909448859308418e-06, + "loss": 0.0825, + "step": 8554 + }, + { + "epoch": 0.72, + "grad_norm": 0.3019769104046824, + "learning_rate": 1.9083762704524756e-06, + "loss": 0.0686, + "step": 8555 + }, + { + "epoch": 0.72, + "grad_norm": 0.3140347874557103, + "learning_rate": 1.9073039118786786e-06, + "loss": 0.0641, + "step": 8556 + }, + { + "epoch": 0.72, + "grad_norm": 0.2921808536725846, + "learning_rate": 1.9062317836669002e-06, + "loss": 0.0409, + "step": 8557 + }, + { + "epoch": 0.72, + "grad_norm": 0.25815241897086005, + "learning_rate": 1.9051598858970033e-06, + "loss": 0.1007, + "step": 8558 + }, + { + "epoch": 0.72, + "grad_norm": 0.41246333032623084, + "learning_rate": 1.9040882186488263e-06, + "loss": 0.1049, + "step": 8559 + }, + { + "epoch": 0.72, + "grad_norm": 0.1966014976410521, + "learning_rate": 1.9030167820021927e-06, + "loss": 0.042, + "step": 8560 + }, + { + "epoch": 0.72, + "grad_norm": 0.18197116647764305, + "learning_rate": 1.9019455760369115e-06, + "loss": 0.0442, + "step": 8561 + }, + { + "epoch": 0.72, + "grad_norm": 0.25935200573564243, + "learning_rate": 1.9008746008327722e-06, + "loss": 0.0707, + "step": 8562 + }, + { + "epoch": 0.72, + "grad_norm": 0.39449134136279485, + "learning_rate": 1.8998038564695436e-06, + "loss": 0.0925, + "step": 8563 + }, + { + "epoch": 0.72, + "grad_norm": 0.13576304778052012, + "learning_rate": 1.8987333430269861e-06, + "loss": 0.043, + "step": 8564 + }, + { + "epoch": 0.72, + "grad_norm": 0.25640560295247133, + "learning_rate": 1.8976630605848357e-06, + "loss": 0.0821, + "step": 8565 + }, + { + "epoch": 0.72, + "grad_norm": 0.34270795130679027, + "learning_rate": 1.8965930092228124e-06, + "loss": 0.098, + "step": 8566 + }, + { + "epoch": 0.72, + "grad_norm": 0.31316471176577015, + "learning_rate": 1.8955231890206187e-06, + "loss": 0.0779, + "step": 8567 + }, + { + "epoch": 0.72, + "grad_norm": 0.18601313957211163, + "learning_rate": 1.894453600057945e-06, + "loss": 0.0611, + "step": 8568 + }, + { + "epoch": 0.72, + "grad_norm": 0.25663875456757806, + "learning_rate": 1.8933842424144583e-06, + "loss": 0.0623, + "step": 8569 + }, + { + "epoch": 0.72, + "grad_norm": 0.5170634353782716, + "learning_rate": 1.8923151161698084e-06, + "loss": 0.1217, + "step": 8570 + }, + { + "epoch": 0.72, + "grad_norm": 0.2782315748594582, + "learning_rate": 1.8912462214036343e-06, + "loss": 0.079, + "step": 8571 + }, + { + "epoch": 0.72, + "grad_norm": 0.285729215681705, + "learning_rate": 1.8901775581955518e-06, + "loss": 0.1064, + "step": 8572 + }, + { + "epoch": 0.72, + "grad_norm": 0.43080402499791876, + "learning_rate": 1.88910912662516e-06, + "loss": 0.0879, + "step": 8573 + }, + { + "epoch": 0.72, + "grad_norm": 0.3603952642305913, + "learning_rate": 1.8880409267720417e-06, + "loss": 0.0869, + "step": 8574 + }, + { + "epoch": 0.72, + "grad_norm": 0.21097104183887597, + "learning_rate": 1.8869729587157647e-06, + "loss": 0.0508, + "step": 8575 + }, + { + "epoch": 0.72, + "grad_norm": 0.3247021848064528, + "learning_rate": 1.885905222535877e-06, + "loss": 0.129, + "step": 8576 + }, + { + "epoch": 0.72, + "grad_norm": 0.19651041413473003, + "learning_rate": 1.8848377183119076e-06, + "loss": 0.0489, + "step": 8577 + }, + { + "epoch": 0.72, + "grad_norm": 0.3265273261981132, + "learning_rate": 1.8837704461233735e-06, + "loss": 0.0674, + "step": 8578 + }, + { + "epoch": 0.72, + "grad_norm": 0.2217765585260521, + "learning_rate": 1.88270340604977e-06, + "loss": 0.0571, + "step": 8579 + }, + { + "epoch": 0.72, + "grad_norm": 0.6220350260843028, + "learning_rate": 1.8816365981705748e-06, + "loss": 0.1044, + "step": 8580 + }, + { + "epoch": 0.72, + "grad_norm": 0.29388577121007736, + "learning_rate": 1.8805700225652534e-06, + "loss": 0.0833, + "step": 8581 + }, + { + "epoch": 0.72, + "grad_norm": 0.3096248031484029, + "learning_rate": 1.8795036793132487e-06, + "loss": 0.0708, + "step": 8582 + }, + { + "epoch": 0.72, + "grad_norm": 0.3596016465172378, + "learning_rate": 1.8784375684939881e-06, + "loss": 0.1037, + "step": 8583 + }, + { + "epoch": 0.72, + "grad_norm": 0.2806453467013293, + "learning_rate": 1.8773716901868805e-06, + "loss": 0.0803, + "step": 8584 + }, + { + "epoch": 0.72, + "grad_norm": 0.2722194069570301, + "learning_rate": 1.876306044471321e-06, + "loss": 0.0619, + "step": 8585 + }, + { + "epoch": 0.72, + "grad_norm": 0.500368177699109, + "learning_rate": 1.8752406314266848e-06, + "loss": 0.1074, + "step": 8586 + }, + { + "epoch": 0.72, + "grad_norm": 0.19191877413248948, + "learning_rate": 1.8741754511323273e-06, + "loss": 0.0604, + "step": 8587 + }, + { + "epoch": 0.72, + "grad_norm": 0.22010401451192818, + "learning_rate": 1.8731105036675939e-06, + "loss": 0.0579, + "step": 8588 + }, + { + "epoch": 0.72, + "grad_norm": 0.17140897248582623, + "learning_rate": 1.872045789111805e-06, + "loss": 0.0509, + "step": 8589 + }, + { + "epoch": 0.72, + "grad_norm": 0.16405010773083645, + "learning_rate": 1.8709813075442678e-06, + "loss": 0.0293, + "step": 8590 + }, + { + "epoch": 0.72, + "grad_norm": 0.528591413495145, + "learning_rate": 1.8699170590442683e-06, + "loss": 0.0988, + "step": 8591 + }, + { + "epoch": 0.72, + "grad_norm": 0.38761348223789505, + "learning_rate": 1.868853043691083e-06, + "loss": 0.0784, + "step": 8592 + }, + { + "epoch": 0.72, + "grad_norm": 0.22520792456417432, + "learning_rate": 1.8677892615639626e-06, + "loss": 0.0525, + "step": 8593 + }, + { + "epoch": 0.72, + "grad_norm": 0.17809455928089962, + "learning_rate": 1.866725712742143e-06, + "loss": 0.046, + "step": 8594 + }, + { + "epoch": 0.72, + "grad_norm": 0.41192993824968416, + "learning_rate": 1.8656623973048466e-06, + "loss": 0.1019, + "step": 8595 + }, + { + "epoch": 0.72, + "grad_norm": 0.3836326776477548, + "learning_rate": 1.8645993153312736e-06, + "loss": 0.0816, + "step": 8596 + }, + { + "epoch": 0.72, + "grad_norm": 0.2956545262143723, + "learning_rate": 1.8635364669006073e-06, + "loss": 0.0467, + "step": 8597 + }, + { + "epoch": 0.72, + "grad_norm": 0.46303551666203485, + "learning_rate": 1.8624738520920176e-06, + "loss": 0.1137, + "step": 8598 + }, + { + "epoch": 0.72, + "grad_norm": 0.25992743855380407, + "learning_rate": 1.8614114709846531e-06, + "loss": 0.0693, + "step": 8599 + }, + { + "epoch": 0.72, + "grad_norm": 0.2823570899252034, + "learning_rate": 1.8603493236576458e-06, + "loss": 0.087, + "step": 8600 + }, + { + "epoch": 0.72, + "grad_norm": 0.3077436998283426, + "learning_rate": 1.8592874101901087e-06, + "loss": 0.0693, + "step": 8601 + }, + { + "epoch": 0.72, + "grad_norm": 0.5004601074837179, + "learning_rate": 1.8582257306611435e-06, + "loss": 0.1339, + "step": 8602 + }, + { + "epoch": 0.72, + "grad_norm": 0.374641622616434, + "learning_rate": 1.8571642851498278e-06, + "loss": 0.0952, + "step": 8603 + }, + { + "epoch": 0.72, + "grad_norm": 0.24605204811060066, + "learning_rate": 1.8561030737352226e-06, + "loss": 0.0795, + "step": 8604 + }, + { + "epoch": 0.73, + "grad_norm": 0.17920773535033102, + "learning_rate": 1.8550420964963772e-06, + "loss": 0.0293, + "step": 8605 + }, + { + "epoch": 0.73, + "grad_norm": 0.19454660686575073, + "learning_rate": 1.8539813535123168e-06, + "loss": 0.0468, + "step": 8606 + }, + { + "epoch": 0.73, + "grad_norm": 0.27688784662274873, + "learning_rate": 1.8529208448620522e-06, + "loss": 0.0554, + "step": 8607 + }, + { + "epoch": 0.73, + "grad_norm": 0.2860967402599092, + "learning_rate": 1.8518605706245745e-06, + "loss": 0.0617, + "step": 8608 + }, + { + "epoch": 0.73, + "grad_norm": 0.34831612536026135, + "learning_rate": 1.8508005308788623e-06, + "loss": 0.0905, + "step": 8609 + }, + { + "epoch": 0.73, + "grad_norm": 0.21994747701689207, + "learning_rate": 1.8497407257038724e-06, + "loss": 0.0537, + "step": 8610 + }, + { + "epoch": 0.73, + "grad_norm": 0.6481709251188935, + "learning_rate": 1.8486811551785445e-06, + "loss": 0.1338, + "step": 8611 + }, + { + "epoch": 0.73, + "grad_norm": 0.16854281677463817, + "learning_rate": 1.8476218193818007e-06, + "loss": 0.0389, + "step": 8612 + }, + { + "epoch": 0.73, + "grad_norm": 0.22451520637281053, + "learning_rate": 1.8465627183925488e-06, + "loss": 0.0728, + "step": 8613 + }, + { + "epoch": 0.73, + "grad_norm": 0.3524275447506338, + "learning_rate": 1.8455038522896761e-06, + "loss": 0.0812, + "step": 8614 + }, + { + "epoch": 0.73, + "grad_norm": 0.41652203151895745, + "learning_rate": 1.8444452211520531e-06, + "loss": 0.0986, + "step": 8615 + }, + { + "epoch": 0.73, + "grad_norm": 0.35620612459749806, + "learning_rate": 1.8433868250585318e-06, + "loss": 0.0788, + "step": 8616 + }, + { + "epoch": 0.73, + "grad_norm": 0.31026363685158903, + "learning_rate": 1.8423286640879474e-06, + "loss": 0.0682, + "step": 8617 + }, + { + "epoch": 0.73, + "grad_norm": 0.41123393727912216, + "learning_rate": 1.8412707383191203e-06, + "loss": 0.0775, + "step": 8618 + }, + { + "epoch": 0.73, + "grad_norm": 0.2898005388635064, + "learning_rate": 1.8402130478308495e-06, + "loss": 0.0823, + "step": 8619 + }, + { + "epoch": 0.73, + "grad_norm": 0.36717247211938964, + "learning_rate": 1.8391555927019177e-06, + "loss": 0.0836, + "step": 8620 + }, + { + "epoch": 0.73, + "grad_norm": 0.4549120776121371, + "learning_rate": 1.8380983730110895e-06, + "loss": 0.1216, + "step": 8621 + }, + { + "epoch": 0.73, + "grad_norm": 0.21057728343910734, + "learning_rate": 1.8370413888371152e-06, + "loss": 0.0511, + "step": 8622 + }, + { + "epoch": 0.73, + "grad_norm": 0.3138541833302261, + "learning_rate": 1.8359846402587246e-06, + "loss": 0.0622, + "step": 8623 + }, + { + "epoch": 0.73, + "grad_norm": 0.23827953589534256, + "learning_rate": 1.8349281273546288e-06, + "loss": 0.0747, + "step": 8624 + }, + { + "epoch": 0.73, + "grad_norm": 0.4342755359960006, + "learning_rate": 1.833871850203523e-06, + "loss": 0.0949, + "step": 8625 + }, + { + "epoch": 0.73, + "grad_norm": 0.14727810370518662, + "learning_rate": 1.832815808884087e-06, + "loss": 0.0428, + "step": 8626 + }, + { + "epoch": 0.73, + "grad_norm": 0.22125256958175882, + "learning_rate": 1.83176000347498e-06, + "loss": 0.0544, + "step": 8627 + }, + { + "epoch": 0.73, + "grad_norm": 0.3765734030478906, + "learning_rate": 1.8307044340548425e-06, + "loss": 0.1002, + "step": 8628 + }, + { + "epoch": 0.73, + "grad_norm": 0.4268078333894236, + "learning_rate": 1.8296491007023032e-06, + "loss": 0.0767, + "step": 8629 + }, + { + "epoch": 0.73, + "grad_norm": 0.47729681115747163, + "learning_rate": 1.8285940034959675e-06, + "loss": 0.1061, + "step": 8630 + }, + { + "epoch": 0.73, + "grad_norm": 0.34937699841822933, + "learning_rate": 1.8275391425144234e-06, + "loss": 0.0783, + "step": 8631 + }, + { + "epoch": 0.73, + "grad_norm": 0.2177383747992872, + "learning_rate": 1.8264845178362467e-06, + "loss": 0.0549, + "step": 8632 + }, + { + "epoch": 0.73, + "grad_norm": 0.2527299874051213, + "learning_rate": 1.8254301295399901e-06, + "loss": 0.0816, + "step": 8633 + }, + { + "epoch": 0.73, + "grad_norm": 0.2699610776027627, + "learning_rate": 1.8243759777041908e-06, + "loss": 0.0787, + "step": 8634 + }, + { + "epoch": 0.73, + "grad_norm": 0.25467081786197815, + "learning_rate": 1.823322062407366e-06, + "loss": 0.0493, + "step": 8635 + }, + { + "epoch": 0.73, + "grad_norm": 0.29735273146432556, + "learning_rate": 1.822268383728022e-06, + "loss": 0.0821, + "step": 8636 + }, + { + "epoch": 0.73, + "grad_norm": 0.3053433320474836, + "learning_rate": 1.8212149417446395e-06, + "loss": 0.0884, + "step": 8637 + }, + { + "epoch": 0.73, + "grad_norm": 0.3045493640132282, + "learning_rate": 1.8201617365356849e-06, + "loss": 0.0677, + "step": 8638 + }, + { + "epoch": 0.73, + "grad_norm": 0.21505617293362075, + "learning_rate": 1.8191087681796094e-06, + "loss": 0.0416, + "step": 8639 + }, + { + "epoch": 0.73, + "grad_norm": 0.3081263273269807, + "learning_rate": 1.8180560367548433e-06, + "loss": 0.0886, + "step": 8640 + }, + { + "epoch": 0.73, + "grad_norm": 0.3350906921722642, + "learning_rate": 1.8170035423397996e-06, + "loss": 0.0951, + "step": 8641 + }, + { + "epoch": 0.73, + "grad_norm": 0.2904368106063874, + "learning_rate": 1.8159512850128724e-06, + "loss": 0.0933, + "step": 8642 + }, + { + "epoch": 0.73, + "grad_norm": 0.36128543941868674, + "learning_rate": 1.8148992648524439e-06, + "loss": 0.0918, + "step": 8643 + }, + { + "epoch": 0.73, + "grad_norm": 0.26085713603674, + "learning_rate": 1.813847481936873e-06, + "loss": 0.0551, + "step": 8644 + }, + { + "epoch": 0.73, + "grad_norm": 0.3249519606778011, + "learning_rate": 1.8127959363445003e-06, + "loss": 0.0611, + "step": 8645 + }, + { + "epoch": 0.73, + "grad_norm": 0.2057574035486612, + "learning_rate": 1.811744628153655e-06, + "loss": 0.0727, + "step": 8646 + }, + { + "epoch": 0.73, + "grad_norm": 0.23028273606829058, + "learning_rate": 1.8106935574426426e-06, + "loss": 0.0618, + "step": 8647 + }, + { + "epoch": 0.73, + "grad_norm": 0.2585961472052786, + "learning_rate": 1.8096427242897513e-06, + "loss": 0.0587, + "step": 8648 + }, + { + "epoch": 0.73, + "grad_norm": 0.6153779768180376, + "learning_rate": 1.8085921287732572e-06, + "loss": 0.1088, + "step": 8649 + }, + { + "epoch": 0.73, + "grad_norm": 0.4221687483075273, + "learning_rate": 1.807541770971412e-06, + "loss": 0.0647, + "step": 8650 + }, + { + "epoch": 0.73, + "grad_norm": 0.28487048551357025, + "learning_rate": 1.8064916509624537e-06, + "loss": 0.0896, + "step": 8651 + }, + { + "epoch": 0.73, + "grad_norm": 0.3081947861834471, + "learning_rate": 1.805441768824599e-06, + "loss": 0.0573, + "step": 8652 + }, + { + "epoch": 0.73, + "grad_norm": 0.40059624030232976, + "learning_rate": 1.804392124636053e-06, + "loss": 0.1021, + "step": 8653 + }, + { + "epoch": 0.73, + "grad_norm": 0.2751407282957148, + "learning_rate": 1.803342718474997e-06, + "loss": 0.0556, + "step": 8654 + }, + { + "epoch": 0.73, + "grad_norm": 0.514292756242048, + "learning_rate": 1.8022935504195953e-06, + "loss": 0.1442, + "step": 8655 + }, + { + "epoch": 0.73, + "grad_norm": 0.3485153096511675, + "learning_rate": 1.801244620548e-06, + "loss": 0.0689, + "step": 8656 + }, + { + "epoch": 0.73, + "grad_norm": 0.36193338219730853, + "learning_rate": 1.8001959289383397e-06, + "loss": 0.0938, + "step": 8657 + }, + { + "epoch": 0.73, + "grad_norm": 0.30244050086938523, + "learning_rate": 1.7991474756687266e-06, + "loss": 0.1069, + "step": 8658 + }, + { + "epoch": 0.73, + "grad_norm": 0.22308022945755457, + "learning_rate": 1.7980992608172543e-06, + "loss": 0.0751, + "step": 8659 + }, + { + "epoch": 0.73, + "grad_norm": 0.2871274660145017, + "learning_rate": 1.7970512844620031e-06, + "loss": 0.0801, + "step": 8660 + }, + { + "epoch": 0.73, + "grad_norm": 0.34817401355808386, + "learning_rate": 1.7960035466810306e-06, + "loss": 0.1108, + "step": 8661 + }, + { + "epoch": 0.73, + "grad_norm": 0.3875303957814468, + "learning_rate": 1.794956047552377e-06, + "loss": 0.0896, + "step": 8662 + }, + { + "epoch": 0.73, + "grad_norm": 0.1725560357197752, + "learning_rate": 1.7939087871540695e-06, + "loss": 0.0378, + "step": 8663 + }, + { + "epoch": 0.73, + "grad_norm": 0.3513066545607775, + "learning_rate": 1.7928617655641122e-06, + "loss": 0.066, + "step": 8664 + }, + { + "epoch": 0.73, + "grad_norm": 0.3779562358907929, + "learning_rate": 1.791814982860494e-06, + "loss": 0.066, + "step": 8665 + }, + { + "epoch": 0.73, + "grad_norm": 0.3379868295796861, + "learning_rate": 1.7907684391211834e-06, + "loss": 0.1065, + "step": 8666 + }, + { + "epoch": 0.73, + "grad_norm": 0.284081879878797, + "learning_rate": 1.7897221344241355e-06, + "loss": 0.0737, + "step": 8667 + }, + { + "epoch": 0.73, + "grad_norm": 0.2921253828260448, + "learning_rate": 1.7886760688472849e-06, + "loss": 0.0789, + "step": 8668 + }, + { + "epoch": 0.73, + "grad_norm": 0.26074279692059793, + "learning_rate": 1.787630242468546e-06, + "loss": 0.0532, + "step": 8669 + }, + { + "epoch": 0.73, + "grad_norm": 0.24148583940991145, + "learning_rate": 1.7865846553658222e-06, + "loss": 0.0501, + "step": 8670 + }, + { + "epoch": 0.73, + "grad_norm": 0.4316766538791286, + "learning_rate": 1.7855393076169924e-06, + "loss": 0.0717, + "step": 8671 + }, + { + "epoch": 0.73, + "grad_norm": 0.10586896885396005, + "learning_rate": 1.7844941992999187e-06, + "loss": 0.0142, + "step": 8672 + }, + { + "epoch": 0.73, + "grad_norm": 0.42154239011994626, + "learning_rate": 1.7834493304924511e-06, + "loss": 0.1098, + "step": 8673 + }, + { + "epoch": 0.73, + "grad_norm": 0.2027515746861528, + "learning_rate": 1.7824047012724144e-06, + "loss": 0.0517, + "step": 8674 + }, + { + "epoch": 0.73, + "grad_norm": 0.48287025620246105, + "learning_rate": 1.7813603117176204e-06, + "loss": 0.1156, + "step": 8675 + }, + { + "epoch": 0.73, + "grad_norm": 0.2524719521194105, + "learning_rate": 1.780316161905858e-06, + "loss": 0.0654, + "step": 8676 + }, + { + "epoch": 0.73, + "grad_norm": 0.390376371558901, + "learning_rate": 1.7792722519149054e-06, + "loss": 0.1255, + "step": 8677 + }, + { + "epoch": 0.73, + "grad_norm": 0.35560524744515615, + "learning_rate": 1.7782285818225181e-06, + "loss": 0.0979, + "step": 8678 + }, + { + "epoch": 0.73, + "grad_norm": 0.2861444678733224, + "learning_rate": 1.7771851517064337e-06, + "loss": 0.0663, + "step": 8679 + }, + { + "epoch": 0.73, + "grad_norm": 0.5039509023997834, + "learning_rate": 1.776141961644372e-06, + "loss": 0.1548, + "step": 8680 + }, + { + "epoch": 0.73, + "grad_norm": 0.29094961281556475, + "learning_rate": 1.7750990117140393e-06, + "loss": 0.0761, + "step": 8681 + }, + { + "epoch": 0.73, + "grad_norm": 0.1977818371205157, + "learning_rate": 1.7740563019931185e-06, + "loss": 0.0482, + "step": 8682 + }, + { + "epoch": 0.73, + "grad_norm": 0.1685375988936261, + "learning_rate": 1.7730138325592772e-06, + "loss": 0.0513, + "step": 8683 + }, + { + "epoch": 0.73, + "grad_norm": 0.31627008190400774, + "learning_rate": 1.7719716034901635e-06, + "loss": 0.0709, + "step": 8684 + }, + { + "epoch": 0.73, + "grad_norm": 0.21724756290984287, + "learning_rate": 1.7709296148634081e-06, + "loss": 0.0521, + "step": 8685 + }, + { + "epoch": 0.73, + "grad_norm": 0.19879558195739336, + "learning_rate": 1.7698878667566277e-06, + "loss": 0.062, + "step": 8686 + }, + { + "epoch": 0.73, + "grad_norm": 0.2932456994401182, + "learning_rate": 1.7688463592474154e-06, + "loss": 0.0728, + "step": 8687 + }, + { + "epoch": 0.73, + "grad_norm": 0.2491461840708174, + "learning_rate": 1.7678050924133488e-06, + "loss": 0.0768, + "step": 8688 + }, + { + "epoch": 0.73, + "grad_norm": 0.3128929919532564, + "learning_rate": 1.7667640663319868e-06, + "loss": 0.069, + "step": 8689 + }, + { + "epoch": 0.73, + "grad_norm": 0.6108429328082423, + "learning_rate": 1.7657232810808738e-06, + "loss": 0.16, + "step": 8690 + }, + { + "epoch": 0.73, + "grad_norm": 0.28882902424628315, + "learning_rate": 1.7646827367375314e-06, + "loss": 0.0727, + "step": 8691 + }, + { + "epoch": 0.73, + "grad_norm": 0.3390837493679277, + "learning_rate": 1.7636424333794661e-06, + "loss": 0.1063, + "step": 8692 + }, + { + "epoch": 0.73, + "grad_norm": 0.2842551280277906, + "learning_rate": 1.7626023710841638e-06, + "loss": 0.0846, + "step": 8693 + }, + { + "epoch": 0.73, + "grad_norm": 0.31923131279803996, + "learning_rate": 1.7615625499290984e-06, + "loss": 0.0769, + "step": 8694 + }, + { + "epoch": 0.73, + "grad_norm": 0.3640303854454691, + "learning_rate": 1.7605229699917186e-06, + "loss": 0.0959, + "step": 8695 + }, + { + "epoch": 0.73, + "grad_norm": 0.46707786235956106, + "learning_rate": 1.7594836313494583e-06, + "loss": 0.1248, + "step": 8696 + }, + { + "epoch": 0.73, + "grad_norm": 0.3106428675461031, + "learning_rate": 1.7584445340797362e-06, + "loss": 0.1002, + "step": 8697 + }, + { + "epoch": 0.73, + "grad_norm": 0.22159452429851254, + "learning_rate": 1.7574056782599486e-06, + "loss": 0.047, + "step": 8698 + }, + { + "epoch": 0.73, + "grad_norm": 0.3232279840228523, + "learning_rate": 1.7563670639674756e-06, + "loss": 0.0842, + "step": 8699 + }, + { + "epoch": 0.73, + "grad_norm": 0.30923461729718876, + "learning_rate": 1.7553286912796775e-06, + "loss": 0.0617, + "step": 8700 + }, + { + "epoch": 0.73, + "grad_norm": 0.3398334293264469, + "learning_rate": 1.7542905602739018e-06, + "loss": 0.0844, + "step": 8701 + }, + { + "epoch": 0.73, + "grad_norm": 0.3085803356767103, + "learning_rate": 1.7532526710274728e-06, + "loss": 0.1051, + "step": 8702 + }, + { + "epoch": 0.73, + "grad_norm": 0.31046214472357836, + "learning_rate": 1.7522150236176966e-06, + "loss": 0.0962, + "step": 8703 + }, + { + "epoch": 0.73, + "grad_norm": 0.2768882979956396, + "learning_rate": 1.751177618121867e-06, + "loss": 0.0782, + "step": 8704 + }, + { + "epoch": 0.73, + "grad_norm": 0.3326129294764262, + "learning_rate": 1.7501404546172545e-06, + "loss": 0.0812, + "step": 8705 + }, + { + "epoch": 0.73, + "grad_norm": 0.27416905221469967, + "learning_rate": 1.7491035331811112e-06, + "loss": 0.0657, + "step": 8706 + }, + { + "epoch": 0.73, + "grad_norm": 0.35078918052918373, + "learning_rate": 1.7480668538906759e-06, + "loss": 0.0914, + "step": 8707 + }, + { + "epoch": 0.73, + "grad_norm": 0.23242424127272066, + "learning_rate": 1.7470304168231656e-06, + "loss": 0.0521, + "step": 8708 + }, + { + "epoch": 0.73, + "grad_norm": 0.25873811728404233, + "learning_rate": 1.7459942220557791e-06, + "loss": 0.0878, + "step": 8709 + }, + { + "epoch": 0.73, + "grad_norm": 0.26841697766911105, + "learning_rate": 1.744958269665698e-06, + "loss": 0.0765, + "step": 8710 + }, + { + "epoch": 0.73, + "grad_norm": 0.24867793403145752, + "learning_rate": 1.743922559730088e-06, + "loss": 0.052, + "step": 8711 + }, + { + "epoch": 0.73, + "grad_norm": 0.4176886883940249, + "learning_rate": 1.7428870923260944e-06, + "loss": 0.0908, + "step": 8712 + }, + { + "epoch": 0.73, + "grad_norm": 0.37593505594656645, + "learning_rate": 1.741851867530842e-06, + "loss": 0.0687, + "step": 8713 + }, + { + "epoch": 0.73, + "grad_norm": 0.5328837487002965, + "learning_rate": 1.7408168854214451e-06, + "loss": 0.1433, + "step": 8714 + }, + { + "epoch": 0.73, + "grad_norm": 0.22322119893185083, + "learning_rate": 1.739782146074992e-06, + "loss": 0.0717, + "step": 8715 + }, + { + "epoch": 0.73, + "grad_norm": 0.19109203210180206, + "learning_rate": 1.7387476495685574e-06, + "loss": 0.0384, + "step": 8716 + }, + { + "epoch": 0.73, + "grad_norm": 0.6718769446273014, + "learning_rate": 1.7377133959791941e-06, + "loss": 0.101, + "step": 8717 + }, + { + "epoch": 0.73, + "grad_norm": 0.22730740276550823, + "learning_rate": 1.7366793853839426e-06, + "loss": 0.0629, + "step": 8718 + }, + { + "epoch": 0.73, + "grad_norm": 0.23334270942185734, + "learning_rate": 1.7356456178598214e-06, + "loss": 0.0667, + "step": 8719 + }, + { + "epoch": 0.73, + "grad_norm": 0.2985597337975208, + "learning_rate": 1.734612093483829e-06, + "loss": 0.0715, + "step": 8720 + }, + { + "epoch": 0.73, + "grad_norm": 0.4439876551516199, + "learning_rate": 1.7335788123329517e-06, + "loss": 0.1155, + "step": 8721 + }, + { + "epoch": 0.73, + "grad_norm": 0.3012377339851268, + "learning_rate": 1.7325457744841523e-06, + "loss": 0.079, + "step": 8722 + }, + { + "epoch": 0.74, + "grad_norm": 0.20970008898597461, + "learning_rate": 1.731512980014377e-06, + "loss": 0.0637, + "step": 8723 + }, + { + "epoch": 0.74, + "grad_norm": 0.25080807724361265, + "learning_rate": 1.7304804290005568e-06, + "loss": 0.045, + "step": 8724 + }, + { + "epoch": 0.74, + "grad_norm": 0.4761017383300039, + "learning_rate": 1.7294481215196002e-06, + "loss": 0.1229, + "step": 8725 + }, + { + "epoch": 0.74, + "grad_norm": 0.21185000446656518, + "learning_rate": 1.7284160576484005e-06, + "loss": 0.0657, + "step": 8726 + }, + { + "epoch": 0.74, + "grad_norm": 0.3552183574719446, + "learning_rate": 1.727384237463829e-06, + "loss": 0.1029, + "step": 8727 + }, + { + "epoch": 0.74, + "grad_norm": 0.380019654986308, + "learning_rate": 1.7263526610427455e-06, + "loss": 0.0766, + "step": 8728 + }, + { + "epoch": 0.74, + "grad_norm": 0.3148135389349021, + "learning_rate": 1.725321328461987e-06, + "loss": 0.0792, + "step": 8729 + }, + { + "epoch": 0.74, + "grad_norm": 0.22413238574563393, + "learning_rate": 1.7242902397983703e-06, + "loss": 0.0844, + "step": 8730 + }, + { + "epoch": 0.74, + "grad_norm": 0.364385506699739, + "learning_rate": 1.7232593951287002e-06, + "loss": 0.0759, + "step": 8731 + }, + { + "epoch": 0.74, + "grad_norm": 0.32918589855198443, + "learning_rate": 1.7222287945297594e-06, + "loss": 0.071, + "step": 8732 + }, + { + "epoch": 0.74, + "grad_norm": 0.3533364453809993, + "learning_rate": 1.721198438078312e-06, + "loss": 0.0779, + "step": 8733 + }, + { + "epoch": 0.74, + "grad_norm": 0.3475817541011987, + "learning_rate": 1.7201683258511036e-06, + "loss": 0.0844, + "step": 8734 + }, + { + "epoch": 0.74, + "grad_norm": 0.30900219663427225, + "learning_rate": 1.7191384579248666e-06, + "loss": 0.0779, + "step": 8735 + }, + { + "epoch": 0.74, + "grad_norm": 0.3754553406086458, + "learning_rate": 1.7181088343763096e-06, + "loss": 0.0994, + "step": 8736 + }, + { + "epoch": 0.74, + "grad_norm": 0.33187186455592665, + "learning_rate": 1.7170794552821235e-06, + "loss": 0.098, + "step": 8737 + }, + { + "epoch": 0.74, + "grad_norm": 0.4973758225921344, + "learning_rate": 1.716050320718986e-06, + "loss": 0.1078, + "step": 8738 + }, + { + "epoch": 0.74, + "grad_norm": 0.302222666763661, + "learning_rate": 1.7150214307635503e-06, + "loss": 0.0645, + "step": 8739 + }, + { + "epoch": 0.74, + "grad_norm": 0.26073419589042646, + "learning_rate": 1.7139927854924537e-06, + "loss": 0.0625, + "step": 8740 + }, + { + "epoch": 0.74, + "grad_norm": 0.3014173484670803, + "learning_rate": 1.7129643849823186e-06, + "loss": 0.062, + "step": 8741 + }, + { + "epoch": 0.74, + "grad_norm": 0.3334597613426135, + "learning_rate": 1.7119362293097442e-06, + "loss": 0.0651, + "step": 8742 + }, + { + "epoch": 0.74, + "grad_norm": 0.4658117011075586, + "learning_rate": 1.7109083185513137e-06, + "loss": 0.1199, + "step": 8743 + }, + { + "epoch": 0.74, + "grad_norm": 0.24402236638300986, + "learning_rate": 1.7098806527835904e-06, + "loss": 0.0603, + "step": 8744 + }, + { + "epoch": 0.74, + "grad_norm": 0.31070114321252706, + "learning_rate": 1.7088532320831247e-06, + "loss": 0.0907, + "step": 8745 + }, + { + "epoch": 0.74, + "grad_norm": 0.3755196523547807, + "learning_rate": 1.707826056526442e-06, + "loss": 0.0969, + "step": 8746 + }, + { + "epoch": 0.74, + "grad_norm": 0.21531389187109795, + "learning_rate": 1.706799126190053e-06, + "loss": 0.0644, + "step": 8747 + }, + { + "epoch": 0.74, + "grad_norm": 0.46027226049894004, + "learning_rate": 1.7057724411504478e-06, + "loss": 0.1275, + "step": 8748 + }, + { + "epoch": 0.74, + "grad_norm": 0.3551730442805734, + "learning_rate": 1.7047460014841033e-06, + "loss": 0.0778, + "step": 8749 + }, + { + "epoch": 0.74, + "grad_norm": 0.3638717755514339, + "learning_rate": 1.7037198072674727e-06, + "loss": 0.0648, + "step": 8750 + }, + { + "epoch": 0.74, + "grad_norm": 0.2549676718095175, + "learning_rate": 1.702693858576993e-06, + "loss": 0.0486, + "step": 8751 + }, + { + "epoch": 0.74, + "grad_norm": 0.4794863435420585, + "learning_rate": 1.7016681554890835e-06, + "loss": 0.1008, + "step": 8752 + }, + { + "epoch": 0.74, + "grad_norm": 0.2674691611776805, + "learning_rate": 1.7006426980801416e-06, + "loss": 0.0412, + "step": 8753 + }, + { + "epoch": 0.74, + "grad_norm": 0.24301743057900513, + "learning_rate": 1.699617486426554e-06, + "loss": 0.0737, + "step": 8754 + }, + { + "epoch": 0.74, + "grad_norm": 0.5365760752924508, + "learning_rate": 1.6985925206046817e-06, + "loss": 0.1111, + "step": 8755 + }, + { + "epoch": 0.74, + "grad_norm": 0.25325189925172464, + "learning_rate": 1.6975678006908707e-06, + "loss": 0.0708, + "step": 8756 + }, + { + "epoch": 0.74, + "grad_norm": 0.30387032501115085, + "learning_rate": 1.6965433267614466e-06, + "loss": 0.0751, + "step": 8757 + }, + { + "epoch": 0.74, + "grad_norm": 0.48228192104702716, + "learning_rate": 1.6955190988927212e-06, + "loss": 0.1358, + "step": 8758 + }, + { + "epoch": 0.74, + "grad_norm": 0.3297876811440097, + "learning_rate": 1.694495117160983e-06, + "loss": 0.0742, + "step": 8759 + }, + { + "epoch": 0.74, + "grad_norm": 0.2444151792730252, + "learning_rate": 1.6934713816425048e-06, + "loss": 0.0638, + "step": 8760 + }, + { + "epoch": 0.74, + "grad_norm": 0.26093111820011816, + "learning_rate": 1.692447892413538e-06, + "loss": 0.0783, + "step": 8761 + }, + { + "epoch": 0.74, + "grad_norm": 0.2046438594919639, + "learning_rate": 1.6914246495503224e-06, + "loss": 0.0559, + "step": 8762 + }, + { + "epoch": 0.74, + "grad_norm": 0.2454665511297569, + "learning_rate": 1.6904016531290718e-06, + "loss": 0.0638, + "step": 8763 + }, + { + "epoch": 0.74, + "grad_norm": 0.3658237098586329, + "learning_rate": 1.6893789032259849e-06, + "loss": 0.1023, + "step": 8764 + }, + { + "epoch": 0.74, + "grad_norm": 0.3951852791666201, + "learning_rate": 1.6883563999172447e-06, + "loss": 0.0849, + "step": 8765 + }, + { + "epoch": 0.74, + "grad_norm": 0.4265537722389651, + "learning_rate": 1.6873341432790114e-06, + "loss": 0.0953, + "step": 8766 + }, + { + "epoch": 0.74, + "grad_norm": 0.3279769070756954, + "learning_rate": 1.6863121333874294e-06, + "loss": 0.0899, + "step": 8767 + }, + { + "epoch": 0.74, + "grad_norm": 0.3823382305434872, + "learning_rate": 1.6852903703186208e-06, + "loss": 0.0872, + "step": 8768 + }, + { + "epoch": 0.74, + "grad_norm": 0.35788709796349316, + "learning_rate": 1.6842688541486973e-06, + "loss": 0.0423, + "step": 8769 + }, + { + "epoch": 0.74, + "grad_norm": 0.29422093668250787, + "learning_rate": 1.683247584953745e-06, + "loss": 0.0622, + "step": 8770 + }, + { + "epoch": 0.74, + "grad_norm": 0.2708870773010524, + "learning_rate": 1.6822265628098323e-06, + "loss": 0.0584, + "step": 8771 + }, + { + "epoch": 0.74, + "grad_norm": 0.3566076802147964, + "learning_rate": 1.6812057877930143e-06, + "loss": 0.0875, + "step": 8772 + }, + { + "epoch": 0.74, + "grad_norm": 0.33692850483435854, + "learning_rate": 1.6801852599793228e-06, + "loss": 0.0736, + "step": 8773 + }, + { + "epoch": 0.74, + "grad_norm": 0.24828768418671482, + "learning_rate": 1.6791649794447707e-06, + "loss": 0.0674, + "step": 8774 + }, + { + "epoch": 0.74, + "grad_norm": 0.2914861070819317, + "learning_rate": 1.6781449462653581e-06, + "loss": 0.0685, + "step": 8775 + }, + { + "epoch": 0.74, + "grad_norm": 0.4039476693451992, + "learning_rate": 1.6771251605170607e-06, + "loss": 0.1261, + "step": 8776 + }, + { + "epoch": 0.74, + "grad_norm": 0.2599833385056515, + "learning_rate": 1.6761056222758392e-06, + "loss": 0.0799, + "step": 8777 + }, + { + "epoch": 0.74, + "grad_norm": 0.17469350611586879, + "learning_rate": 1.675086331617632e-06, + "loss": 0.037, + "step": 8778 + }, + { + "epoch": 0.74, + "grad_norm": 0.48777290571931775, + "learning_rate": 1.6740672886183657e-06, + "loss": 0.1124, + "step": 8779 + }, + { + "epoch": 0.74, + "grad_norm": 0.3876369724582235, + "learning_rate": 1.673048493353942e-06, + "loss": 0.0799, + "step": 8780 + }, + { + "epoch": 0.74, + "grad_norm": 0.279589861323937, + "learning_rate": 1.6720299459002465e-06, + "loss": 0.078, + "step": 8781 + }, + { + "epoch": 0.74, + "grad_norm": 0.43792598401609445, + "learning_rate": 1.6710116463331494e-06, + "loss": 0.0721, + "step": 8782 + }, + { + "epoch": 0.74, + "grad_norm": 0.34440930256098673, + "learning_rate": 1.6699935947284968e-06, + "loss": 0.1047, + "step": 8783 + }, + { + "epoch": 0.74, + "grad_norm": 0.22879595501186198, + "learning_rate": 1.6689757911621208e-06, + "loss": 0.0484, + "step": 8784 + }, + { + "epoch": 0.74, + "grad_norm": 0.2653312986755267, + "learning_rate": 1.6679582357098305e-06, + "loss": 0.0772, + "step": 8785 + }, + { + "epoch": 0.74, + "grad_norm": 0.23865071920660855, + "learning_rate": 1.666940928447423e-06, + "loss": 0.0607, + "step": 8786 + }, + { + "epoch": 0.74, + "grad_norm": 0.25499019530608713, + "learning_rate": 1.6659238694506718e-06, + "loss": 0.1018, + "step": 8787 + }, + { + "epoch": 0.74, + "grad_norm": 0.442939863632077, + "learning_rate": 1.6649070587953315e-06, + "loss": 0.0846, + "step": 8788 + }, + { + "epoch": 0.74, + "grad_norm": 0.23052421375233845, + "learning_rate": 1.6638904965571434e-06, + "loss": 0.0408, + "step": 8789 + }, + { + "epoch": 0.74, + "grad_norm": 0.5262021594960135, + "learning_rate": 1.6628741828118255e-06, + "loss": 0.1312, + "step": 8790 + }, + { + "epoch": 0.74, + "grad_norm": 0.28748731992576276, + "learning_rate": 1.6618581176350783e-06, + "loss": 0.0891, + "step": 8791 + }, + { + "epoch": 0.74, + "grad_norm": 0.36642481646238256, + "learning_rate": 1.6608423011025838e-06, + "loss": 0.0844, + "step": 8792 + }, + { + "epoch": 0.74, + "grad_norm": 0.24318038904527697, + "learning_rate": 1.6598267332900076e-06, + "loss": 0.0679, + "step": 8793 + }, + { + "epoch": 0.74, + "grad_norm": 0.2906128558843935, + "learning_rate": 1.6588114142729945e-06, + "loss": 0.0711, + "step": 8794 + }, + { + "epoch": 0.74, + "grad_norm": 0.25204536125244803, + "learning_rate": 1.6577963441271694e-06, + "loss": 0.0751, + "step": 8795 + }, + { + "epoch": 0.74, + "grad_norm": 0.2940313094974745, + "learning_rate": 1.6567815229281442e-06, + "loss": 0.0827, + "step": 8796 + }, + { + "epoch": 0.74, + "grad_norm": 0.3862467664450845, + "learning_rate": 1.6557669507515068e-06, + "loss": 0.0838, + "step": 8797 + }, + { + "epoch": 0.74, + "grad_norm": 0.3543092068951903, + "learning_rate": 1.654752627672827e-06, + "loss": 0.1055, + "step": 8798 + }, + { + "epoch": 0.74, + "grad_norm": 0.3525550025002627, + "learning_rate": 1.6537385537676604e-06, + "loss": 0.1013, + "step": 8799 + }, + { + "epoch": 0.74, + "grad_norm": 0.22784324146758156, + "learning_rate": 1.65272472911154e-06, + "loss": 0.0465, + "step": 8800 + }, + { + "epoch": 0.74, + "grad_norm": 0.24767915408011168, + "learning_rate": 1.6517111537799807e-06, + "loss": 0.0902, + "step": 8801 + }, + { + "epoch": 0.74, + "grad_norm": 0.40542789502037624, + "learning_rate": 1.6506978278484786e-06, + "loss": 0.0858, + "step": 8802 + }, + { + "epoch": 0.74, + "grad_norm": 0.4057641943239592, + "learning_rate": 1.6496847513925146e-06, + "loss": 0.1013, + "step": 8803 + }, + { + "epoch": 0.74, + "grad_norm": 0.27192793059634984, + "learning_rate": 1.6486719244875477e-06, + "loss": 0.0946, + "step": 8804 + }, + { + "epoch": 0.74, + "grad_norm": 0.2540788002373435, + "learning_rate": 1.6476593472090174e-06, + "loss": 0.0618, + "step": 8805 + }, + { + "epoch": 0.74, + "grad_norm": 0.31236944112448795, + "learning_rate": 1.6466470196323487e-06, + "loss": 0.0732, + "step": 8806 + }, + { + "epoch": 0.74, + "grad_norm": 0.30085462504372423, + "learning_rate": 1.6456349418329453e-06, + "loss": 0.069, + "step": 8807 + }, + { + "epoch": 0.74, + "grad_norm": 0.3095148700451659, + "learning_rate": 1.6446231138861917e-06, + "loss": 0.0653, + "step": 8808 + }, + { + "epoch": 0.74, + "grad_norm": 0.3365481333236518, + "learning_rate": 1.6436115358674537e-06, + "loss": 0.0836, + "step": 8809 + }, + { + "epoch": 0.74, + "grad_norm": 0.24706560873289923, + "learning_rate": 1.6426002078520825e-06, + "loss": 0.0641, + "step": 8810 + }, + { + "epoch": 0.74, + "grad_norm": 0.2857532907945194, + "learning_rate": 1.6415891299154062e-06, + "loss": 0.0839, + "step": 8811 + }, + { + "epoch": 0.74, + "grad_norm": 0.39810543980973123, + "learning_rate": 1.6405783021327337e-06, + "loss": 0.102, + "step": 8812 + }, + { + "epoch": 0.74, + "grad_norm": 0.5595514057121227, + "learning_rate": 1.6395677245793612e-06, + "loss": 0.1063, + "step": 8813 + }, + { + "epoch": 0.74, + "grad_norm": 0.28743196867553805, + "learning_rate": 1.6385573973305608e-06, + "loss": 0.0652, + "step": 8814 + }, + { + "epoch": 0.74, + "grad_norm": 0.3249305774448179, + "learning_rate": 1.6375473204615871e-06, + "loss": 0.063, + "step": 8815 + }, + { + "epoch": 0.74, + "grad_norm": 0.1794434629666851, + "learning_rate": 1.6365374940476748e-06, + "loss": 0.0558, + "step": 8816 + }, + { + "epoch": 0.74, + "grad_norm": 0.24525414621512315, + "learning_rate": 1.6355279181640454e-06, + "loss": 0.0911, + "step": 8817 + }, + { + "epoch": 0.74, + "grad_norm": 0.39460153155299793, + "learning_rate": 1.6345185928858964e-06, + "loss": 0.0539, + "step": 8818 + }, + { + "epoch": 0.74, + "grad_norm": 0.5604901117794143, + "learning_rate": 1.6335095182884074e-06, + "loss": 0.0836, + "step": 8819 + }, + { + "epoch": 0.74, + "grad_norm": 0.25226085137623727, + "learning_rate": 1.6325006944467415e-06, + "loss": 0.0809, + "step": 8820 + }, + { + "epoch": 0.74, + "grad_norm": 0.27183695482667647, + "learning_rate": 1.6314921214360385e-06, + "loss": 0.0806, + "step": 8821 + }, + { + "epoch": 0.74, + "grad_norm": 0.2262437959587454, + "learning_rate": 1.6304837993314276e-06, + "loss": 0.0564, + "step": 8822 + }, + { + "epoch": 0.74, + "grad_norm": 0.29919211156628095, + "learning_rate": 1.629475728208012e-06, + "loss": 0.0599, + "step": 8823 + }, + { + "epoch": 0.74, + "grad_norm": 0.19684777812558601, + "learning_rate": 1.628467908140879e-06, + "loss": 0.065, + "step": 8824 + }, + { + "epoch": 0.74, + "grad_norm": 0.28108651643597476, + "learning_rate": 1.6274603392050954e-06, + "loss": 0.0902, + "step": 8825 + }, + { + "epoch": 0.74, + "grad_norm": 0.33640273344399413, + "learning_rate": 1.6264530214757136e-06, + "loss": 0.0626, + "step": 8826 + }, + { + "epoch": 0.74, + "grad_norm": 0.5746747114343644, + "learning_rate": 1.6254459550277629e-06, + "loss": 0.1159, + "step": 8827 + }, + { + "epoch": 0.74, + "grad_norm": 0.29607922246677104, + "learning_rate": 1.6244391399362559e-06, + "loss": 0.0701, + "step": 8828 + }, + { + "epoch": 0.74, + "grad_norm": 0.28716385949240214, + "learning_rate": 1.6234325762761844e-06, + "loss": 0.0716, + "step": 8829 + }, + { + "epoch": 0.74, + "grad_norm": 0.2831341975382209, + "learning_rate": 1.6224262641225258e-06, + "loss": 0.0761, + "step": 8830 + }, + { + "epoch": 0.74, + "grad_norm": 0.2524898251032297, + "learning_rate": 1.6214202035502353e-06, + "loss": 0.0799, + "step": 8831 + }, + { + "epoch": 0.74, + "grad_norm": 0.2459502549435526, + "learning_rate": 1.6204143946342476e-06, + "loss": 0.0502, + "step": 8832 + }, + { + "epoch": 0.74, + "grad_norm": 0.3257782433241818, + "learning_rate": 1.6194088374494853e-06, + "loss": 0.0778, + "step": 8833 + }, + { + "epoch": 0.74, + "grad_norm": 0.43897238548525785, + "learning_rate": 1.6184035320708464e-06, + "loss": 0.1223, + "step": 8834 + }, + { + "epoch": 0.74, + "grad_norm": 0.19824750425729723, + "learning_rate": 1.617398478573211e-06, + "loss": 0.0569, + "step": 8835 + }, + { + "epoch": 0.74, + "grad_norm": 0.3836055503811429, + "learning_rate": 1.616393677031441e-06, + "loss": 0.0952, + "step": 8836 + }, + { + "epoch": 0.74, + "grad_norm": 0.19337413493663652, + "learning_rate": 1.6153891275203826e-06, + "loss": 0.0409, + "step": 8837 + }, + { + "epoch": 0.74, + "grad_norm": 0.2828506589726233, + "learning_rate": 1.6143848301148585e-06, + "loss": 0.0672, + "step": 8838 + }, + { + "epoch": 0.74, + "grad_norm": 0.218640198905327, + "learning_rate": 1.6133807848896733e-06, + "loss": 0.0768, + "step": 8839 + }, + { + "epoch": 0.74, + "grad_norm": 0.4332412141416325, + "learning_rate": 1.6123769919196174e-06, + "loss": 0.1091, + "step": 8840 + }, + { + "epoch": 0.74, + "grad_norm": 0.34063336551874746, + "learning_rate": 1.6113734512794576e-06, + "loss": 0.0955, + "step": 8841 + }, + { + "epoch": 0.75, + "grad_norm": 0.21132869066053817, + "learning_rate": 1.6103701630439434e-06, + "loss": 0.0569, + "step": 8842 + }, + { + "epoch": 0.75, + "grad_norm": 0.2833890841688146, + "learning_rate": 1.6093671272878037e-06, + "loss": 0.062, + "step": 8843 + }, + { + "epoch": 0.75, + "grad_norm": 0.2447811069010782, + "learning_rate": 1.6083643440857538e-06, + "loss": 0.0756, + "step": 8844 + }, + { + "epoch": 0.75, + "grad_norm": 0.3565920962407, + "learning_rate": 1.6073618135124858e-06, + "loss": 0.0756, + "step": 8845 + }, + { + "epoch": 0.75, + "grad_norm": 0.2698247726912429, + "learning_rate": 1.6063595356426715e-06, + "loss": 0.0676, + "step": 8846 + }, + { + "epoch": 0.75, + "grad_norm": 0.3307486771866237, + "learning_rate": 1.60535751055097e-06, + "loss": 0.1003, + "step": 8847 + }, + { + "epoch": 0.75, + "grad_norm": 0.1972372387957145, + "learning_rate": 1.6043557383120162e-06, + "loss": 0.0607, + "step": 8848 + }, + { + "epoch": 0.75, + "grad_norm": 0.303169237828924, + "learning_rate": 1.603354219000427e-06, + "loss": 0.0916, + "step": 8849 + }, + { + "epoch": 0.75, + "grad_norm": 0.26847628526120426, + "learning_rate": 1.6023529526908038e-06, + "loss": 0.0759, + "step": 8850 + }, + { + "epoch": 0.75, + "grad_norm": 0.24337545445596412, + "learning_rate": 1.6013519394577255e-06, + "loss": 0.0705, + "step": 8851 + }, + { + "epoch": 0.75, + "grad_norm": 0.21101594700922224, + "learning_rate": 1.6003511793757531e-06, + "loss": 0.0454, + "step": 8852 + }, + { + "epoch": 0.75, + "grad_norm": 0.38200480660497776, + "learning_rate": 1.599350672519428e-06, + "loss": 0.0769, + "step": 8853 + }, + { + "epoch": 0.75, + "grad_norm": 0.24251602895997362, + "learning_rate": 1.5983504189632765e-06, + "loss": 0.0639, + "step": 8854 + }, + { + "epoch": 0.75, + "grad_norm": 0.3762614575062755, + "learning_rate": 1.5973504187818017e-06, + "loss": 0.1168, + "step": 8855 + }, + { + "epoch": 0.75, + "grad_norm": 0.37452503390720765, + "learning_rate": 1.5963506720494882e-06, + "loss": 0.1018, + "step": 8856 + }, + { + "epoch": 0.75, + "grad_norm": 0.3489453989048087, + "learning_rate": 1.5953511788408061e-06, + "loss": 0.0945, + "step": 8857 + }, + { + "epoch": 0.75, + "grad_norm": 0.4270716464059508, + "learning_rate": 1.5943519392302015e-06, + "loss": 0.0894, + "step": 8858 + }, + { + "epoch": 0.75, + "grad_norm": 0.28228543702750797, + "learning_rate": 1.5933529532921037e-06, + "loss": 0.081, + "step": 8859 + }, + { + "epoch": 0.75, + "grad_norm": 0.546144731510037, + "learning_rate": 1.5923542211009223e-06, + "loss": 0.1082, + "step": 8860 + }, + { + "epoch": 0.75, + "grad_norm": 0.28662677108352147, + "learning_rate": 1.5913557427310505e-06, + "loss": 0.0993, + "step": 8861 + }, + { + "epoch": 0.75, + "grad_norm": 0.41688201531795727, + "learning_rate": 1.5903575182568599e-06, + "loss": 0.092, + "step": 8862 + }, + { + "epoch": 0.75, + "grad_norm": 0.4051923595311536, + "learning_rate": 1.5893595477527024e-06, + "loss": 0.099, + "step": 8863 + }, + { + "epoch": 0.75, + "grad_norm": 0.500516793418049, + "learning_rate": 1.5883618312929155e-06, + "loss": 0.0774, + "step": 8864 + }, + { + "epoch": 0.75, + "grad_norm": 0.49813121010719147, + "learning_rate": 1.5873643689518143e-06, + "loss": 0.0781, + "step": 8865 + }, + { + "epoch": 0.75, + "grad_norm": 0.40327672688718613, + "learning_rate": 1.5863671608036929e-06, + "loss": 0.0891, + "step": 8866 + }, + { + "epoch": 0.75, + "grad_norm": 0.23448886084922121, + "learning_rate": 1.5853702069228333e-06, + "loss": 0.0806, + "step": 8867 + }, + { + "epoch": 0.75, + "grad_norm": 0.43689397347979536, + "learning_rate": 1.584373507383492e-06, + "loss": 0.112, + "step": 8868 + }, + { + "epoch": 0.75, + "grad_norm": 0.2610799586128663, + "learning_rate": 1.5833770622599093e-06, + "loss": 0.0544, + "step": 8869 + }, + { + "epoch": 0.75, + "grad_norm": 0.33747247474257175, + "learning_rate": 1.582380871626305e-06, + "loss": 0.0794, + "step": 8870 + }, + { + "epoch": 0.75, + "grad_norm": 0.2989105804839214, + "learning_rate": 1.5813849355568838e-06, + "loss": 0.069, + "step": 8871 + }, + { + "epoch": 0.75, + "grad_norm": 0.2656174964752859, + "learning_rate": 1.5803892541258275e-06, + "loss": 0.084, + "step": 8872 + }, + { + "epoch": 0.75, + "grad_norm": 0.3253857189320341, + "learning_rate": 1.579393827407299e-06, + "loss": 0.1227, + "step": 8873 + }, + { + "epoch": 0.75, + "grad_norm": 0.7008567579998054, + "learning_rate": 1.578398655475446e-06, + "loss": 0.1095, + "step": 8874 + }, + { + "epoch": 0.75, + "grad_norm": 0.3727563355250439, + "learning_rate": 1.5774037384043938e-06, + "loss": 0.0696, + "step": 8875 + }, + { + "epoch": 0.75, + "grad_norm": 0.6964088532215039, + "learning_rate": 1.5764090762682487e-06, + "loss": 0.1012, + "step": 8876 + }, + { + "epoch": 0.75, + "grad_norm": 0.248231884366039, + "learning_rate": 1.5754146691410983e-06, + "loss": 0.0807, + "step": 8877 + }, + { + "epoch": 0.75, + "grad_norm": 0.33429576489806867, + "learning_rate": 1.5744205170970144e-06, + "loss": 0.0855, + "step": 8878 + }, + { + "epoch": 0.75, + "grad_norm": 0.34994428128221283, + "learning_rate": 1.5734266202100462e-06, + "loss": 0.0833, + "step": 8879 + }, + { + "epoch": 0.75, + "grad_norm": 0.3930967493132058, + "learning_rate": 1.5724329785542231e-06, + "loss": 0.1044, + "step": 8880 + }, + { + "epoch": 0.75, + "grad_norm": 0.23409488069020024, + "learning_rate": 1.5714395922035597e-06, + "loss": 0.0782, + "step": 8881 + }, + { + "epoch": 0.75, + "grad_norm": 0.33718353637663323, + "learning_rate": 1.5704464612320492e-06, + "loss": 0.069, + "step": 8882 + }, + { + "epoch": 0.75, + "grad_norm": 0.39258527549023753, + "learning_rate": 1.5694535857136638e-06, + "loss": 0.0951, + "step": 8883 + }, + { + "epoch": 0.75, + "grad_norm": 0.3074733950824599, + "learning_rate": 1.5684609657223593e-06, + "loss": 0.0741, + "step": 8884 + }, + { + "epoch": 0.75, + "grad_norm": 0.6471691362871344, + "learning_rate": 1.5674686013320733e-06, + "loss": 0.1396, + "step": 8885 + }, + { + "epoch": 0.75, + "grad_norm": 0.41914622854438677, + "learning_rate": 1.566476492616722e-06, + "loss": 0.1023, + "step": 8886 + }, + { + "epoch": 0.75, + "grad_norm": 0.31994196846466866, + "learning_rate": 1.5654846396502028e-06, + "loss": 0.081, + "step": 8887 + }, + { + "epoch": 0.75, + "grad_norm": 0.39626803824151124, + "learning_rate": 1.5644930425063959e-06, + "loss": 0.0826, + "step": 8888 + }, + { + "epoch": 0.75, + "grad_norm": 0.23452047918847987, + "learning_rate": 1.5635017012591585e-06, + "loss": 0.0553, + "step": 8889 + }, + { + "epoch": 0.75, + "grad_norm": 0.4192790173107763, + "learning_rate": 1.562510615982335e-06, + "loss": 0.1181, + "step": 8890 + }, + { + "epoch": 0.75, + "grad_norm": 0.19699957994707024, + "learning_rate": 1.5615197867497455e-06, + "loss": 0.0372, + "step": 8891 + }, + { + "epoch": 0.75, + "grad_norm": 0.25547081768775104, + "learning_rate": 1.5605292136351935e-06, + "loss": 0.0779, + "step": 8892 + }, + { + "epoch": 0.75, + "grad_norm": 0.336014373188817, + "learning_rate": 1.5595388967124603e-06, + "loss": 0.06, + "step": 8893 + }, + { + "epoch": 0.75, + "grad_norm": 0.7111542374022048, + "learning_rate": 1.5585488360553137e-06, + "loss": 0.1225, + "step": 8894 + }, + { + "epoch": 0.75, + "grad_norm": 0.315145443973822, + "learning_rate": 1.5575590317374979e-06, + "loss": 0.0829, + "step": 8895 + }, + { + "epoch": 0.75, + "grad_norm": 0.30793845527860425, + "learning_rate": 1.5565694838327388e-06, + "loss": 0.0936, + "step": 8896 + }, + { + "epoch": 0.75, + "grad_norm": 0.31420607087977354, + "learning_rate": 1.5555801924147424e-06, + "loss": 0.0867, + "step": 8897 + }, + { + "epoch": 0.75, + "grad_norm": 0.3301332980904552, + "learning_rate": 1.5545911575572004e-06, + "loss": 0.1192, + "step": 8898 + }, + { + "epoch": 0.75, + "grad_norm": 0.41096934109322686, + "learning_rate": 1.5536023793337796e-06, + "loss": 0.0771, + "step": 8899 + }, + { + "epoch": 0.75, + "grad_norm": 0.4980888735141741, + "learning_rate": 1.5526138578181304e-06, + "loss": 0.1049, + "step": 8900 + }, + { + "epoch": 0.75, + "grad_norm": 0.3430972377184634, + "learning_rate": 1.5516255930838824e-06, + "loss": 0.086, + "step": 8901 + }, + { + "epoch": 0.75, + "grad_norm": 0.24458587448416488, + "learning_rate": 1.5506375852046496e-06, + "loss": 0.0724, + "step": 8902 + }, + { + "epoch": 0.75, + "grad_norm": 0.24939762100495907, + "learning_rate": 1.549649834254024e-06, + "loss": 0.0572, + "step": 8903 + }, + { + "epoch": 0.75, + "grad_norm": 0.3798712461011829, + "learning_rate": 1.5486623403055767e-06, + "loss": 0.073, + "step": 8904 + }, + { + "epoch": 0.75, + "grad_norm": 0.2501315416702115, + "learning_rate": 1.5476751034328652e-06, + "loss": 0.068, + "step": 8905 + }, + { + "epoch": 0.75, + "grad_norm": 0.24825002836392268, + "learning_rate": 1.5466881237094233e-06, + "loss": 0.0749, + "step": 8906 + }, + { + "epoch": 0.75, + "grad_norm": 0.2590045369833802, + "learning_rate": 1.5457014012087656e-06, + "loss": 0.0584, + "step": 8907 + }, + { + "epoch": 0.75, + "grad_norm": 0.29426838902260505, + "learning_rate": 1.5447149360043916e-06, + "loss": 0.0493, + "step": 8908 + }, + { + "epoch": 0.75, + "grad_norm": 0.43912112515181195, + "learning_rate": 1.5437287281697782e-06, + "loss": 0.1079, + "step": 8909 + }, + { + "epoch": 0.75, + "grad_norm": 0.4188582599245124, + "learning_rate": 1.5427427777783832e-06, + "loss": 0.1125, + "step": 8910 + }, + { + "epoch": 0.75, + "grad_norm": 0.36847501602279475, + "learning_rate": 1.5417570849036446e-06, + "loss": 0.0842, + "step": 8911 + }, + { + "epoch": 0.75, + "grad_norm": 0.26760948973492155, + "learning_rate": 1.540771649618985e-06, + "loss": 0.0692, + "step": 8912 + }, + { + "epoch": 0.75, + "grad_norm": 0.36870775380330195, + "learning_rate": 1.539786471997805e-06, + "loss": 0.0992, + "step": 8913 + }, + { + "epoch": 0.75, + "grad_norm": 0.4425000435844594, + "learning_rate": 1.5388015521134847e-06, + "loss": 0.0845, + "step": 8914 + }, + { + "epoch": 0.75, + "grad_norm": 0.23521823695791527, + "learning_rate": 1.5378168900393887e-06, + "loss": 0.0705, + "step": 8915 + }, + { + "epoch": 0.75, + "grad_norm": 0.3792702305667949, + "learning_rate": 1.5368324858488592e-06, + "loss": 0.0898, + "step": 8916 + }, + { + "epoch": 0.75, + "grad_norm": 0.18818566710247914, + "learning_rate": 1.5358483396152213e-06, + "loss": 0.0509, + "step": 8917 + }, + { + "epoch": 0.75, + "grad_norm": 0.23797397866573497, + "learning_rate": 1.5348644514117773e-06, + "loss": 0.0684, + "step": 8918 + }, + { + "epoch": 0.75, + "grad_norm": 0.33105986330207154, + "learning_rate": 1.5338808213118162e-06, + "loss": 0.078, + "step": 8919 + }, + { + "epoch": 0.75, + "grad_norm": 0.3053456167927793, + "learning_rate": 1.5328974493886034e-06, + "loss": 0.0818, + "step": 8920 + }, + { + "epoch": 0.75, + "grad_norm": 0.3360753563199587, + "learning_rate": 1.5319143357153842e-06, + "loss": 0.0979, + "step": 8921 + }, + { + "epoch": 0.75, + "grad_norm": 0.22995699377649478, + "learning_rate": 1.5309314803653902e-06, + "loss": 0.0709, + "step": 8922 + }, + { + "epoch": 0.75, + "grad_norm": 0.302926561231055, + "learning_rate": 1.5299488834118286e-06, + "loss": 0.0796, + "step": 8923 + }, + { + "epoch": 0.75, + "grad_norm": 0.32656444825675474, + "learning_rate": 1.5289665449278868e-06, + "loss": 0.0717, + "step": 8924 + }, + { + "epoch": 0.75, + "grad_norm": 0.3240604299529103, + "learning_rate": 1.5279844649867381e-06, + "loss": 0.094, + "step": 8925 + }, + { + "epoch": 0.75, + "grad_norm": 0.31300044832674584, + "learning_rate": 1.5270026436615333e-06, + "loss": 0.0465, + "step": 8926 + }, + { + "epoch": 0.75, + "grad_norm": 0.27855691232989227, + "learning_rate": 1.5260210810254027e-06, + "loss": 0.0674, + "step": 8927 + }, + { + "epoch": 0.75, + "grad_norm": 0.33436809153151714, + "learning_rate": 1.525039777151458e-06, + "loss": 0.083, + "step": 8928 + }, + { + "epoch": 0.75, + "grad_norm": 0.36502621963203413, + "learning_rate": 1.524058732112796e-06, + "loss": 0.1081, + "step": 8929 + }, + { + "epoch": 0.75, + "grad_norm": 0.25531291528520716, + "learning_rate": 1.5230779459824873e-06, + "loss": 0.0639, + "step": 8930 + }, + { + "epoch": 0.75, + "grad_norm": 0.30426986069774775, + "learning_rate": 1.5220974188335869e-06, + "loss": 0.0613, + "step": 8931 + }, + { + "epoch": 0.75, + "grad_norm": 0.37713613168457427, + "learning_rate": 1.5211171507391325e-06, + "loss": 0.0714, + "step": 8932 + }, + { + "epoch": 0.75, + "grad_norm": 0.3177241694915858, + "learning_rate": 1.520137141772139e-06, + "loss": 0.0925, + "step": 8933 + }, + { + "epoch": 0.75, + "grad_norm": 0.30137627234537917, + "learning_rate": 1.5191573920056025e-06, + "loss": 0.0544, + "step": 8934 + }, + { + "epoch": 0.75, + "grad_norm": 0.3253107905617309, + "learning_rate": 1.5181779015124993e-06, + "loss": 0.1209, + "step": 8935 + }, + { + "epoch": 0.75, + "grad_norm": 0.2399610077682187, + "learning_rate": 1.5171986703657909e-06, + "loss": 0.076, + "step": 8936 + }, + { + "epoch": 0.75, + "grad_norm": 0.29479676246693537, + "learning_rate": 1.5162196986384143e-06, + "loss": 0.0469, + "step": 8937 + }, + { + "epoch": 0.75, + "grad_norm": 0.4098417012827499, + "learning_rate": 1.5152409864032874e-06, + "loss": 0.1013, + "step": 8938 + }, + { + "epoch": 0.75, + "grad_norm": 0.2281331913886879, + "learning_rate": 1.5142625337333139e-06, + "loss": 0.0481, + "step": 8939 + }, + { + "epoch": 0.75, + "grad_norm": 0.37941146186776964, + "learning_rate": 1.5132843407013726e-06, + "loss": 0.0812, + "step": 8940 + }, + { + "epoch": 0.75, + "grad_norm": 0.2364200786859717, + "learning_rate": 1.5123064073803235e-06, + "loss": 0.0661, + "step": 8941 + }, + { + "epoch": 0.75, + "grad_norm": 0.22248878846635126, + "learning_rate": 1.511328733843012e-06, + "loss": 0.0555, + "step": 8942 + }, + { + "epoch": 0.75, + "grad_norm": 0.35169934283765586, + "learning_rate": 1.5103513201622593e-06, + "loss": 0.0836, + "step": 8943 + }, + { + "epoch": 0.75, + "grad_norm": 0.35351434787551783, + "learning_rate": 1.5093741664108696e-06, + "loss": 0.0812, + "step": 8944 + }, + { + "epoch": 0.75, + "grad_norm": 0.2599288693323992, + "learning_rate": 1.508397272661624e-06, + "loss": 0.0705, + "step": 8945 + }, + { + "epoch": 0.75, + "grad_norm": 0.4714786781672482, + "learning_rate": 1.507420638987292e-06, + "loss": 0.0788, + "step": 8946 + }, + { + "epoch": 0.75, + "grad_norm": 0.20741640414773405, + "learning_rate": 1.5064442654606166e-06, + "loss": 0.0671, + "step": 8947 + }, + { + "epoch": 0.75, + "grad_norm": 0.3253731635206642, + "learning_rate": 1.505468152154322e-06, + "loss": 0.1075, + "step": 8948 + }, + { + "epoch": 0.75, + "grad_norm": 0.423574081166215, + "learning_rate": 1.5044922991411177e-06, + "loss": 0.1274, + "step": 8949 + }, + { + "epoch": 0.75, + "grad_norm": 0.32180403940704294, + "learning_rate": 1.50351670649369e-06, + "loss": 0.0839, + "step": 8950 + }, + { + "epoch": 0.75, + "grad_norm": 0.3011156844593209, + "learning_rate": 1.5025413742847068e-06, + "loss": 0.0726, + "step": 8951 + }, + { + "epoch": 0.75, + "grad_norm": 0.20731735088658268, + "learning_rate": 1.5015663025868144e-06, + "loss": 0.07, + "step": 8952 + }, + { + "epoch": 0.75, + "grad_norm": 0.2707908493862437, + "learning_rate": 1.5005914914726455e-06, + "loss": 0.0444, + "step": 8953 + }, + { + "epoch": 0.75, + "grad_norm": 0.37517716786688143, + "learning_rate": 1.4996169410148076e-06, + "loss": 0.0811, + "step": 8954 + }, + { + "epoch": 0.75, + "grad_norm": 0.4888746616625594, + "learning_rate": 1.4986426512858916e-06, + "loss": 0.0728, + "step": 8955 + }, + { + "epoch": 0.75, + "grad_norm": 0.3775610464651205, + "learning_rate": 1.4976686223584675e-06, + "loss": 0.0593, + "step": 8956 + }, + { + "epoch": 0.75, + "grad_norm": 0.35699133759534624, + "learning_rate": 1.4966948543050851e-06, + "loss": 0.0868, + "step": 8957 + }, + { + "epoch": 0.75, + "grad_norm": 0.3382146069450258, + "learning_rate": 1.49572134719828e-06, + "loss": 0.0969, + "step": 8958 + }, + { + "epoch": 0.75, + "grad_norm": 0.37089649232226474, + "learning_rate": 1.4947481011105636e-06, + "loss": 0.0949, + "step": 8959 + }, + { + "epoch": 0.75, + "grad_norm": 0.29673773801225556, + "learning_rate": 1.4937751161144275e-06, + "loss": 0.0717, + "step": 8960 + }, + { + "epoch": 0.76, + "grad_norm": 0.22574776702747865, + "learning_rate": 1.4928023922823442e-06, + "loss": 0.0627, + "step": 8961 + }, + { + "epoch": 0.76, + "grad_norm": 0.2897026898360463, + "learning_rate": 1.4918299296867723e-06, + "loss": 0.0842, + "step": 8962 + }, + { + "epoch": 0.76, + "grad_norm": 0.29366634619067306, + "learning_rate": 1.490857728400143e-06, + "loss": 0.0813, + "step": 8963 + }, + { + "epoch": 0.76, + "grad_norm": 0.3087150040995132, + "learning_rate": 1.4898857884948725e-06, + "loss": 0.0841, + "step": 8964 + }, + { + "epoch": 0.76, + "grad_norm": 0.4590410761728625, + "learning_rate": 1.4889141100433545e-06, + "loss": 0.0977, + "step": 8965 + }, + { + "epoch": 0.76, + "grad_norm": 0.21076252658550823, + "learning_rate": 1.4879426931179696e-06, + "loss": 0.0431, + "step": 8966 + }, + { + "epoch": 0.76, + "grad_norm": 0.3564162063175528, + "learning_rate": 1.4869715377910715e-06, + "loss": 0.0781, + "step": 8967 + }, + { + "epoch": 0.76, + "grad_norm": 0.1922290865265554, + "learning_rate": 1.4860006441349983e-06, + "loss": 0.0213, + "step": 8968 + }, + { + "epoch": 0.76, + "grad_norm": 0.2736661817969576, + "learning_rate": 1.4850300122220662e-06, + "loss": 0.0737, + "step": 8969 + }, + { + "epoch": 0.76, + "grad_norm": 0.312284192143055, + "learning_rate": 1.4840596421245768e-06, + "loss": 0.0782, + "step": 8970 + }, + { + "epoch": 0.76, + "grad_norm": 0.6388991512648625, + "learning_rate": 1.4830895339148067e-06, + "loss": 0.1367, + "step": 8971 + }, + { + "epoch": 0.76, + "grad_norm": 0.3002977774249973, + "learning_rate": 1.482119687665014e-06, + "loss": 0.0746, + "step": 8972 + }, + { + "epoch": 0.76, + "grad_norm": 0.2312226188086535, + "learning_rate": 1.4811501034474418e-06, + "loss": 0.0841, + "step": 8973 + }, + { + "epoch": 0.76, + "grad_norm": 0.276817738945182, + "learning_rate": 1.480180781334309e-06, + "loss": 0.0958, + "step": 8974 + }, + { + "epoch": 0.76, + "grad_norm": 0.24069314005872958, + "learning_rate": 1.4792117213978148e-06, + "loss": 0.0671, + "step": 8975 + }, + { + "epoch": 0.76, + "grad_norm": 0.4794788338636877, + "learning_rate": 1.4782429237101425e-06, + "loss": 0.0642, + "step": 8976 + }, + { + "epoch": 0.76, + "grad_norm": 0.23906200921909176, + "learning_rate": 1.477274388343453e-06, + "loss": 0.0707, + "step": 8977 + }, + { + "epoch": 0.76, + "grad_norm": 0.369460659514181, + "learning_rate": 1.4763061153698887e-06, + "loss": 0.1115, + "step": 8978 + }, + { + "epoch": 0.76, + "grad_norm": 0.24985494002295278, + "learning_rate": 1.4753381048615706e-06, + "loss": 0.057, + "step": 8979 + }, + { + "epoch": 0.76, + "grad_norm": 0.267243101255548, + "learning_rate": 1.4743703568906048e-06, + "loss": 0.0697, + "step": 8980 + }, + { + "epoch": 0.76, + "grad_norm": 0.7098557760666024, + "learning_rate": 1.4734028715290728e-06, + "loss": 0.1321, + "step": 8981 + }, + { + "epoch": 0.76, + "grad_norm": 0.22441670507516367, + "learning_rate": 1.472435648849037e-06, + "loss": 0.046, + "step": 8982 + }, + { + "epoch": 0.76, + "grad_norm": 0.4174382399838925, + "learning_rate": 1.471468688922546e-06, + "loss": 0.1098, + "step": 8983 + }, + { + "epoch": 0.76, + "grad_norm": 0.19723608682021473, + "learning_rate": 1.4705019918216218e-06, + "loss": 0.0593, + "step": 8984 + }, + { + "epoch": 0.76, + "grad_norm": 0.25685606449167836, + "learning_rate": 1.46953555761827e-06, + "loss": 0.0816, + "step": 8985 + }, + { + "epoch": 0.76, + "grad_norm": 0.4907315977765806, + "learning_rate": 1.4685693863844752e-06, + "loss": 0.1247, + "step": 8986 + }, + { + "epoch": 0.76, + "grad_norm": 0.4285027523258741, + "learning_rate": 1.4676034781922056e-06, + "loss": 0.0966, + "step": 8987 + }, + { + "epoch": 0.76, + "grad_norm": 0.20539011021613662, + "learning_rate": 1.4666378331134073e-06, + "loss": 0.0516, + "step": 8988 + }, + { + "epoch": 0.76, + "grad_norm": 0.2625599215059745, + "learning_rate": 1.4656724512200044e-06, + "loss": 0.0754, + "step": 8989 + }, + { + "epoch": 0.76, + "grad_norm": 0.24539203258290104, + "learning_rate": 1.4647073325839084e-06, + "loss": 0.05, + "step": 8990 + }, + { + "epoch": 0.76, + "grad_norm": 0.28239754359414915, + "learning_rate": 1.4637424772770042e-06, + "loss": 0.0867, + "step": 8991 + }, + { + "epoch": 0.76, + "grad_norm": 0.3037797468278806, + "learning_rate": 1.4627778853711594e-06, + "loss": 0.053, + "step": 8992 + }, + { + "epoch": 0.76, + "grad_norm": 0.40775203700654555, + "learning_rate": 1.4618135569382253e-06, + "loss": 0.0754, + "step": 8993 + }, + { + "epoch": 0.76, + "grad_norm": 0.27011957574466455, + "learning_rate": 1.4608494920500287e-06, + "loss": 0.0676, + "step": 8994 + }, + { + "epoch": 0.76, + "grad_norm": 0.5092248284917518, + "learning_rate": 1.4598856907783792e-06, + "loss": 0.0935, + "step": 8995 + }, + { + "epoch": 0.76, + "grad_norm": 0.3305601578414505, + "learning_rate": 1.4589221531950642e-06, + "loss": 0.0914, + "step": 8996 + }, + { + "epoch": 0.76, + "grad_norm": 0.22118126080798028, + "learning_rate": 1.4579588793718575e-06, + "loss": 0.0637, + "step": 8997 + }, + { + "epoch": 0.76, + "grad_norm": 0.32247275133620557, + "learning_rate": 1.456995869380507e-06, + "loss": 0.074, + "step": 8998 + }, + { + "epoch": 0.76, + "grad_norm": 0.5017056623997849, + "learning_rate": 1.456033123292742e-06, + "loss": 0.1036, + "step": 8999 + }, + { + "epoch": 0.76, + "grad_norm": 0.33382550794309285, + "learning_rate": 1.4550706411802774e-06, + "loss": 0.0924, + "step": 9000 + }, + { + "epoch": 0.76, + "grad_norm": 0.6789040251250273, + "learning_rate": 1.4541084231148017e-06, + "loss": 0.1309, + "step": 9001 + }, + { + "epoch": 0.76, + "grad_norm": 0.2665076247915018, + "learning_rate": 1.453146469167987e-06, + "loss": 0.0922, + "step": 9002 + }, + { + "epoch": 0.76, + "grad_norm": 0.23213898008908482, + "learning_rate": 1.4521847794114835e-06, + "loss": 0.0576, + "step": 9003 + }, + { + "epoch": 0.76, + "grad_norm": 0.2248932638434717, + "learning_rate": 1.4512233539169268e-06, + "loss": 0.0438, + "step": 9004 + }, + { + "epoch": 0.76, + "grad_norm": 0.2835690276813325, + "learning_rate": 1.450262192755928e-06, + "loss": 0.0771, + "step": 9005 + }, + { + "epoch": 0.76, + "grad_norm": 0.29322717058529885, + "learning_rate": 1.4493012960000784e-06, + "loss": 0.0441, + "step": 9006 + }, + { + "epoch": 0.76, + "grad_norm": 0.4052261028393808, + "learning_rate": 1.448340663720954e-06, + "loss": 0.1048, + "step": 9007 + }, + { + "epoch": 0.76, + "grad_norm": 0.2770794207009056, + "learning_rate": 1.4473802959901069e-06, + "loss": 0.1124, + "step": 9008 + }, + { + "epoch": 0.76, + "grad_norm": 0.4429745291276455, + "learning_rate": 1.4464201928790694e-06, + "loss": 0.0862, + "step": 9009 + }, + { + "epoch": 0.76, + "grad_norm": 0.22072899415649624, + "learning_rate": 1.4454603544593588e-06, + "loss": 0.0634, + "step": 9010 + }, + { + "epoch": 0.76, + "grad_norm": 0.2267164504348625, + "learning_rate": 1.4445007808024675e-06, + "loss": 0.0928, + "step": 9011 + }, + { + "epoch": 0.76, + "grad_norm": 0.2110049235768872, + "learning_rate": 1.4435414719798702e-06, + "loss": 0.0523, + "step": 9012 + }, + { + "epoch": 0.76, + "grad_norm": 0.27275130621687793, + "learning_rate": 1.4425824280630207e-06, + "loss": 0.0884, + "step": 9013 + }, + { + "epoch": 0.76, + "grad_norm": 0.30593084470245585, + "learning_rate": 1.441623649123357e-06, + "loss": 0.0692, + "step": 9014 + }, + { + "epoch": 0.76, + "grad_norm": 0.27342559207893546, + "learning_rate": 1.4406651352322932e-06, + "loss": 0.1003, + "step": 9015 + }, + { + "epoch": 0.76, + "grad_norm": 0.18338588799638847, + "learning_rate": 1.4397068864612223e-06, + "loss": 0.0336, + "step": 9016 + }, + { + "epoch": 0.76, + "grad_norm": 0.5076698254301681, + "learning_rate": 1.4387489028815254e-06, + "loss": 0.1263, + "step": 9017 + }, + { + "epoch": 0.76, + "grad_norm": 0.3031893217642408, + "learning_rate": 1.4377911845645553e-06, + "loss": 0.0581, + "step": 9018 + }, + { + "epoch": 0.76, + "grad_norm": 0.44022376570033506, + "learning_rate": 1.4368337315816495e-06, + "loss": 0.1223, + "step": 9019 + }, + { + "epoch": 0.76, + "grad_norm": 0.41652171225266876, + "learning_rate": 1.4358765440041229e-06, + "loss": 0.1048, + "step": 9020 + }, + { + "epoch": 0.76, + "grad_norm": 0.2664166462358083, + "learning_rate": 1.4349196219032752e-06, + "loss": 0.0628, + "step": 9021 + }, + { + "epoch": 0.76, + "grad_norm": 0.375706908063255, + "learning_rate": 1.4339629653503823e-06, + "loss": 0.1013, + "step": 9022 + }, + { + "epoch": 0.76, + "grad_norm": 0.3649409874473625, + "learning_rate": 1.433006574416701e-06, + "loss": 0.0666, + "step": 9023 + }, + { + "epoch": 0.76, + "grad_norm": 0.303482617666949, + "learning_rate": 1.43205044917347e-06, + "loss": 0.0727, + "step": 9024 + }, + { + "epoch": 0.76, + "grad_norm": 0.3931415657806582, + "learning_rate": 1.4310945896919043e-06, + "loss": 0.0993, + "step": 9025 + }, + { + "epoch": 0.76, + "grad_norm": 0.37629458331307347, + "learning_rate": 1.430138996043206e-06, + "loss": 0.1054, + "step": 9026 + }, + { + "epoch": 0.76, + "grad_norm": 0.27913000940133625, + "learning_rate": 1.429183668298551e-06, + "loss": 0.1055, + "step": 9027 + }, + { + "epoch": 0.76, + "grad_norm": 0.3211784614554781, + "learning_rate": 1.4282286065290978e-06, + "loss": 0.1261, + "step": 9028 + }, + { + "epoch": 0.76, + "grad_norm": 0.49791522267033933, + "learning_rate": 1.4272738108059836e-06, + "loss": 0.0914, + "step": 9029 + }, + { + "epoch": 0.76, + "grad_norm": 0.35242131679930966, + "learning_rate": 1.4263192812003302e-06, + "loss": 0.0816, + "step": 9030 + }, + { + "epoch": 0.76, + "grad_norm": 0.2960467544019307, + "learning_rate": 1.425365017783235e-06, + "loss": 0.0619, + "step": 9031 + }, + { + "epoch": 0.76, + "grad_norm": 0.4752485813627897, + "learning_rate": 1.4244110206257772e-06, + "loss": 0.0896, + "step": 9032 + }, + { + "epoch": 0.76, + "grad_norm": 0.23606901469735783, + "learning_rate": 1.4234572897990146e-06, + "loss": 0.0649, + "step": 9033 + }, + { + "epoch": 0.76, + "grad_norm": 0.44986533527721007, + "learning_rate": 1.4225038253739887e-06, + "loss": 0.1064, + "step": 9034 + }, + { + "epoch": 0.76, + "grad_norm": 0.33491455372354395, + "learning_rate": 1.4215506274217189e-06, + "loss": 0.1008, + "step": 9035 + }, + { + "epoch": 0.76, + "grad_norm": 0.3167114358511054, + "learning_rate": 1.4205976960132045e-06, + "loss": 0.0877, + "step": 9036 + }, + { + "epoch": 0.76, + "grad_norm": 0.28092484899191167, + "learning_rate": 1.4196450312194232e-06, + "loss": 0.0765, + "step": 9037 + }, + { + "epoch": 0.76, + "grad_norm": 0.4751160830065029, + "learning_rate": 1.4186926331113392e-06, + "loss": 0.1232, + "step": 9038 + }, + { + "epoch": 0.76, + "grad_norm": 0.2703189887778611, + "learning_rate": 1.41774050175989e-06, + "loss": 0.069, + "step": 9039 + }, + { + "epoch": 0.76, + "grad_norm": 0.506201789950593, + "learning_rate": 1.4167886372359958e-06, + "loss": 0.1004, + "step": 9040 + }, + { + "epoch": 0.76, + "grad_norm": 0.5566671157665424, + "learning_rate": 1.4158370396105581e-06, + "loss": 0.0776, + "step": 9041 + }, + { + "epoch": 0.76, + "grad_norm": 0.22019667106651766, + "learning_rate": 1.4148857089544577e-06, + "loss": 0.0658, + "step": 9042 + }, + { + "epoch": 0.76, + "grad_norm": 0.36813301464474457, + "learning_rate": 1.413934645338554e-06, + "loss": 0.0966, + "step": 9043 + }, + { + "epoch": 0.76, + "grad_norm": 0.27196913329447064, + "learning_rate": 1.4129838488336872e-06, + "loss": 0.08, + "step": 9044 + }, + { + "epoch": 0.76, + "grad_norm": 0.3308766565428638, + "learning_rate": 1.4120333195106807e-06, + "loss": 0.0701, + "step": 9045 + }, + { + "epoch": 0.76, + "grad_norm": 0.34774797564339244, + "learning_rate": 1.4110830574403339e-06, + "loss": 0.1055, + "step": 9046 + }, + { + "epoch": 0.76, + "grad_norm": 0.3179537533855218, + "learning_rate": 1.4101330626934262e-06, + "loss": 0.0775, + "step": 9047 + }, + { + "epoch": 0.76, + "grad_norm": 0.26022121076761734, + "learning_rate": 1.4091833353407224e-06, + "loss": 0.0829, + "step": 9048 + }, + { + "epoch": 0.76, + "grad_norm": 0.2856584692743634, + "learning_rate": 1.4082338754529618e-06, + "loss": 0.0825, + "step": 9049 + }, + { + "epoch": 0.76, + "grad_norm": 0.3280682834203328, + "learning_rate": 1.4072846831008641e-06, + "loss": 0.0642, + "step": 9050 + }, + { + "epoch": 0.76, + "grad_norm": 0.242506963688167, + "learning_rate": 1.4063357583551341e-06, + "loss": 0.0506, + "step": 9051 + }, + { + "epoch": 0.76, + "grad_norm": 0.32885076680305075, + "learning_rate": 1.4053871012864511e-06, + "loss": 0.091, + "step": 9052 + }, + { + "epoch": 0.76, + "grad_norm": 0.34717192053713514, + "learning_rate": 1.4044387119654774e-06, + "loss": 0.1118, + "step": 9053 + }, + { + "epoch": 0.76, + "grad_norm": 0.3344377698357655, + "learning_rate": 1.403490590462852e-06, + "loss": 0.111, + "step": 9054 + }, + { + "epoch": 0.76, + "grad_norm": 0.28658875547217344, + "learning_rate": 1.4025427368492006e-06, + "loss": 0.0758, + "step": 9055 + }, + { + "epoch": 0.76, + "grad_norm": 0.25315245928228775, + "learning_rate": 1.4015951511951227e-06, + "loss": 0.0668, + "step": 9056 + }, + { + "epoch": 0.76, + "grad_norm": 0.2055357957307144, + "learning_rate": 1.4006478335711987e-06, + "loss": 0.0669, + "step": 9057 + }, + { + "epoch": 0.76, + "grad_norm": 0.32148181635280576, + "learning_rate": 1.3997007840479937e-06, + "loss": 0.0895, + "step": 9058 + }, + { + "epoch": 0.76, + "grad_norm": 0.40570641281715086, + "learning_rate": 1.3987540026960473e-06, + "loss": 0.0946, + "step": 9059 + }, + { + "epoch": 0.76, + "grad_norm": 0.18528781036277567, + "learning_rate": 1.3978074895858818e-06, + "loss": 0.0563, + "step": 9060 + }, + { + "epoch": 0.76, + "grad_norm": 0.2979438698244309, + "learning_rate": 1.396861244787997e-06, + "loss": 0.0877, + "step": 9061 + }, + { + "epoch": 0.76, + "grad_norm": 0.3233573299270331, + "learning_rate": 1.3959152683728776e-06, + "loss": 0.1171, + "step": 9062 + }, + { + "epoch": 0.76, + "grad_norm": 0.24922987460857152, + "learning_rate": 1.3949695604109854e-06, + "loss": 0.0646, + "step": 9063 + }, + { + "epoch": 0.76, + "grad_norm": 0.2914190294519474, + "learning_rate": 1.394024120972759e-06, + "loss": 0.0786, + "step": 9064 + }, + { + "epoch": 0.76, + "grad_norm": 0.2988407277398744, + "learning_rate": 1.393078950128624e-06, + "loss": 0.1035, + "step": 9065 + }, + { + "epoch": 0.76, + "grad_norm": 0.1876797214361556, + "learning_rate": 1.392134047948981e-06, + "loss": 0.0581, + "step": 9066 + }, + { + "epoch": 0.76, + "grad_norm": 0.34075821483212215, + "learning_rate": 1.3911894145042093e-06, + "loss": 0.0977, + "step": 9067 + }, + { + "epoch": 0.76, + "grad_norm": 0.24360904545632936, + "learning_rate": 1.3902450498646753e-06, + "loss": 0.0586, + "step": 9068 + }, + { + "epoch": 0.76, + "grad_norm": 0.35727690519778, + "learning_rate": 1.389300954100718e-06, + "loss": 0.0693, + "step": 9069 + }, + { + "epoch": 0.76, + "grad_norm": 0.23223491557066642, + "learning_rate": 1.388357127282659e-06, + "loss": 0.0559, + "step": 9070 + }, + { + "epoch": 0.76, + "grad_norm": 0.31778657430077106, + "learning_rate": 1.3874135694807994e-06, + "loss": 0.0852, + "step": 9071 + }, + { + "epoch": 0.76, + "grad_norm": 0.2911285208593612, + "learning_rate": 1.386470280765424e-06, + "loss": 0.0588, + "step": 9072 + }, + { + "epoch": 0.76, + "grad_norm": 0.4002003470793741, + "learning_rate": 1.3855272612067917e-06, + "loss": 0.1063, + "step": 9073 + }, + { + "epoch": 0.76, + "grad_norm": 0.4373750741467451, + "learning_rate": 1.3845845108751438e-06, + "loss": 0.1128, + "step": 9074 + }, + { + "epoch": 0.76, + "grad_norm": 0.3090594402600677, + "learning_rate": 1.3836420298407044e-06, + "loss": 0.0491, + "step": 9075 + }, + { + "epoch": 0.76, + "grad_norm": 0.3259196951884869, + "learning_rate": 1.3826998181736728e-06, + "loss": 0.0587, + "step": 9076 + }, + { + "epoch": 0.76, + "grad_norm": 0.5512487342771755, + "learning_rate": 1.381757875944232e-06, + "loss": 0.122, + "step": 9077 + }, + { + "epoch": 0.76, + "grad_norm": 0.2575720598354923, + "learning_rate": 1.3808162032225403e-06, + "loss": 0.0876, + "step": 9078 + }, + { + "epoch": 0.76, + "grad_norm": 0.21443003769851024, + "learning_rate": 1.3798748000787427e-06, + "loss": 0.0794, + "step": 9079 + }, + { + "epoch": 0.77, + "grad_norm": 0.2561444915863631, + "learning_rate": 1.378933666582959e-06, + "loss": 0.058, + "step": 9080 + }, + { + "epoch": 0.77, + "grad_norm": 0.28728586866455486, + "learning_rate": 1.3779928028052885e-06, + "loss": 0.0733, + "step": 9081 + }, + { + "epoch": 0.77, + "grad_norm": 0.3745453796970236, + "learning_rate": 1.377052208815815e-06, + "loss": 0.1394, + "step": 9082 + }, + { + "epoch": 0.77, + "grad_norm": 0.6595915030224648, + "learning_rate": 1.3761118846845989e-06, + "loss": 0.1048, + "step": 9083 + }, + { + "epoch": 0.77, + "grad_norm": 0.3234201462504862, + "learning_rate": 1.375171830481678e-06, + "loss": 0.0848, + "step": 9084 + }, + { + "epoch": 0.77, + "grad_norm": 0.1976262481838455, + "learning_rate": 1.3742320462770775e-06, + "loss": 0.0634, + "step": 9085 + }, + { + "epoch": 0.77, + "grad_norm": 0.208103518678512, + "learning_rate": 1.3732925321407958e-06, + "loss": 0.0363, + "step": 9086 + }, + { + "epoch": 0.77, + "grad_norm": 0.2975968535939927, + "learning_rate": 1.372353288142813e-06, + "loss": 0.0994, + "step": 9087 + }, + { + "epoch": 0.77, + "grad_norm": 0.5200256443960833, + "learning_rate": 1.371414314353089e-06, + "loss": 0.1527, + "step": 9088 + }, + { + "epoch": 0.77, + "grad_norm": 0.21033096410764265, + "learning_rate": 1.3704756108415657e-06, + "loss": 0.0391, + "step": 9089 + }, + { + "epoch": 0.77, + "grad_norm": 0.2809555751680728, + "learning_rate": 1.369537177678163e-06, + "loss": 0.0802, + "step": 9090 + }, + { + "epoch": 0.77, + "grad_norm": 0.8239302190455301, + "learning_rate": 1.36859901493278e-06, + "loss": 0.1815, + "step": 9091 + }, + { + "epoch": 0.77, + "grad_norm": 0.31557592179690763, + "learning_rate": 1.3676611226752973e-06, + "loss": 0.0675, + "step": 9092 + }, + { + "epoch": 0.77, + "grad_norm": 0.2549647237244421, + "learning_rate": 1.366723500975572e-06, + "loss": 0.0728, + "step": 9093 + }, + { + "epoch": 0.77, + "grad_norm": 0.3716165060551907, + "learning_rate": 1.3657861499034474e-06, + "loss": 0.0965, + "step": 9094 + }, + { + "epoch": 0.77, + "grad_norm": 0.3536941715085643, + "learning_rate": 1.364849069528742e-06, + "loss": 0.0634, + "step": 9095 + }, + { + "epoch": 0.77, + "grad_norm": 0.40249659657989595, + "learning_rate": 1.3639122599212534e-06, + "loss": 0.1064, + "step": 9096 + }, + { + "epoch": 0.77, + "grad_norm": 0.26664910689930643, + "learning_rate": 1.3629757211507598e-06, + "loss": 0.0606, + "step": 9097 + }, + { + "epoch": 0.77, + "grad_norm": 0.28032972595210925, + "learning_rate": 1.362039453287024e-06, + "loss": 0.076, + "step": 9098 + }, + { + "epoch": 0.77, + "grad_norm": 0.24761656936911136, + "learning_rate": 1.3611034563997816e-06, + "loss": 0.0857, + "step": 9099 + }, + { + "epoch": 0.77, + "grad_norm": 0.28718012479688754, + "learning_rate": 1.3601677305587524e-06, + "loss": 0.0794, + "step": 9100 + }, + { + "epoch": 0.77, + "grad_norm": 0.48448441228178585, + "learning_rate": 1.359232275833633e-06, + "loss": 0.0779, + "step": 9101 + }, + { + "epoch": 0.77, + "grad_norm": 0.1887399522682956, + "learning_rate": 1.3582970922941036e-06, + "loss": 0.0413, + "step": 9102 + }, + { + "epoch": 0.77, + "grad_norm": 0.26410487586101533, + "learning_rate": 1.3573621800098213e-06, + "loss": 0.0644, + "step": 9103 + }, + { + "epoch": 0.77, + "grad_norm": 0.30879792360296227, + "learning_rate": 1.3564275390504244e-06, + "loss": 0.1012, + "step": 9104 + }, + { + "epoch": 0.77, + "grad_norm": 0.3307359260223385, + "learning_rate": 1.3554931694855278e-06, + "loss": 0.054, + "step": 9105 + }, + { + "epoch": 0.77, + "grad_norm": 0.24141346817593878, + "learning_rate": 1.3545590713847323e-06, + "loss": 0.06, + "step": 9106 + }, + { + "epoch": 0.77, + "grad_norm": 0.22715120002187913, + "learning_rate": 1.3536252448176135e-06, + "loss": 0.07, + "step": 9107 + }, + { + "epoch": 0.77, + "grad_norm": 0.2146055360311639, + "learning_rate": 1.3526916898537262e-06, + "loss": 0.0561, + "step": 9108 + }, + { + "epoch": 0.77, + "grad_norm": 0.18162295880046728, + "learning_rate": 1.351758406562611e-06, + "loss": 0.0491, + "step": 9109 + }, + { + "epoch": 0.77, + "grad_norm": 0.2947989667758957, + "learning_rate": 1.3508253950137822e-06, + "loss": 0.0871, + "step": 9110 + }, + { + "epoch": 0.77, + "grad_norm": 0.28819258608365494, + "learning_rate": 1.3498926552767362e-06, + "loss": 0.0311, + "step": 9111 + }, + { + "epoch": 0.77, + "grad_norm": 0.1701447588430931, + "learning_rate": 1.3489601874209468e-06, + "loss": 0.0332, + "step": 9112 + }, + { + "epoch": 0.77, + "grad_norm": 0.18461017934954646, + "learning_rate": 1.3480279915158728e-06, + "loss": 0.0512, + "step": 9113 + }, + { + "epoch": 0.77, + "grad_norm": 0.2948992444065201, + "learning_rate": 1.3470960676309491e-06, + "loss": 0.07, + "step": 9114 + }, + { + "epoch": 0.77, + "grad_norm": 0.23553140670887057, + "learning_rate": 1.346164415835588e-06, + "loss": 0.0367, + "step": 9115 + }, + { + "epoch": 0.77, + "grad_norm": 0.2808188165900945, + "learning_rate": 1.3452330361991877e-06, + "loss": 0.0512, + "step": 9116 + }, + { + "epoch": 0.77, + "grad_norm": 0.3347905601974998, + "learning_rate": 1.3443019287911218e-06, + "loss": 0.0891, + "step": 9117 + }, + { + "epoch": 0.77, + "grad_norm": 0.4357652473059663, + "learning_rate": 1.3433710936807426e-06, + "loss": 0.1338, + "step": 9118 + }, + { + "epoch": 0.77, + "grad_norm": 0.24090227958287116, + "learning_rate": 1.3424405309373873e-06, + "loss": 0.0771, + "step": 9119 + }, + { + "epoch": 0.77, + "grad_norm": 0.3876223704845664, + "learning_rate": 1.3415102406303681e-06, + "loss": 0.1145, + "step": 9120 + }, + { + "epoch": 0.77, + "grad_norm": 0.31469508362196214, + "learning_rate": 1.3405802228289788e-06, + "loss": 0.0673, + "step": 9121 + }, + { + "epoch": 0.77, + "grad_norm": 0.27908118014739175, + "learning_rate": 1.33965047760249e-06, + "loss": 0.0745, + "step": 9122 + }, + { + "epoch": 0.77, + "grad_norm": 0.2367200269629889, + "learning_rate": 1.3387210050201588e-06, + "loss": 0.0529, + "step": 9123 + }, + { + "epoch": 0.77, + "grad_norm": 0.3037619408955906, + "learning_rate": 1.3377918051512157e-06, + "loss": 0.0689, + "step": 9124 + }, + { + "epoch": 0.77, + "grad_norm": 0.2136408276494688, + "learning_rate": 1.3368628780648712e-06, + "loss": 0.0557, + "step": 9125 + }, + { + "epoch": 0.77, + "grad_norm": 0.39559527285243673, + "learning_rate": 1.3359342238303203e-06, + "loss": 0.1235, + "step": 9126 + }, + { + "epoch": 0.77, + "grad_norm": 0.2982668715143579, + "learning_rate": 1.3350058425167334e-06, + "loss": 0.0854, + "step": 9127 + }, + { + "epoch": 0.77, + "grad_norm": 0.3249028934435638, + "learning_rate": 1.3340777341932616e-06, + "loss": 0.0907, + "step": 9128 + }, + { + "epoch": 0.77, + "grad_norm": 0.39201012760442144, + "learning_rate": 1.3331498989290337e-06, + "loss": 0.0988, + "step": 9129 + }, + { + "epoch": 0.77, + "grad_norm": 0.33211321089772666, + "learning_rate": 1.3322223367931643e-06, + "loss": 0.0883, + "step": 9130 + }, + { + "epoch": 0.77, + "grad_norm": 0.31617862146563375, + "learning_rate": 1.3312950478547416e-06, + "loss": 0.0764, + "step": 9131 + }, + { + "epoch": 0.77, + "grad_norm": 0.36963105389555884, + "learning_rate": 1.3303680321828332e-06, + "loss": 0.0905, + "step": 9132 + }, + { + "epoch": 0.77, + "grad_norm": 0.29613206935387587, + "learning_rate": 1.3294412898464937e-06, + "loss": 0.0451, + "step": 9133 + }, + { + "epoch": 0.77, + "grad_norm": 0.313363258168235, + "learning_rate": 1.3285148209147486e-06, + "loss": 0.1025, + "step": 9134 + }, + { + "epoch": 0.77, + "grad_norm": 0.18470489519287475, + "learning_rate": 1.3275886254566068e-06, + "loss": 0.0358, + "step": 9135 + }, + { + "epoch": 0.77, + "grad_norm": 0.45664323715895416, + "learning_rate": 1.3266627035410584e-06, + "loss": 0.0869, + "step": 9136 + }, + { + "epoch": 0.77, + "grad_norm": 0.21957774998844323, + "learning_rate": 1.3257370552370713e-06, + "loss": 0.051, + "step": 9137 + }, + { + "epoch": 0.77, + "grad_norm": 0.21910337263325588, + "learning_rate": 1.324811680613592e-06, + "loss": 0.0512, + "step": 9138 + }, + { + "epoch": 0.77, + "grad_norm": 0.42484304280494495, + "learning_rate": 1.3238865797395466e-06, + "loss": 0.1014, + "step": 9139 + }, + { + "epoch": 0.77, + "grad_norm": 0.315009704950275, + "learning_rate": 1.3229617526838457e-06, + "loss": 0.0835, + "step": 9140 + }, + { + "epoch": 0.77, + "grad_norm": 0.3325039849436051, + "learning_rate": 1.3220371995153735e-06, + "loss": 0.079, + "step": 9141 + }, + { + "epoch": 0.77, + "grad_norm": 0.3588680149133279, + "learning_rate": 1.3211129203029953e-06, + "loss": 0.0825, + "step": 9142 + }, + { + "epoch": 0.77, + "grad_norm": 0.4209612026699083, + "learning_rate": 1.320188915115559e-06, + "loss": 0.1119, + "step": 9143 + }, + { + "epoch": 0.77, + "grad_norm": 0.23459897043118016, + "learning_rate": 1.319265184021889e-06, + "loss": 0.0746, + "step": 9144 + }, + { + "epoch": 0.77, + "grad_norm": 0.4809473992240087, + "learning_rate": 1.3183417270907894e-06, + "loss": 0.129, + "step": 9145 + }, + { + "epoch": 0.77, + "grad_norm": 0.30819298183566163, + "learning_rate": 1.3174185443910442e-06, + "loss": 0.1015, + "step": 9146 + }, + { + "epoch": 0.77, + "grad_norm": 0.4470678667944507, + "learning_rate": 1.3164956359914198e-06, + "loss": 0.102, + "step": 9147 + }, + { + "epoch": 0.77, + "grad_norm": 0.24118226356446548, + "learning_rate": 1.3155730019606588e-06, + "loss": 0.0757, + "step": 9148 + }, + { + "epoch": 0.77, + "grad_norm": 0.35950195381662287, + "learning_rate": 1.314650642367482e-06, + "loss": 0.0841, + "step": 9149 + }, + { + "epoch": 0.77, + "grad_norm": 0.49782855569517626, + "learning_rate": 1.3137285572805957e-06, + "loss": 0.1508, + "step": 9150 + }, + { + "epoch": 0.77, + "grad_norm": 0.4145138392072528, + "learning_rate": 1.3128067467686812e-06, + "loss": 0.0867, + "step": 9151 + }, + { + "epoch": 0.77, + "grad_norm": 0.16381227587322522, + "learning_rate": 1.3118852109003988e-06, + "loss": 0.0573, + "step": 9152 + }, + { + "epoch": 0.77, + "grad_norm": 0.3255996905238742, + "learning_rate": 1.31096394974439e-06, + "loss": 0.0913, + "step": 9153 + }, + { + "epoch": 0.77, + "grad_norm": 0.34773943274912794, + "learning_rate": 1.3100429633692774e-06, + "loss": 0.0996, + "step": 9154 + }, + { + "epoch": 0.77, + "grad_norm": 0.4416283287016242, + "learning_rate": 1.3091222518436607e-06, + "loss": 0.0966, + "step": 9155 + }, + { + "epoch": 0.77, + "grad_norm": 0.5321065695552621, + "learning_rate": 1.3082018152361182e-06, + "loss": 0.1166, + "step": 9156 + }, + { + "epoch": 0.77, + "grad_norm": 0.20566322082248967, + "learning_rate": 1.307281653615212e-06, + "loss": 0.0526, + "step": 9157 + }, + { + "epoch": 0.77, + "grad_norm": 0.2884705810384233, + "learning_rate": 1.3063617670494799e-06, + "loss": 0.0688, + "step": 9158 + }, + { + "epoch": 0.77, + "grad_norm": 0.2814858366153959, + "learning_rate": 1.305442155607441e-06, + "loss": 0.0588, + "step": 9159 + }, + { + "epoch": 0.77, + "grad_norm": 0.21303761314197944, + "learning_rate": 1.3045228193575922e-06, + "loss": 0.0538, + "step": 9160 + }, + { + "epoch": 0.77, + "grad_norm": 0.36587992532023206, + "learning_rate": 1.3036037583684102e-06, + "loss": 0.0966, + "step": 9161 + }, + { + "epoch": 0.77, + "grad_norm": 0.3936324947936362, + "learning_rate": 1.3026849727083552e-06, + "loss": 0.0784, + "step": 9162 + }, + { + "epoch": 0.77, + "grad_norm": 0.33937712973154177, + "learning_rate": 1.3017664624458615e-06, + "loss": 0.0716, + "step": 9163 + }, + { + "epoch": 0.77, + "grad_norm": 0.3208734756274665, + "learning_rate": 1.3008482276493457e-06, + "loss": 0.0861, + "step": 9164 + }, + { + "epoch": 0.77, + "grad_norm": 0.31395310269000604, + "learning_rate": 1.2999302683872011e-06, + "loss": 0.0975, + "step": 9165 + }, + { + "epoch": 0.77, + "grad_norm": 0.41125509611979827, + "learning_rate": 1.2990125847278068e-06, + "loss": 0.1026, + "step": 9166 + }, + { + "epoch": 0.77, + "grad_norm": 0.34437017223147487, + "learning_rate": 1.2980951767395144e-06, + "loss": 0.0754, + "step": 9167 + }, + { + "epoch": 0.77, + "grad_norm": 0.25752273104505297, + "learning_rate": 1.2971780444906585e-06, + "loss": 0.0587, + "step": 9168 + }, + { + "epoch": 0.77, + "grad_norm": 0.35324211165188757, + "learning_rate": 1.296261188049553e-06, + "loss": 0.0838, + "step": 9169 + }, + { + "epoch": 0.77, + "grad_norm": 0.35892680823859235, + "learning_rate": 1.2953446074844879e-06, + "loss": 0.0987, + "step": 9170 + }, + { + "epoch": 0.77, + "grad_norm": 0.22923565241839478, + "learning_rate": 1.2944283028637394e-06, + "loss": 0.0646, + "step": 9171 + }, + { + "epoch": 0.77, + "grad_norm": 0.3184439509659246, + "learning_rate": 1.2935122742555577e-06, + "loss": 0.0971, + "step": 9172 + }, + { + "epoch": 0.77, + "grad_norm": 0.34755485185078694, + "learning_rate": 1.2925965217281717e-06, + "loss": 0.0544, + "step": 9173 + }, + { + "epoch": 0.77, + "grad_norm": 0.268085972182783, + "learning_rate": 1.2916810453497958e-06, + "loss": 0.0731, + "step": 9174 + }, + { + "epoch": 0.77, + "grad_norm": 0.2683416658219828, + "learning_rate": 1.2907658451886184e-06, + "loss": 0.0638, + "step": 9175 + }, + { + "epoch": 0.77, + "grad_norm": 0.2530225295580043, + "learning_rate": 1.2898509213128068e-06, + "loss": 0.0648, + "step": 9176 + }, + { + "epoch": 0.77, + "grad_norm": 0.21962678440856956, + "learning_rate": 1.2889362737905136e-06, + "loss": 0.0821, + "step": 9177 + }, + { + "epoch": 0.77, + "grad_norm": 0.27066394516084225, + "learning_rate": 1.288021902689865e-06, + "loss": 0.0637, + "step": 9178 + }, + { + "epoch": 0.77, + "grad_norm": 0.3184093914824342, + "learning_rate": 1.287107808078969e-06, + "loss": 0.0648, + "step": 9179 + }, + { + "epoch": 0.77, + "grad_norm": 0.37685532950081446, + "learning_rate": 1.2861939900259112e-06, + "loss": 0.0861, + "step": 9180 + }, + { + "epoch": 0.77, + "grad_norm": 0.2838812393048947, + "learning_rate": 1.2852804485987613e-06, + "loss": 0.0412, + "step": 9181 + }, + { + "epoch": 0.77, + "grad_norm": 0.24668458375793811, + "learning_rate": 1.2843671838655635e-06, + "loss": 0.0644, + "step": 9182 + }, + { + "epoch": 0.77, + "grad_norm": 0.223491805500163, + "learning_rate": 1.2834541958943408e-06, + "loss": 0.046, + "step": 9183 + }, + { + "epoch": 0.77, + "grad_norm": 0.4674453226110058, + "learning_rate": 1.2825414847531026e-06, + "loss": 0.1145, + "step": 9184 + }, + { + "epoch": 0.77, + "grad_norm": 0.3542464639609329, + "learning_rate": 1.2816290505098294e-06, + "loss": 0.0781, + "step": 9185 + }, + { + "epoch": 0.77, + "grad_norm": 0.4728523806385116, + "learning_rate": 1.280716893232486e-06, + "loss": 0.094, + "step": 9186 + }, + { + "epoch": 0.77, + "grad_norm": 0.3154872018629003, + "learning_rate": 1.2798050129890139e-06, + "loss": 0.0666, + "step": 9187 + }, + { + "epoch": 0.77, + "grad_norm": 0.3053727599756188, + "learning_rate": 1.2788934098473365e-06, + "loss": 0.0938, + "step": 9188 + }, + { + "epoch": 0.77, + "grad_norm": 0.41722895534651366, + "learning_rate": 1.277982083875356e-06, + "loss": 0.0877, + "step": 9189 + }, + { + "epoch": 0.77, + "grad_norm": 0.3560267057030059, + "learning_rate": 1.2770710351409504e-06, + "loss": 0.0835, + "step": 9190 + }, + { + "epoch": 0.77, + "grad_norm": 0.5332085312168945, + "learning_rate": 1.276160263711983e-06, + "loss": 0.1016, + "step": 9191 + }, + { + "epoch": 0.77, + "grad_norm": 0.6967215794854511, + "learning_rate": 1.2752497696562927e-06, + "loss": 0.144, + "step": 9192 + }, + { + "epoch": 0.77, + "grad_norm": 0.5367054611624333, + "learning_rate": 1.2743395530416959e-06, + "loss": 0.1107, + "step": 9193 + }, + { + "epoch": 0.77, + "grad_norm": 0.3672236201648932, + "learning_rate": 1.2734296139359942e-06, + "loss": 0.0808, + "step": 9194 + }, + { + "epoch": 0.77, + "grad_norm": 0.4027181556175779, + "learning_rate": 1.2725199524069642e-06, + "loss": 0.0809, + "step": 9195 + }, + { + "epoch": 0.77, + "grad_norm": 0.3246616688708011, + "learning_rate": 1.2716105685223617e-06, + "loss": 0.0715, + "step": 9196 + }, + { + "epoch": 0.77, + "grad_norm": 0.3479525139436896, + "learning_rate": 1.2707014623499226e-06, + "loss": 0.0861, + "step": 9197 + }, + { + "epoch": 0.78, + "grad_norm": 0.4270912548981726, + "learning_rate": 1.2697926339573647e-06, + "loss": 0.099, + "step": 9198 + }, + { + "epoch": 0.78, + "grad_norm": 0.32769536113991343, + "learning_rate": 1.2688840834123812e-06, + "loss": 0.0908, + "step": 9199 + }, + { + "epoch": 0.78, + "grad_norm": 0.3344201919727719, + "learning_rate": 1.267975810782645e-06, + "loss": 0.0936, + "step": 9200 + }, + { + "epoch": 0.78, + "grad_norm": 0.4994865853553957, + "learning_rate": 1.2670678161358129e-06, + "loss": 0.115, + "step": 9201 + }, + { + "epoch": 0.78, + "grad_norm": 0.5327470747544643, + "learning_rate": 1.2661600995395158e-06, + "loss": 0.1174, + "step": 9202 + }, + { + "epoch": 0.78, + "grad_norm": 0.283788195718664, + "learning_rate": 1.2652526610613657e-06, + "loss": 0.0755, + "step": 9203 + }, + { + "epoch": 0.78, + "grad_norm": 0.34880036782477436, + "learning_rate": 1.2643455007689526e-06, + "loss": 0.0793, + "step": 9204 + }, + { + "epoch": 0.78, + "grad_norm": 0.3276318482962535, + "learning_rate": 1.2634386187298493e-06, + "loss": 0.0734, + "step": 9205 + }, + { + "epoch": 0.78, + "grad_norm": 0.2283586728070119, + "learning_rate": 1.262532015011605e-06, + "loss": 0.0301, + "step": 9206 + }, + { + "epoch": 0.78, + "grad_norm": 0.3618973209143697, + "learning_rate": 1.2616256896817474e-06, + "loss": 0.08, + "step": 9207 + }, + { + "epoch": 0.78, + "grad_norm": 0.4024403584861021, + "learning_rate": 1.2607196428077873e-06, + "loss": 0.1147, + "step": 9208 + }, + { + "epoch": 0.78, + "grad_norm": 0.4073580010558178, + "learning_rate": 1.2598138744572107e-06, + "loss": 0.1262, + "step": 9209 + }, + { + "epoch": 0.78, + "grad_norm": 0.3276037349107275, + "learning_rate": 1.2589083846974843e-06, + "loss": 0.0776, + "step": 9210 + }, + { + "epoch": 0.78, + "grad_norm": 0.3585427443266532, + "learning_rate": 1.2580031735960556e-06, + "loss": 0.0879, + "step": 9211 + }, + { + "epoch": 0.78, + "grad_norm": 0.3405047620428224, + "learning_rate": 1.2570982412203498e-06, + "loss": 0.0964, + "step": 9212 + }, + { + "epoch": 0.78, + "grad_norm": 0.2501209030875898, + "learning_rate": 1.2561935876377706e-06, + "loss": 0.0512, + "step": 9213 + }, + { + "epoch": 0.78, + "grad_norm": 0.25599665798845916, + "learning_rate": 1.2552892129157012e-06, + "loss": 0.0581, + "step": 9214 + }, + { + "epoch": 0.78, + "grad_norm": 0.3103634267271324, + "learning_rate": 1.2543851171215072e-06, + "loss": 0.0854, + "step": 9215 + }, + { + "epoch": 0.78, + "grad_norm": 0.2320527994653225, + "learning_rate": 1.2534813003225293e-06, + "loss": 0.0664, + "step": 9216 + }, + { + "epoch": 0.78, + "grad_norm": 0.3151179113093802, + "learning_rate": 1.252577762586088e-06, + "loss": 0.0988, + "step": 9217 + }, + { + "epoch": 0.78, + "grad_norm": 0.549686231503436, + "learning_rate": 1.2516745039794864e-06, + "loss": 0.116, + "step": 9218 + }, + { + "epoch": 0.78, + "grad_norm": 0.23671336435756016, + "learning_rate": 1.2507715245700036e-06, + "loss": 0.0606, + "step": 9219 + }, + { + "epoch": 0.78, + "grad_norm": 0.3860078526211724, + "learning_rate": 1.2498688244248985e-06, + "loss": 0.0888, + "step": 9220 + }, + { + "epoch": 0.78, + "grad_norm": 0.2924911321483093, + "learning_rate": 1.2489664036114079e-06, + "loss": 0.0998, + "step": 9221 + }, + { + "epoch": 0.78, + "grad_norm": 0.2864676631900646, + "learning_rate": 1.2480642621967525e-06, + "loss": 0.0735, + "step": 9222 + }, + { + "epoch": 0.78, + "grad_norm": 0.6617321428723307, + "learning_rate": 1.2471624002481276e-06, + "loss": 0.1301, + "step": 9223 + }, + { + "epoch": 0.78, + "grad_norm": 0.2187410370201043, + "learning_rate": 1.2462608178327074e-06, + "loss": 0.0528, + "step": 9224 + }, + { + "epoch": 0.78, + "grad_norm": 0.3264560369202226, + "learning_rate": 1.24535951501765e-06, + "loss": 0.0771, + "step": 9225 + }, + { + "epoch": 0.78, + "grad_norm": 0.2871133741514348, + "learning_rate": 1.2444584918700885e-06, + "loss": 0.0725, + "step": 9226 + }, + { + "epoch": 0.78, + "grad_norm": 0.6245343908210043, + "learning_rate": 1.2435577484571366e-06, + "loss": 0.1023, + "step": 9227 + }, + { + "epoch": 0.78, + "grad_norm": 0.276183308165265, + "learning_rate": 1.2426572848458857e-06, + "loss": 0.1067, + "step": 9228 + }, + { + "epoch": 0.78, + "grad_norm": 0.3394828686624172, + "learning_rate": 1.2417571011034075e-06, + "loss": 0.0745, + "step": 9229 + }, + { + "epoch": 0.78, + "grad_norm": 0.46067134800713977, + "learning_rate": 1.2408571972967547e-06, + "loss": 0.1118, + "step": 9230 + }, + { + "epoch": 0.78, + "grad_norm": 0.17730063589145664, + "learning_rate": 1.239957573492957e-06, + "loss": 0.0257, + "step": 9231 + }, + { + "epoch": 0.78, + "grad_norm": 0.2116527428913274, + "learning_rate": 1.2390582297590225e-06, + "loss": 0.0589, + "step": 9232 + }, + { + "epoch": 0.78, + "grad_norm": 0.3254278097208631, + "learning_rate": 1.238159166161939e-06, + "loss": 0.0542, + "step": 9233 + }, + { + "epoch": 0.78, + "grad_norm": 0.26352865096776623, + "learning_rate": 1.237260382768677e-06, + "loss": 0.0488, + "step": 9234 + }, + { + "epoch": 0.78, + "grad_norm": 0.17625151302969838, + "learning_rate": 1.2363618796461807e-06, + "loss": 0.0566, + "step": 9235 + }, + { + "epoch": 0.78, + "grad_norm": 0.28049986475212957, + "learning_rate": 1.2354636568613766e-06, + "loss": 0.0959, + "step": 9236 + }, + { + "epoch": 0.78, + "grad_norm": 0.2145859191594404, + "learning_rate": 1.2345657144811695e-06, + "loss": 0.0606, + "step": 9237 + }, + { + "epoch": 0.78, + "grad_norm": 0.46653354540853426, + "learning_rate": 1.2336680525724415e-06, + "loss": 0.0602, + "step": 9238 + }, + { + "epoch": 0.78, + "grad_norm": 0.230994084036041, + "learning_rate": 1.2327706712020594e-06, + "loss": 0.0648, + "step": 9239 + }, + { + "epoch": 0.78, + "grad_norm": 0.4335279778063143, + "learning_rate": 1.2318735704368634e-06, + "loss": 0.1039, + "step": 9240 + }, + { + "epoch": 0.78, + "grad_norm": 0.3564292176371483, + "learning_rate": 1.2309767503436731e-06, + "loss": 0.0649, + "step": 9241 + }, + { + "epoch": 0.78, + "grad_norm": 0.3133147494815925, + "learning_rate": 1.2300802109892918e-06, + "loss": 0.0685, + "step": 9242 + }, + { + "epoch": 0.78, + "grad_norm": 0.4414189658587401, + "learning_rate": 1.229183952440498e-06, + "loss": 0.0981, + "step": 9243 + }, + { + "epoch": 0.78, + "grad_norm": 0.22359238565160444, + "learning_rate": 1.228287974764049e-06, + "loss": 0.0335, + "step": 9244 + }, + { + "epoch": 0.78, + "grad_norm": 0.3510933683235041, + "learning_rate": 1.2273922780266845e-06, + "loss": 0.0646, + "step": 9245 + }, + { + "epoch": 0.78, + "grad_norm": 0.21996907851273628, + "learning_rate": 1.2264968622951208e-06, + "loss": 0.036, + "step": 9246 + }, + { + "epoch": 0.78, + "grad_norm": 0.4398368183051803, + "learning_rate": 1.225601727636052e-06, + "loss": 0.0989, + "step": 9247 + }, + { + "epoch": 0.78, + "grad_norm": 0.25834230864058244, + "learning_rate": 1.2247068741161533e-06, + "loss": 0.0855, + "step": 9248 + }, + { + "epoch": 0.78, + "grad_norm": 0.3094501529566432, + "learning_rate": 1.2238123018020808e-06, + "loss": 0.0679, + "step": 9249 + }, + { + "epoch": 0.78, + "grad_norm": 0.2053615124491915, + "learning_rate": 1.222918010760466e-06, + "loss": 0.0627, + "step": 9250 + }, + { + "epoch": 0.78, + "grad_norm": 0.3007431007395312, + "learning_rate": 1.2220240010579193e-06, + "loss": 0.083, + "step": 9251 + }, + { + "epoch": 0.78, + "grad_norm": 0.3087149869198408, + "learning_rate": 1.2211302727610347e-06, + "loss": 0.0436, + "step": 9252 + }, + { + "epoch": 0.78, + "grad_norm": 0.2215414533531904, + "learning_rate": 1.2202368259363812e-06, + "loss": 0.0604, + "step": 9253 + }, + { + "epoch": 0.78, + "grad_norm": 0.43182902303850856, + "learning_rate": 1.2193436606505072e-06, + "loss": 0.1244, + "step": 9254 + }, + { + "epoch": 0.78, + "grad_norm": 0.19047223059665688, + "learning_rate": 1.21845077696994e-06, + "loss": 0.0568, + "step": 9255 + }, + { + "epoch": 0.78, + "grad_norm": 0.30423573990868696, + "learning_rate": 1.2175581749611898e-06, + "loss": 0.0728, + "step": 9256 + }, + { + "epoch": 0.78, + "grad_norm": 0.5251447613032193, + "learning_rate": 1.2166658546907411e-06, + "loss": 0.0776, + "step": 9257 + }, + { + "epoch": 0.78, + "grad_norm": 0.465648460561916, + "learning_rate": 1.2157738162250577e-06, + "loss": 0.0872, + "step": 9258 + }, + { + "epoch": 0.78, + "grad_norm": 0.3260908016750977, + "learning_rate": 1.214882059630586e-06, + "loss": 0.1006, + "step": 9259 + }, + { + "epoch": 0.78, + "grad_norm": 0.5012418158960525, + "learning_rate": 1.2139905849737487e-06, + "loss": 0.0946, + "step": 9260 + }, + { + "epoch": 0.78, + "grad_norm": 0.364099262400626, + "learning_rate": 1.2130993923209467e-06, + "loss": 0.0668, + "step": 9261 + }, + { + "epoch": 0.78, + "grad_norm": 0.39582601520545835, + "learning_rate": 1.2122084817385638e-06, + "loss": 0.0755, + "step": 9262 + }, + { + "epoch": 0.78, + "grad_norm": 0.32905043591367644, + "learning_rate": 1.2113178532929582e-06, + "loss": 0.0686, + "step": 9263 + }, + { + "epoch": 0.78, + "grad_norm": 0.2627850880012583, + "learning_rate": 1.21042750705047e-06, + "loss": 0.0835, + "step": 9264 + }, + { + "epoch": 0.78, + "grad_norm": 0.4961173326127303, + "learning_rate": 1.2095374430774154e-06, + "loss": 0.0997, + "step": 9265 + }, + { + "epoch": 0.78, + "grad_norm": 0.5857777439196745, + "learning_rate": 1.2086476614400943e-06, + "loss": 0.1383, + "step": 9266 + }, + { + "epoch": 0.78, + "grad_norm": 0.49850260175246414, + "learning_rate": 1.2077581622047818e-06, + "loss": 0.104, + "step": 9267 + }, + { + "epoch": 0.78, + "grad_norm": 0.28579229557836094, + "learning_rate": 1.2068689454377313e-06, + "loss": 0.0399, + "step": 9268 + }, + { + "epoch": 0.78, + "grad_norm": 0.2604370825992014, + "learning_rate": 1.20598001120518e-06, + "loss": 0.0637, + "step": 9269 + }, + { + "epoch": 0.78, + "grad_norm": 0.30371635232132094, + "learning_rate": 1.2050913595733392e-06, + "loss": 0.0857, + "step": 9270 + }, + { + "epoch": 0.78, + "grad_norm": 0.2928447481850043, + "learning_rate": 1.2042029906084012e-06, + "loss": 0.0631, + "step": 9271 + }, + { + "epoch": 0.78, + "grad_norm": 0.49681443954098764, + "learning_rate": 1.2033149043765346e-06, + "loss": 0.1116, + "step": 9272 + }, + { + "epoch": 0.78, + "grad_norm": 0.544656570669009, + "learning_rate": 1.2024271009438932e-06, + "loss": 0.0924, + "step": 9273 + }, + { + "epoch": 0.78, + "grad_norm": 0.3663706898653954, + "learning_rate": 1.2015395803766034e-06, + "loss": 0.0748, + "step": 9274 + }, + { + "epoch": 0.78, + "grad_norm": 0.3213708679614775, + "learning_rate": 1.2006523427407724e-06, + "loss": 0.0367, + "step": 9275 + }, + { + "epoch": 0.78, + "grad_norm": 0.19040594980044329, + "learning_rate": 1.1997653881024884e-06, + "loss": 0.0582, + "step": 9276 + }, + { + "epoch": 0.78, + "grad_norm": 0.2837855617236932, + "learning_rate": 1.198878716527817e-06, + "loss": 0.0672, + "step": 9277 + }, + { + "epoch": 0.78, + "grad_norm": 0.4637737504449664, + "learning_rate": 1.1979923280828016e-06, + "loss": 0.1137, + "step": 9278 + }, + { + "epoch": 0.78, + "grad_norm": 0.2865487749627345, + "learning_rate": 1.1971062228334646e-06, + "loss": 0.0667, + "step": 9279 + }, + { + "epoch": 0.78, + "grad_norm": 0.22232684739949388, + "learning_rate": 1.196220400845811e-06, + "loss": 0.066, + "step": 9280 + }, + { + "epoch": 0.78, + "grad_norm": 0.2594550711372454, + "learning_rate": 1.1953348621858207e-06, + "loss": 0.067, + "step": 9281 + }, + { + "epoch": 0.78, + "grad_norm": 0.32639688506198294, + "learning_rate": 1.1944496069194523e-06, + "loss": 0.0922, + "step": 9282 + }, + { + "epoch": 0.78, + "grad_norm": 0.4181197692851612, + "learning_rate": 1.1935646351126478e-06, + "loss": 0.1064, + "step": 9283 + }, + { + "epoch": 0.78, + "grad_norm": 0.33395983062068574, + "learning_rate": 1.1926799468313227e-06, + "loss": 0.064, + "step": 9284 + }, + { + "epoch": 0.78, + "grad_norm": 0.3839744243208467, + "learning_rate": 1.1917955421413735e-06, + "loss": 0.0679, + "step": 9285 + }, + { + "epoch": 0.78, + "grad_norm": 0.3463911864700188, + "learning_rate": 1.190911421108678e-06, + "loss": 0.1059, + "step": 9286 + }, + { + "epoch": 0.78, + "grad_norm": 0.40826901514633357, + "learning_rate": 1.1900275837990899e-06, + "loss": 0.0953, + "step": 9287 + }, + { + "epoch": 0.78, + "grad_norm": 0.44036851412061884, + "learning_rate": 1.1891440302784418e-06, + "loss": 0.1145, + "step": 9288 + }, + { + "epoch": 0.78, + "grad_norm": 0.3649675073626109, + "learning_rate": 1.1882607606125447e-06, + "loss": 0.0628, + "step": 9289 + }, + { + "epoch": 0.78, + "grad_norm": 0.3263009285481507, + "learning_rate": 1.1873777748671922e-06, + "loss": 0.0714, + "step": 9290 + }, + { + "epoch": 0.78, + "grad_norm": 0.5055607782042301, + "learning_rate": 1.1864950731081536e-06, + "loss": 0.106, + "step": 9291 + }, + { + "epoch": 0.78, + "grad_norm": 0.5152625006457674, + "learning_rate": 1.185612655401176e-06, + "loss": 0.0843, + "step": 9292 + }, + { + "epoch": 0.78, + "grad_norm": 0.5653781212651706, + "learning_rate": 1.1847305218119898e-06, + "loss": 0.1353, + "step": 9293 + }, + { + "epoch": 0.78, + "grad_norm": 0.37880194789335686, + "learning_rate": 1.1838486724062992e-06, + "loss": 0.0972, + "step": 9294 + }, + { + "epoch": 0.78, + "grad_norm": 0.4308745733146846, + "learning_rate": 1.1829671072497906e-06, + "loss": 0.0874, + "step": 9295 + }, + { + "epoch": 0.78, + "grad_norm": 0.39757064981961926, + "learning_rate": 1.1820858264081281e-06, + "loss": 0.0918, + "step": 9296 + }, + { + "epoch": 0.78, + "grad_norm": 0.29906010198248384, + "learning_rate": 1.1812048299469526e-06, + "loss": 0.0659, + "step": 9297 + }, + { + "epoch": 0.78, + "grad_norm": 0.3784480265726095, + "learning_rate": 1.1803241179318886e-06, + "loss": 0.0988, + "step": 9298 + }, + { + "epoch": 0.78, + "grad_norm": 0.6050789612783471, + "learning_rate": 1.1794436904285355e-06, + "loss": 0.093, + "step": 9299 + }, + { + "epoch": 0.78, + "grad_norm": 0.30023379946434486, + "learning_rate": 1.1785635475024731e-06, + "loss": 0.0902, + "step": 9300 + }, + { + "epoch": 0.78, + "grad_norm": 0.30685317374439186, + "learning_rate": 1.1776836892192578e-06, + "loss": 0.0837, + "step": 9301 + }, + { + "epoch": 0.78, + "grad_norm": 0.20348060964221507, + "learning_rate": 1.1768041156444293e-06, + "loss": 0.0794, + "step": 9302 + }, + { + "epoch": 0.78, + "grad_norm": 0.2671707214757036, + "learning_rate": 1.175924826843502e-06, + "loss": 0.0526, + "step": 9303 + }, + { + "epoch": 0.78, + "grad_norm": 0.22407515347863294, + "learning_rate": 1.17504582288197e-06, + "loss": 0.082, + "step": 9304 + }, + { + "epoch": 0.78, + "grad_norm": 0.46308565975101823, + "learning_rate": 1.1741671038253077e-06, + "loss": 0.1088, + "step": 9305 + }, + { + "epoch": 0.78, + "grad_norm": 0.29804296510829625, + "learning_rate": 1.173288669738965e-06, + "loss": 0.0588, + "step": 9306 + }, + { + "epoch": 0.78, + "grad_norm": 0.3496570055014727, + "learning_rate": 1.1724105206883763e-06, + "loss": 0.0816, + "step": 9307 + }, + { + "epoch": 0.78, + "grad_norm": 0.43357340188482607, + "learning_rate": 1.171532656738949e-06, + "loss": 0.0857, + "step": 9308 + }, + { + "epoch": 0.78, + "grad_norm": 0.37015159066891573, + "learning_rate": 1.1706550779560705e-06, + "loss": 0.1109, + "step": 9309 + }, + { + "epoch": 0.78, + "grad_norm": 0.40703606575616763, + "learning_rate": 1.1697777844051105e-06, + "loss": 0.0656, + "step": 9310 + }, + { + "epoch": 0.78, + "grad_norm": 0.39581708565725676, + "learning_rate": 1.1689007761514143e-06, + "loss": 0.1043, + "step": 9311 + }, + { + "epoch": 0.78, + "grad_norm": 0.41312565780093147, + "learning_rate": 1.1680240532603055e-06, + "loss": 0.1044, + "step": 9312 + }, + { + "epoch": 0.78, + "grad_norm": 0.3341608806589965, + "learning_rate": 1.1671476157970873e-06, + "loss": 0.0681, + "step": 9313 + }, + { + "epoch": 0.78, + "grad_norm": 0.19637090260168866, + "learning_rate": 1.166271463827044e-06, + "loss": 0.0394, + "step": 9314 + }, + { + "epoch": 0.78, + "grad_norm": 0.43301115687759745, + "learning_rate": 1.1653955974154353e-06, + "loss": 0.07, + "step": 9315 + }, + { + "epoch": 0.78, + "grad_norm": 0.32825181940964554, + "learning_rate": 1.164520016627499e-06, + "loss": 0.0679, + "step": 9316 + }, + { + "epoch": 0.79, + "grad_norm": 0.316220326367159, + "learning_rate": 1.1636447215284563e-06, + "loss": 0.0787, + "step": 9317 + }, + { + "epoch": 0.79, + "grad_norm": 0.46761205507602793, + "learning_rate": 1.1627697121835035e-06, + "loss": 0.1006, + "step": 9318 + }, + { + "epoch": 0.79, + "grad_norm": 0.19645546475225717, + "learning_rate": 1.1618949886578145e-06, + "loss": 0.0591, + "step": 9319 + }, + { + "epoch": 0.79, + "grad_norm": 0.34900732390252254, + "learning_rate": 1.1610205510165473e-06, + "loss": 0.1121, + "step": 9320 + }, + { + "epoch": 0.79, + "grad_norm": 0.5355660971225606, + "learning_rate": 1.1601463993248329e-06, + "loss": 0.1237, + "step": 9321 + }, + { + "epoch": 0.79, + "grad_norm": 0.2886544909843859, + "learning_rate": 1.1592725336477833e-06, + "loss": 0.0707, + "step": 9322 + }, + { + "epoch": 0.79, + "grad_norm": 0.2902073720988028, + "learning_rate": 1.158398954050488e-06, + "loss": 0.0926, + "step": 9323 + }, + { + "epoch": 0.79, + "grad_norm": 0.36037309962481706, + "learning_rate": 1.1575256605980189e-06, + "loss": 0.109, + "step": 9324 + }, + { + "epoch": 0.79, + "grad_norm": 0.2708472773171315, + "learning_rate": 1.1566526533554228e-06, + "loss": 0.0882, + "step": 9325 + }, + { + "epoch": 0.79, + "grad_norm": 0.44194293310601235, + "learning_rate": 1.155779932387725e-06, + "loss": 0.11, + "step": 9326 + }, + { + "epoch": 0.79, + "grad_norm": 0.26089661809459846, + "learning_rate": 1.1549074977599333e-06, + "loss": 0.0586, + "step": 9327 + }, + { + "epoch": 0.79, + "grad_norm": 0.47339368871831516, + "learning_rate": 1.1540353495370304e-06, + "loss": 0.096, + "step": 9328 + }, + { + "epoch": 0.79, + "grad_norm": 0.21961815733042944, + "learning_rate": 1.1531634877839792e-06, + "loss": 0.0358, + "step": 9329 + }, + { + "epoch": 0.79, + "grad_norm": 0.45987807831637395, + "learning_rate": 1.152291912565719e-06, + "loss": 0.0832, + "step": 9330 + }, + { + "epoch": 0.79, + "grad_norm": 0.2347152976061844, + "learning_rate": 1.1514206239471737e-06, + "loss": 0.0445, + "step": 9331 + }, + { + "epoch": 0.79, + "grad_norm": 0.2968466021735074, + "learning_rate": 1.1505496219932399e-06, + "loss": 0.0911, + "step": 9332 + }, + { + "epoch": 0.79, + "grad_norm": 0.2340864911808723, + "learning_rate": 1.1496789067687935e-06, + "loss": 0.0709, + "step": 9333 + }, + { + "epoch": 0.79, + "grad_norm": 0.28987640847842494, + "learning_rate": 1.1488084783386933e-06, + "loss": 0.0681, + "step": 9334 + }, + { + "epoch": 0.79, + "grad_norm": 0.5341363955376943, + "learning_rate": 1.147938336767772e-06, + "loss": 0.1337, + "step": 9335 + }, + { + "epoch": 0.79, + "grad_norm": 0.29881233165587984, + "learning_rate": 1.1470684821208428e-06, + "loss": 0.0965, + "step": 9336 + }, + { + "epoch": 0.79, + "grad_norm": 0.30295929237120667, + "learning_rate": 1.146198914462699e-06, + "loss": 0.0934, + "step": 9337 + }, + { + "epoch": 0.79, + "grad_norm": 0.3824552462172366, + "learning_rate": 1.14532963385811e-06, + "loss": 0.0656, + "step": 9338 + }, + { + "epoch": 0.79, + "grad_norm": 0.2830981094687693, + "learning_rate": 1.144460640371825e-06, + "loss": 0.0703, + "step": 9339 + }, + { + "epoch": 0.79, + "grad_norm": 0.3180052576142439, + "learning_rate": 1.1435919340685702e-06, + "loss": 0.0696, + "step": 9340 + }, + { + "epoch": 0.79, + "grad_norm": 0.29043449646022373, + "learning_rate": 1.142723515013055e-06, + "loss": 0.0599, + "step": 9341 + }, + { + "epoch": 0.79, + "grad_norm": 0.22981331829063498, + "learning_rate": 1.1418553832699624e-06, + "loss": 0.0535, + "step": 9342 + }, + { + "epoch": 0.79, + "grad_norm": 0.29175778970561117, + "learning_rate": 1.140987538903955e-06, + "loss": 0.0707, + "step": 9343 + }, + { + "epoch": 0.79, + "grad_norm": 0.3211786055903773, + "learning_rate": 1.1401199819796777e-06, + "loss": 0.0903, + "step": 9344 + }, + { + "epoch": 0.79, + "grad_norm": 0.516524402508203, + "learning_rate": 1.139252712561749e-06, + "loss": 0.1199, + "step": 9345 + }, + { + "epoch": 0.79, + "grad_norm": 0.2929054470081414, + "learning_rate": 1.1383857307147695e-06, + "loss": 0.0884, + "step": 9346 + }, + { + "epoch": 0.79, + "grad_norm": 0.3427345335274688, + "learning_rate": 1.1375190365033145e-06, + "loss": 0.1023, + "step": 9347 + }, + { + "epoch": 0.79, + "grad_norm": 0.32549223324032595, + "learning_rate": 1.1366526299919438e-06, + "loss": 0.0626, + "step": 9348 + }, + { + "epoch": 0.79, + "grad_norm": 0.32181859855254363, + "learning_rate": 1.135786511245191e-06, + "loss": 0.0783, + "step": 9349 + }, + { + "epoch": 0.79, + "grad_norm": 0.23688833559680336, + "learning_rate": 1.1349206803275682e-06, + "loss": 0.0619, + "step": 9350 + }, + { + "epoch": 0.79, + "grad_norm": 0.4372714170111681, + "learning_rate": 1.1340551373035701e-06, + "loss": 0.0986, + "step": 9351 + }, + { + "epoch": 0.79, + "grad_norm": 0.20914236746793263, + "learning_rate": 1.1331898822376664e-06, + "loss": 0.0488, + "step": 9352 + }, + { + "epoch": 0.79, + "grad_norm": 0.5617233165118068, + "learning_rate": 1.1323249151943045e-06, + "loss": 0.1539, + "step": 9353 + }, + { + "epoch": 0.79, + "grad_norm": 0.4582047601348013, + "learning_rate": 1.131460236237915e-06, + "loss": 0.132, + "step": 9354 + }, + { + "epoch": 0.79, + "grad_norm": 0.29709266278517865, + "learning_rate": 1.1305958454329036e-06, + "loss": 0.0928, + "step": 9355 + }, + { + "epoch": 0.79, + "grad_norm": 0.27799174256557124, + "learning_rate": 1.1297317428436545e-06, + "loss": 0.0757, + "step": 9356 + }, + { + "epoch": 0.79, + "grad_norm": 0.6172203818227928, + "learning_rate": 1.1288679285345288e-06, + "loss": 0.1081, + "step": 9357 + }, + { + "epoch": 0.79, + "grad_norm": 0.31712321927323595, + "learning_rate": 1.1280044025698733e-06, + "loss": 0.0719, + "step": 9358 + }, + { + "epoch": 0.79, + "grad_norm": 0.36042875625158505, + "learning_rate": 1.1271411650140052e-06, + "loss": 0.0791, + "step": 9359 + }, + { + "epoch": 0.79, + "grad_norm": 0.2150978497167032, + "learning_rate": 1.1262782159312236e-06, + "loss": 0.043, + "step": 9360 + }, + { + "epoch": 0.79, + "grad_norm": 0.25335583963677155, + "learning_rate": 1.125415555385807e-06, + "loss": 0.0723, + "step": 9361 + }, + { + "epoch": 0.79, + "grad_norm": 0.22182092215421234, + "learning_rate": 1.1245531834420115e-06, + "loss": 0.0281, + "step": 9362 + }, + { + "epoch": 0.79, + "grad_norm": 0.43031058521395876, + "learning_rate": 1.123691100164071e-06, + "loss": 0.093, + "step": 9363 + }, + { + "epoch": 0.79, + "grad_norm": 0.2539893310142868, + "learning_rate": 1.1228293056161987e-06, + "loss": 0.0653, + "step": 9364 + }, + { + "epoch": 0.79, + "grad_norm": 0.3199911626174709, + "learning_rate": 1.1219677998625845e-06, + "loss": 0.0815, + "step": 9365 + }, + { + "epoch": 0.79, + "grad_norm": 0.4732875326076397, + "learning_rate": 1.1211065829674007e-06, + "loss": 0.1319, + "step": 9366 + }, + { + "epoch": 0.79, + "grad_norm": 0.2207601332851445, + "learning_rate": 1.1202456549947954e-06, + "loss": 0.0752, + "step": 9367 + }, + { + "epoch": 0.79, + "grad_norm": 0.38324574821991314, + "learning_rate": 1.1193850160088948e-06, + "loss": 0.0999, + "step": 9368 + }, + { + "epoch": 0.79, + "grad_norm": 0.36410890361579085, + "learning_rate": 1.1185246660738025e-06, + "loss": 0.0888, + "step": 9369 + }, + { + "epoch": 0.79, + "grad_norm": 0.4077372457861351, + "learning_rate": 1.1176646052536067e-06, + "loss": 0.0973, + "step": 9370 + }, + { + "epoch": 0.79, + "grad_norm": 0.2874286157242859, + "learning_rate": 1.1168048336123666e-06, + "loss": 0.0816, + "step": 9371 + }, + { + "epoch": 0.79, + "grad_norm": 0.27483479656349746, + "learning_rate": 1.1159453512141238e-06, + "loss": 0.0678, + "step": 9372 + }, + { + "epoch": 0.79, + "grad_norm": 0.4328440612785833, + "learning_rate": 1.1150861581228978e-06, + "loss": 0.129, + "step": 9373 + }, + { + "epoch": 0.79, + "grad_norm": 0.16645049465152567, + "learning_rate": 1.114227254402685e-06, + "loss": 0.0431, + "step": 9374 + }, + { + "epoch": 0.79, + "grad_norm": 0.36472653749834893, + "learning_rate": 1.113368640117463e-06, + "loss": 0.0938, + "step": 9375 + }, + { + "epoch": 0.79, + "grad_norm": 0.24997845642505542, + "learning_rate": 1.1125103153311873e-06, + "loss": 0.0609, + "step": 9376 + }, + { + "epoch": 0.79, + "grad_norm": 0.38933672911441664, + "learning_rate": 1.1116522801077873e-06, + "loss": 0.0717, + "step": 9377 + }, + { + "epoch": 0.79, + "grad_norm": 0.4833171667719454, + "learning_rate": 1.1107945345111781e-06, + "loss": 0.1001, + "step": 9378 + }, + { + "epoch": 0.79, + "grad_norm": 0.32797178174366665, + "learning_rate": 1.1099370786052489e-06, + "loss": 0.0967, + "step": 9379 + }, + { + "epoch": 0.79, + "grad_norm": 0.3053390590459215, + "learning_rate": 1.109079912453867e-06, + "loss": 0.0913, + "step": 9380 + }, + { + "epoch": 0.79, + "grad_norm": 0.35632643637962647, + "learning_rate": 1.108223036120878e-06, + "loss": 0.095, + "step": 9381 + }, + { + "epoch": 0.79, + "grad_norm": 0.29643629164448054, + "learning_rate": 1.1073664496701103e-06, + "loss": 0.0865, + "step": 9382 + }, + { + "epoch": 0.79, + "grad_norm": 0.24622018429467024, + "learning_rate": 1.1065101531653654e-06, + "loss": 0.0506, + "step": 9383 + }, + { + "epoch": 0.79, + "grad_norm": 0.2145037311033517, + "learning_rate": 1.105654146670424e-06, + "loss": 0.0571, + "step": 9384 + }, + { + "epoch": 0.79, + "grad_norm": 0.3297512004467472, + "learning_rate": 1.1047984302490495e-06, + "loss": 0.0641, + "step": 9385 + }, + { + "epoch": 0.79, + "grad_norm": 0.48553134972174045, + "learning_rate": 1.1039430039649795e-06, + "loss": 0.1001, + "step": 9386 + }, + { + "epoch": 0.79, + "grad_norm": 0.22196605932519084, + "learning_rate": 1.1030878678819307e-06, + "loss": 0.0671, + "step": 9387 + }, + { + "epoch": 0.79, + "grad_norm": 0.4530690811035253, + "learning_rate": 1.1022330220635969e-06, + "loss": 0.0837, + "step": 9388 + }, + { + "epoch": 0.79, + "grad_norm": 0.25768154944476174, + "learning_rate": 1.1013784665736554e-06, + "loss": 0.049, + "step": 9389 + }, + { + "epoch": 0.79, + "grad_norm": 0.38111299082343153, + "learning_rate": 1.1005242014757567e-06, + "loss": 0.0901, + "step": 9390 + }, + { + "epoch": 0.79, + "grad_norm": 0.2726035560376019, + "learning_rate": 1.0996702268335308e-06, + "loss": 0.0579, + "step": 9391 + }, + { + "epoch": 0.79, + "grad_norm": 0.2721059596813946, + "learning_rate": 1.0988165427105885e-06, + "loss": 0.0635, + "step": 9392 + }, + { + "epoch": 0.79, + "grad_norm": 0.3614775055241465, + "learning_rate": 1.0979631491705162e-06, + "loss": 0.0854, + "step": 9393 + }, + { + "epoch": 0.79, + "grad_norm": 0.23025476417282853, + "learning_rate": 1.097110046276878e-06, + "loss": 0.0787, + "step": 9394 + }, + { + "epoch": 0.79, + "grad_norm": 0.25897108429739835, + "learning_rate": 1.096257234093221e-06, + "loss": 0.0837, + "step": 9395 + }, + { + "epoch": 0.79, + "grad_norm": 0.412858913285288, + "learning_rate": 1.0954047126830663e-06, + "loss": 0.1279, + "step": 9396 + }, + { + "epoch": 0.79, + "grad_norm": 0.26515248655763857, + "learning_rate": 1.0945524821099147e-06, + "loss": 0.0595, + "step": 9397 + }, + { + "epoch": 0.79, + "grad_norm": 0.35337904666777814, + "learning_rate": 1.0937005424372431e-06, + "loss": 0.0966, + "step": 9398 + }, + { + "epoch": 0.79, + "grad_norm": 0.3444547127862197, + "learning_rate": 1.092848893728513e-06, + "loss": 0.0751, + "step": 9399 + }, + { + "epoch": 0.79, + "grad_norm": 0.45933753487575285, + "learning_rate": 1.091997536047158e-06, + "loss": 0.1194, + "step": 9400 + }, + { + "epoch": 0.79, + "grad_norm": 0.25231411626635764, + "learning_rate": 1.091146469456591e-06, + "loss": 0.0591, + "step": 9401 + }, + { + "epoch": 0.79, + "grad_norm": 0.20793834255996602, + "learning_rate": 1.090295694020207e-06, + "loss": 0.043, + "step": 9402 + }, + { + "epoch": 0.79, + "grad_norm": 0.374460615764852, + "learning_rate": 1.0894452098013758e-06, + "loss": 0.0895, + "step": 9403 + }, + { + "epoch": 0.79, + "grad_norm": 0.4684536880803592, + "learning_rate": 1.0885950168634457e-06, + "loss": 0.1639, + "step": 9404 + }, + { + "epoch": 0.79, + "grad_norm": 0.26704351179738445, + "learning_rate": 1.0877451152697428e-06, + "loss": 0.0655, + "step": 9405 + }, + { + "epoch": 0.79, + "grad_norm": 0.2640189603666819, + "learning_rate": 1.0868955050835755e-06, + "loss": 0.0636, + "step": 9406 + }, + { + "epoch": 0.79, + "grad_norm": 0.2912415975300197, + "learning_rate": 1.086046186368227e-06, + "loss": 0.0785, + "step": 9407 + }, + { + "epoch": 0.79, + "grad_norm": 0.4176305951783421, + "learning_rate": 1.0851971591869575e-06, + "loss": 0.0815, + "step": 9408 + }, + { + "epoch": 0.79, + "grad_norm": 0.44509235245635737, + "learning_rate": 1.0843484236030105e-06, + "loss": 0.0883, + "step": 9409 + }, + { + "epoch": 0.79, + "grad_norm": 0.3256122554546421, + "learning_rate": 1.083499979679603e-06, + "loss": 0.0714, + "step": 9410 + }, + { + "epoch": 0.79, + "grad_norm": 0.28567074769986633, + "learning_rate": 1.0826518274799313e-06, + "loss": 0.0631, + "step": 9411 + }, + { + "epoch": 0.79, + "grad_norm": 0.28451706882348354, + "learning_rate": 1.0818039670671725e-06, + "loss": 0.0414, + "step": 9412 + }, + { + "epoch": 0.79, + "grad_norm": 0.44350728196451367, + "learning_rate": 1.0809563985044796e-06, + "loss": 0.1002, + "step": 9413 + }, + { + "epoch": 0.79, + "grad_norm": 0.34426239445685, + "learning_rate": 1.0801091218549847e-06, + "loss": 0.0714, + "step": 9414 + }, + { + "epoch": 0.79, + "grad_norm": 0.36844961118504277, + "learning_rate": 1.0792621371817957e-06, + "loss": 0.0865, + "step": 9415 + }, + { + "epoch": 0.79, + "grad_norm": 0.49899936843600834, + "learning_rate": 1.0784154445480043e-06, + "loss": 0.0888, + "step": 9416 + }, + { + "epoch": 0.79, + "grad_norm": 0.23109702330938567, + "learning_rate": 1.0775690440166753e-06, + "loss": 0.0423, + "step": 9417 + }, + { + "epoch": 0.79, + "grad_norm": 0.34440355863584776, + "learning_rate": 1.0767229356508524e-06, + "loss": 0.0808, + "step": 9418 + }, + { + "epoch": 0.79, + "grad_norm": 0.35611727418842365, + "learning_rate": 1.0758771195135614e-06, + "loss": 0.1075, + "step": 9419 + }, + { + "epoch": 0.79, + "grad_norm": 0.4896959184577979, + "learning_rate": 1.0750315956678025e-06, + "loss": 0.1127, + "step": 9420 + }, + { + "epoch": 0.79, + "grad_norm": 0.3277182256566695, + "learning_rate": 1.0741863641765548e-06, + "loss": 0.0632, + "step": 9421 + }, + { + "epoch": 0.79, + "grad_norm": 0.38255715330385964, + "learning_rate": 1.0733414251027751e-06, + "loss": 0.1369, + "step": 9422 + }, + { + "epoch": 0.79, + "grad_norm": 0.29490165596071827, + "learning_rate": 1.0724967785094014e-06, + "loss": 0.0388, + "step": 9423 + }, + { + "epoch": 0.79, + "grad_norm": 0.2831306258948654, + "learning_rate": 1.0716524244593473e-06, + "loss": 0.1043, + "step": 9424 + }, + { + "epoch": 0.79, + "grad_norm": 0.36518045132182364, + "learning_rate": 1.0708083630155036e-06, + "loss": 0.0751, + "step": 9425 + }, + { + "epoch": 0.79, + "grad_norm": 0.5797122499433458, + "learning_rate": 1.069964594240744e-06, + "loss": 0.1318, + "step": 9426 + }, + { + "epoch": 0.79, + "grad_norm": 0.27735092485057206, + "learning_rate": 1.069121118197915e-06, + "loss": 0.0579, + "step": 9427 + }, + { + "epoch": 0.79, + "grad_norm": 0.22515569090773913, + "learning_rate": 1.0682779349498428e-06, + "loss": 0.0728, + "step": 9428 + }, + { + "epoch": 0.79, + "grad_norm": 0.248802714047661, + "learning_rate": 1.0674350445593357e-06, + "loss": 0.0567, + "step": 9429 + }, + { + "epoch": 0.79, + "grad_norm": 0.2321232648998306, + "learning_rate": 1.0665924470891753e-06, + "loss": 0.085, + "step": 9430 + }, + { + "epoch": 0.79, + "grad_norm": 0.38019342045157584, + "learning_rate": 1.0657501426021233e-06, + "loss": 0.0992, + "step": 9431 + }, + { + "epoch": 0.79, + "grad_norm": 0.3023057906984438, + "learning_rate": 1.0649081311609188e-06, + "loss": 0.0664, + "step": 9432 + }, + { + "epoch": 0.79, + "grad_norm": 0.2961175596293181, + "learning_rate": 1.0640664128282797e-06, + "loss": 0.084, + "step": 9433 + }, + { + "epoch": 0.79, + "grad_norm": 0.39992940364904905, + "learning_rate": 1.0632249876669037e-06, + "loss": 0.0906, + "step": 9434 + }, + { + "epoch": 0.79, + "grad_norm": 0.2050863137714559, + "learning_rate": 1.062383855739464e-06, + "loss": 0.0747, + "step": 9435 + }, + { + "epoch": 0.8, + "grad_norm": 0.5692649664640083, + "learning_rate": 1.0615430171086127e-06, + "loss": 0.1, + "step": 9436 + }, + { + "epoch": 0.8, + "grad_norm": 0.3718390881509671, + "learning_rate": 1.0607024718369802e-06, + "loss": 0.1043, + "step": 9437 + }, + { + "epoch": 0.8, + "grad_norm": 0.2930905590612547, + "learning_rate": 1.059862219987176e-06, + "loss": 0.0807, + "step": 9438 + }, + { + "epoch": 0.8, + "grad_norm": 0.38051474016125814, + "learning_rate": 1.0590222616217877e-06, + "loss": 0.0935, + "step": 9439 + }, + { + "epoch": 0.8, + "grad_norm": 0.3255129636430382, + "learning_rate": 1.0581825968033788e-06, + "loss": 0.0671, + "step": 9440 + }, + { + "epoch": 0.8, + "grad_norm": 0.7213886319785722, + "learning_rate": 1.0573432255944926e-06, + "loss": 0.14, + "step": 9441 + }, + { + "epoch": 0.8, + "grad_norm": 0.4026173774861173, + "learning_rate": 1.0565041480576493e-06, + "loss": 0.0765, + "step": 9442 + }, + { + "epoch": 0.8, + "grad_norm": 0.29046274420787477, + "learning_rate": 1.0556653642553517e-06, + "loss": 0.0629, + "step": 9443 + }, + { + "epoch": 0.8, + "grad_norm": 0.4015107018020187, + "learning_rate": 1.0548268742500745e-06, + "loss": 0.1216, + "step": 9444 + }, + { + "epoch": 0.8, + "grad_norm": 0.2945006087799275, + "learning_rate": 1.053988678104273e-06, + "loss": 0.0869, + "step": 9445 + }, + { + "epoch": 0.8, + "grad_norm": 0.2515773430322903, + "learning_rate": 1.0531507758803827e-06, + "loss": 0.0829, + "step": 9446 + }, + { + "epoch": 0.8, + "grad_norm": 0.4900612012575581, + "learning_rate": 1.0523131676408154e-06, + "loss": 0.1599, + "step": 9447 + }, + { + "epoch": 0.8, + "grad_norm": 0.3091266708599519, + "learning_rate": 1.0514758534479601e-06, + "loss": 0.0811, + "step": 9448 + }, + { + "epoch": 0.8, + "grad_norm": 0.31748186443582654, + "learning_rate": 1.0506388333641836e-06, + "loss": 0.117, + "step": 9449 + }, + { + "epoch": 0.8, + "grad_norm": 0.22563766672562718, + "learning_rate": 1.0498021074518355e-06, + "loss": 0.0611, + "step": 9450 + }, + { + "epoch": 0.8, + "grad_norm": 0.5330579468842649, + "learning_rate": 1.0489656757732375e-06, + "loss": 0.1079, + "step": 9451 + }, + { + "epoch": 0.8, + "grad_norm": 0.37375961770527794, + "learning_rate": 1.0481295383906914e-06, + "loss": 0.0789, + "step": 9452 + }, + { + "epoch": 0.8, + "grad_norm": 0.39041849706724446, + "learning_rate": 1.0472936953664798e-06, + "loss": 0.0983, + "step": 9453 + }, + { + "epoch": 0.8, + "grad_norm": 0.30707174061388903, + "learning_rate": 1.0464581467628597e-06, + "loss": 0.096, + "step": 9454 + }, + { + "epoch": 0.8, + "grad_norm": 0.4458638824848113, + "learning_rate": 1.0456228926420682e-06, + "loss": 0.1091, + "step": 9455 + }, + { + "epoch": 0.8, + "grad_norm": 0.4239362180324649, + "learning_rate": 1.0447879330663185e-06, + "loss": 0.0947, + "step": 9456 + }, + { + "epoch": 0.8, + "grad_norm": 0.21481715140697885, + "learning_rate": 1.0439532680978054e-06, + "loss": 0.0691, + "step": 9457 + }, + { + "epoch": 0.8, + "grad_norm": 0.09852494353072135, + "learning_rate": 1.0431188977986983e-06, + "loss": 0.017, + "step": 9458 + }, + { + "epoch": 0.8, + "grad_norm": 0.34761684295440354, + "learning_rate": 1.0422848222311455e-06, + "loss": 0.0793, + "step": 9459 + }, + { + "epoch": 0.8, + "grad_norm": 0.4151762958884645, + "learning_rate": 1.0414510414572755e-06, + "loss": 0.1006, + "step": 9460 + }, + { + "epoch": 0.8, + "grad_norm": 0.3349420872015461, + "learning_rate": 1.0406175555391923e-06, + "loss": 0.0842, + "step": 9461 + }, + { + "epoch": 0.8, + "grad_norm": 0.38084513280878435, + "learning_rate": 1.039784364538977e-06, + "loss": 0.0831, + "step": 9462 + }, + { + "epoch": 0.8, + "grad_norm": 0.2971313501240971, + "learning_rate": 1.038951468518694e-06, + "loss": 0.0745, + "step": 9463 + }, + { + "epoch": 0.8, + "grad_norm": 0.4084980181400941, + "learning_rate": 1.0381188675403803e-06, + "loss": 0.1094, + "step": 9464 + }, + { + "epoch": 0.8, + "grad_norm": 0.4114010564745562, + "learning_rate": 1.0372865616660532e-06, + "loss": 0.1103, + "step": 9465 + }, + { + "epoch": 0.8, + "grad_norm": 0.22641981586402618, + "learning_rate": 1.0364545509577057e-06, + "loss": 0.0937, + "step": 9466 + }, + { + "epoch": 0.8, + "grad_norm": 0.35285123482097275, + "learning_rate": 1.035622835477314e-06, + "loss": 0.0719, + "step": 9467 + }, + { + "epoch": 0.8, + "grad_norm": 0.281639337429761, + "learning_rate": 1.0347914152868283e-06, + "loss": 0.0553, + "step": 9468 + }, + { + "epoch": 0.8, + "grad_norm": 0.1922242880042777, + "learning_rate": 1.033960290448175e-06, + "loss": 0.0475, + "step": 9469 + }, + { + "epoch": 0.8, + "grad_norm": 0.32858527518905223, + "learning_rate": 1.033129461023265e-06, + "loss": 0.0998, + "step": 9470 + }, + { + "epoch": 0.8, + "grad_norm": 0.4202812320405979, + "learning_rate": 1.0322989270739814e-06, + "loss": 0.0787, + "step": 9471 + }, + { + "epoch": 0.8, + "grad_norm": 0.24740082052772397, + "learning_rate": 1.0314686886621872e-06, + "loss": 0.0568, + "step": 9472 + }, + { + "epoch": 0.8, + "grad_norm": 0.49258359512008126, + "learning_rate": 1.0306387458497218e-06, + "loss": 0.0719, + "step": 9473 + }, + { + "epoch": 0.8, + "grad_norm": 0.43993148823945727, + "learning_rate": 1.0298090986984077e-06, + "loss": 0.0891, + "step": 9474 + }, + { + "epoch": 0.8, + "grad_norm": 0.4448497054580627, + "learning_rate": 1.0289797472700397e-06, + "loss": 0.0787, + "step": 9475 + }, + { + "epoch": 0.8, + "grad_norm": 0.2674762105525837, + "learning_rate": 1.0281506916263918e-06, + "loss": 0.0629, + "step": 9476 + }, + { + "epoch": 0.8, + "grad_norm": 0.31276316195512316, + "learning_rate": 1.0273219318292193e-06, + "loss": 0.0791, + "step": 9477 + }, + { + "epoch": 0.8, + "grad_norm": 0.3003362768657269, + "learning_rate": 1.0264934679402523e-06, + "loss": 0.0874, + "step": 9478 + }, + { + "epoch": 0.8, + "grad_norm": 0.24181030501496012, + "learning_rate": 1.0256653000211969e-06, + "loss": 0.0527, + "step": 9479 + }, + { + "epoch": 0.8, + "grad_norm": 0.344260843065536, + "learning_rate": 1.0248374281337441e-06, + "loss": 0.0731, + "step": 9480 + }, + { + "epoch": 0.8, + "grad_norm": 0.3786440009143864, + "learning_rate": 1.0240098523395569e-06, + "loss": 0.1014, + "step": 9481 + }, + { + "epoch": 0.8, + "grad_norm": 0.3003477594172609, + "learning_rate": 1.0231825727002776e-06, + "loss": 0.0839, + "step": 9482 + }, + { + "epoch": 0.8, + "grad_norm": 0.36990139911029807, + "learning_rate": 1.0223555892775256e-06, + "loss": 0.0868, + "step": 9483 + }, + { + "epoch": 0.8, + "grad_norm": 0.29915541545008373, + "learning_rate": 1.0215289021329023e-06, + "loss": 0.0727, + "step": 9484 + }, + { + "epoch": 0.8, + "grad_norm": 0.4263460358796248, + "learning_rate": 1.0207025113279827e-06, + "loss": 0.0922, + "step": 9485 + }, + { + "epoch": 0.8, + "grad_norm": 0.22455781156748722, + "learning_rate": 1.0198764169243198e-06, + "loss": 0.048, + "step": 9486 + }, + { + "epoch": 0.8, + "grad_norm": 0.24250498892971267, + "learning_rate": 1.0190506189834482e-06, + "loss": 0.0599, + "step": 9487 + }, + { + "epoch": 0.8, + "grad_norm": 0.35149428487455425, + "learning_rate": 1.0182251175668779e-06, + "loss": 0.0814, + "step": 9488 + }, + { + "epoch": 0.8, + "grad_norm": 0.3375633843484217, + "learning_rate": 1.0173999127360957e-06, + "loss": 0.1013, + "step": 9489 + }, + { + "epoch": 0.8, + "grad_norm": 0.3857001936935201, + "learning_rate": 1.0165750045525673e-06, + "loss": 0.0844, + "step": 9490 + }, + { + "epoch": 0.8, + "grad_norm": 0.24759213148661807, + "learning_rate": 1.015750393077739e-06, + "loss": 0.0445, + "step": 9491 + }, + { + "epoch": 0.8, + "grad_norm": 0.2892582334898657, + "learning_rate": 1.014926078373032e-06, + "loss": 0.0558, + "step": 9492 + }, + { + "epoch": 0.8, + "grad_norm": 0.4304169615966122, + "learning_rate": 1.0141020604998437e-06, + "loss": 0.0988, + "step": 9493 + }, + { + "epoch": 0.8, + "grad_norm": 0.2634242748307677, + "learning_rate": 1.0132783395195545e-06, + "loss": 0.0939, + "step": 9494 + }, + { + "epoch": 0.8, + "grad_norm": 0.572803060882255, + "learning_rate": 1.0124549154935192e-06, + "loss": 0.0801, + "step": 9495 + }, + { + "epoch": 0.8, + "grad_norm": 0.3375868878036133, + "learning_rate": 1.0116317884830695e-06, + "loss": 0.0914, + "step": 9496 + }, + { + "epoch": 0.8, + "grad_norm": 0.35469303096046023, + "learning_rate": 1.01080895854952e-06, + "loss": 0.0691, + "step": 9497 + }, + { + "epoch": 0.8, + "grad_norm": 0.37002450046140567, + "learning_rate": 1.0099864257541575e-06, + "loss": 0.1044, + "step": 9498 + }, + { + "epoch": 0.8, + "grad_norm": 0.33804472847245937, + "learning_rate": 1.00916419015825e-06, + "loss": 0.0706, + "step": 9499 + }, + { + "epoch": 0.8, + "grad_norm": 0.3608794011457335, + "learning_rate": 1.0083422518230412e-06, + "loss": 0.0586, + "step": 9500 + }, + { + "epoch": 0.8, + "grad_norm": 0.2131082898312194, + "learning_rate": 1.007520610809754e-06, + "loss": 0.0473, + "step": 9501 + }, + { + "epoch": 0.8, + "grad_norm": 0.22556929941496373, + "learning_rate": 1.0066992671795905e-06, + "loss": 0.0572, + "step": 9502 + }, + { + "epoch": 0.8, + "grad_norm": 0.33109044504824375, + "learning_rate": 1.0058782209937284e-06, + "loss": 0.1041, + "step": 9503 + }, + { + "epoch": 0.8, + "grad_norm": 0.2912627081265232, + "learning_rate": 1.0050574723133245e-06, + "loss": 0.0641, + "step": 9504 + }, + { + "epoch": 0.8, + "grad_norm": 0.41218942592048113, + "learning_rate": 1.0042370211995101e-06, + "loss": 0.102, + "step": 9505 + }, + { + "epoch": 0.8, + "grad_norm": 0.4096836015680222, + "learning_rate": 1.0034168677134015e-06, + "loss": 0.118, + "step": 9506 + }, + { + "epoch": 0.8, + "grad_norm": 0.43869659020070123, + "learning_rate": 1.0025970119160854e-06, + "loss": 0.1113, + "step": 9507 + }, + { + "epoch": 0.8, + "grad_norm": 0.7797877869711737, + "learning_rate": 1.0017774538686314e-06, + "loss": 0.1413, + "step": 9508 + }, + { + "epoch": 0.8, + "grad_norm": 0.25897759129555037, + "learning_rate": 1.0009581936320838e-06, + "loss": 0.0626, + "step": 9509 + }, + { + "epoch": 0.8, + "grad_norm": 0.35017725681712436, + "learning_rate": 1.0001392312674645e-06, + "loss": 0.062, + "step": 9510 + }, + { + "epoch": 0.8, + "grad_norm": 0.16520783092672198, + "learning_rate": 9.993205668357775e-07, + "loss": 0.0397, + "step": 9511 + }, + { + "epoch": 0.8, + "grad_norm": 0.31679494400956504, + "learning_rate": 9.985022003980004e-07, + "loss": 0.0793, + "step": 9512 + }, + { + "epoch": 0.8, + "grad_norm": 0.24074401494415626, + "learning_rate": 9.9768413201509e-07, + "loss": 0.0784, + "step": 9513 + }, + { + "epoch": 0.8, + "grad_norm": 0.27580190533815174, + "learning_rate": 9.968663617479796e-07, + "loss": 0.0609, + "step": 9514 + }, + { + "epoch": 0.8, + "grad_norm": 0.19535047074178918, + "learning_rate": 9.96048889657583e-07, + "loss": 0.0719, + "step": 9515 + }, + { + "epoch": 0.8, + "grad_norm": 0.2872448038897539, + "learning_rate": 9.952317158047908e-07, + "loss": 0.0733, + "step": 9516 + }, + { + "epoch": 0.8, + "grad_norm": 0.47575822851132854, + "learning_rate": 9.944148402504676e-07, + "loss": 0.0871, + "step": 9517 + }, + { + "epoch": 0.8, + "grad_norm": 0.1884864409574822, + "learning_rate": 9.935982630554636e-07, + "loss": 0.0528, + "step": 9518 + }, + { + "epoch": 0.8, + "grad_norm": 0.3458148469916603, + "learning_rate": 9.927819842805997e-07, + "loss": 0.0994, + "step": 9519 + }, + { + "epoch": 0.8, + "grad_norm": 0.1853699595890618, + "learning_rate": 9.919660039866757e-07, + "loss": 0.0487, + "step": 9520 + }, + { + "epoch": 0.8, + "grad_norm": 0.3398138573946735, + "learning_rate": 9.911503222344743e-07, + "loss": 0.1055, + "step": 9521 + }, + { + "epoch": 0.8, + "grad_norm": 0.2986193298725853, + "learning_rate": 9.903349390847493e-07, + "loss": 0.0567, + "step": 9522 + }, + { + "epoch": 0.8, + "grad_norm": 0.4304820281903488, + "learning_rate": 9.895198545982365e-07, + "loss": 0.1113, + "step": 9523 + }, + { + "epoch": 0.8, + "grad_norm": 0.32418751394090867, + "learning_rate": 9.887050688356464e-07, + "loss": 0.0761, + "step": 9524 + }, + { + "epoch": 0.8, + "grad_norm": 0.40507284821579265, + "learning_rate": 9.878905818576718e-07, + "loss": 0.086, + "step": 9525 + }, + { + "epoch": 0.8, + "grad_norm": 0.2520521625248654, + "learning_rate": 9.87076393724979e-07, + "loss": 0.0778, + "step": 9526 + }, + { + "epoch": 0.8, + "grad_norm": 0.267784860256976, + "learning_rate": 9.862625044982116e-07, + "loss": 0.0898, + "step": 9527 + }, + { + "epoch": 0.8, + "grad_norm": 0.24375663959375155, + "learning_rate": 9.854489142379958e-07, + "loss": 0.0603, + "step": 9528 + }, + { + "epoch": 0.8, + "grad_norm": 0.26066480031165973, + "learning_rate": 9.84635623004932e-07, + "loss": 0.0724, + "step": 9529 + }, + { + "epoch": 0.8, + "grad_norm": 0.2799189474938784, + "learning_rate": 9.838226308595978e-07, + "loss": 0.0391, + "step": 9530 + }, + { + "epoch": 0.8, + "grad_norm": 0.26333539435902226, + "learning_rate": 9.83009937862549e-07, + "loss": 0.0538, + "step": 9531 + }, + { + "epoch": 0.8, + "grad_norm": 0.18994937182771404, + "learning_rate": 9.82197544074322e-07, + "loss": 0.0615, + "step": 9532 + }, + { + "epoch": 0.8, + "grad_norm": 0.2911355980643953, + "learning_rate": 9.813854495554271e-07, + "loss": 0.095, + "step": 9533 + }, + { + "epoch": 0.8, + "grad_norm": 0.2823999234235467, + "learning_rate": 9.805736543663529e-07, + "loss": 0.1044, + "step": 9534 + }, + { + "epoch": 0.8, + "grad_norm": 0.37222280819664677, + "learning_rate": 9.797621585675687e-07, + "loss": 0.0971, + "step": 9535 + }, + { + "epoch": 0.8, + "grad_norm": 0.22530633357904414, + "learning_rate": 9.789509622195192e-07, + "loss": 0.0469, + "step": 9536 + }, + { + "epoch": 0.8, + "grad_norm": 0.18876721135174265, + "learning_rate": 9.781400653826244e-07, + "loss": 0.0672, + "step": 9537 + }, + { + "epoch": 0.8, + "grad_norm": 0.22955872181301532, + "learning_rate": 9.773294681172886e-07, + "loss": 0.0725, + "step": 9538 + }, + { + "epoch": 0.8, + "grad_norm": 0.22469469241823972, + "learning_rate": 9.765191704838879e-07, + "loss": 0.0543, + "step": 9539 + }, + { + "epoch": 0.8, + "grad_norm": 0.3934945196532428, + "learning_rate": 9.757091725427769e-07, + "loss": 0.0876, + "step": 9540 + }, + { + "epoch": 0.8, + "grad_norm": 0.30483661984977134, + "learning_rate": 9.748994743542895e-07, + "loss": 0.0717, + "step": 9541 + }, + { + "epoch": 0.8, + "grad_norm": 0.29198721554322654, + "learning_rate": 9.740900759787381e-07, + "loss": 0.0869, + "step": 9542 + }, + { + "epoch": 0.8, + "grad_norm": 0.4217519720494303, + "learning_rate": 9.732809774764107e-07, + "loss": 0.0812, + "step": 9543 + }, + { + "epoch": 0.8, + "grad_norm": 0.2615473542211019, + "learning_rate": 9.724721789075726e-07, + "loss": 0.0656, + "step": 9544 + }, + { + "epoch": 0.8, + "grad_norm": 0.3428016120404283, + "learning_rate": 9.716636803324692e-07, + "loss": 0.0714, + "step": 9545 + }, + { + "epoch": 0.8, + "grad_norm": 0.3221118511929357, + "learning_rate": 9.70855481811322e-07, + "loss": 0.0747, + "step": 9546 + }, + { + "epoch": 0.8, + "grad_norm": 0.33802306437093976, + "learning_rate": 9.700475834043299e-07, + "loss": 0.0471, + "step": 9547 + }, + { + "epoch": 0.8, + "grad_norm": 0.3275720590014501, + "learning_rate": 9.692399851716689e-07, + "loss": 0.0749, + "step": 9548 + }, + { + "epoch": 0.8, + "grad_norm": 0.28056324360057766, + "learning_rate": 9.68432687173496e-07, + "loss": 0.0719, + "step": 9549 + }, + { + "epoch": 0.8, + "grad_norm": 0.2963831161567858, + "learning_rate": 9.676256894699421e-07, + "loss": 0.0702, + "step": 9550 + }, + { + "epoch": 0.8, + "grad_norm": 0.2954622161252651, + "learning_rate": 9.668189921211158e-07, + "loss": 0.0719, + "step": 9551 + }, + { + "epoch": 0.8, + "grad_norm": 0.21810156831998798, + "learning_rate": 9.660125951871075e-07, + "loss": 0.071, + "step": 9552 + }, + { + "epoch": 0.8, + "grad_norm": 0.5269466745788222, + "learning_rate": 9.652064987279807e-07, + "loss": 0.1023, + "step": 9553 + }, + { + "epoch": 0.81, + "grad_norm": 0.3050697099109935, + "learning_rate": 9.644007028037771e-07, + "loss": 0.1161, + "step": 9554 + }, + { + "epoch": 0.81, + "grad_norm": 0.27508259709916427, + "learning_rate": 9.635952074745191e-07, + "loss": 0.0493, + "step": 9555 + }, + { + "epoch": 0.81, + "grad_norm": 0.6101469256763596, + "learning_rate": 9.627900128002044e-07, + "loss": 0.1157, + "step": 9556 + }, + { + "epoch": 0.81, + "grad_norm": 0.2420013315025714, + "learning_rate": 9.61985118840808e-07, + "loss": 0.0878, + "step": 9557 + }, + { + "epoch": 0.81, + "grad_norm": 0.4374001692686658, + "learning_rate": 9.611805256562811e-07, + "loss": 0.1099, + "step": 9558 + }, + { + "epoch": 0.81, + "grad_norm": 0.2688766849098861, + "learning_rate": 9.603762333065586e-07, + "loss": 0.0872, + "step": 9559 + }, + { + "epoch": 0.81, + "grad_norm": 0.36511916332830674, + "learning_rate": 9.595722418515469e-07, + "loss": 0.0732, + "step": 9560 + }, + { + "epoch": 0.81, + "grad_norm": 0.31871974683446297, + "learning_rate": 9.587685513511297e-07, + "loss": 0.0611, + "step": 9561 + }, + { + "epoch": 0.81, + "grad_norm": 0.3491156997031344, + "learning_rate": 9.579651618651748e-07, + "loss": 0.0512, + "step": 9562 + }, + { + "epoch": 0.81, + "grad_norm": 0.4384981931791563, + "learning_rate": 9.571620734535208e-07, + "loss": 0.1168, + "step": 9563 + }, + { + "epoch": 0.81, + "grad_norm": 0.22598879952824824, + "learning_rate": 9.563592861759867e-07, + "loss": 0.0728, + "step": 9564 + }, + { + "epoch": 0.81, + "grad_norm": 0.42204102673121263, + "learning_rate": 9.55556800092368e-07, + "loss": 0.0865, + "step": 9565 + }, + { + "epoch": 0.81, + "grad_norm": 0.21609389454811964, + "learning_rate": 9.54754615262441e-07, + "loss": 0.0637, + "step": 9566 + }, + { + "epoch": 0.81, + "grad_norm": 0.2211024456573294, + "learning_rate": 9.53952731745955e-07, + "loss": 0.0298, + "step": 9567 + }, + { + "epoch": 0.81, + "grad_norm": 0.3588321453206782, + "learning_rate": 9.531511496026397e-07, + "loss": 0.1004, + "step": 9568 + }, + { + "epoch": 0.81, + "grad_norm": 0.4431574812526109, + "learning_rate": 9.523498688922e-07, + "loss": 0.1096, + "step": 9569 + }, + { + "epoch": 0.81, + "grad_norm": 0.4132679353807532, + "learning_rate": 9.51548889674323e-07, + "loss": 0.0921, + "step": 9570 + }, + { + "epoch": 0.81, + "grad_norm": 0.33711368885112064, + "learning_rate": 9.507482120086686e-07, + "loss": 0.0876, + "step": 9571 + }, + { + "epoch": 0.81, + "grad_norm": 0.6457169768546224, + "learning_rate": 9.499478359548758e-07, + "loss": 0.0804, + "step": 9572 + }, + { + "epoch": 0.81, + "grad_norm": 0.4370573508670499, + "learning_rate": 9.491477615725603e-07, + "loss": 0.1121, + "step": 9573 + }, + { + "epoch": 0.81, + "grad_norm": 0.44020553545066887, + "learning_rate": 9.483479889213187e-07, + "loss": 0.1049, + "step": 9574 + }, + { + "epoch": 0.81, + "grad_norm": 0.282153746580046, + "learning_rate": 9.475485180607213e-07, + "loss": 0.0744, + "step": 9575 + }, + { + "epoch": 0.81, + "grad_norm": 0.290839404306481, + "learning_rate": 9.467493490503177e-07, + "loss": 0.0637, + "step": 9576 + }, + { + "epoch": 0.81, + "grad_norm": 0.13818578751209026, + "learning_rate": 9.459504819496345e-07, + "loss": 0.0175, + "step": 9577 + }, + { + "epoch": 0.81, + "grad_norm": 0.40812002917388324, + "learning_rate": 9.45151916818175e-07, + "loss": 0.0955, + "step": 9578 + }, + { + "epoch": 0.81, + "grad_norm": 0.3484525975033104, + "learning_rate": 9.44353653715423e-07, + "loss": 0.0504, + "step": 9579 + }, + { + "epoch": 0.81, + "grad_norm": 0.2567767141875635, + "learning_rate": 9.435556927008371e-07, + "loss": 0.0497, + "step": 9580 + }, + { + "epoch": 0.81, + "grad_norm": 0.2824550615456103, + "learning_rate": 9.427580338338532e-07, + "loss": 0.0768, + "step": 9581 + }, + { + "epoch": 0.81, + "grad_norm": 0.3380780173080835, + "learning_rate": 9.419606771738854e-07, + "loss": 0.0899, + "step": 9582 + }, + { + "epoch": 0.81, + "grad_norm": 0.3792074620002089, + "learning_rate": 9.411636227803267e-07, + "loss": 0.1188, + "step": 9583 + }, + { + "epoch": 0.81, + "grad_norm": 0.3060343305488842, + "learning_rate": 9.403668707125463e-07, + "loss": 0.083, + "step": 9584 + }, + { + "epoch": 0.81, + "grad_norm": 0.2841543855123277, + "learning_rate": 9.395704210298895e-07, + "loss": 0.0879, + "step": 9585 + }, + { + "epoch": 0.81, + "grad_norm": 0.3502374473609649, + "learning_rate": 9.387742737916822e-07, + "loss": 0.0962, + "step": 9586 + }, + { + "epoch": 0.81, + "grad_norm": 0.25842695784572767, + "learning_rate": 9.379784290572258e-07, + "loss": 0.0656, + "step": 9587 + }, + { + "epoch": 0.81, + "grad_norm": 0.4540296494458854, + "learning_rate": 9.371828868857974e-07, + "loss": 0.0712, + "step": 9588 + }, + { + "epoch": 0.81, + "grad_norm": 0.3432389174598196, + "learning_rate": 9.363876473366568e-07, + "loss": 0.0949, + "step": 9589 + }, + { + "epoch": 0.81, + "grad_norm": 0.22945750950744656, + "learning_rate": 9.355927104690366e-07, + "loss": 0.0465, + "step": 9590 + }, + { + "epoch": 0.81, + "grad_norm": 0.353773220103368, + "learning_rate": 9.347980763421482e-07, + "loss": 0.0738, + "step": 9591 + }, + { + "epoch": 0.81, + "grad_norm": 0.339449248143001, + "learning_rate": 9.340037450151789e-07, + "loss": 0.0845, + "step": 9592 + }, + { + "epoch": 0.81, + "grad_norm": 0.48394329738078856, + "learning_rate": 9.332097165472986e-07, + "loss": 0.0953, + "step": 9593 + }, + { + "epoch": 0.81, + "grad_norm": 0.3023242834286774, + "learning_rate": 9.324159909976493e-07, + "loss": 0.0978, + "step": 9594 + }, + { + "epoch": 0.81, + "grad_norm": 0.5094623349513073, + "learning_rate": 9.316225684253511e-07, + "loss": 0.1073, + "step": 9595 + }, + { + "epoch": 0.81, + "grad_norm": 0.2784479387712475, + "learning_rate": 9.308294488895053e-07, + "loss": 0.0776, + "step": 9596 + }, + { + "epoch": 0.81, + "grad_norm": 0.6926111362898754, + "learning_rate": 9.300366324491872e-07, + "loss": 0.1343, + "step": 9597 + }, + { + "epoch": 0.81, + "grad_norm": 0.4406050064398591, + "learning_rate": 9.292441191634494e-07, + "loss": 0.106, + "step": 9598 + }, + { + "epoch": 0.81, + "grad_norm": 0.4480923296057095, + "learning_rate": 9.284519090913225e-07, + "loss": 0.128, + "step": 9599 + }, + { + "epoch": 0.81, + "grad_norm": 0.5023795495712815, + "learning_rate": 9.276600022918176e-07, + "loss": 0.0817, + "step": 9600 + }, + { + "epoch": 0.81, + "grad_norm": 0.5075426578494602, + "learning_rate": 9.268683988239186e-07, + "loss": 0.0985, + "step": 9601 + }, + { + "epoch": 0.81, + "grad_norm": 0.3027121547749337, + "learning_rate": 9.260770987465873e-07, + "loss": 0.0854, + "step": 9602 + }, + { + "epoch": 0.81, + "grad_norm": 0.22173060992373933, + "learning_rate": 9.252861021187676e-07, + "loss": 0.0442, + "step": 9603 + }, + { + "epoch": 0.81, + "grad_norm": 0.31549545033445914, + "learning_rate": 9.244954089993762e-07, + "loss": 0.095, + "step": 9604 + }, + { + "epoch": 0.81, + "grad_norm": 0.5168590521469552, + "learning_rate": 9.237050194473068e-07, + "loss": 0.0288, + "step": 9605 + }, + { + "epoch": 0.81, + "grad_norm": 0.3615744731190038, + "learning_rate": 9.229149335214349e-07, + "loss": 0.0597, + "step": 9606 + }, + { + "epoch": 0.81, + "grad_norm": 0.3767879314837329, + "learning_rate": 9.221251512806095e-07, + "loss": 0.1055, + "step": 9607 + }, + { + "epoch": 0.81, + "grad_norm": 0.4639846995779717, + "learning_rate": 9.213356727836587e-07, + "loss": 0.107, + "step": 9608 + }, + { + "epoch": 0.81, + "grad_norm": 0.4010130271982931, + "learning_rate": 9.205464980893852e-07, + "loss": 0.0542, + "step": 9609 + }, + { + "epoch": 0.81, + "grad_norm": 0.4685865165614232, + "learning_rate": 9.197576272565744e-07, + "loss": 0.1397, + "step": 9610 + }, + { + "epoch": 0.81, + "grad_norm": 0.6573985325570642, + "learning_rate": 9.18969060343985e-07, + "loss": 0.1402, + "step": 9611 + }, + { + "epoch": 0.81, + "grad_norm": 0.310333291169207, + "learning_rate": 9.181807974103524e-07, + "loss": 0.0808, + "step": 9612 + }, + { + "epoch": 0.81, + "grad_norm": 0.2746032677666339, + "learning_rate": 9.173928385143932e-07, + "loss": 0.0734, + "step": 9613 + }, + { + "epoch": 0.81, + "grad_norm": 0.4903263931213346, + "learning_rate": 9.166051837147988e-07, + "loss": 0.0958, + "step": 9614 + }, + { + "epoch": 0.81, + "grad_norm": 0.1866404093254978, + "learning_rate": 9.158178330702378e-07, + "loss": 0.0565, + "step": 9615 + }, + { + "epoch": 0.81, + "grad_norm": 0.46527643199952196, + "learning_rate": 9.150307866393554e-07, + "loss": 0.084, + "step": 9616 + }, + { + "epoch": 0.81, + "grad_norm": 0.3095889462978008, + "learning_rate": 9.142440444807782e-07, + "loss": 0.06, + "step": 9617 + }, + { + "epoch": 0.81, + "grad_norm": 0.3267215213323229, + "learning_rate": 9.134576066531054e-07, + "loss": 0.0721, + "step": 9618 + }, + { + "epoch": 0.81, + "grad_norm": 0.30233497404182386, + "learning_rate": 9.12671473214915e-07, + "loss": 0.0731, + "step": 9619 + }, + { + "epoch": 0.81, + "grad_norm": 0.31526999086031327, + "learning_rate": 9.118856442247648e-07, + "loss": 0.0793, + "step": 9620 + }, + { + "epoch": 0.81, + "grad_norm": 0.2683763604434798, + "learning_rate": 9.111001197411867e-07, + "loss": 0.0754, + "step": 9621 + }, + { + "epoch": 0.81, + "grad_norm": 0.36007606888687704, + "learning_rate": 9.103148998226902e-07, + "loss": 0.0874, + "step": 9622 + }, + { + "epoch": 0.81, + "grad_norm": 0.3556495163568835, + "learning_rate": 9.095299845277655e-07, + "loss": 0.1077, + "step": 9623 + }, + { + "epoch": 0.81, + "grad_norm": 0.401886522684654, + "learning_rate": 9.087453739148761e-07, + "loss": 0.1367, + "step": 9624 + }, + { + "epoch": 0.81, + "grad_norm": 0.21200826045047594, + "learning_rate": 9.07961068042465e-07, + "loss": 0.0569, + "step": 9625 + }, + { + "epoch": 0.81, + "grad_norm": 0.4233207165730966, + "learning_rate": 9.071770669689495e-07, + "loss": 0.1008, + "step": 9626 + }, + { + "epoch": 0.81, + "grad_norm": 0.2700683691074575, + "learning_rate": 9.063933707527305e-07, + "loss": 0.0752, + "step": 9627 + }, + { + "epoch": 0.81, + "grad_norm": 0.4493900028978525, + "learning_rate": 9.0560997945218e-07, + "loss": 0.1133, + "step": 9628 + }, + { + "epoch": 0.81, + "grad_norm": 0.1854490994459956, + "learning_rate": 9.048268931256482e-07, + "loss": 0.0394, + "step": 9629 + }, + { + "epoch": 0.81, + "grad_norm": 0.23903462925591118, + "learning_rate": 9.040441118314669e-07, + "loss": 0.0865, + "step": 9630 + }, + { + "epoch": 0.81, + "grad_norm": 0.14879659256639505, + "learning_rate": 9.032616356279412e-07, + "loss": 0.0462, + "step": 9631 + }, + { + "epoch": 0.81, + "grad_norm": 0.20321658335036927, + "learning_rate": 9.024794645733543e-07, + "loss": 0.0421, + "step": 9632 + }, + { + "epoch": 0.81, + "grad_norm": 0.33960720971075176, + "learning_rate": 9.016975987259647e-07, + "loss": 0.0838, + "step": 9633 + }, + { + "epoch": 0.81, + "grad_norm": 0.21702327514100297, + "learning_rate": 9.009160381440141e-07, + "loss": 0.0601, + "step": 9634 + }, + { + "epoch": 0.81, + "grad_norm": 0.34677334025746076, + "learning_rate": 9.00134782885716e-07, + "loss": 0.0777, + "step": 9635 + }, + { + "epoch": 0.81, + "grad_norm": 0.3129532603409501, + "learning_rate": 8.993538330092627e-07, + "loss": 0.1071, + "step": 9636 + }, + { + "epoch": 0.81, + "grad_norm": 0.3728818018838984, + "learning_rate": 8.985731885728221e-07, + "loss": 0.0772, + "step": 9637 + }, + { + "epoch": 0.81, + "grad_norm": 0.6312124071541648, + "learning_rate": 8.97792849634545e-07, + "loss": 0.1224, + "step": 9638 + }, + { + "epoch": 0.81, + "grad_norm": 0.3024944979026759, + "learning_rate": 8.970128162525532e-07, + "loss": 0.0592, + "step": 9639 + }, + { + "epoch": 0.81, + "grad_norm": 0.2905629021377918, + "learning_rate": 8.962330884849485e-07, + "loss": 0.0723, + "step": 9640 + }, + { + "epoch": 0.81, + "grad_norm": 0.3177480164517761, + "learning_rate": 8.954536663898084e-07, + "loss": 0.0724, + "step": 9641 + }, + { + "epoch": 0.81, + "grad_norm": 0.27600637740816836, + "learning_rate": 8.946745500251913e-07, + "loss": 0.0629, + "step": 9642 + }, + { + "epoch": 0.81, + "grad_norm": 0.25054918317991387, + "learning_rate": 8.938957394491293e-07, + "loss": 0.0713, + "step": 9643 + }, + { + "epoch": 0.81, + "grad_norm": 0.28250026721585486, + "learning_rate": 8.931172347196321e-07, + "loss": 0.0901, + "step": 9644 + }, + { + "epoch": 0.81, + "grad_norm": 0.26270272271383177, + "learning_rate": 8.923390358946876e-07, + "loss": 0.0458, + "step": 9645 + }, + { + "epoch": 0.81, + "grad_norm": 0.5098695170702431, + "learning_rate": 8.915611430322591e-07, + "loss": 0.0979, + "step": 9646 + }, + { + "epoch": 0.81, + "grad_norm": 0.31923681846017427, + "learning_rate": 8.907835561902916e-07, + "loss": 0.0709, + "step": 9647 + }, + { + "epoch": 0.81, + "grad_norm": 0.34218251105136327, + "learning_rate": 8.900062754267031e-07, + "loss": 0.0542, + "step": 9648 + }, + { + "epoch": 0.81, + "grad_norm": 0.41589290847448607, + "learning_rate": 8.892293007993891e-07, + "loss": 0.096, + "step": 9649 + }, + { + "epoch": 0.81, + "grad_norm": 0.4092742837999429, + "learning_rate": 8.884526323662229e-07, + "loss": 0.0756, + "step": 9650 + }, + { + "epoch": 0.81, + "grad_norm": 0.2707721423005706, + "learning_rate": 8.876762701850572e-07, + "loss": 0.0564, + "step": 9651 + }, + { + "epoch": 0.81, + "grad_norm": 0.3601308893178154, + "learning_rate": 8.869002143137196e-07, + "loss": 0.0636, + "step": 9652 + }, + { + "epoch": 0.81, + "grad_norm": 0.3615333984680277, + "learning_rate": 8.861244648100126e-07, + "loss": 0.0842, + "step": 9653 + }, + { + "epoch": 0.81, + "grad_norm": 0.2408710141193774, + "learning_rate": 8.853490217317223e-07, + "loss": 0.0721, + "step": 9654 + }, + { + "epoch": 0.81, + "grad_norm": 0.48325883789266844, + "learning_rate": 8.845738851366059e-07, + "loss": 0.1152, + "step": 9655 + }, + { + "epoch": 0.81, + "grad_norm": 0.2105906020620536, + "learning_rate": 8.83799055082401e-07, + "loss": 0.0506, + "step": 9656 + }, + { + "epoch": 0.81, + "grad_norm": 1.0781274809704673, + "learning_rate": 8.830245316268198e-07, + "loss": 0.1456, + "step": 9657 + }, + { + "epoch": 0.81, + "grad_norm": 0.2580514923924286, + "learning_rate": 8.822503148275562e-07, + "loss": 0.0539, + "step": 9658 + }, + { + "epoch": 0.81, + "grad_norm": 0.39631906910048026, + "learning_rate": 8.814764047422769e-07, + "loss": 0.0977, + "step": 9659 + }, + { + "epoch": 0.81, + "grad_norm": 0.4855048831608224, + "learning_rate": 8.807028014286257e-07, + "loss": 0.1217, + "step": 9660 + }, + { + "epoch": 0.81, + "grad_norm": 0.2794252456682048, + "learning_rate": 8.799295049442275e-07, + "loss": 0.064, + "step": 9661 + }, + { + "epoch": 0.81, + "grad_norm": 0.38782355425491977, + "learning_rate": 8.791565153466807e-07, + "loss": 0.0965, + "step": 9662 + }, + { + "epoch": 0.81, + "grad_norm": 0.2740909304583038, + "learning_rate": 8.783838326935617e-07, + "loss": 0.0786, + "step": 9663 + }, + { + "epoch": 0.81, + "grad_norm": 0.3865933825942271, + "learning_rate": 8.77611457042426e-07, + "loss": 0.0912, + "step": 9664 + }, + { + "epoch": 0.81, + "grad_norm": 0.2947881885094054, + "learning_rate": 8.768393884508042e-07, + "loss": 0.0796, + "step": 9665 + }, + { + "epoch": 0.81, + "grad_norm": 0.4047619633106657, + "learning_rate": 8.760676269762031e-07, + "loss": 0.1295, + "step": 9666 + }, + { + "epoch": 0.81, + "grad_norm": 0.5082922812068742, + "learning_rate": 8.752961726761084e-07, + "loss": 0.1151, + "step": 9667 + }, + { + "epoch": 0.81, + "grad_norm": 0.3597204793766933, + "learning_rate": 8.745250256079835e-07, + "loss": 0.0664, + "step": 9668 + }, + { + "epoch": 0.81, + "grad_norm": 0.32292411416653366, + "learning_rate": 8.737541858292675e-07, + "loss": 0.1, + "step": 9669 + }, + { + "epoch": 0.81, + "grad_norm": 0.3590337515021964, + "learning_rate": 8.729836533973757e-07, + "loss": 0.094, + "step": 9670 + }, + { + "epoch": 0.81, + "grad_norm": 0.3800466392767771, + "learning_rate": 8.722134283697048e-07, + "loss": 0.0994, + "step": 9671 + }, + { + "epoch": 0.81, + "grad_norm": 0.25841107649520534, + "learning_rate": 8.714435108036235e-07, + "loss": 0.0568, + "step": 9672 + }, + { + "epoch": 0.82, + "grad_norm": 0.4184436492346018, + "learning_rate": 8.706739007564796e-07, + "loss": 0.1136, + "step": 9673 + }, + { + "epoch": 0.82, + "grad_norm": 0.45465173069109066, + "learning_rate": 8.69904598285598e-07, + "loss": 0.0447, + "step": 9674 + }, + { + "epoch": 0.82, + "grad_norm": 0.20034083049690538, + "learning_rate": 8.691356034482828e-07, + "loss": 0.059, + "step": 9675 + }, + { + "epoch": 0.82, + "grad_norm": 0.35598775528681215, + "learning_rate": 8.683669163018116e-07, + "loss": 0.0667, + "step": 9676 + }, + { + "epoch": 0.82, + "grad_norm": 0.2120594666942776, + "learning_rate": 8.675985369034401e-07, + "loss": 0.0767, + "step": 9677 + }, + { + "epoch": 0.82, + "grad_norm": 0.26480759831642103, + "learning_rate": 8.668304653104037e-07, + "loss": 0.0903, + "step": 9678 + }, + { + "epoch": 0.82, + "grad_norm": 0.3187643372457018, + "learning_rate": 8.660627015799116e-07, + "loss": 0.0904, + "step": 9679 + }, + { + "epoch": 0.82, + "grad_norm": 0.29471329751798747, + "learning_rate": 8.652952457691505e-07, + "loss": 0.0729, + "step": 9680 + }, + { + "epoch": 0.82, + "grad_norm": 0.44756574075472416, + "learning_rate": 8.645280979352871e-07, + "loss": 0.0971, + "step": 9681 + }, + { + "epoch": 0.82, + "grad_norm": 0.2583459264812097, + "learning_rate": 8.637612581354615e-07, + "loss": 0.0568, + "step": 9682 + }, + { + "epoch": 0.82, + "grad_norm": 0.3131310293250757, + "learning_rate": 8.629947264267935e-07, + "loss": 0.0754, + "step": 9683 + }, + { + "epoch": 0.82, + "grad_norm": 0.24431402058383406, + "learning_rate": 8.622285028663762e-07, + "loss": 0.058, + "step": 9684 + }, + { + "epoch": 0.82, + "grad_norm": 0.2477901129582285, + "learning_rate": 8.614625875112859e-07, + "loss": 0.0617, + "step": 9685 + }, + { + "epoch": 0.82, + "grad_norm": 0.2780079542352381, + "learning_rate": 8.606969804185706e-07, + "loss": 0.0882, + "step": 9686 + }, + { + "epoch": 0.82, + "grad_norm": 0.24351735002130515, + "learning_rate": 8.599316816452563e-07, + "loss": 0.0745, + "step": 9687 + }, + { + "epoch": 0.82, + "grad_norm": 0.3022830131179911, + "learning_rate": 8.591666912483493e-07, + "loss": 0.0844, + "step": 9688 + }, + { + "epoch": 0.82, + "grad_norm": 0.2783109160581392, + "learning_rate": 8.584020092848289e-07, + "loss": 0.0593, + "step": 9689 + }, + { + "epoch": 0.82, + "grad_norm": 0.27278334289045836, + "learning_rate": 8.57637635811654e-07, + "loss": 0.0727, + "step": 9690 + }, + { + "epoch": 0.82, + "grad_norm": 0.6362226072160814, + "learning_rate": 8.568735708857573e-07, + "loss": 0.1543, + "step": 9691 + }, + { + "epoch": 0.82, + "grad_norm": 0.22384616884927755, + "learning_rate": 8.561098145640546e-07, + "loss": 0.0672, + "step": 9692 + }, + { + "epoch": 0.82, + "grad_norm": 0.5007086434591281, + "learning_rate": 8.553463669034317e-07, + "loss": 0.11, + "step": 9693 + }, + { + "epoch": 0.82, + "grad_norm": 0.2319049436059855, + "learning_rate": 8.545832279607552e-07, + "loss": 0.0724, + "step": 9694 + }, + { + "epoch": 0.82, + "grad_norm": 0.48532287309033173, + "learning_rate": 8.538203977928699e-07, + "loss": 0.0996, + "step": 9695 + }, + { + "epoch": 0.82, + "grad_norm": 0.2548642543218809, + "learning_rate": 8.53057876456595e-07, + "loss": 0.0686, + "step": 9696 + }, + { + "epoch": 0.82, + "grad_norm": 0.5413586283716167, + "learning_rate": 8.522956640087254e-07, + "loss": 0.1078, + "step": 9697 + }, + { + "epoch": 0.82, + "grad_norm": 0.2726265262432998, + "learning_rate": 8.515337605060386e-07, + "loss": 0.0845, + "step": 9698 + }, + { + "epoch": 0.82, + "grad_norm": 0.27484376516497994, + "learning_rate": 8.507721660052837e-07, + "loss": 0.0883, + "step": 9699 + }, + { + "epoch": 0.82, + "grad_norm": 0.5342369722961724, + "learning_rate": 8.500108805631885e-07, + "loss": 0.1306, + "step": 9700 + }, + { + "epoch": 0.82, + "grad_norm": 0.22049548489486223, + "learning_rate": 8.492499042364578e-07, + "loss": 0.0579, + "step": 9701 + }, + { + "epoch": 0.82, + "grad_norm": 0.5504009026710616, + "learning_rate": 8.484892370817749e-07, + "loss": 0.1113, + "step": 9702 + }, + { + "epoch": 0.82, + "grad_norm": 0.21938906435708538, + "learning_rate": 8.477288791557986e-07, + "loss": 0.0654, + "step": 9703 + }, + { + "epoch": 0.82, + "grad_norm": 0.4846058905822086, + "learning_rate": 8.469688305151635e-07, + "loss": 0.1148, + "step": 9704 + }, + { + "epoch": 0.82, + "grad_norm": 0.28037140948560935, + "learning_rate": 8.462090912164822e-07, + "loss": 0.0813, + "step": 9705 + }, + { + "epoch": 0.82, + "grad_norm": 0.5062864225993273, + "learning_rate": 8.454496613163465e-07, + "loss": 0.1096, + "step": 9706 + }, + { + "epoch": 0.82, + "grad_norm": 0.4932077781360014, + "learning_rate": 8.446905408713219e-07, + "loss": 0.1038, + "step": 9707 + }, + { + "epoch": 0.82, + "grad_norm": 0.39720690186103114, + "learning_rate": 8.439317299379524e-07, + "loss": 0.0952, + "step": 9708 + }, + { + "epoch": 0.82, + "grad_norm": 0.6527065422606126, + "learning_rate": 8.431732285727579e-07, + "loss": 0.1233, + "step": 9709 + }, + { + "epoch": 0.82, + "grad_norm": 0.3527215027975017, + "learning_rate": 8.424150368322371e-07, + "loss": 0.0934, + "step": 9710 + }, + { + "epoch": 0.82, + "grad_norm": 0.2517436879589864, + "learning_rate": 8.416571547728641e-07, + "loss": 0.0596, + "step": 9711 + }, + { + "epoch": 0.82, + "grad_norm": 0.28916597944144945, + "learning_rate": 8.408995824510907e-07, + "loss": 0.0753, + "step": 9712 + }, + { + "epoch": 0.82, + "grad_norm": 0.23550095085639877, + "learning_rate": 8.401423199233449e-07, + "loss": 0.0788, + "step": 9713 + }, + { + "epoch": 0.82, + "grad_norm": 0.3846497286855851, + "learning_rate": 8.393853672460306e-07, + "loss": 0.1013, + "step": 9714 + }, + { + "epoch": 0.82, + "grad_norm": 0.3449298251335863, + "learning_rate": 8.386287244755331e-07, + "loss": 0.0828, + "step": 9715 + }, + { + "epoch": 0.82, + "grad_norm": 0.2957821328828303, + "learning_rate": 8.378723916682096e-07, + "loss": 0.0518, + "step": 9716 + }, + { + "epoch": 0.82, + "grad_norm": 0.4045435212096391, + "learning_rate": 8.371163688803968e-07, + "loss": 0.1151, + "step": 9717 + }, + { + "epoch": 0.82, + "grad_norm": 0.21285318599741368, + "learning_rate": 8.363606561684062e-07, + "loss": 0.0651, + "step": 9718 + }, + { + "epoch": 0.82, + "grad_norm": 0.4037082978472771, + "learning_rate": 8.356052535885307e-07, + "loss": 0.0841, + "step": 9719 + }, + { + "epoch": 0.82, + "grad_norm": 0.402477557504167, + "learning_rate": 8.348501611970345e-07, + "loss": 0.0704, + "step": 9720 + }, + { + "epoch": 0.82, + "grad_norm": 0.29674799902788374, + "learning_rate": 8.34095379050161e-07, + "loss": 0.0922, + "step": 9721 + }, + { + "epoch": 0.82, + "grad_norm": 0.2524671372615894, + "learning_rate": 8.333409072041332e-07, + "loss": 0.0532, + "step": 9722 + }, + { + "epoch": 0.82, + "grad_norm": 0.35078220262974263, + "learning_rate": 8.325867457151471e-07, + "loss": 0.0736, + "step": 9723 + }, + { + "epoch": 0.82, + "grad_norm": 0.2892492010107794, + "learning_rate": 8.318328946393772e-07, + "loss": 0.0667, + "step": 9724 + }, + { + "epoch": 0.82, + "grad_norm": 0.3746811910499197, + "learning_rate": 8.310793540329737e-07, + "loss": 0.0821, + "step": 9725 + }, + { + "epoch": 0.82, + "grad_norm": 0.4362615617310372, + "learning_rate": 8.303261239520665e-07, + "loss": 0.0819, + "step": 9726 + }, + { + "epoch": 0.82, + "grad_norm": 0.24611361178384084, + "learning_rate": 8.295732044527599e-07, + "loss": 0.064, + "step": 9727 + }, + { + "epoch": 0.82, + "grad_norm": 0.2986013022063484, + "learning_rate": 8.288205955911344e-07, + "loss": 0.0407, + "step": 9728 + }, + { + "epoch": 0.82, + "grad_norm": 0.2784203450073894, + "learning_rate": 8.280682974232512e-07, + "loss": 0.0459, + "step": 9729 + }, + { + "epoch": 0.82, + "grad_norm": 0.35718324062087736, + "learning_rate": 8.273163100051445e-07, + "loss": 0.1046, + "step": 9730 + }, + { + "epoch": 0.82, + "grad_norm": 0.36596675772184795, + "learning_rate": 8.265646333928251e-07, + "loss": 0.0826, + "step": 9731 + }, + { + "epoch": 0.82, + "grad_norm": 0.30634599996325323, + "learning_rate": 8.258132676422853e-07, + "loss": 0.0757, + "step": 9732 + }, + { + "epoch": 0.82, + "grad_norm": 0.2176533675054571, + "learning_rate": 8.250622128094899e-07, + "loss": 0.0395, + "step": 9733 + }, + { + "epoch": 0.82, + "grad_norm": 0.31737432489957107, + "learning_rate": 8.243114689503817e-07, + "loss": 0.086, + "step": 9734 + }, + { + "epoch": 0.82, + "grad_norm": 0.30669122203703475, + "learning_rate": 8.23561036120879e-07, + "loss": 0.0854, + "step": 9735 + }, + { + "epoch": 0.82, + "grad_norm": 0.2679540839682881, + "learning_rate": 8.228109143768815e-07, + "loss": 0.0942, + "step": 9736 + }, + { + "epoch": 0.82, + "grad_norm": 0.29047022727739147, + "learning_rate": 8.220611037742604e-07, + "loss": 0.0619, + "step": 9737 + }, + { + "epoch": 0.82, + "grad_norm": 0.6248323078678272, + "learning_rate": 8.213116043688657e-07, + "loss": 0.094, + "step": 9738 + }, + { + "epoch": 0.82, + "grad_norm": 0.22436947384991715, + "learning_rate": 8.205624162165271e-07, + "loss": 0.0794, + "step": 9739 + }, + { + "epoch": 0.82, + "grad_norm": 0.3052104152800874, + "learning_rate": 8.198135393730461e-07, + "loss": 0.0767, + "step": 9740 + }, + { + "epoch": 0.82, + "grad_norm": 0.21375665884849118, + "learning_rate": 8.190649738942041e-07, + "loss": 0.0801, + "step": 9741 + }, + { + "epoch": 0.82, + "grad_norm": 0.5298953966148979, + "learning_rate": 8.183167198357578e-07, + "loss": 0.0812, + "step": 9742 + }, + { + "epoch": 0.82, + "grad_norm": 0.31951539857083106, + "learning_rate": 8.17568777253443e-07, + "loss": 0.0606, + "step": 9743 + }, + { + "epoch": 0.82, + "grad_norm": 0.35860945363601293, + "learning_rate": 8.168211462029707e-07, + "loss": 0.0822, + "step": 9744 + }, + { + "epoch": 0.82, + "grad_norm": 0.20310627136584156, + "learning_rate": 8.160738267400265e-07, + "loss": 0.0372, + "step": 9745 + }, + { + "epoch": 0.82, + "grad_norm": 0.34219474493738816, + "learning_rate": 8.153268189202785e-07, + "loss": 0.1077, + "step": 9746 + }, + { + "epoch": 0.82, + "grad_norm": 0.24892287379396272, + "learning_rate": 8.145801227993661e-07, + "loss": 0.0434, + "step": 9747 + }, + { + "epoch": 0.82, + "grad_norm": 0.2989440114665037, + "learning_rate": 8.138337384329087e-07, + "loss": 0.0569, + "step": 9748 + }, + { + "epoch": 0.82, + "grad_norm": 0.3841773142884086, + "learning_rate": 8.130876658764986e-07, + "loss": 0.1023, + "step": 9749 + }, + { + "epoch": 0.82, + "grad_norm": 0.4970036268749609, + "learning_rate": 8.123419051857118e-07, + "loss": 0.0982, + "step": 9750 + }, + { + "epoch": 0.82, + "grad_norm": 0.3378261401661527, + "learning_rate": 8.115964564160944e-07, + "loss": 0.1019, + "step": 9751 + }, + { + "epoch": 0.82, + "grad_norm": 0.4060210129193685, + "learning_rate": 8.10851319623171e-07, + "loss": 0.0815, + "step": 9752 + }, + { + "epoch": 0.82, + "grad_norm": 0.22267634863716657, + "learning_rate": 8.10106494862446e-07, + "loss": 0.0632, + "step": 9753 + }, + { + "epoch": 0.82, + "grad_norm": 0.3521131553401419, + "learning_rate": 8.093619821893972e-07, + "loss": 0.0934, + "step": 9754 + }, + { + "epoch": 0.82, + "grad_norm": 0.5570641999382414, + "learning_rate": 8.086177816594792e-07, + "loss": 0.14, + "step": 9755 + }, + { + "epoch": 0.82, + "grad_norm": 0.30863703812741766, + "learning_rate": 8.078738933281266e-07, + "loss": 0.0919, + "step": 9756 + }, + { + "epoch": 0.82, + "grad_norm": 0.2969089564892958, + "learning_rate": 8.071303172507472e-07, + "loss": 0.0754, + "step": 9757 + }, + { + "epoch": 0.82, + "grad_norm": 0.5727093831307662, + "learning_rate": 8.063870534827273e-07, + "loss": 0.1493, + "step": 9758 + }, + { + "epoch": 0.82, + "grad_norm": 0.5449620745945319, + "learning_rate": 8.056441020794275e-07, + "loss": 0.1182, + "step": 9759 + }, + { + "epoch": 0.82, + "grad_norm": 0.2351460603334859, + "learning_rate": 8.049014630961905e-07, + "loss": 0.0371, + "step": 9760 + }, + { + "epoch": 0.82, + "grad_norm": 0.32908295144057753, + "learning_rate": 8.041591365883312e-07, + "loss": 0.119, + "step": 9761 + }, + { + "epoch": 0.82, + "grad_norm": 0.32139705546893066, + "learning_rate": 8.034171226111404e-07, + "loss": 0.0875, + "step": 9762 + }, + { + "epoch": 0.82, + "grad_norm": 0.39166941152477774, + "learning_rate": 8.0267542121989e-07, + "loss": 0.1213, + "step": 9763 + }, + { + "epoch": 0.82, + "grad_norm": 0.5944846542499271, + "learning_rate": 8.019340324698261e-07, + "loss": 0.085, + "step": 9764 + }, + { + "epoch": 0.82, + "grad_norm": 0.2622798464119927, + "learning_rate": 8.011929564161708e-07, + "loss": 0.0816, + "step": 9765 + }, + { + "epoch": 0.82, + "grad_norm": 0.22659022794020647, + "learning_rate": 8.004521931141223e-07, + "loss": 0.0566, + "step": 9766 + }, + { + "epoch": 0.82, + "grad_norm": 0.23989737438444092, + "learning_rate": 7.997117426188606e-07, + "loss": 0.051, + "step": 9767 + }, + { + "epoch": 0.82, + "grad_norm": 0.3828085526897713, + "learning_rate": 7.989716049855362e-07, + "loss": 0.1027, + "step": 9768 + }, + { + "epoch": 0.82, + "grad_norm": 0.22102970608373645, + "learning_rate": 7.982317802692785e-07, + "loss": 0.0315, + "step": 9769 + }, + { + "epoch": 0.82, + "grad_norm": 0.45992663464078987, + "learning_rate": 7.974922685251962e-07, + "loss": 0.0965, + "step": 9770 + }, + { + "epoch": 0.82, + "grad_norm": 0.3044792386492315, + "learning_rate": 7.967530698083715e-07, + "loss": 0.0566, + "step": 9771 + }, + { + "epoch": 0.82, + "grad_norm": 0.21138835304350942, + "learning_rate": 7.960141841738633e-07, + "loss": 0.0492, + "step": 9772 + }, + { + "epoch": 0.82, + "grad_norm": 0.6716361210464431, + "learning_rate": 7.952756116767074e-07, + "loss": 0.1559, + "step": 9773 + }, + { + "epoch": 0.82, + "grad_norm": 0.21035813725060956, + "learning_rate": 7.945373523719196e-07, + "loss": 0.0524, + "step": 9774 + }, + { + "epoch": 0.82, + "grad_norm": 0.21383125815789836, + "learning_rate": 7.937994063144888e-07, + "loss": 0.0411, + "step": 9775 + }, + { + "epoch": 0.82, + "grad_norm": 0.3068827279937841, + "learning_rate": 7.93061773559381e-07, + "loss": 0.0944, + "step": 9776 + }, + { + "epoch": 0.82, + "grad_norm": 0.2448350614156476, + "learning_rate": 7.923244541615383e-07, + "loss": 0.0784, + "step": 9777 + }, + { + "epoch": 0.82, + "grad_norm": 0.41737729715777283, + "learning_rate": 7.915874481758829e-07, + "loss": 0.1093, + "step": 9778 + }, + { + "epoch": 0.82, + "grad_norm": 0.3533633694578978, + "learning_rate": 7.908507556573103e-07, + "loss": 0.0727, + "step": 9779 + }, + { + "epoch": 0.82, + "grad_norm": 0.30464384816873197, + "learning_rate": 7.901143766606933e-07, + "loss": 0.1024, + "step": 9780 + }, + { + "epoch": 0.82, + "grad_norm": 0.3386405752903696, + "learning_rate": 7.893783112408821e-07, + "loss": 0.0966, + "step": 9781 + }, + { + "epoch": 0.82, + "grad_norm": 0.28067305965953726, + "learning_rate": 7.886425594527014e-07, + "loss": 0.0716, + "step": 9782 + }, + { + "epoch": 0.82, + "grad_norm": 0.31493557318688786, + "learning_rate": 7.879071213509576e-07, + "loss": 0.0696, + "step": 9783 + }, + { + "epoch": 0.82, + "grad_norm": 0.31860440494469616, + "learning_rate": 7.871719969904284e-07, + "loss": 0.061, + "step": 9784 + }, + { + "epoch": 0.82, + "grad_norm": 0.36253219616062227, + "learning_rate": 7.864371864258702e-07, + "loss": 0.0458, + "step": 9785 + }, + { + "epoch": 0.82, + "grad_norm": 0.2659507727252553, + "learning_rate": 7.857026897120151e-07, + "loss": 0.0386, + "step": 9786 + }, + { + "epoch": 0.82, + "grad_norm": 0.25039686848856685, + "learning_rate": 7.84968506903575e-07, + "loss": 0.0631, + "step": 9787 + }, + { + "epoch": 0.82, + "grad_norm": 0.2507733380653957, + "learning_rate": 7.842346380552351e-07, + "loss": 0.0519, + "step": 9788 + }, + { + "epoch": 0.82, + "grad_norm": 0.5127571888203863, + "learning_rate": 7.835010832216567e-07, + "loss": 0.1039, + "step": 9789 + }, + { + "epoch": 0.82, + "grad_norm": 0.4018434322536532, + "learning_rate": 7.827678424574819e-07, + "loss": 0.0787, + "step": 9790 + }, + { + "epoch": 0.82, + "grad_norm": 0.5212942016039955, + "learning_rate": 7.820349158173252e-07, + "loss": 0.0997, + "step": 9791 + }, + { + "epoch": 0.83, + "grad_norm": 0.18456911570363677, + "learning_rate": 7.813023033557793e-07, + "loss": 0.0777, + "step": 9792 + }, + { + "epoch": 0.83, + "grad_norm": 0.23803338174440045, + "learning_rate": 7.805700051274123e-07, + "loss": 0.054, + "step": 9793 + }, + { + "epoch": 0.83, + "grad_norm": 0.23918196853071208, + "learning_rate": 7.798380211867729e-07, + "loss": 0.0566, + "step": 9794 + }, + { + "epoch": 0.83, + "grad_norm": 0.2736766113941927, + "learning_rate": 7.791063515883817e-07, + "loss": 0.0999, + "step": 9795 + }, + { + "epoch": 0.83, + "grad_norm": 0.2720499951863573, + "learning_rate": 7.783749963867366e-07, + "loss": 0.0712, + "step": 9796 + }, + { + "epoch": 0.83, + "grad_norm": 0.313492814100594, + "learning_rate": 7.776439556363158e-07, + "loss": 0.0783, + "step": 9797 + }, + { + "epoch": 0.83, + "grad_norm": 0.4635126441496074, + "learning_rate": 7.769132293915705e-07, + "loss": 0.0977, + "step": 9798 + }, + { + "epoch": 0.83, + "grad_norm": 0.5340604764904753, + "learning_rate": 7.761828177069292e-07, + "loss": 0.1237, + "step": 9799 + }, + { + "epoch": 0.83, + "grad_norm": 0.30593321770160103, + "learning_rate": 7.754527206367957e-07, + "loss": 0.0683, + "step": 9800 + }, + { + "epoch": 0.83, + "grad_norm": 0.40228543747457834, + "learning_rate": 7.747229382355547e-07, + "loss": 0.0699, + "step": 9801 + }, + { + "epoch": 0.83, + "grad_norm": 0.41585589051746025, + "learning_rate": 7.739934705575636e-07, + "loss": 0.0817, + "step": 9802 + }, + { + "epoch": 0.83, + "grad_norm": 0.2988102249065971, + "learning_rate": 7.732643176571553e-07, + "loss": 0.0688, + "step": 9803 + }, + { + "epoch": 0.83, + "grad_norm": 0.3286184791322606, + "learning_rate": 7.725354795886448e-07, + "loss": 0.0757, + "step": 9804 + }, + { + "epoch": 0.83, + "grad_norm": 0.301890543929499, + "learning_rate": 7.718069564063185e-07, + "loss": 0.086, + "step": 9805 + }, + { + "epoch": 0.83, + "grad_norm": 0.1906780526197197, + "learning_rate": 7.710787481644399e-07, + "loss": 0.035, + "step": 9806 + }, + { + "epoch": 0.83, + "grad_norm": 0.39161367682091747, + "learning_rate": 7.703508549172528e-07, + "loss": 0.0807, + "step": 9807 + }, + { + "epoch": 0.83, + "grad_norm": 0.25080202890267445, + "learning_rate": 7.696232767189732e-07, + "loss": 0.0416, + "step": 9808 + }, + { + "epoch": 0.83, + "grad_norm": 0.27709767937567886, + "learning_rate": 7.68896013623796e-07, + "loss": 0.0632, + "step": 9809 + }, + { + "epoch": 0.83, + "grad_norm": 0.41341307646340014, + "learning_rate": 7.681690656858904e-07, + "loss": 0.1034, + "step": 9810 + }, + { + "epoch": 0.83, + "grad_norm": 0.5104878095236919, + "learning_rate": 7.674424329594066e-07, + "loss": 0.1363, + "step": 9811 + }, + { + "epoch": 0.83, + "grad_norm": 0.26515473658048555, + "learning_rate": 7.667161154984665e-07, + "loss": 0.0652, + "step": 9812 + }, + { + "epoch": 0.83, + "grad_norm": 0.3805648670636288, + "learning_rate": 7.659901133571695e-07, + "loss": 0.0869, + "step": 9813 + }, + { + "epoch": 0.83, + "grad_norm": 0.2675557247717521, + "learning_rate": 7.652644265895953e-07, + "loss": 0.0663, + "step": 9814 + }, + { + "epoch": 0.83, + "grad_norm": 0.5210774658131873, + "learning_rate": 7.645390552497956e-07, + "loss": 0.1031, + "step": 9815 + }, + { + "epoch": 0.83, + "grad_norm": 0.30290395059655223, + "learning_rate": 7.638139993918003e-07, + "loss": 0.083, + "step": 9816 + }, + { + "epoch": 0.83, + "grad_norm": 0.3237611713567306, + "learning_rate": 7.630892590696148e-07, + "loss": 0.1044, + "step": 9817 + }, + { + "epoch": 0.83, + "grad_norm": 0.21539807141489517, + "learning_rate": 7.623648343372242e-07, + "loss": 0.0444, + "step": 9818 + }, + { + "epoch": 0.83, + "grad_norm": 0.40962766860635247, + "learning_rate": 7.616407252485874e-07, + "loss": 0.1135, + "step": 9819 + }, + { + "epoch": 0.83, + "grad_norm": 0.23448944523820134, + "learning_rate": 7.609169318576376e-07, + "loss": 0.0623, + "step": 9820 + }, + { + "epoch": 0.83, + "grad_norm": 0.43859832638155555, + "learning_rate": 7.601934542182909e-07, + "loss": 0.0786, + "step": 9821 + }, + { + "epoch": 0.83, + "grad_norm": 0.3681232908285159, + "learning_rate": 7.594702923844344e-07, + "loss": 0.0893, + "step": 9822 + }, + { + "epoch": 0.83, + "grad_norm": 0.3566704486693897, + "learning_rate": 7.587474464099326e-07, + "loss": 0.119, + "step": 9823 + }, + { + "epoch": 0.83, + "grad_norm": 0.34094763129153904, + "learning_rate": 7.580249163486286e-07, + "loss": 0.1016, + "step": 9824 + }, + { + "epoch": 0.83, + "grad_norm": 0.32306606375173197, + "learning_rate": 7.573027022543406e-07, + "loss": 0.0919, + "step": 9825 + }, + { + "epoch": 0.83, + "grad_norm": 0.3790513005152477, + "learning_rate": 7.565808041808625e-07, + "loss": 0.0916, + "step": 9826 + }, + { + "epoch": 0.83, + "grad_norm": 0.4311829312743003, + "learning_rate": 7.558592221819649e-07, + "loss": 0.1083, + "step": 9827 + }, + { + "epoch": 0.83, + "grad_norm": 0.5437097447163345, + "learning_rate": 7.551379563113981e-07, + "loss": 0.1096, + "step": 9828 + }, + { + "epoch": 0.83, + "grad_norm": 0.3571798483590183, + "learning_rate": 7.544170066228835e-07, + "loss": 0.1276, + "step": 9829 + }, + { + "epoch": 0.83, + "grad_norm": 0.2575823702220296, + "learning_rate": 7.536963731701219e-07, + "loss": 0.0645, + "step": 9830 + }, + { + "epoch": 0.83, + "grad_norm": 0.41718105386456633, + "learning_rate": 7.529760560067922e-07, + "loss": 0.086, + "step": 9831 + }, + { + "epoch": 0.83, + "grad_norm": 0.29223269215470365, + "learning_rate": 7.522560551865465e-07, + "loss": 0.0719, + "step": 9832 + }, + { + "epoch": 0.83, + "grad_norm": 0.49086716805133845, + "learning_rate": 7.515363707630146e-07, + "loss": 0.1481, + "step": 9833 + }, + { + "epoch": 0.83, + "grad_norm": 0.2943607707451245, + "learning_rate": 7.50817002789802e-07, + "loss": 0.0788, + "step": 9834 + }, + { + "epoch": 0.83, + "grad_norm": 0.2859534975837805, + "learning_rate": 7.500979513204937e-07, + "loss": 0.0709, + "step": 9835 + }, + { + "epoch": 0.83, + "grad_norm": 0.43894889342180987, + "learning_rate": 7.493792164086472e-07, + "loss": 0.0913, + "step": 9836 + }, + { + "epoch": 0.83, + "grad_norm": 0.4162640333569657, + "learning_rate": 7.48660798107797e-07, + "loss": 0.0671, + "step": 9837 + }, + { + "epoch": 0.83, + "grad_norm": 0.21052968988779, + "learning_rate": 7.479426964714582e-07, + "loss": 0.0619, + "step": 9838 + }, + { + "epoch": 0.83, + "grad_norm": 0.27478217175932546, + "learning_rate": 7.472249115531166e-07, + "loss": 0.1014, + "step": 9839 + }, + { + "epoch": 0.83, + "grad_norm": 0.3802474396411029, + "learning_rate": 7.465074434062386e-07, + "loss": 0.0632, + "step": 9840 + }, + { + "epoch": 0.83, + "grad_norm": 0.26968163989390265, + "learning_rate": 7.45790292084263e-07, + "loss": 0.0846, + "step": 9841 + }, + { + "epoch": 0.83, + "grad_norm": 0.22018793290799799, + "learning_rate": 7.450734576406104e-07, + "loss": 0.0438, + "step": 9842 + }, + { + "epoch": 0.83, + "grad_norm": 0.38354668013316706, + "learning_rate": 7.443569401286737e-07, + "loss": 0.069, + "step": 9843 + }, + { + "epoch": 0.83, + "grad_norm": 0.1972513442228845, + "learning_rate": 7.436407396018225e-07, + "loss": 0.0641, + "step": 9844 + }, + { + "epoch": 0.83, + "grad_norm": 0.22380100172955078, + "learning_rate": 7.429248561134034e-07, + "loss": 0.0615, + "step": 9845 + }, + { + "epoch": 0.83, + "grad_norm": 0.2590908847573438, + "learning_rate": 7.422092897167416e-07, + "loss": 0.0633, + "step": 9846 + }, + { + "epoch": 0.83, + "grad_norm": 0.18487375380366977, + "learning_rate": 7.41494040465135e-07, + "loss": 0.0294, + "step": 9847 + }, + { + "epoch": 0.83, + "grad_norm": 0.17920872061455342, + "learning_rate": 7.407791084118598e-07, + "loss": 0.0482, + "step": 9848 + }, + { + "epoch": 0.83, + "grad_norm": 0.3486444139550612, + "learning_rate": 7.400644936101676e-07, + "loss": 0.0726, + "step": 9849 + }, + { + "epoch": 0.83, + "grad_norm": 0.410032500694567, + "learning_rate": 7.393501961132887e-07, + "loss": 0.095, + "step": 9850 + }, + { + "epoch": 0.83, + "grad_norm": 0.45223181720855704, + "learning_rate": 7.386362159744275e-07, + "loss": 0.1232, + "step": 9851 + }, + { + "epoch": 0.83, + "grad_norm": 0.38715021314705456, + "learning_rate": 7.379225532467654e-07, + "loss": 0.0902, + "step": 9852 + }, + { + "epoch": 0.83, + "grad_norm": 0.19027227517917547, + "learning_rate": 7.372092079834597e-07, + "loss": 0.0409, + "step": 9853 + }, + { + "epoch": 0.83, + "grad_norm": 0.2041203105785427, + "learning_rate": 7.364961802376435e-07, + "loss": 0.0623, + "step": 9854 + }, + { + "epoch": 0.83, + "grad_norm": 0.364313219372423, + "learning_rate": 7.3578347006243e-07, + "loss": 0.073, + "step": 9855 + }, + { + "epoch": 0.83, + "grad_norm": 0.3593433607435564, + "learning_rate": 7.350710775109043e-07, + "loss": 0.0696, + "step": 9856 + }, + { + "epoch": 0.83, + "grad_norm": 0.3380728338896378, + "learning_rate": 7.343590026361291e-07, + "loss": 0.0732, + "step": 9857 + }, + { + "epoch": 0.83, + "grad_norm": 0.46046863139736105, + "learning_rate": 7.336472454911453e-07, + "loss": 0.1318, + "step": 9858 + }, + { + "epoch": 0.83, + "grad_norm": 0.4047601325960266, + "learning_rate": 7.329358061289682e-07, + "loss": 0.1155, + "step": 9859 + }, + { + "epoch": 0.83, + "grad_norm": 0.3591799970801823, + "learning_rate": 7.322246846025899e-07, + "loss": 0.1032, + "step": 9860 + }, + { + "epoch": 0.83, + "grad_norm": 0.34244510451419896, + "learning_rate": 7.315138809649768e-07, + "loss": 0.1013, + "step": 9861 + }, + { + "epoch": 0.83, + "grad_norm": 0.2432773049895317, + "learning_rate": 7.308033952690774e-07, + "loss": 0.0763, + "step": 9862 + }, + { + "epoch": 0.83, + "grad_norm": 0.3290926710902916, + "learning_rate": 7.300932275678113e-07, + "loss": 0.0724, + "step": 9863 + }, + { + "epoch": 0.83, + "grad_norm": 0.19390099621854553, + "learning_rate": 7.293833779140741e-07, + "loss": 0.0379, + "step": 9864 + }, + { + "epoch": 0.83, + "grad_norm": 0.4248593085786264, + "learning_rate": 7.286738463607423e-07, + "loss": 0.0943, + "step": 9865 + }, + { + "epoch": 0.83, + "grad_norm": 0.2679217027790947, + "learning_rate": 7.279646329606648e-07, + "loss": 0.0847, + "step": 9866 + }, + { + "epoch": 0.83, + "grad_norm": 0.22919174370667947, + "learning_rate": 7.272557377666678e-07, + "loss": 0.0651, + "step": 9867 + }, + { + "epoch": 0.83, + "grad_norm": 0.6873300176874477, + "learning_rate": 7.265471608315527e-07, + "loss": 0.1084, + "step": 9868 + }, + { + "epoch": 0.83, + "grad_norm": 0.4720270836099034, + "learning_rate": 7.258389022081014e-07, + "loss": 0.0951, + "step": 9869 + }, + { + "epoch": 0.83, + "grad_norm": 0.29445532994096063, + "learning_rate": 7.251309619490671e-07, + "loss": 0.1028, + "step": 9870 + }, + { + "epoch": 0.83, + "grad_norm": 0.31028921464432496, + "learning_rate": 7.244233401071804e-07, + "loss": 0.0809, + "step": 9871 + }, + { + "epoch": 0.83, + "grad_norm": 0.30929135798891627, + "learning_rate": 7.23716036735152e-07, + "loss": 0.0802, + "step": 9872 + }, + { + "epoch": 0.83, + "grad_norm": 0.5115663003663974, + "learning_rate": 7.230090518856641e-07, + "loss": 0.1042, + "step": 9873 + }, + { + "epoch": 0.83, + "grad_norm": 0.3640175786699371, + "learning_rate": 7.223023856113776e-07, + "loss": 0.0899, + "step": 9874 + }, + { + "epoch": 0.83, + "grad_norm": 0.2922998888202173, + "learning_rate": 7.215960379649273e-07, + "loss": 0.0742, + "step": 9875 + }, + { + "epoch": 0.83, + "grad_norm": 0.25922278889898753, + "learning_rate": 7.208900089989291e-07, + "loss": 0.0932, + "step": 9876 + }, + { + "epoch": 0.83, + "grad_norm": 0.3158416145902828, + "learning_rate": 7.201842987659707e-07, + "loss": 0.0909, + "step": 9877 + }, + { + "epoch": 0.83, + "grad_norm": 0.3420751749927588, + "learning_rate": 7.194789073186159e-07, + "loss": 0.0873, + "step": 9878 + }, + { + "epoch": 0.83, + "grad_norm": 0.22508741871126006, + "learning_rate": 7.187738347094097e-07, + "loss": 0.0721, + "step": 9879 + }, + { + "epoch": 0.83, + "grad_norm": 0.40556355748776063, + "learning_rate": 7.180690809908681e-07, + "loss": 0.0757, + "step": 9880 + }, + { + "epoch": 0.83, + "grad_norm": 0.32428246207605577, + "learning_rate": 7.173646462154838e-07, + "loss": 0.0884, + "step": 9881 + }, + { + "epoch": 0.83, + "grad_norm": 0.39158887210753257, + "learning_rate": 7.166605304357305e-07, + "loss": 0.0615, + "step": 9882 + }, + { + "epoch": 0.83, + "grad_norm": 0.4153514475728676, + "learning_rate": 7.159567337040535e-07, + "loss": 0.084, + "step": 9883 + }, + { + "epoch": 0.83, + "grad_norm": 0.39279057423817826, + "learning_rate": 7.152532560728748e-07, + "loss": 0.0655, + "step": 9884 + }, + { + "epoch": 0.83, + "grad_norm": 0.34062529008343856, + "learning_rate": 7.145500975945929e-07, + "loss": 0.1003, + "step": 9885 + }, + { + "epoch": 0.83, + "grad_norm": 0.40964575679890336, + "learning_rate": 7.138472583215855e-07, + "loss": 0.1153, + "step": 9886 + }, + { + "epoch": 0.83, + "grad_norm": 0.4048773126769146, + "learning_rate": 7.131447383062034e-07, + "loss": 0.0956, + "step": 9887 + }, + { + "epoch": 0.83, + "grad_norm": 0.35514762466309213, + "learning_rate": 7.124425376007726e-07, + "loss": 0.0867, + "step": 9888 + }, + { + "epoch": 0.83, + "grad_norm": 0.29531222056047096, + "learning_rate": 7.117406562575995e-07, + "loss": 0.0648, + "step": 9889 + }, + { + "epoch": 0.83, + "grad_norm": 0.3588619924644892, + "learning_rate": 7.110390943289636e-07, + "loss": 0.0937, + "step": 9890 + }, + { + "epoch": 0.83, + "grad_norm": 0.38244855700186453, + "learning_rate": 7.103378518671205e-07, + "loss": 0.1186, + "step": 9891 + }, + { + "epoch": 0.83, + "grad_norm": 0.22513882271751143, + "learning_rate": 7.096369289243026e-07, + "loss": 0.0489, + "step": 9892 + }, + { + "epoch": 0.83, + "grad_norm": 0.2971890069441585, + "learning_rate": 7.089363255527204e-07, + "loss": 0.054, + "step": 9893 + }, + { + "epoch": 0.83, + "grad_norm": 0.14545636220958127, + "learning_rate": 7.082360418045581e-07, + "loss": 0.0369, + "step": 9894 + }, + { + "epoch": 0.83, + "grad_norm": 0.5198452838122398, + "learning_rate": 7.075360777319756e-07, + "loss": 0.1027, + "step": 9895 + }, + { + "epoch": 0.83, + "grad_norm": 0.18100923321806167, + "learning_rate": 7.068364333871125e-07, + "loss": 0.0344, + "step": 9896 + }, + { + "epoch": 0.83, + "grad_norm": 0.30341355434640155, + "learning_rate": 7.061371088220814e-07, + "loss": 0.0595, + "step": 9897 + }, + { + "epoch": 0.83, + "grad_norm": 0.40472857429239506, + "learning_rate": 7.054381040889713e-07, + "loss": 0.0802, + "step": 9898 + }, + { + "epoch": 0.83, + "grad_norm": 0.28743750799965895, + "learning_rate": 7.047394192398493e-07, + "loss": 0.0617, + "step": 9899 + }, + { + "epoch": 0.83, + "grad_norm": 0.5333154439294154, + "learning_rate": 7.040410543267573e-07, + "loss": 0.1103, + "step": 9900 + }, + { + "epoch": 0.83, + "grad_norm": 0.3613073134694714, + "learning_rate": 7.033430094017135e-07, + "loss": 0.0841, + "step": 9901 + }, + { + "epoch": 0.83, + "grad_norm": 0.3274965841964515, + "learning_rate": 7.026452845167115e-07, + "loss": 0.0656, + "step": 9902 + }, + { + "epoch": 0.83, + "grad_norm": 0.3965812791183081, + "learning_rate": 7.019478797237233e-07, + "loss": 0.0857, + "step": 9903 + }, + { + "epoch": 0.83, + "grad_norm": 0.2068028937966365, + "learning_rate": 7.012507950746949e-07, + "loss": 0.0481, + "step": 9904 + }, + { + "epoch": 0.83, + "grad_norm": 0.25877586207386494, + "learning_rate": 7.005540306215486e-07, + "loss": 0.0683, + "step": 9905 + }, + { + "epoch": 0.83, + "grad_norm": 0.3144415533135142, + "learning_rate": 6.998575864161855e-07, + "loss": 0.0703, + "step": 9906 + }, + { + "epoch": 0.83, + "grad_norm": 0.33881020182870164, + "learning_rate": 6.991614625104792e-07, + "loss": 0.1008, + "step": 9907 + }, + { + "epoch": 0.83, + "grad_norm": 0.38532765669259733, + "learning_rate": 6.984656589562816e-07, + "loss": 0.1182, + "step": 9908 + }, + { + "epoch": 0.83, + "grad_norm": 0.6137803620133151, + "learning_rate": 6.977701758054184e-07, + "loss": 0.0685, + "step": 9909 + }, + { + "epoch": 0.84, + "grad_norm": 0.3028415874968051, + "learning_rate": 6.970750131096965e-07, + "loss": 0.1129, + "step": 9910 + }, + { + "epoch": 0.84, + "grad_norm": 0.35994530508844846, + "learning_rate": 6.96380170920894e-07, + "loss": 0.1149, + "step": 9911 + }, + { + "epoch": 0.84, + "grad_norm": 0.4367432821588394, + "learning_rate": 6.956856492907665e-07, + "loss": 0.0937, + "step": 9912 + }, + { + "epoch": 0.84, + "grad_norm": 0.3450504373792145, + "learning_rate": 6.949914482710452e-07, + "loss": 0.0828, + "step": 9913 + }, + { + "epoch": 0.84, + "grad_norm": 1.0803155264973436, + "learning_rate": 6.942975679134406e-07, + "loss": 0.1037, + "step": 9914 + }, + { + "epoch": 0.84, + "grad_norm": 0.30500559154768886, + "learning_rate": 6.93604008269636e-07, + "loss": 0.0691, + "step": 9915 + }, + { + "epoch": 0.84, + "grad_norm": 0.36903167958889754, + "learning_rate": 6.929107693912912e-07, + "loss": 0.0739, + "step": 9916 + }, + { + "epoch": 0.84, + "grad_norm": 0.24279262336888582, + "learning_rate": 6.92217851330042e-07, + "loss": 0.0592, + "step": 9917 + }, + { + "epoch": 0.84, + "grad_norm": 0.31138986809214103, + "learning_rate": 6.915252541375029e-07, + "loss": 0.0856, + "step": 9918 + }, + { + "epoch": 0.84, + "grad_norm": 0.3204645184146256, + "learning_rate": 6.908329778652617e-07, + "loss": 0.1103, + "step": 9919 + }, + { + "epoch": 0.84, + "grad_norm": 0.5590846089069286, + "learning_rate": 6.901410225648825e-07, + "loss": 0.119, + "step": 9920 + }, + { + "epoch": 0.84, + "grad_norm": 0.3962538260118628, + "learning_rate": 6.894493882879072e-07, + "loss": 0.111, + "step": 9921 + }, + { + "epoch": 0.84, + "grad_norm": 0.8932587199170292, + "learning_rate": 6.887580750858514e-07, + "loss": 0.1055, + "step": 9922 + }, + { + "epoch": 0.84, + "grad_norm": 0.6734518554147418, + "learning_rate": 6.880670830102099e-07, + "loss": 0.1393, + "step": 9923 + }, + { + "epoch": 0.84, + "grad_norm": 0.38651345904532425, + "learning_rate": 6.87376412112451e-07, + "loss": 0.0869, + "step": 9924 + }, + { + "epoch": 0.84, + "grad_norm": 0.23371898323045504, + "learning_rate": 6.866860624440197e-07, + "loss": 0.0672, + "step": 9925 + }, + { + "epoch": 0.84, + "grad_norm": 0.3608194670820361, + "learning_rate": 6.859960340563359e-07, + "loss": 0.075, + "step": 9926 + }, + { + "epoch": 0.84, + "grad_norm": 0.21348011690748372, + "learning_rate": 6.853063270007998e-07, + "loss": 0.0654, + "step": 9927 + }, + { + "epoch": 0.84, + "grad_norm": 0.5428174568424596, + "learning_rate": 6.84616941328784e-07, + "loss": 0.127, + "step": 9928 + }, + { + "epoch": 0.84, + "grad_norm": 0.4149427480287126, + "learning_rate": 6.839278770916358e-07, + "loss": 0.0945, + "step": 9929 + }, + { + "epoch": 0.84, + "grad_norm": 0.4890960349298005, + "learning_rate": 6.83239134340683e-07, + "loss": 0.0864, + "step": 9930 + }, + { + "epoch": 0.84, + "grad_norm": 0.3484796547155267, + "learning_rate": 6.825507131272269e-07, + "loss": 0.1159, + "step": 9931 + }, + { + "epoch": 0.84, + "grad_norm": 0.2468165205071676, + "learning_rate": 6.818626135025436e-07, + "loss": 0.0614, + "step": 9932 + }, + { + "epoch": 0.84, + "grad_norm": 0.2932159993254829, + "learning_rate": 6.811748355178888e-07, + "loss": 0.0593, + "step": 9933 + }, + { + "epoch": 0.84, + "grad_norm": 0.17937781591431232, + "learning_rate": 6.804873792244915e-07, + "loss": 0.0551, + "step": 9934 + }, + { + "epoch": 0.84, + "grad_norm": 0.3473795470819226, + "learning_rate": 6.798002446735569e-07, + "loss": 0.1106, + "step": 9935 + }, + { + "epoch": 0.84, + "grad_norm": 0.2613062861364457, + "learning_rate": 6.791134319162662e-07, + "loss": 0.0489, + "step": 9936 + }, + { + "epoch": 0.84, + "grad_norm": 0.32659553785753825, + "learning_rate": 6.784269410037792e-07, + "loss": 0.1134, + "step": 9937 + }, + { + "epoch": 0.84, + "grad_norm": 0.5194624281863731, + "learning_rate": 6.777407719872286e-07, + "loss": 0.1138, + "step": 9938 + }, + { + "epoch": 0.84, + "grad_norm": 0.35260135765109557, + "learning_rate": 6.770549249177233e-07, + "loss": 0.101, + "step": 9939 + }, + { + "epoch": 0.84, + "grad_norm": 0.21181658763747813, + "learning_rate": 6.76369399846351e-07, + "loss": 0.0565, + "step": 9940 + }, + { + "epoch": 0.84, + "grad_norm": 0.33144567198397046, + "learning_rate": 6.756841968241734e-07, + "loss": 0.085, + "step": 9941 + }, + { + "epoch": 0.84, + "grad_norm": 0.1789964089003555, + "learning_rate": 6.749993159022273e-07, + "loss": 0.0554, + "step": 9942 + }, + { + "epoch": 0.84, + "grad_norm": 0.20497789437728106, + "learning_rate": 6.743147571315261e-07, + "loss": 0.0464, + "step": 9943 + }, + { + "epoch": 0.84, + "grad_norm": 0.3536453341132406, + "learning_rate": 6.736305205630623e-07, + "loss": 0.096, + "step": 9944 + }, + { + "epoch": 0.84, + "grad_norm": 0.2700831444384398, + "learning_rate": 6.729466062478001e-07, + "loss": 0.0761, + "step": 9945 + }, + { + "epoch": 0.84, + "grad_norm": 0.2351775697950334, + "learning_rate": 6.722630142366804e-07, + "loss": 0.0813, + "step": 9946 + }, + { + "epoch": 0.84, + "grad_norm": 0.29596166157228354, + "learning_rate": 6.715797445806233e-07, + "loss": 0.0901, + "step": 9947 + }, + { + "epoch": 0.84, + "grad_norm": 0.28979324368630516, + "learning_rate": 6.708967973305219e-07, + "loss": 0.1037, + "step": 9948 + }, + { + "epoch": 0.84, + "grad_norm": 0.3977331231328276, + "learning_rate": 6.702141725372446e-07, + "loss": 0.097, + "step": 9949 + }, + { + "epoch": 0.84, + "grad_norm": 0.2682830549857594, + "learning_rate": 6.695318702516401e-07, + "loss": 0.0634, + "step": 9950 + }, + { + "epoch": 0.84, + "grad_norm": 0.267912643128008, + "learning_rate": 6.688498905245288e-07, + "loss": 0.0763, + "step": 9951 + }, + { + "epoch": 0.84, + "grad_norm": 0.2878486540449759, + "learning_rate": 6.68168233406708e-07, + "loss": 0.0794, + "step": 9952 + }, + { + "epoch": 0.84, + "grad_norm": 0.39476888375283403, + "learning_rate": 6.674868989489513e-07, + "loss": 0.0945, + "step": 9953 + }, + { + "epoch": 0.84, + "grad_norm": 0.4796810571018395, + "learning_rate": 6.6680588720201e-07, + "loss": 0.0881, + "step": 9954 + }, + { + "epoch": 0.84, + "grad_norm": 0.3891566341049095, + "learning_rate": 6.66125198216609e-07, + "loss": 0.0946, + "step": 9955 + }, + { + "epoch": 0.84, + "grad_norm": 0.3538794755193433, + "learning_rate": 6.654448320434487e-07, + "loss": 0.0962, + "step": 9956 + }, + { + "epoch": 0.84, + "grad_norm": 0.3407869839406074, + "learning_rate": 6.647647887332092e-07, + "loss": 0.0651, + "step": 9957 + }, + { + "epoch": 0.84, + "grad_norm": 0.4724830041498625, + "learning_rate": 6.640850683365424e-07, + "loss": 0.0924, + "step": 9958 + }, + { + "epoch": 0.84, + "grad_norm": 0.2988038615242455, + "learning_rate": 6.634056709040787e-07, + "loss": 0.0664, + "step": 9959 + }, + { + "epoch": 0.84, + "grad_norm": 0.2786867952125467, + "learning_rate": 6.627265964864222e-07, + "loss": 0.0869, + "step": 9960 + }, + { + "epoch": 0.84, + "grad_norm": 0.338381956720589, + "learning_rate": 6.620478451341561e-07, + "loss": 0.0796, + "step": 9961 + }, + { + "epoch": 0.84, + "grad_norm": 0.37141617871926136, + "learning_rate": 6.613694168978374e-07, + "loss": 0.0991, + "step": 9962 + }, + { + "epoch": 0.84, + "grad_norm": 0.3699427130549968, + "learning_rate": 6.606913118279973e-07, + "loss": 0.1045, + "step": 9963 + }, + { + "epoch": 0.84, + "grad_norm": 0.306013040079827, + "learning_rate": 6.600135299751481e-07, + "loss": 0.0613, + "step": 9964 + }, + { + "epoch": 0.84, + "grad_norm": 0.2752507605540219, + "learning_rate": 6.593360713897729e-07, + "loss": 0.084, + "step": 9965 + }, + { + "epoch": 0.84, + "grad_norm": 0.2955830427664039, + "learning_rate": 6.586589361223328e-07, + "loss": 0.0765, + "step": 9966 + }, + { + "epoch": 0.84, + "grad_norm": 0.24715143348720378, + "learning_rate": 6.579821242232665e-07, + "loss": 0.0793, + "step": 9967 + }, + { + "epoch": 0.84, + "grad_norm": 0.3220376085684528, + "learning_rate": 6.573056357429853e-07, + "loss": 0.1095, + "step": 9968 + }, + { + "epoch": 0.84, + "grad_norm": 0.5265670743370436, + "learning_rate": 6.566294707318782e-07, + "loss": 0.1246, + "step": 9969 + }, + { + "epoch": 0.84, + "grad_norm": 0.41214130609218913, + "learning_rate": 6.559536292403096e-07, + "loss": 0.1358, + "step": 9970 + }, + { + "epoch": 0.84, + "grad_norm": 0.3591485915869745, + "learning_rate": 6.552781113186213e-07, + "loss": 0.0485, + "step": 9971 + }, + { + "epoch": 0.84, + "grad_norm": 0.27695386441168834, + "learning_rate": 6.546029170171297e-07, + "loss": 0.0729, + "step": 9972 + }, + { + "epoch": 0.84, + "grad_norm": 0.9920329242533058, + "learning_rate": 6.539280463861252e-07, + "loss": 0.1357, + "step": 9973 + }, + { + "epoch": 0.84, + "grad_norm": 0.3378754690999391, + "learning_rate": 6.532534994758788e-07, + "loss": 0.1101, + "step": 9974 + }, + { + "epoch": 0.84, + "grad_norm": 0.4077153172114649, + "learning_rate": 6.525792763366329e-07, + "loss": 0.0736, + "step": 9975 + }, + { + "epoch": 0.84, + "grad_norm": 0.43715877417675025, + "learning_rate": 6.519053770186084e-07, + "loss": 0.1074, + "step": 9976 + }, + { + "epoch": 0.84, + "grad_norm": 0.2808069240758969, + "learning_rate": 6.512318015720004e-07, + "loss": 0.0811, + "step": 9977 + }, + { + "epoch": 0.84, + "grad_norm": 0.23595589614730264, + "learning_rate": 6.505585500469819e-07, + "loss": 0.0611, + "step": 9978 + }, + { + "epoch": 0.84, + "grad_norm": 0.25544977764727195, + "learning_rate": 6.498856224937e-07, + "loss": 0.0678, + "step": 9979 + }, + { + "epoch": 0.84, + "grad_norm": 0.3036858745935714, + "learning_rate": 6.492130189622781e-07, + "loss": 0.0954, + "step": 9980 + }, + { + "epoch": 0.84, + "grad_norm": 0.3969699432531286, + "learning_rate": 6.485407395028148e-07, + "loss": 0.0674, + "step": 9981 + }, + { + "epoch": 0.84, + "grad_norm": 0.844057351929389, + "learning_rate": 6.478687841653874e-07, + "loss": 0.1174, + "step": 9982 + }, + { + "epoch": 0.84, + "grad_norm": 0.18800533116579318, + "learning_rate": 6.471971530000459e-07, + "loss": 0.0468, + "step": 9983 + }, + { + "epoch": 0.84, + "grad_norm": 0.20522865614953167, + "learning_rate": 6.465258460568174e-07, + "loss": 0.0559, + "step": 9984 + }, + { + "epoch": 0.84, + "grad_norm": 0.25691264030770145, + "learning_rate": 6.458548633857036e-07, + "loss": 0.0664, + "step": 9985 + }, + { + "epoch": 0.84, + "grad_norm": 0.21670358768985237, + "learning_rate": 6.451842050366858e-07, + "loss": 0.0456, + "step": 9986 + }, + { + "epoch": 0.84, + "grad_norm": 0.324863784374698, + "learning_rate": 6.445138710597165e-07, + "loss": 0.0991, + "step": 9987 + }, + { + "epoch": 0.84, + "grad_norm": 0.3516150256372846, + "learning_rate": 6.438438615047271e-07, + "loss": 0.1219, + "step": 9988 + }, + { + "epoch": 0.84, + "grad_norm": 0.16975888050080762, + "learning_rate": 6.431741764216237e-07, + "loss": 0.06, + "step": 9989 + }, + { + "epoch": 0.84, + "grad_norm": 0.4161984267397283, + "learning_rate": 6.425048158602865e-07, + "loss": 0.0945, + "step": 9990 + }, + { + "epoch": 0.84, + "grad_norm": 0.4043672873691549, + "learning_rate": 6.41835779870576e-07, + "loss": 0.0849, + "step": 9991 + }, + { + "epoch": 0.84, + "grad_norm": 0.46843555497686135, + "learning_rate": 6.411670685023247e-07, + "loss": 0.0666, + "step": 9992 + }, + { + "epoch": 0.84, + "grad_norm": 0.26065090527405504, + "learning_rate": 6.404986818053421e-07, + "loss": 0.0712, + "step": 9993 + }, + { + "epoch": 0.84, + "grad_norm": 0.37799962792487357, + "learning_rate": 6.39830619829413e-07, + "loss": 0.0633, + "step": 9994 + }, + { + "epoch": 0.84, + "grad_norm": 0.33301947051005987, + "learning_rate": 6.391628826243001e-07, + "loss": 0.0958, + "step": 9995 + }, + { + "epoch": 0.84, + "grad_norm": 0.3574224160882498, + "learning_rate": 6.384954702397394e-07, + "loss": 0.0831, + "step": 9996 + }, + { + "epoch": 0.84, + "grad_norm": 0.35024048521073536, + "learning_rate": 6.378283827254427e-07, + "loss": 0.0725, + "step": 9997 + }, + { + "epoch": 0.84, + "grad_norm": 0.31861449675038783, + "learning_rate": 6.371616201311009e-07, + "loss": 0.0857, + "step": 9998 + }, + { + "epoch": 0.84, + "grad_norm": 0.30967611099697434, + "learning_rate": 6.364951825063765e-07, + "loss": 0.0723, + "step": 9999 + }, + { + "epoch": 0.84, + "grad_norm": 0.35809143861575465, + "learning_rate": 6.358290699009106e-07, + "loss": 0.1042, + "step": 10000 + }, + { + "epoch": 0.84, + "grad_norm": 0.40626894229806215, + "learning_rate": 6.351632823643178e-07, + "loss": 0.077, + "step": 10001 + }, + { + "epoch": 0.84, + "grad_norm": 0.3686656245731218, + "learning_rate": 6.344978199461915e-07, + "loss": 0.0991, + "step": 10002 + }, + { + "epoch": 0.84, + "grad_norm": 0.4820689579500976, + "learning_rate": 6.338326826960989e-07, + "loss": 0.1197, + "step": 10003 + }, + { + "epoch": 0.84, + "grad_norm": 0.2330912853072014, + "learning_rate": 6.331678706635813e-07, + "loss": 0.0572, + "step": 10004 + }, + { + "epoch": 0.84, + "grad_norm": 0.37731167924736797, + "learning_rate": 6.325033838981609e-07, + "loss": 0.0605, + "step": 10005 + }, + { + "epoch": 0.84, + "grad_norm": 0.2523130116950927, + "learning_rate": 6.318392224493309e-07, + "loss": 0.071, + "step": 10006 + }, + { + "epoch": 0.84, + "grad_norm": 0.4025894834448294, + "learning_rate": 6.311753863665603e-07, + "loss": 0.0987, + "step": 10007 + }, + { + "epoch": 0.84, + "grad_norm": 0.27599719809877776, + "learning_rate": 6.30511875699299e-07, + "loss": 0.0733, + "step": 10008 + }, + { + "epoch": 0.84, + "grad_norm": 0.4919532154506478, + "learning_rate": 6.298486904969675e-07, + "loss": 0.0965, + "step": 10009 + }, + { + "epoch": 0.84, + "grad_norm": 0.20638605968772533, + "learning_rate": 6.291858308089633e-07, + "loss": 0.0523, + "step": 10010 + }, + { + "epoch": 0.84, + "grad_norm": 0.47135193749683096, + "learning_rate": 6.285232966846589e-07, + "loss": 0.1041, + "step": 10011 + }, + { + "epoch": 0.84, + "grad_norm": 0.43289601190114324, + "learning_rate": 6.278610881734065e-07, + "loss": 0.0963, + "step": 10012 + }, + { + "epoch": 0.84, + "grad_norm": 0.435381571819881, + "learning_rate": 6.271992053245296e-07, + "loss": 0.1405, + "step": 10013 + }, + { + "epoch": 0.84, + "grad_norm": 0.5126033372432839, + "learning_rate": 6.265376481873287e-07, + "loss": 0.1195, + "step": 10014 + }, + { + "epoch": 0.84, + "grad_norm": 0.21779021357190934, + "learning_rate": 6.25876416811082e-07, + "loss": 0.0576, + "step": 10015 + }, + { + "epoch": 0.84, + "grad_norm": 0.2989099413506579, + "learning_rate": 6.252155112450409e-07, + "loss": 0.0919, + "step": 10016 + }, + { + "epoch": 0.84, + "grad_norm": 0.2958396235730252, + "learning_rate": 6.24554931538433e-07, + "loss": 0.0753, + "step": 10017 + }, + { + "epoch": 0.84, + "grad_norm": 0.2753224658757622, + "learning_rate": 6.238946777404619e-07, + "loss": 0.0567, + "step": 10018 + }, + { + "epoch": 0.84, + "grad_norm": 0.2925403621564145, + "learning_rate": 6.232347499003094e-07, + "loss": 0.0845, + "step": 10019 + }, + { + "epoch": 0.84, + "grad_norm": 0.33754052775932514, + "learning_rate": 6.225751480671288e-07, + "loss": 0.0894, + "step": 10020 + }, + { + "epoch": 0.84, + "grad_norm": 0.23248067214408336, + "learning_rate": 6.219158722900509e-07, + "loss": 0.0775, + "step": 10021 + }, + { + "epoch": 0.84, + "grad_norm": 0.15912741345557144, + "learning_rate": 6.212569226181836e-07, + "loss": 0.0488, + "step": 10022 + }, + { + "epoch": 0.84, + "grad_norm": 0.2656976245486098, + "learning_rate": 6.205982991006093e-07, + "loss": 0.0743, + "step": 10023 + }, + { + "epoch": 0.84, + "grad_norm": 0.340150623186326, + "learning_rate": 6.199400017863844e-07, + "loss": 0.0998, + "step": 10024 + }, + { + "epoch": 0.84, + "grad_norm": 0.36717487028904194, + "learning_rate": 6.192820307245445e-07, + "loss": 0.0834, + "step": 10025 + }, + { + "epoch": 0.84, + "grad_norm": 0.3698375951686579, + "learning_rate": 6.186243859640989e-07, + "loss": 0.0872, + "step": 10026 + }, + { + "epoch": 0.84, + "grad_norm": 0.3151516640981602, + "learning_rate": 6.179670675540322e-07, + "loss": 0.0901, + "step": 10027 + }, + { + "epoch": 0.84, + "grad_norm": 0.27648733119213587, + "learning_rate": 6.173100755433048e-07, + "loss": 0.063, + "step": 10028 + }, + { + "epoch": 0.85, + "grad_norm": 0.2909117653166107, + "learning_rate": 6.166534099808552e-07, + "loss": 0.0562, + "step": 10029 + }, + { + "epoch": 0.85, + "grad_norm": 0.35965534577714, + "learning_rate": 6.159970709155943e-07, + "loss": 0.0729, + "step": 10030 + }, + { + "epoch": 0.85, + "grad_norm": 0.3959655871915268, + "learning_rate": 6.153410583964092e-07, + "loss": 0.0995, + "step": 10031 + }, + { + "epoch": 0.85, + "grad_norm": 0.34862531830459653, + "learning_rate": 6.146853724721652e-07, + "loss": 0.0914, + "step": 10032 + }, + { + "epoch": 0.85, + "grad_norm": 0.30899925620861934, + "learning_rate": 6.140300131917015e-07, + "loss": 0.0709, + "step": 10033 + }, + { + "epoch": 0.85, + "grad_norm": 0.21019894135443865, + "learning_rate": 6.133749806038325e-07, + "loss": 0.0382, + "step": 10034 + }, + { + "epoch": 0.85, + "grad_norm": 0.32840265695534265, + "learning_rate": 6.12720274757348e-07, + "loss": 0.1019, + "step": 10035 + }, + { + "epoch": 0.85, + "grad_norm": 0.2920827945890285, + "learning_rate": 6.120658957010162e-07, + "loss": 0.0854, + "step": 10036 + }, + { + "epoch": 0.85, + "grad_norm": 0.4476076160764897, + "learning_rate": 6.114118434835781e-07, + "loss": 0.091, + "step": 10037 + }, + { + "epoch": 0.85, + "grad_norm": 0.44075988335182503, + "learning_rate": 6.107581181537503e-07, + "loss": 0.1357, + "step": 10038 + }, + { + "epoch": 0.85, + "grad_norm": 0.2621923594633043, + "learning_rate": 6.101047197602278e-07, + "loss": 0.0844, + "step": 10039 + }, + { + "epoch": 0.85, + "grad_norm": 0.22935029461028514, + "learning_rate": 6.094516483516794e-07, + "loss": 0.055, + "step": 10040 + }, + { + "epoch": 0.85, + "grad_norm": 0.22211277565660706, + "learning_rate": 6.087989039767472e-07, + "loss": 0.0866, + "step": 10041 + }, + { + "epoch": 0.85, + "grad_norm": 0.3697892472230128, + "learning_rate": 6.081464866840552e-07, + "loss": 0.0678, + "step": 10042 + }, + { + "epoch": 0.85, + "grad_norm": 0.4057106054291832, + "learning_rate": 6.074943965221969e-07, + "loss": 0.0858, + "step": 10043 + }, + { + "epoch": 0.85, + "grad_norm": 0.45883441864136787, + "learning_rate": 6.068426335397442e-07, + "loss": 0.0768, + "step": 10044 + }, + { + "epoch": 0.85, + "grad_norm": 0.6617785743403138, + "learning_rate": 6.06191197785243e-07, + "loss": 0.1263, + "step": 10045 + }, + { + "epoch": 0.85, + "grad_norm": 0.22899207271752697, + "learning_rate": 6.055400893072183e-07, + "loss": 0.0715, + "step": 10046 + }, + { + "epoch": 0.85, + "grad_norm": 0.2816184833941627, + "learning_rate": 6.048893081541674e-07, + "loss": 0.0782, + "step": 10047 + }, + { + "epoch": 0.85, + "grad_norm": 0.5064648873609169, + "learning_rate": 6.042388543745642e-07, + "loss": 0.1331, + "step": 10048 + }, + { + "epoch": 0.85, + "grad_norm": 0.18797318740207142, + "learning_rate": 6.035887280168579e-07, + "loss": 0.0626, + "step": 10049 + }, + { + "epoch": 0.85, + "grad_norm": 0.2341344234682925, + "learning_rate": 6.029389291294747e-07, + "loss": 0.0598, + "step": 10050 + }, + { + "epoch": 0.85, + "grad_norm": 0.3512452207778412, + "learning_rate": 6.022894577608146e-07, + "loss": 0.1098, + "step": 10051 + }, + { + "epoch": 0.85, + "grad_norm": 0.34607291364100073, + "learning_rate": 6.016403139592541e-07, + "loss": 0.0858, + "step": 10052 + }, + { + "epoch": 0.85, + "grad_norm": 0.3123602129253076, + "learning_rate": 6.009914977731446e-07, + "loss": 0.0687, + "step": 10053 + }, + { + "epoch": 0.85, + "grad_norm": 0.251331360341203, + "learning_rate": 6.003430092508156e-07, + "loss": 0.0877, + "step": 10054 + }, + { + "epoch": 0.85, + "grad_norm": 0.4344539389887402, + "learning_rate": 5.99694848440569e-07, + "loss": 0.094, + "step": 10055 + }, + { + "epoch": 0.85, + "grad_norm": 0.40992625273747296, + "learning_rate": 5.990470153906841e-07, + "loss": 0.1006, + "step": 10056 + }, + { + "epoch": 0.85, + "grad_norm": 0.35065072571167394, + "learning_rate": 5.98399510149415e-07, + "loss": 0.1073, + "step": 10057 + }, + { + "epoch": 0.85, + "grad_norm": 0.531683677164515, + "learning_rate": 5.977523327649903e-07, + "loss": 0.0987, + "step": 10058 + }, + { + "epoch": 0.85, + "grad_norm": 0.505513606779016, + "learning_rate": 5.971054832856177e-07, + "loss": 0.1252, + "step": 10059 + }, + { + "epoch": 0.85, + "grad_norm": 0.33564076128811127, + "learning_rate": 5.964589617594774e-07, + "loss": 0.1035, + "step": 10060 + }, + { + "epoch": 0.85, + "grad_norm": 0.20162835599461876, + "learning_rate": 5.958127682347264e-07, + "loss": 0.0549, + "step": 10061 + }, + { + "epoch": 0.85, + "grad_norm": 0.3038869763699886, + "learning_rate": 5.951669027594958e-07, + "loss": 0.1041, + "step": 10062 + }, + { + "epoch": 0.85, + "grad_norm": 0.3753884917034024, + "learning_rate": 5.945213653818954e-07, + "loss": 0.0886, + "step": 10063 + }, + { + "epoch": 0.85, + "grad_norm": 0.23551272816867713, + "learning_rate": 5.938761561500073e-07, + "loss": 0.0607, + "step": 10064 + }, + { + "epoch": 0.85, + "grad_norm": 0.3479306454516724, + "learning_rate": 5.932312751118896e-07, + "loss": 0.0943, + "step": 10065 + }, + { + "epoch": 0.85, + "grad_norm": 0.3836962601323056, + "learning_rate": 5.92586722315579e-07, + "loss": 0.0988, + "step": 10066 + }, + { + "epoch": 0.85, + "grad_norm": 0.4125365898913317, + "learning_rate": 5.919424978090843e-07, + "loss": 0.0678, + "step": 10067 + }, + { + "epoch": 0.85, + "grad_norm": 0.3022674371688516, + "learning_rate": 5.91298601640391e-07, + "loss": 0.0835, + "step": 10068 + }, + { + "epoch": 0.85, + "grad_norm": 0.24855599509345083, + "learning_rate": 5.906550338574596e-07, + "loss": 0.0416, + "step": 10069 + }, + { + "epoch": 0.85, + "grad_norm": 0.2366265294150871, + "learning_rate": 5.90011794508229e-07, + "loss": 0.0624, + "step": 10070 + }, + { + "epoch": 0.85, + "grad_norm": 0.3054822460605164, + "learning_rate": 5.893688836406098e-07, + "loss": 0.1003, + "step": 10071 + }, + { + "epoch": 0.85, + "grad_norm": 0.5693754429143377, + "learning_rate": 5.887263013024886e-07, + "loss": 0.0913, + "step": 10072 + }, + { + "epoch": 0.85, + "grad_norm": 0.3590839416763244, + "learning_rate": 5.880840475417315e-07, + "loss": 0.0875, + "step": 10073 + }, + { + "epoch": 0.85, + "grad_norm": 0.26328879378723563, + "learning_rate": 5.874421224061755e-07, + "loss": 0.0596, + "step": 10074 + }, + { + "epoch": 0.85, + "grad_norm": 0.3039715264159054, + "learning_rate": 5.868005259436343e-07, + "loss": 0.0902, + "step": 10075 + }, + { + "epoch": 0.85, + "grad_norm": 0.3998908034187788, + "learning_rate": 5.861592582018998e-07, + "loss": 0.0668, + "step": 10076 + }, + { + "epoch": 0.85, + "grad_norm": 0.7110174508168057, + "learning_rate": 5.855183192287367e-07, + "loss": 0.1365, + "step": 10077 + }, + { + "epoch": 0.85, + "grad_norm": 0.3998245029451963, + "learning_rate": 5.848777090718849e-07, + "loss": 0.0711, + "step": 10078 + }, + { + "epoch": 0.85, + "grad_norm": 0.3227790174012581, + "learning_rate": 5.842374277790608e-07, + "loss": 0.054, + "step": 10079 + }, + { + "epoch": 0.85, + "grad_norm": 0.3390945760519033, + "learning_rate": 5.835974753979573e-07, + "loss": 0.0841, + "step": 10080 + }, + { + "epoch": 0.85, + "grad_norm": 0.18067998683394357, + "learning_rate": 5.829578519762413e-07, + "loss": 0.0439, + "step": 10081 + }, + { + "epoch": 0.85, + "grad_norm": 0.2446286218041023, + "learning_rate": 5.823185575615553e-07, + "loss": 0.0511, + "step": 10082 + }, + { + "epoch": 0.85, + "grad_norm": 0.21025385259522128, + "learning_rate": 5.816795922015184e-07, + "loss": 0.0371, + "step": 10083 + }, + { + "epoch": 0.85, + "grad_norm": 0.21919923061775645, + "learning_rate": 5.810409559437241e-07, + "loss": 0.0793, + "step": 10084 + }, + { + "epoch": 0.85, + "grad_norm": 0.580564164850924, + "learning_rate": 5.804026488357423e-07, + "loss": 0.0916, + "step": 10085 + }, + { + "epoch": 0.85, + "grad_norm": 0.35026425716209636, + "learning_rate": 5.797646709251153e-07, + "loss": 0.1088, + "step": 10086 + }, + { + "epoch": 0.85, + "grad_norm": 0.354811859883803, + "learning_rate": 5.79127022259367e-07, + "loss": 0.1056, + "step": 10087 + }, + { + "epoch": 0.85, + "grad_norm": 0.2678649803196242, + "learning_rate": 5.784897028859915e-07, + "loss": 0.0621, + "step": 10088 + }, + { + "epoch": 0.85, + "grad_norm": 0.32969620933248084, + "learning_rate": 5.778527128524591e-07, + "loss": 0.0879, + "step": 10089 + }, + { + "epoch": 0.85, + "grad_norm": 0.2798721662342978, + "learning_rate": 5.772160522062187e-07, + "loss": 0.0849, + "step": 10090 + }, + { + "epoch": 0.85, + "grad_norm": 0.25182569151114936, + "learning_rate": 5.765797209946916e-07, + "loss": 0.0648, + "step": 10091 + }, + { + "epoch": 0.85, + "grad_norm": 0.21865477899744626, + "learning_rate": 5.759437192652739e-07, + "loss": 0.0634, + "step": 10092 + }, + { + "epoch": 0.85, + "grad_norm": 0.2850319371447458, + "learning_rate": 5.753080470653411e-07, + "loss": 0.0778, + "step": 10093 + }, + { + "epoch": 0.85, + "grad_norm": 0.23589932933515276, + "learning_rate": 5.746727044422401e-07, + "loss": 0.0651, + "step": 10094 + }, + { + "epoch": 0.85, + "grad_norm": 0.3019480784331829, + "learning_rate": 5.74037691443296e-07, + "loss": 0.0857, + "step": 10095 + }, + { + "epoch": 0.85, + "grad_norm": 0.3154725287574297, + "learning_rate": 5.734030081158071e-07, + "loss": 0.0724, + "step": 10096 + }, + { + "epoch": 0.85, + "grad_norm": 0.21602922609034222, + "learning_rate": 5.727686545070493e-07, + "loss": 0.072, + "step": 10097 + }, + { + "epoch": 0.85, + "grad_norm": 0.4329351899219665, + "learning_rate": 5.721346306642733e-07, + "loss": 0.1119, + "step": 10098 + }, + { + "epoch": 0.85, + "grad_norm": 0.18544762084551014, + "learning_rate": 5.715009366347029e-07, + "loss": 0.0458, + "step": 10099 + }, + { + "epoch": 0.85, + "grad_norm": 0.5563154679794391, + "learning_rate": 5.708675724655416e-07, + "loss": 0.1264, + "step": 10100 + }, + { + "epoch": 0.85, + "grad_norm": 0.2485403851592536, + "learning_rate": 5.702345382039653e-07, + "loss": 0.0717, + "step": 10101 + }, + { + "epoch": 0.85, + "grad_norm": 0.32432658645935014, + "learning_rate": 5.696018338971259e-07, + "loss": 0.0731, + "step": 10102 + }, + { + "epoch": 0.85, + "grad_norm": 0.45674140230988364, + "learning_rate": 5.689694595921497e-07, + "loss": 0.098, + "step": 10103 + }, + { + "epoch": 0.85, + "grad_norm": 0.4899105897067363, + "learning_rate": 5.683374153361421e-07, + "loss": 0.1272, + "step": 10104 + }, + { + "epoch": 0.85, + "grad_norm": 0.30639314046242133, + "learning_rate": 5.6770570117618e-07, + "loss": 0.0668, + "step": 10105 + }, + { + "epoch": 0.85, + "grad_norm": 0.4964217596715763, + "learning_rate": 5.670743171593157e-07, + "loss": 0.0821, + "step": 10106 + }, + { + "epoch": 0.85, + "grad_norm": 0.23361283372302385, + "learning_rate": 5.664432633325817e-07, + "loss": 0.0528, + "step": 10107 + }, + { + "epoch": 0.85, + "grad_norm": 0.24142385203668879, + "learning_rate": 5.658125397429809e-07, + "loss": 0.0412, + "step": 10108 + }, + { + "epoch": 0.85, + "grad_norm": 0.21254369781288915, + "learning_rate": 5.651821464374918e-07, + "loss": 0.061, + "step": 10109 + }, + { + "epoch": 0.85, + "grad_norm": 0.24200911900799923, + "learning_rate": 5.645520834630718e-07, + "loss": 0.0643, + "step": 10110 + }, + { + "epoch": 0.85, + "grad_norm": 0.5600097345438724, + "learning_rate": 5.639223508666519e-07, + "loss": 0.0942, + "step": 10111 + }, + { + "epoch": 0.85, + "grad_norm": 0.3491473062639947, + "learning_rate": 5.632929486951372e-07, + "loss": 0.0919, + "step": 10112 + }, + { + "epoch": 0.85, + "grad_norm": 0.24315213421564805, + "learning_rate": 5.626638769954079e-07, + "loss": 0.0585, + "step": 10113 + }, + { + "epoch": 0.85, + "grad_norm": 0.41236815141197186, + "learning_rate": 5.620351358143239e-07, + "loss": 0.0717, + "step": 10114 + }, + { + "epoch": 0.85, + "grad_norm": 0.33504781720963045, + "learning_rate": 5.614067251987154e-07, + "loss": 0.0731, + "step": 10115 + }, + { + "epoch": 0.85, + "grad_norm": 0.3128590469777471, + "learning_rate": 5.607786451953912e-07, + "loss": 0.0691, + "step": 10116 + }, + { + "epoch": 0.85, + "grad_norm": 0.23033080064507722, + "learning_rate": 5.601508958511331e-07, + "loss": 0.075, + "step": 10117 + }, + { + "epoch": 0.85, + "grad_norm": 0.19658909336726813, + "learning_rate": 5.595234772127011e-07, + "loss": 0.0646, + "step": 10118 + }, + { + "epoch": 0.85, + "grad_norm": 0.2510375988849471, + "learning_rate": 5.588963893268279e-07, + "loss": 0.067, + "step": 10119 + }, + { + "epoch": 0.85, + "grad_norm": 0.2573015565067798, + "learning_rate": 5.582696322402237e-07, + "loss": 0.0823, + "step": 10120 + }, + { + "epoch": 0.85, + "grad_norm": 0.5561555205062403, + "learning_rate": 5.576432059995707e-07, + "loss": 0.1126, + "step": 10121 + }, + { + "epoch": 0.85, + "grad_norm": 0.30365844221003885, + "learning_rate": 5.570171106515315e-07, + "loss": 0.0466, + "step": 10122 + }, + { + "epoch": 0.85, + "grad_norm": 0.32601208151312766, + "learning_rate": 5.563913462427401e-07, + "loss": 0.0858, + "step": 10123 + }, + { + "epoch": 0.85, + "grad_norm": 0.20664641937100506, + "learning_rate": 5.557659128198072e-07, + "loss": 0.0408, + "step": 10124 + }, + { + "epoch": 0.85, + "grad_norm": 0.2057668485448398, + "learning_rate": 5.551408104293188e-07, + "loss": 0.0464, + "step": 10125 + }, + { + "epoch": 0.85, + "grad_norm": 0.275302541041108, + "learning_rate": 5.54516039117835e-07, + "loss": 0.0977, + "step": 10126 + }, + { + "epoch": 0.85, + "grad_norm": 0.22596811227362334, + "learning_rate": 5.538915989318943e-07, + "loss": 0.0695, + "step": 10127 + }, + { + "epoch": 0.85, + "grad_norm": 0.23848898256759685, + "learning_rate": 5.532674899180079e-07, + "loss": 0.0806, + "step": 10128 + }, + { + "epoch": 0.85, + "grad_norm": 0.20674249425206906, + "learning_rate": 5.526437121226629e-07, + "loss": 0.034, + "step": 10129 + }, + { + "epoch": 0.85, + "grad_norm": 0.4611595113157456, + "learning_rate": 5.520202655923212e-07, + "loss": 0.0811, + "step": 10130 + }, + { + "epoch": 0.85, + "grad_norm": 0.3339739097495186, + "learning_rate": 5.513971503734223e-07, + "loss": 0.118, + "step": 10131 + }, + { + "epoch": 0.85, + "grad_norm": 0.34023055318300904, + "learning_rate": 5.507743665123788e-07, + "loss": 0.0807, + "step": 10132 + }, + { + "epoch": 0.85, + "grad_norm": 0.4529090994019407, + "learning_rate": 5.501519140555784e-07, + "loss": 0.1131, + "step": 10133 + }, + { + "epoch": 0.85, + "grad_norm": 0.2959287939753219, + "learning_rate": 5.495297930493864e-07, + "loss": 0.0639, + "step": 10134 + }, + { + "epoch": 0.85, + "grad_norm": 0.3083781397633366, + "learning_rate": 5.489080035401412e-07, + "loss": 0.0918, + "step": 10135 + }, + { + "epoch": 0.85, + "grad_norm": 0.2658363308666795, + "learning_rate": 5.482865455741576e-07, + "loss": 0.08, + "step": 10136 + }, + { + "epoch": 0.85, + "grad_norm": 0.5614104857624876, + "learning_rate": 5.476654191977243e-07, + "loss": 0.1071, + "step": 10137 + }, + { + "epoch": 0.85, + "grad_norm": 0.26866241445030264, + "learning_rate": 5.470446244571081e-07, + "loss": 0.0712, + "step": 10138 + }, + { + "epoch": 0.85, + "grad_norm": 0.22916407226265162, + "learning_rate": 5.464241613985494e-07, + "loss": 0.0732, + "step": 10139 + }, + { + "epoch": 0.85, + "grad_norm": 0.23361139735585032, + "learning_rate": 5.458040300682616e-07, + "loss": 0.0576, + "step": 10140 + }, + { + "epoch": 0.85, + "grad_norm": 0.45699667092551394, + "learning_rate": 5.451842305124377e-07, + "loss": 0.0861, + "step": 10141 + }, + { + "epoch": 0.85, + "grad_norm": 0.3779438131504885, + "learning_rate": 5.445647627772443e-07, + "loss": 0.104, + "step": 10142 + }, + { + "epoch": 0.85, + "grad_norm": 0.21099429836653283, + "learning_rate": 5.439456269088222e-07, + "loss": 0.0649, + "step": 10143 + }, + { + "epoch": 0.85, + "grad_norm": 0.28044221210124665, + "learning_rate": 5.433268229532867e-07, + "loss": 0.0828, + "step": 10144 + }, + { + "epoch": 0.85, + "grad_norm": 0.36199221607048093, + "learning_rate": 5.427083509567327e-07, + "loss": 0.0831, + "step": 10145 + }, + { + "epoch": 0.85, + "grad_norm": 0.39216804062376454, + "learning_rate": 5.420902109652265e-07, + "loss": 0.1029, + "step": 10146 + }, + { + "epoch": 0.85, + "grad_norm": 0.3780502826396717, + "learning_rate": 5.414724030248098e-07, + "loss": 0.0718, + "step": 10147 + }, + { + "epoch": 0.86, + "grad_norm": 0.3050551022564213, + "learning_rate": 5.408549271815023e-07, + "loss": 0.0809, + "step": 10148 + }, + { + "epoch": 0.86, + "grad_norm": 0.36436516543996395, + "learning_rate": 5.402377834812961e-07, + "loss": 0.0618, + "step": 10149 + }, + { + "epoch": 0.86, + "grad_norm": 0.3092014068121197, + "learning_rate": 5.396209719701584e-07, + "loss": 0.0929, + "step": 10150 + }, + { + "epoch": 0.86, + "grad_norm": 0.6360406986906038, + "learning_rate": 5.390044926940358e-07, + "loss": 0.1222, + "step": 10151 + }, + { + "epoch": 0.86, + "grad_norm": 0.35744604704974464, + "learning_rate": 5.383883456988459e-07, + "loss": 0.1151, + "step": 10152 + }, + { + "epoch": 0.86, + "grad_norm": 0.2826134725793427, + "learning_rate": 5.377725310304827e-07, + "loss": 0.083, + "step": 10153 + }, + { + "epoch": 0.86, + "grad_norm": 0.5205982078654979, + "learning_rate": 5.371570487348149e-07, + "loss": 0.1365, + "step": 10154 + }, + { + "epoch": 0.86, + "grad_norm": 0.33780526807175987, + "learning_rate": 5.365418988576887e-07, + "loss": 0.0943, + "step": 10155 + }, + { + "epoch": 0.86, + "grad_norm": 0.26999103018681314, + "learning_rate": 5.359270814449241e-07, + "loss": 0.0555, + "step": 10156 + }, + { + "epoch": 0.86, + "grad_norm": 0.2775509996516515, + "learning_rate": 5.353125965423139e-07, + "loss": 0.0621, + "step": 10157 + }, + { + "epoch": 0.86, + "grad_norm": 0.3208658967614774, + "learning_rate": 5.346984441956316e-07, + "loss": 0.0879, + "step": 10158 + }, + { + "epoch": 0.86, + "grad_norm": 0.10915325703474758, + "learning_rate": 5.340846244506214e-07, + "loss": 0.0289, + "step": 10159 + }, + { + "epoch": 0.86, + "grad_norm": 0.35765169206162944, + "learning_rate": 5.334711373530043e-07, + "loss": 0.0959, + "step": 10160 + }, + { + "epoch": 0.86, + "grad_norm": 0.2810410289620671, + "learning_rate": 5.328579829484754e-07, + "loss": 0.0808, + "step": 10161 + }, + { + "epoch": 0.86, + "grad_norm": 0.3166000050568358, + "learning_rate": 5.322451612827078e-07, + "loss": 0.073, + "step": 10162 + }, + { + "epoch": 0.86, + "grad_norm": 0.30449514646004555, + "learning_rate": 5.316326724013477e-07, + "loss": 0.0806, + "step": 10163 + }, + { + "epoch": 0.86, + "grad_norm": 0.29423842962256364, + "learning_rate": 5.310205163500148e-07, + "loss": 0.0756, + "step": 10164 + }, + { + "epoch": 0.86, + "grad_norm": 0.327304131929898, + "learning_rate": 5.304086931743085e-07, + "loss": 0.0869, + "step": 10165 + }, + { + "epoch": 0.86, + "grad_norm": 0.3675276971314589, + "learning_rate": 5.297972029198006e-07, + "loss": 0.0442, + "step": 10166 + }, + { + "epoch": 0.86, + "grad_norm": 0.22519923978457085, + "learning_rate": 5.291860456320363e-07, + "loss": 0.0391, + "step": 10167 + }, + { + "epoch": 0.86, + "grad_norm": 0.35575681309286683, + "learning_rate": 5.28575221356541e-07, + "loss": 0.0773, + "step": 10168 + }, + { + "epoch": 0.86, + "grad_norm": 0.30400740819080263, + "learning_rate": 5.279647301388107e-07, + "loss": 0.0844, + "step": 10169 + }, + { + "epoch": 0.86, + "grad_norm": 0.36732024366900184, + "learning_rate": 5.273545720243195e-07, + "loss": 0.1245, + "step": 10170 + }, + { + "epoch": 0.86, + "grad_norm": 0.1960655751678503, + "learning_rate": 5.267447470585135e-07, + "loss": 0.0555, + "step": 10171 + }, + { + "epoch": 0.86, + "grad_norm": 0.3030606164429105, + "learning_rate": 5.261352552868187e-07, + "loss": 0.095, + "step": 10172 + }, + { + "epoch": 0.86, + "grad_norm": 0.3085420021645029, + "learning_rate": 5.255260967546321e-07, + "loss": 0.0785, + "step": 10173 + }, + { + "epoch": 0.86, + "grad_norm": 0.24766138873802865, + "learning_rate": 5.249172715073264e-07, + "loss": 0.0612, + "step": 10174 + }, + { + "epoch": 0.86, + "grad_norm": 0.19946730276298946, + "learning_rate": 5.24308779590253e-07, + "loss": 0.0317, + "step": 10175 + }, + { + "epoch": 0.86, + "grad_norm": 0.4221255888641955, + "learning_rate": 5.237006210487339e-07, + "loss": 0.1098, + "step": 10176 + }, + { + "epoch": 0.86, + "grad_norm": 0.4302044304689142, + "learning_rate": 5.230927959280696e-07, + "loss": 0.0631, + "step": 10177 + }, + { + "epoch": 0.86, + "grad_norm": 0.2328379140151972, + "learning_rate": 5.224853042735323e-07, + "loss": 0.0679, + "step": 10178 + }, + { + "epoch": 0.86, + "grad_norm": 0.2699513452362352, + "learning_rate": 5.21878146130374e-07, + "loss": 0.0952, + "step": 10179 + }, + { + "epoch": 0.86, + "grad_norm": 0.45347490184686484, + "learning_rate": 5.212713215438187e-07, + "loss": 0.0865, + "step": 10180 + }, + { + "epoch": 0.86, + "grad_norm": 0.2559295798724765, + "learning_rate": 5.206648305590645e-07, + "loss": 0.0547, + "step": 10181 + }, + { + "epoch": 0.86, + "grad_norm": 0.40675486523356985, + "learning_rate": 5.200586732212892e-07, + "loss": 0.0772, + "step": 10182 + }, + { + "epoch": 0.86, + "grad_norm": 0.4073187418841058, + "learning_rate": 5.194528495756413e-07, + "loss": 0.1447, + "step": 10183 + }, + { + "epoch": 0.86, + "grad_norm": 0.40045063472907716, + "learning_rate": 5.18847359667246e-07, + "loss": 0.0858, + "step": 10184 + }, + { + "epoch": 0.86, + "grad_norm": 0.6488649111846287, + "learning_rate": 5.182422035412033e-07, + "loss": 0.0952, + "step": 10185 + }, + { + "epoch": 0.86, + "grad_norm": 0.18427024002170167, + "learning_rate": 5.176373812425906e-07, + "loss": 0.0311, + "step": 10186 + }, + { + "epoch": 0.86, + "grad_norm": 0.5901415861337089, + "learning_rate": 5.170328928164569e-07, + "loss": 0.0727, + "step": 10187 + }, + { + "epoch": 0.86, + "grad_norm": 0.3615338297571059, + "learning_rate": 5.164287383078287e-07, + "loss": 0.1257, + "step": 10188 + }, + { + "epoch": 0.86, + "grad_norm": 0.3277681420696021, + "learning_rate": 5.158249177617064e-07, + "loss": 0.1026, + "step": 10189 + }, + { + "epoch": 0.86, + "grad_norm": 0.21689988665360102, + "learning_rate": 5.152214312230668e-07, + "loss": 0.0569, + "step": 10190 + }, + { + "epoch": 0.86, + "grad_norm": 0.3572332178946842, + "learning_rate": 5.146182787368609e-07, + "loss": 0.1172, + "step": 10191 + }, + { + "epoch": 0.86, + "grad_norm": 0.22588690189929764, + "learning_rate": 5.140154603480152e-07, + "loss": 0.0654, + "step": 10192 + }, + { + "epoch": 0.86, + "grad_norm": 0.2695550597192518, + "learning_rate": 5.134129761014306e-07, + "loss": 0.0643, + "step": 10193 + }, + { + "epoch": 0.86, + "grad_norm": 0.25010907600136456, + "learning_rate": 5.128108260419828e-07, + "loss": 0.0612, + "step": 10194 + }, + { + "epoch": 0.86, + "grad_norm": 0.23556562080626983, + "learning_rate": 5.122090102145255e-07, + "loss": 0.0616, + "step": 10195 + }, + { + "epoch": 0.86, + "grad_norm": 0.3621675350765161, + "learning_rate": 5.116075286638844e-07, + "loss": 0.0704, + "step": 10196 + }, + { + "epoch": 0.86, + "grad_norm": 0.3131261952714749, + "learning_rate": 5.110063814348614e-07, + "loss": 0.071, + "step": 10197 + }, + { + "epoch": 0.86, + "grad_norm": 0.38354680874174724, + "learning_rate": 5.104055685722326e-07, + "loss": 0.1007, + "step": 10198 + }, + { + "epoch": 0.86, + "grad_norm": 0.353198951561074, + "learning_rate": 5.098050901207519e-07, + "loss": 0.0649, + "step": 10199 + }, + { + "epoch": 0.86, + "grad_norm": 0.2408579198957943, + "learning_rate": 5.092049461251453e-07, + "loss": 0.0674, + "step": 10200 + }, + { + "epoch": 0.86, + "grad_norm": 0.2773421566768113, + "learning_rate": 5.086051366301143e-07, + "loss": 0.0524, + "step": 10201 + }, + { + "epoch": 0.86, + "grad_norm": 0.31780623767360566, + "learning_rate": 5.080056616803375e-07, + "loss": 0.0873, + "step": 10202 + }, + { + "epoch": 0.86, + "grad_norm": 0.22842394481054543, + "learning_rate": 5.074065213204677e-07, + "loss": 0.0393, + "step": 10203 + }, + { + "epoch": 0.86, + "grad_norm": 0.48957375136074877, + "learning_rate": 5.068077155951306e-07, + "loss": 0.1083, + "step": 10204 + }, + { + "epoch": 0.86, + "grad_norm": 0.31720943528062123, + "learning_rate": 5.062092445489292e-07, + "loss": 0.0897, + "step": 10205 + }, + { + "epoch": 0.86, + "grad_norm": 0.3934655126248802, + "learning_rate": 5.05611108226442e-07, + "loss": 0.1043, + "step": 10206 + }, + { + "epoch": 0.86, + "grad_norm": 0.27299753882075806, + "learning_rate": 5.050133066722218e-07, + "loss": 0.0599, + "step": 10207 + }, + { + "epoch": 0.86, + "grad_norm": 0.16080844450749188, + "learning_rate": 5.044158399307941e-07, + "loss": 0.0428, + "step": 10208 + }, + { + "epoch": 0.86, + "grad_norm": 0.29585174910428724, + "learning_rate": 5.03818708046665e-07, + "loss": 0.0887, + "step": 10209 + }, + { + "epoch": 0.86, + "grad_norm": 0.4756540658564396, + "learning_rate": 5.032219110643105e-07, + "loss": 0.0765, + "step": 10210 + }, + { + "epoch": 0.86, + "grad_norm": 0.3831573844688135, + "learning_rate": 5.026254490281835e-07, + "loss": 0.0965, + "step": 10211 + }, + { + "epoch": 0.86, + "grad_norm": 0.41443214000563344, + "learning_rate": 5.020293219827116e-07, + "loss": 0.0876, + "step": 10212 + }, + { + "epoch": 0.86, + "grad_norm": 0.3922449204842818, + "learning_rate": 5.014335299722989e-07, + "loss": 0.0875, + "step": 10213 + }, + { + "epoch": 0.86, + "grad_norm": 0.4280807004114287, + "learning_rate": 5.008380730413231e-07, + "loss": 0.1074, + "step": 10214 + }, + { + "epoch": 0.86, + "grad_norm": 0.30198797463437604, + "learning_rate": 5.002429512341367e-07, + "loss": 0.0629, + "step": 10215 + }, + { + "epoch": 0.86, + "grad_norm": 0.25042853511453145, + "learning_rate": 4.996481645950691e-07, + "loss": 0.0905, + "step": 10216 + }, + { + "epoch": 0.86, + "grad_norm": 0.22591671629981414, + "learning_rate": 4.99053713168422e-07, + "loss": 0.078, + "step": 10217 + }, + { + "epoch": 0.86, + "grad_norm": 0.281593486909036, + "learning_rate": 4.98459596998474e-07, + "loss": 0.0731, + "step": 10218 + }, + { + "epoch": 0.86, + "grad_norm": 0.3699086140036204, + "learning_rate": 4.978658161294797e-07, + "loss": 0.1002, + "step": 10219 + }, + { + "epoch": 0.86, + "grad_norm": 0.2635814628460597, + "learning_rate": 4.97272370605666e-07, + "loss": 0.0724, + "step": 10220 + }, + { + "epoch": 0.86, + "grad_norm": 0.7727226390361319, + "learning_rate": 4.966792604712362e-07, + "loss": 0.2018, + "step": 10221 + }, + { + "epoch": 0.86, + "grad_norm": 0.3386638076236892, + "learning_rate": 4.960864857703684e-07, + "loss": 0.07, + "step": 10222 + }, + { + "epoch": 0.86, + "grad_norm": 0.4258233614013364, + "learning_rate": 4.954940465472169e-07, + "loss": 0.0969, + "step": 10223 + }, + { + "epoch": 0.86, + "grad_norm": 0.16571779248145949, + "learning_rate": 4.949019428459101e-07, + "loss": 0.0265, + "step": 10224 + }, + { + "epoch": 0.86, + "grad_norm": 0.28982450174778257, + "learning_rate": 4.943101747105494e-07, + "loss": 0.0864, + "step": 10225 + }, + { + "epoch": 0.86, + "grad_norm": 0.3438381130784451, + "learning_rate": 4.937187421852152e-07, + "loss": 0.0823, + "step": 10226 + }, + { + "epoch": 0.86, + "grad_norm": 0.26235028237812424, + "learning_rate": 4.931276453139605e-07, + "loss": 0.0672, + "step": 10227 + }, + { + "epoch": 0.86, + "grad_norm": 0.2970159247063792, + "learning_rate": 4.925368841408129e-07, + "loss": 0.0498, + "step": 10228 + }, + { + "epoch": 0.86, + "grad_norm": 0.2997101783544131, + "learning_rate": 4.919464587097756e-07, + "loss": 0.0923, + "step": 10229 + }, + { + "epoch": 0.86, + "grad_norm": 0.24796452119856754, + "learning_rate": 4.913563690648282e-07, + "loss": 0.054, + "step": 10230 + }, + { + "epoch": 0.86, + "grad_norm": 0.4638822164242629, + "learning_rate": 4.907666152499229e-07, + "loss": 0.0874, + "step": 10231 + }, + { + "epoch": 0.86, + "grad_norm": 0.26648054083531814, + "learning_rate": 4.901771973089875e-07, + "loss": 0.0651, + "step": 10232 + }, + { + "epoch": 0.86, + "grad_norm": 0.59773389861169, + "learning_rate": 4.895881152859272e-07, + "loss": 0.1319, + "step": 10233 + }, + { + "epoch": 0.86, + "grad_norm": 0.47661722187376643, + "learning_rate": 4.889993692246192e-07, + "loss": 0.0929, + "step": 10234 + }, + { + "epoch": 0.86, + "grad_norm": 0.16138837852530785, + "learning_rate": 4.884109591689168e-07, + "loss": 0.0357, + "step": 10235 + }, + { + "epoch": 0.86, + "grad_norm": 0.2949496298846048, + "learning_rate": 4.878228851626465e-07, + "loss": 0.0758, + "step": 10236 + }, + { + "epoch": 0.86, + "grad_norm": 0.2543849112834512, + "learning_rate": 4.872351472496146e-07, + "loss": 0.0376, + "step": 10237 + }, + { + "epoch": 0.86, + "grad_norm": 0.2145955663391157, + "learning_rate": 4.866477454735979e-07, + "loss": 0.0478, + "step": 10238 + }, + { + "epoch": 0.86, + "grad_norm": 0.24434501939954376, + "learning_rate": 4.860606798783479e-07, + "loss": 0.0644, + "step": 10239 + }, + { + "epoch": 0.86, + "grad_norm": 0.2937302254917918, + "learning_rate": 4.854739505075956e-07, + "loss": 0.0818, + "step": 10240 + }, + { + "epoch": 0.86, + "grad_norm": 0.2553345241588614, + "learning_rate": 4.848875574050421e-07, + "loss": 0.0657, + "step": 10241 + }, + { + "epoch": 0.86, + "grad_norm": 0.5721737199276666, + "learning_rate": 4.843015006143648e-07, + "loss": 0.0912, + "step": 10242 + }, + { + "epoch": 0.86, + "grad_norm": 0.2733376399076147, + "learning_rate": 4.837157801792186e-07, + "loss": 0.0597, + "step": 10243 + }, + { + "epoch": 0.86, + "grad_norm": 0.30212893104126315, + "learning_rate": 4.831303961432304e-07, + "loss": 0.0396, + "step": 10244 + }, + { + "epoch": 0.86, + "grad_norm": 0.3208961585722439, + "learning_rate": 4.825453485500032e-07, + "loss": 0.0804, + "step": 10245 + }, + { + "epoch": 0.86, + "grad_norm": 0.5206776666001215, + "learning_rate": 4.819606374431135e-07, + "loss": 0.0807, + "step": 10246 + }, + { + "epoch": 0.86, + "grad_norm": 0.2508554646560112, + "learning_rate": 4.813762628661162e-07, + "loss": 0.0597, + "step": 10247 + }, + { + "epoch": 0.86, + "grad_norm": 0.308374542677377, + "learning_rate": 4.807922248625374e-07, + "loss": 0.0474, + "step": 10248 + }, + { + "epoch": 0.86, + "grad_norm": 0.235376590980154, + "learning_rate": 4.802085234758796e-07, + "loss": 0.0439, + "step": 10249 + }, + { + "epoch": 0.86, + "grad_norm": 0.8773818973956543, + "learning_rate": 4.796251587496214e-07, + "loss": 0.1012, + "step": 10250 + }, + { + "epoch": 0.86, + "grad_norm": 0.5156017019552253, + "learning_rate": 4.790421307272141e-07, + "loss": 0.0968, + "step": 10251 + }, + { + "epoch": 0.86, + "grad_norm": 0.21238967928582178, + "learning_rate": 4.784594394520858e-07, + "loss": 0.0777, + "step": 10252 + }, + { + "epoch": 0.86, + "grad_norm": 0.3666093768411957, + "learning_rate": 4.778770849676373e-07, + "loss": 0.1054, + "step": 10253 + }, + { + "epoch": 0.86, + "grad_norm": 0.43457962060793637, + "learning_rate": 4.772950673172483e-07, + "loss": 0.0447, + "step": 10254 + }, + { + "epoch": 0.86, + "grad_norm": 0.2579033526879363, + "learning_rate": 4.767133865442686e-07, + "loss": 0.0627, + "step": 10255 + }, + { + "epoch": 0.86, + "grad_norm": 0.36003414050401195, + "learning_rate": 4.761320426920263e-07, + "loss": 0.0745, + "step": 10256 + }, + { + "epoch": 0.86, + "grad_norm": 0.32375235312733147, + "learning_rate": 4.7555103580382234e-07, + "loss": 0.0979, + "step": 10257 + }, + { + "epoch": 0.86, + "grad_norm": 0.23507560639990824, + "learning_rate": 4.749703659229343e-07, + "loss": 0.0743, + "step": 10258 + }, + { + "epoch": 0.86, + "grad_norm": 0.30075282359524214, + "learning_rate": 4.7439003309261433e-07, + "loss": 0.0701, + "step": 10259 + }, + { + "epoch": 0.86, + "grad_norm": 0.2760026121332858, + "learning_rate": 4.738100373560883e-07, + "loss": 0.0609, + "step": 10260 + }, + { + "epoch": 0.86, + "grad_norm": 0.20152794464550844, + "learning_rate": 4.732303787565573e-07, + "loss": 0.0504, + "step": 10261 + }, + { + "epoch": 0.86, + "grad_norm": 0.2959931002283258, + "learning_rate": 4.726510573371973e-07, + "loss": 0.0734, + "step": 10262 + }, + { + "epoch": 0.86, + "grad_norm": 0.3486099193423995, + "learning_rate": 4.7207207314116097e-07, + "loss": 0.0891, + "step": 10263 + }, + { + "epoch": 0.86, + "grad_norm": 0.386772936605961, + "learning_rate": 4.7149342621157447e-07, + "loss": 0.0605, + "step": 10264 + }, + { + "epoch": 0.86, + "grad_norm": 0.7764557068990194, + "learning_rate": 4.709151165915371e-07, + "loss": 0.1019, + "step": 10265 + }, + { + "epoch": 0.87, + "grad_norm": 0.4156304549160289, + "learning_rate": 4.703371443241256e-07, + "loss": 0.0741, + "step": 10266 + }, + { + "epoch": 0.87, + "grad_norm": 0.31056722056849706, + "learning_rate": 4.6975950945239157e-07, + "loss": 0.0858, + "step": 10267 + }, + { + "epoch": 0.87, + "grad_norm": 0.4013091395394022, + "learning_rate": 4.691822120193595e-07, + "loss": 0.0832, + "step": 10268 + }, + { + "epoch": 0.87, + "grad_norm": 0.28671706496106414, + "learning_rate": 4.6860525206803066e-07, + "loss": 0.0793, + "step": 10269 + }, + { + "epoch": 0.87, + "grad_norm": 0.23547596273598484, + "learning_rate": 4.6802862964137885e-07, + "loss": 0.0645, + "step": 10270 + }, + { + "epoch": 0.87, + "grad_norm": 0.25677223175567787, + "learning_rate": 4.674523447823565e-07, + "loss": 0.0576, + "step": 10271 + }, + { + "epoch": 0.87, + "grad_norm": 0.3884967273802368, + "learning_rate": 4.668763975338875e-07, + "loss": 0.027, + "step": 10272 + }, + { + "epoch": 0.87, + "grad_norm": 0.28648261467681785, + "learning_rate": 4.663007879388709e-07, + "loss": 0.097, + "step": 10273 + }, + { + "epoch": 0.87, + "grad_norm": 0.2776977282929443, + "learning_rate": 4.65725516040183e-07, + "loss": 0.0838, + "step": 10274 + }, + { + "epoch": 0.87, + "grad_norm": 0.3389796810483693, + "learning_rate": 4.6515058188067286e-07, + "loss": 0.1075, + "step": 10275 + }, + { + "epoch": 0.87, + "grad_norm": 0.6206385845292867, + "learning_rate": 4.64575985503164e-07, + "loss": 0.0969, + "step": 10276 + }, + { + "epoch": 0.87, + "grad_norm": 0.35733929444124096, + "learning_rate": 4.6400172695045767e-07, + "loss": 0.0899, + "step": 10277 + }, + { + "epoch": 0.87, + "grad_norm": 0.29254147699566185, + "learning_rate": 4.6342780626532693e-07, + "loss": 0.0823, + "step": 10278 + }, + { + "epoch": 0.87, + "grad_norm": 0.35023298831837707, + "learning_rate": 4.6285422349052034e-07, + "loss": 0.0563, + "step": 10279 + }, + { + "epoch": 0.87, + "grad_norm": 0.16485257246911403, + "learning_rate": 4.62280978668761e-07, + "loss": 0.0341, + "step": 10280 + }, + { + "epoch": 0.87, + "grad_norm": 0.428637768013447, + "learning_rate": 4.617080718427497e-07, + "loss": 0.0715, + "step": 10281 + }, + { + "epoch": 0.87, + "grad_norm": 0.19768073472783207, + "learning_rate": 4.611355030551584e-07, + "loss": 0.0571, + "step": 10282 + }, + { + "epoch": 0.87, + "grad_norm": 0.2940470223077482, + "learning_rate": 4.6056327234863527e-07, + "loss": 0.0735, + "step": 10283 + }, + { + "epoch": 0.87, + "grad_norm": 0.3868099699550164, + "learning_rate": 4.5999137976580456e-07, + "loss": 0.1057, + "step": 10284 + }, + { + "epoch": 0.87, + "grad_norm": 0.3421544007731041, + "learning_rate": 4.594198253492632e-07, + "loss": 0.0455, + "step": 10285 + }, + { + "epoch": 0.87, + "grad_norm": 0.2943359566913793, + "learning_rate": 4.5884860914158393e-07, + "loss": 0.0643, + "step": 10286 + }, + { + "epoch": 0.87, + "grad_norm": 0.26324892211029793, + "learning_rate": 4.5827773118531374e-07, + "loss": 0.0635, + "step": 10287 + }, + { + "epoch": 0.87, + "grad_norm": 0.3062664386470111, + "learning_rate": 4.5770719152297637e-07, + "loss": 0.0726, + "step": 10288 + }, + { + "epoch": 0.87, + "grad_norm": 0.23420903247841512, + "learning_rate": 4.571369901970685e-07, + "loss": 0.0716, + "step": 10289 + }, + { + "epoch": 0.87, + "grad_norm": 0.32220686253937486, + "learning_rate": 4.5656712725006046e-07, + "loss": 0.1211, + "step": 10290 + }, + { + "epoch": 0.87, + "grad_norm": 0.5089778524708687, + "learning_rate": 4.559976027244012e-07, + "loss": 0.0834, + "step": 10291 + }, + { + "epoch": 0.87, + "grad_norm": 0.2605011411550036, + "learning_rate": 4.554284166625117e-07, + "loss": 0.0616, + "step": 10292 + }, + { + "epoch": 0.87, + "grad_norm": 0.37348259657523597, + "learning_rate": 4.548595691067864e-07, + "loss": 0.1023, + "step": 10293 + }, + { + "epoch": 0.87, + "grad_norm": 0.30545091060130575, + "learning_rate": 4.542910600995992e-07, + "loss": 0.0888, + "step": 10294 + }, + { + "epoch": 0.87, + "grad_norm": 0.41881365674578086, + "learning_rate": 4.5372288968329403e-07, + "loss": 0.0789, + "step": 10295 + }, + { + "epoch": 0.87, + "grad_norm": 0.30162096523054843, + "learning_rate": 4.5315505790019255e-07, + "loss": 0.0518, + "step": 10296 + }, + { + "epoch": 0.87, + "grad_norm": 0.323327871858337, + "learning_rate": 4.5258756479258817e-07, + "loss": 0.0714, + "step": 10297 + }, + { + "epoch": 0.87, + "grad_norm": 0.29800942606073044, + "learning_rate": 4.5202041040275427e-07, + "loss": 0.0633, + "step": 10298 + }, + { + "epoch": 0.87, + "grad_norm": 0.3234224270431746, + "learning_rate": 4.514535947729337e-07, + "loss": 0.0775, + "step": 10299 + }, + { + "epoch": 0.87, + "grad_norm": 0.22619306297448796, + "learning_rate": 4.508871179453456e-07, + "loss": 0.0551, + "step": 10300 + }, + { + "epoch": 0.87, + "grad_norm": 0.33216494321211715, + "learning_rate": 4.503209799621866e-07, + "loss": 0.0718, + "step": 10301 + }, + { + "epoch": 0.87, + "grad_norm": 0.1866277227643434, + "learning_rate": 4.4975518086562475e-07, + "loss": 0.0551, + "step": 10302 + }, + { + "epoch": 0.87, + "grad_norm": 0.2520963126410752, + "learning_rate": 4.4918972069780406e-07, + "loss": 0.0535, + "step": 10303 + }, + { + "epoch": 0.87, + "grad_norm": 0.37960338059646304, + "learning_rate": 4.486245995008426e-07, + "loss": 0.1, + "step": 10304 + }, + { + "epoch": 0.87, + "grad_norm": 0.27147910553472443, + "learning_rate": 4.48059817316836e-07, + "loss": 0.0888, + "step": 10305 + }, + { + "epoch": 0.87, + "grad_norm": 0.26035636758196845, + "learning_rate": 4.4749537418785074e-07, + "loss": 0.0779, + "step": 10306 + }, + { + "epoch": 0.87, + "grad_norm": 0.3319508060053674, + "learning_rate": 4.469312701559292e-07, + "loss": 0.0672, + "step": 10307 + }, + { + "epoch": 0.87, + "grad_norm": 0.35643497786630396, + "learning_rate": 4.463675052630917e-07, + "loss": 0.1215, + "step": 10308 + }, + { + "epoch": 0.87, + "grad_norm": 0.32137436731792407, + "learning_rate": 4.458040795513291e-07, + "loss": 0.0509, + "step": 10309 + }, + { + "epoch": 0.87, + "grad_norm": 0.30131576243462616, + "learning_rate": 4.4524099306260784e-07, + "loss": 0.0693, + "step": 10310 + }, + { + "epoch": 0.87, + "grad_norm": 0.3834484791028972, + "learning_rate": 4.4467824583887154e-07, + "loss": 0.0879, + "step": 10311 + }, + { + "epoch": 0.87, + "grad_norm": 0.39777282093582, + "learning_rate": 4.4411583792203673e-07, + "loss": 0.1457, + "step": 10312 + }, + { + "epoch": 0.87, + "grad_norm": 0.1225991836815811, + "learning_rate": 4.4355376935399374e-07, + "loss": 0.0326, + "step": 10313 + }, + { + "epoch": 0.87, + "grad_norm": 0.2721349251688276, + "learning_rate": 4.4299204017660914e-07, + "loss": 0.0695, + "step": 10314 + }, + { + "epoch": 0.87, + "grad_norm": 0.29951396540538383, + "learning_rate": 4.4243065043172385e-07, + "loss": 0.0399, + "step": 10315 + }, + { + "epoch": 0.87, + "grad_norm": 0.2832713752554066, + "learning_rate": 4.4186960016115443e-07, + "loss": 0.0845, + "step": 10316 + }, + { + "epoch": 0.87, + "grad_norm": 0.27521846147019713, + "learning_rate": 4.413088894066886e-07, + "loss": 0.0632, + "step": 10317 + }, + { + "epoch": 0.87, + "grad_norm": 0.3860538624801942, + "learning_rate": 4.4074851821009456e-07, + "loss": 0.0764, + "step": 10318 + }, + { + "epoch": 0.87, + "grad_norm": 0.35581482484988686, + "learning_rate": 4.4018848661311007e-07, + "loss": 0.101, + "step": 10319 + }, + { + "epoch": 0.87, + "grad_norm": 0.48670616574989567, + "learning_rate": 4.3962879465745013e-07, + "loss": 0.0854, + "step": 10320 + }, + { + "epoch": 0.87, + "grad_norm": 0.4266729338674842, + "learning_rate": 4.390694423848024e-07, + "loss": 0.1025, + "step": 10321 + }, + { + "epoch": 0.87, + "grad_norm": 0.19311682858007226, + "learning_rate": 4.3851042983683313e-07, + "loss": 0.0388, + "step": 10322 + }, + { + "epoch": 0.87, + "grad_norm": 0.14655881812517357, + "learning_rate": 4.3795175705518e-07, + "loss": 0.0444, + "step": 10323 + }, + { + "epoch": 0.87, + "grad_norm": 0.3521108718058451, + "learning_rate": 4.373934240814559e-07, + "loss": 0.0694, + "step": 10324 + }, + { + "epoch": 0.87, + "grad_norm": 0.27749808980474194, + "learning_rate": 4.368354309572476e-07, + "loss": 0.0795, + "step": 10325 + }, + { + "epoch": 0.87, + "grad_norm": 0.35796360683785816, + "learning_rate": 4.362777777241195e-07, + "loss": 0.1099, + "step": 10326 + }, + { + "epoch": 0.87, + "grad_norm": 0.36608619782556956, + "learning_rate": 4.357204644236085e-07, + "loss": 0.1077, + "step": 10327 + }, + { + "epoch": 0.87, + "grad_norm": 0.20890109456707825, + "learning_rate": 4.351634910972258e-07, + "loss": 0.0581, + "step": 10328 + }, + { + "epoch": 0.87, + "grad_norm": 0.5230488917594488, + "learning_rate": 4.3460685778645874e-07, + "loss": 0.1232, + "step": 10329 + }, + { + "epoch": 0.87, + "grad_norm": 0.3156965371421897, + "learning_rate": 4.340505645327675e-07, + "loss": 0.083, + "step": 10330 + }, + { + "epoch": 0.87, + "grad_norm": 0.43787747246516584, + "learning_rate": 4.3349461137758955e-07, + "loss": 0.0608, + "step": 10331 + }, + { + "epoch": 0.87, + "grad_norm": 0.3731609959315829, + "learning_rate": 4.32938998362335e-07, + "loss": 0.1247, + "step": 10332 + }, + { + "epoch": 0.87, + "grad_norm": 0.2928238435665105, + "learning_rate": 4.323837255283886e-07, + "loss": 0.0608, + "step": 10333 + }, + { + "epoch": 0.87, + "grad_norm": 0.42056105913323677, + "learning_rate": 4.3182879291711e-07, + "loss": 0.1039, + "step": 10334 + }, + { + "epoch": 0.87, + "grad_norm": 0.3312379779744026, + "learning_rate": 4.31274200569835e-07, + "loss": 0.0792, + "step": 10335 + }, + { + "epoch": 0.87, + "grad_norm": 0.6524435471606699, + "learning_rate": 4.307199485278729e-07, + "loss": 0.1117, + "step": 10336 + }, + { + "epoch": 0.87, + "grad_norm": 0.33222670650635105, + "learning_rate": 4.3016603683250666e-07, + "loss": 0.0672, + "step": 10337 + }, + { + "epoch": 0.87, + "grad_norm": 0.31374329740436335, + "learning_rate": 4.2961246552499445e-07, + "loss": 0.0985, + "step": 10338 + }, + { + "epoch": 0.87, + "grad_norm": 0.26720376827635073, + "learning_rate": 4.290592346465711e-07, + "loss": 0.0717, + "step": 10339 + }, + { + "epoch": 0.87, + "grad_norm": 0.30285381406600864, + "learning_rate": 4.2850634423844405e-07, + "loss": 0.0965, + "step": 10340 + }, + { + "epoch": 0.87, + "grad_norm": 0.23224558256617409, + "learning_rate": 4.279537943417944e-07, + "loss": 0.0533, + "step": 10341 + }, + { + "epoch": 0.87, + "grad_norm": 0.23451275860202472, + "learning_rate": 4.2740158499778083e-07, + "loss": 0.0871, + "step": 10342 + }, + { + "epoch": 0.87, + "grad_norm": 0.3001457108440875, + "learning_rate": 4.268497162475349e-07, + "loss": 0.0856, + "step": 10343 + }, + { + "epoch": 0.87, + "grad_norm": 0.3451959377218374, + "learning_rate": 4.262981881321615e-07, + "loss": 0.0702, + "step": 10344 + }, + { + "epoch": 0.87, + "grad_norm": 0.2313840663757002, + "learning_rate": 4.2574700069274386e-07, + "loss": 0.0515, + "step": 10345 + }, + { + "epoch": 0.87, + "grad_norm": 0.4223668888691095, + "learning_rate": 4.251961539703364e-07, + "loss": 0.099, + "step": 10346 + }, + { + "epoch": 0.87, + "grad_norm": 0.2741831456052615, + "learning_rate": 4.2464564800596965e-07, + "loss": 0.0563, + "step": 10347 + }, + { + "epoch": 0.87, + "grad_norm": 0.2444429906158703, + "learning_rate": 4.240954828406479e-07, + "loss": 0.045, + "step": 10348 + }, + { + "epoch": 0.87, + "grad_norm": 0.4088940220193649, + "learning_rate": 4.235456585153519e-07, + "loss": 0.0766, + "step": 10349 + }, + { + "epoch": 0.87, + "grad_norm": 0.27409980545021667, + "learning_rate": 4.2299617507103484e-07, + "loss": 0.0878, + "step": 10350 + }, + { + "epoch": 0.87, + "grad_norm": 0.3629484461627276, + "learning_rate": 4.2244703254862515e-07, + "loss": 0.1002, + "step": 10351 + }, + { + "epoch": 0.87, + "grad_norm": 0.31260928812599686, + "learning_rate": 4.218982309890274e-07, + "loss": 0.0884, + "step": 10352 + }, + { + "epoch": 0.87, + "grad_norm": 0.22388083489538424, + "learning_rate": 4.2134977043311877e-07, + "loss": 0.0607, + "step": 10353 + }, + { + "epoch": 0.87, + "grad_norm": 0.43421735852025695, + "learning_rate": 4.2080165092175217e-07, + "loss": 0.1026, + "step": 10354 + }, + { + "epoch": 0.87, + "grad_norm": 0.3258412201122716, + "learning_rate": 4.2025387249575333e-07, + "loss": 0.0858, + "step": 10355 + }, + { + "epoch": 0.87, + "grad_norm": 0.3054051857494564, + "learning_rate": 4.197064351959257e-07, + "loss": 0.0979, + "step": 10356 + }, + { + "epoch": 0.87, + "grad_norm": 0.2370452226499411, + "learning_rate": 4.1915933906304497e-07, + "loss": 0.0617, + "step": 10357 + }, + { + "epoch": 0.87, + "grad_norm": 0.4369980372747044, + "learning_rate": 4.186125841378613e-07, + "loss": 0.0947, + "step": 10358 + }, + { + "epoch": 0.87, + "grad_norm": 0.22333325226976525, + "learning_rate": 4.180661704611022e-07, + "loss": 0.0758, + "step": 10359 + }, + { + "epoch": 0.87, + "grad_norm": 0.3566997906528589, + "learning_rate": 4.1752009807346617e-07, + "loss": 0.1123, + "step": 10360 + }, + { + "epoch": 0.87, + "grad_norm": 0.34485280690394426, + "learning_rate": 4.169743670156279e-07, + "loss": 0.0689, + "step": 10361 + }, + { + "epoch": 0.87, + "grad_norm": 0.8119810328765202, + "learning_rate": 4.164289773282365e-07, + "loss": 0.1581, + "step": 10362 + }, + { + "epoch": 0.87, + "grad_norm": 0.4982987666757085, + "learning_rate": 4.1588392905191624e-07, + "loss": 0.0827, + "step": 10363 + }, + { + "epoch": 0.87, + "grad_norm": 0.6304857869085873, + "learning_rate": 4.153392222272662e-07, + "loss": 0.0779, + "step": 10364 + }, + { + "epoch": 0.87, + "grad_norm": 0.3363888844140371, + "learning_rate": 4.1479485689485734e-07, + "loss": 0.0907, + "step": 10365 + }, + { + "epoch": 0.87, + "grad_norm": 0.33172137610364805, + "learning_rate": 4.1425083309523885e-07, + "loss": 0.0911, + "step": 10366 + }, + { + "epoch": 0.87, + "grad_norm": 0.9081140970498471, + "learning_rate": 4.137071508689328e-07, + "loss": 0.0861, + "step": 10367 + }, + { + "epoch": 0.87, + "grad_norm": 0.32942256797708763, + "learning_rate": 4.131638102564339e-07, + "loss": 0.051, + "step": 10368 + }, + { + "epoch": 0.87, + "grad_norm": 0.5921407601034936, + "learning_rate": 4.126208112982161e-07, + "loss": 0.1118, + "step": 10369 + }, + { + "epoch": 0.87, + "grad_norm": 0.2270469660376063, + "learning_rate": 4.1207815403472353e-07, + "loss": 0.0504, + "step": 10370 + }, + { + "epoch": 0.87, + "grad_norm": 0.3538272142474261, + "learning_rate": 4.1153583850637666e-07, + "loss": 0.1056, + "step": 10371 + }, + { + "epoch": 0.87, + "grad_norm": 0.2508500543248445, + "learning_rate": 4.109938647535694e-07, + "loss": 0.0643, + "step": 10372 + }, + { + "epoch": 0.87, + "grad_norm": 0.28308932111462787, + "learning_rate": 4.1045223281667324e-07, + "loss": 0.0917, + "step": 10373 + }, + { + "epoch": 0.87, + "grad_norm": 0.5064738236898707, + "learning_rate": 4.0991094273603036e-07, + "loss": 0.0449, + "step": 10374 + }, + { + "epoch": 0.87, + "grad_norm": 0.38821339350090184, + "learning_rate": 4.093699945519591e-07, + "loss": 0.1037, + "step": 10375 + }, + { + "epoch": 0.87, + "grad_norm": 0.35831034477868734, + "learning_rate": 4.088293883047539e-07, + "loss": 0.1113, + "step": 10376 + }, + { + "epoch": 0.87, + "grad_norm": 0.19645772333719635, + "learning_rate": 4.0828912403468146e-07, + "loss": 0.0544, + "step": 10377 + }, + { + "epoch": 0.87, + "grad_norm": 0.4092757305235048, + "learning_rate": 4.077492017819834e-07, + "loss": 0.1019, + "step": 10378 + }, + { + "epoch": 0.87, + "grad_norm": 0.38899190655504556, + "learning_rate": 4.0720962158687594e-07, + "loss": 0.1219, + "step": 10379 + }, + { + "epoch": 0.87, + "grad_norm": 0.7656656374359254, + "learning_rate": 4.066703834895519e-07, + "loss": 0.1325, + "step": 10380 + }, + { + "epoch": 0.87, + "grad_norm": 0.2607572446676187, + "learning_rate": 4.0613148753017586e-07, + "loss": 0.0403, + "step": 10381 + }, + { + "epoch": 0.87, + "grad_norm": 0.22854638385844114, + "learning_rate": 4.0559293374888673e-07, + "loss": 0.0698, + "step": 10382 + }, + { + "epoch": 0.87, + "grad_norm": 0.3032568377489978, + "learning_rate": 4.0505472218580146e-07, + "loss": 0.0676, + "step": 10383 + }, + { + "epoch": 0.87, + "grad_norm": 0.3450014623218361, + "learning_rate": 4.0451685288100785e-07, + "loss": 0.0917, + "step": 10384 + }, + { + "epoch": 0.88, + "grad_norm": 0.4098843339799118, + "learning_rate": 4.039793258745689e-07, + "loss": 0.0725, + "step": 10385 + }, + { + "epoch": 0.88, + "grad_norm": 0.3021890462077491, + "learning_rate": 4.0344214120652426e-07, + "loss": 0.0904, + "step": 10386 + }, + { + "epoch": 0.88, + "grad_norm": 0.315233855832079, + "learning_rate": 4.0290529891688637e-07, + "loss": 0.0861, + "step": 10387 + }, + { + "epoch": 0.88, + "grad_norm": 0.38616058761388317, + "learning_rate": 4.0236879904564153e-07, + "loss": 0.0635, + "step": 10388 + }, + { + "epoch": 0.88, + "grad_norm": 0.3209315872051834, + "learning_rate": 4.018326416327506e-07, + "loss": 0.0823, + "step": 10389 + }, + { + "epoch": 0.88, + "grad_norm": 0.18676843195558632, + "learning_rate": 4.0129682671815164e-07, + "loss": 0.0612, + "step": 10390 + }, + { + "epoch": 0.88, + "grad_norm": 0.22725537879387447, + "learning_rate": 4.007613543417549e-07, + "loss": 0.0687, + "step": 10391 + }, + { + "epoch": 0.88, + "grad_norm": 0.2899317988800566, + "learning_rate": 4.0022622454344516e-07, + "loss": 0.0699, + "step": 10392 + }, + { + "epoch": 0.88, + "grad_norm": 0.24421775988496255, + "learning_rate": 3.996914373630806e-07, + "loss": 0.0818, + "step": 10393 + }, + { + "epoch": 0.88, + "grad_norm": 0.21490426332050236, + "learning_rate": 3.991569928404976e-07, + "loss": 0.06, + "step": 10394 + }, + { + "epoch": 0.88, + "grad_norm": 0.48883729935748477, + "learning_rate": 3.986228910155032e-07, + "loss": 0.069, + "step": 10395 + }, + { + "epoch": 0.88, + "grad_norm": 0.35525939724063726, + "learning_rate": 3.980891319278818e-07, + "loss": 0.0804, + "step": 10396 + }, + { + "epoch": 0.88, + "grad_norm": 0.32388192670503696, + "learning_rate": 3.975557156173893e-07, + "loss": 0.0724, + "step": 10397 + }, + { + "epoch": 0.88, + "grad_norm": 0.3914793411707041, + "learning_rate": 3.9702264212375775e-07, + "loss": 0.0655, + "step": 10398 + }, + { + "epoch": 0.88, + "grad_norm": 0.24412864501660192, + "learning_rate": 3.9648991148669493e-07, + "loss": 0.0705, + "step": 10399 + }, + { + "epoch": 0.88, + "grad_norm": 0.2495225936013205, + "learning_rate": 3.959575237458807e-07, + "loss": 0.0809, + "step": 10400 + }, + { + "epoch": 0.88, + "grad_norm": 0.3864750342929013, + "learning_rate": 3.954254789409706e-07, + "loss": 0.0844, + "step": 10401 + }, + { + "epoch": 0.88, + "grad_norm": 0.25403259807413925, + "learning_rate": 3.9489377711159405e-07, + "loss": 0.0671, + "step": 10402 + }, + { + "epoch": 0.88, + "grad_norm": 0.2659199612816149, + "learning_rate": 3.94362418297356e-07, + "loss": 0.0829, + "step": 10403 + }, + { + "epoch": 0.88, + "grad_norm": 0.8307169870072073, + "learning_rate": 3.9383140253783535e-07, + "loss": 0.178, + "step": 10404 + }, + { + "epoch": 0.88, + "grad_norm": 0.6153387894325313, + "learning_rate": 3.933007298725849e-07, + "loss": 0.1184, + "step": 10405 + }, + { + "epoch": 0.88, + "grad_norm": 0.3062710098602297, + "learning_rate": 3.927704003411309e-07, + "loss": 0.0681, + "step": 10406 + }, + { + "epoch": 0.88, + "grad_norm": 0.2611274756209047, + "learning_rate": 3.9224041398297717e-07, + "loss": 0.0863, + "step": 10407 + }, + { + "epoch": 0.88, + "grad_norm": 0.35101871736208623, + "learning_rate": 3.917107708376e-07, + "loss": 0.0832, + "step": 10408 + }, + { + "epoch": 0.88, + "grad_norm": 0.3788759457930212, + "learning_rate": 3.911814709444489e-07, + "loss": 0.0806, + "step": 10409 + }, + { + "epoch": 0.88, + "grad_norm": 0.4995479574979192, + "learning_rate": 3.906525143429513e-07, + "loss": 0.1479, + "step": 10410 + }, + { + "epoch": 0.88, + "grad_norm": 0.2014908255005349, + "learning_rate": 3.901239010725055e-07, + "loss": 0.0474, + "step": 10411 + }, + { + "epoch": 0.88, + "grad_norm": 0.3435530161666291, + "learning_rate": 3.895956311724863e-07, + "loss": 0.112, + "step": 10412 + }, + { + "epoch": 0.88, + "grad_norm": 0.3134306382494912, + "learning_rate": 3.890677046822411e-07, + "loss": 0.0731, + "step": 10413 + }, + { + "epoch": 0.88, + "grad_norm": 0.28343788372598033, + "learning_rate": 3.8854012164109443e-07, + "loss": 0.084, + "step": 10414 + }, + { + "epoch": 0.88, + "grad_norm": 0.26248043903083584, + "learning_rate": 3.8801288208834386e-07, + "loss": 0.058, + "step": 10415 + }, + { + "epoch": 0.88, + "grad_norm": 0.36292430607520376, + "learning_rate": 3.874859860632596e-07, + "loss": 0.079, + "step": 10416 + }, + { + "epoch": 0.88, + "grad_norm": 0.22181257518377587, + "learning_rate": 3.869594336050897e-07, + "loss": 0.074, + "step": 10417 + }, + { + "epoch": 0.88, + "grad_norm": 0.6261155179138623, + "learning_rate": 3.864332247530539e-07, + "loss": 0.1016, + "step": 10418 + }, + { + "epoch": 0.88, + "grad_norm": 0.2375704756094497, + "learning_rate": 3.8590735954634694e-07, + "loss": 0.066, + "step": 10419 + }, + { + "epoch": 0.88, + "grad_norm": 0.42351872738735774, + "learning_rate": 3.853818380241403e-07, + "loss": 0.0707, + "step": 10420 + }, + { + "epoch": 0.88, + "grad_norm": 0.6119375908436636, + "learning_rate": 3.84856660225576e-07, + "loss": 0.0901, + "step": 10421 + }, + { + "epoch": 0.88, + "grad_norm": 0.20252568521423933, + "learning_rate": 3.843318261897727e-07, + "loss": 0.054, + "step": 10422 + }, + { + "epoch": 0.88, + "grad_norm": 0.3367252189959483, + "learning_rate": 3.8380733595582254e-07, + "loss": 0.0777, + "step": 10423 + }, + { + "epoch": 0.88, + "grad_norm": 0.1790652678058368, + "learning_rate": 3.832831895627942e-07, + "loss": 0.0544, + "step": 10424 + }, + { + "epoch": 0.88, + "grad_norm": 0.23785935808743974, + "learning_rate": 3.8275938704972816e-07, + "loss": 0.0695, + "step": 10425 + }, + { + "epoch": 0.88, + "grad_norm": 0.23772999168644451, + "learning_rate": 3.8223592845563984e-07, + "loss": 0.0688, + "step": 10426 + }, + { + "epoch": 0.88, + "grad_norm": 0.29237650213745126, + "learning_rate": 3.817128138195203e-07, + "loss": 0.0436, + "step": 10427 + }, + { + "epoch": 0.88, + "grad_norm": 0.3648829015912419, + "learning_rate": 3.8119004318033446e-07, + "loss": 0.0587, + "step": 10428 + }, + { + "epoch": 0.88, + "grad_norm": 0.26562577490611833, + "learning_rate": 3.8066761657702067e-07, + "loss": 0.0821, + "step": 10429 + }, + { + "epoch": 0.88, + "grad_norm": 0.2520389770112586, + "learning_rate": 3.8014553404849107e-07, + "loss": 0.0659, + "step": 10430 + }, + { + "epoch": 0.88, + "grad_norm": 0.5312096551999453, + "learning_rate": 3.796237956336363e-07, + "loss": 0.0797, + "step": 10431 + }, + { + "epoch": 0.88, + "grad_norm": 0.3062944973342576, + "learning_rate": 3.7910240137131624e-07, + "loss": 0.0536, + "step": 10432 + }, + { + "epoch": 0.88, + "grad_norm": 0.3131665355180927, + "learning_rate": 3.7858135130036775e-07, + "loss": 0.098, + "step": 10433 + }, + { + "epoch": 0.88, + "grad_norm": 0.4318152023368978, + "learning_rate": 3.780606454596025e-07, + "loss": 0.0587, + "step": 10434 + }, + { + "epoch": 0.88, + "grad_norm": 0.3501077846583156, + "learning_rate": 3.7754028388780504e-07, + "loss": 0.0875, + "step": 10435 + }, + { + "epoch": 0.88, + "grad_norm": 0.45633262914274236, + "learning_rate": 3.7702026662373493e-07, + "loss": 0.085, + "step": 10436 + }, + { + "epoch": 0.88, + "grad_norm": 0.2964466053852528, + "learning_rate": 3.765005937061261e-07, + "loss": 0.0657, + "step": 10437 + }, + { + "epoch": 0.88, + "grad_norm": 0.1380071861219292, + "learning_rate": 3.759812651736877e-07, + "loss": 0.02, + "step": 10438 + }, + { + "epoch": 0.88, + "grad_norm": 0.32894179408353524, + "learning_rate": 3.7546228106510096e-07, + "loss": 0.0752, + "step": 10439 + }, + { + "epoch": 0.88, + "grad_norm": 0.27178268975586095, + "learning_rate": 3.749436414190227e-07, + "loss": 0.0448, + "step": 10440 + }, + { + "epoch": 0.88, + "grad_norm": 0.33157059734022054, + "learning_rate": 3.744253462740866e-07, + "loss": 0.0619, + "step": 10441 + }, + { + "epoch": 0.88, + "grad_norm": 0.3311711518329391, + "learning_rate": 3.73907395668896e-07, + "loss": 0.0767, + "step": 10442 + }, + { + "epoch": 0.88, + "grad_norm": 0.43539908643047226, + "learning_rate": 3.7338978964203076e-07, + "loss": 0.0914, + "step": 10443 + }, + { + "epoch": 0.88, + "grad_norm": 0.39211911672779165, + "learning_rate": 3.728725282320467e-07, + "loss": 0.0861, + "step": 10444 + }, + { + "epoch": 0.88, + "grad_norm": 0.5349911915057601, + "learning_rate": 3.723556114774718e-07, + "loss": 0.0887, + "step": 10445 + }, + { + "epoch": 0.88, + "grad_norm": 0.1502168439262821, + "learning_rate": 3.718390394168092e-07, + "loss": 0.0468, + "step": 10446 + }, + { + "epoch": 0.88, + "grad_norm": 0.42889618283909337, + "learning_rate": 3.713228120885348e-07, + "loss": 0.0961, + "step": 10447 + }, + { + "epoch": 0.88, + "grad_norm": 0.4453911928834456, + "learning_rate": 3.708069295311023e-07, + "loss": 0.0668, + "step": 10448 + }, + { + "epoch": 0.88, + "grad_norm": 0.2679866598277611, + "learning_rate": 3.7029139178293704e-07, + "loss": 0.0554, + "step": 10449 + }, + { + "epoch": 0.88, + "grad_norm": 0.3779248824586545, + "learning_rate": 3.6977619888243833e-07, + "loss": 0.0999, + "step": 10450 + }, + { + "epoch": 0.88, + "grad_norm": 0.20518766866395668, + "learning_rate": 3.6926135086798154e-07, + "loss": 0.0488, + "step": 10451 + }, + { + "epoch": 0.88, + "grad_norm": 0.7027321346839837, + "learning_rate": 3.68746847777916e-07, + "loss": 0.1048, + "step": 10452 + }, + { + "epoch": 0.88, + "grad_norm": 0.25032200553723244, + "learning_rate": 3.682326896505628e-07, + "loss": 0.0725, + "step": 10453 + }, + { + "epoch": 0.88, + "grad_norm": 0.44088445575936414, + "learning_rate": 3.6771887652422235e-07, + "loss": 0.1062, + "step": 10454 + }, + { + "epoch": 0.88, + "grad_norm": 0.2891506316173497, + "learning_rate": 3.672054084371646e-07, + "loss": 0.0882, + "step": 10455 + }, + { + "epoch": 0.88, + "grad_norm": 0.32008525694190726, + "learning_rate": 3.666922854276361e-07, + "loss": 0.0788, + "step": 10456 + }, + { + "epoch": 0.88, + "grad_norm": 0.4521141391172543, + "learning_rate": 3.6617950753385646e-07, + "loss": 0.1224, + "step": 10457 + }, + { + "epoch": 0.88, + "grad_norm": 0.30370150536975143, + "learning_rate": 3.656670747940222e-07, + "loss": 0.0821, + "step": 10458 + }, + { + "epoch": 0.88, + "grad_norm": 0.3951520324096447, + "learning_rate": 3.651549872463006e-07, + "loss": 0.0902, + "step": 10459 + }, + { + "epoch": 0.88, + "grad_norm": 0.43360259034236925, + "learning_rate": 3.646432449288362e-07, + "loss": 0.0927, + "step": 10460 + }, + { + "epoch": 0.88, + "grad_norm": 0.5755670973663171, + "learning_rate": 3.6413184787974456e-07, + "loss": 0.1544, + "step": 10461 + }, + { + "epoch": 0.88, + "grad_norm": 0.32136463616518385, + "learning_rate": 3.6362079613711965e-07, + "loss": 0.0803, + "step": 10462 + }, + { + "epoch": 0.88, + "grad_norm": 0.35873482886870334, + "learning_rate": 3.6311008973902717e-07, + "loss": 0.092, + "step": 10463 + }, + { + "epoch": 0.88, + "grad_norm": 0.2670205864331704, + "learning_rate": 3.625997287235067e-07, + "loss": 0.0741, + "step": 10464 + }, + { + "epoch": 0.88, + "grad_norm": 0.1892564032663384, + "learning_rate": 3.6208971312857335e-07, + "loss": 0.0667, + "step": 10465 + }, + { + "epoch": 0.88, + "grad_norm": 0.24514540197921247, + "learning_rate": 3.6158004299221506e-07, + "loss": 0.0649, + "step": 10466 + }, + { + "epoch": 0.88, + "grad_norm": 0.24853166695287798, + "learning_rate": 3.6107071835239705e-07, + "loss": 0.0709, + "step": 10467 + }, + { + "epoch": 0.88, + "grad_norm": 0.3328433017063389, + "learning_rate": 3.6056173924705564e-07, + "loss": 0.0678, + "step": 10468 + }, + { + "epoch": 0.88, + "grad_norm": 0.30700522231253863, + "learning_rate": 3.600531057141021e-07, + "loss": 0.096, + "step": 10469 + }, + { + "epoch": 0.88, + "grad_norm": 0.4421505798983173, + "learning_rate": 3.595448177914229e-07, + "loss": 0.127, + "step": 10470 + }, + { + "epoch": 0.88, + "grad_norm": 0.6520325413532139, + "learning_rate": 3.5903687551687884e-07, + "loss": 0.0838, + "step": 10471 + }, + { + "epoch": 0.88, + "grad_norm": 0.2030842161751872, + "learning_rate": 3.585292789283035e-07, + "loss": 0.052, + "step": 10472 + }, + { + "epoch": 0.88, + "grad_norm": 0.24212380855308172, + "learning_rate": 3.5802202806350664e-07, + "loss": 0.0678, + "step": 10473 + }, + { + "epoch": 0.88, + "grad_norm": 0.3263595315933203, + "learning_rate": 3.575151229602697e-07, + "loss": 0.0893, + "step": 10474 + }, + { + "epoch": 0.88, + "grad_norm": 0.352126983671568, + "learning_rate": 3.570085636563514e-07, + "loss": 0.1094, + "step": 10475 + }, + { + "epoch": 0.88, + "grad_norm": 0.3658986824898356, + "learning_rate": 3.5650235018948264e-07, + "loss": 0.0741, + "step": 10476 + }, + { + "epoch": 0.88, + "grad_norm": 0.26298437411133213, + "learning_rate": 3.5599648259736883e-07, + "loss": 0.0589, + "step": 10477 + }, + { + "epoch": 0.88, + "grad_norm": 0.2521789956647028, + "learning_rate": 3.5549096091769096e-07, + "loss": 0.0919, + "step": 10478 + }, + { + "epoch": 0.88, + "grad_norm": 0.40582955334202075, + "learning_rate": 3.5498578518810276e-07, + "loss": 0.1223, + "step": 10479 + }, + { + "epoch": 0.88, + "grad_norm": 0.18308185199039048, + "learning_rate": 3.5448095544623194e-07, + "loss": 0.047, + "step": 10480 + }, + { + "epoch": 0.88, + "grad_norm": 0.4666904672104624, + "learning_rate": 3.539764717296812e-07, + "loss": 0.1083, + "step": 10481 + }, + { + "epoch": 0.88, + "grad_norm": 0.3950306175657269, + "learning_rate": 3.534723340760288e-07, + "loss": 0.0806, + "step": 10482 + }, + { + "epoch": 0.88, + "grad_norm": 0.3391062142550214, + "learning_rate": 3.529685425228252e-07, + "loss": 0.0898, + "step": 10483 + }, + { + "epoch": 0.88, + "grad_norm": 0.37638710972000183, + "learning_rate": 3.5246509710759434e-07, + "loss": 0.0547, + "step": 10484 + }, + { + "epoch": 0.88, + "grad_norm": 0.17248565962104662, + "learning_rate": 3.519619978678379e-07, + "loss": 0.0406, + "step": 10485 + }, + { + "epoch": 0.88, + "grad_norm": 0.25024692474359556, + "learning_rate": 3.5145924484102866e-07, + "loss": 0.0745, + "step": 10486 + }, + { + "epoch": 0.88, + "grad_norm": 0.33672913772656693, + "learning_rate": 3.50956838064615e-07, + "loss": 0.072, + "step": 10487 + }, + { + "epoch": 0.88, + "grad_norm": 0.47644391097858924, + "learning_rate": 3.5045477757601754e-07, + "loss": 0.0965, + "step": 10488 + }, + { + "epoch": 0.88, + "grad_norm": 0.26358756306344006, + "learning_rate": 3.4995306341263414e-07, + "loss": 0.0537, + "step": 10489 + }, + { + "epoch": 0.88, + "grad_norm": 0.21333784941205572, + "learning_rate": 3.49451695611836e-07, + "loss": 0.0794, + "step": 10490 + }, + { + "epoch": 0.88, + "grad_norm": 0.3468605773610321, + "learning_rate": 3.489506742109655e-07, + "loss": 0.1292, + "step": 10491 + }, + { + "epoch": 0.88, + "grad_norm": 0.2532132240303655, + "learning_rate": 3.4844999924734444e-07, + "loss": 0.0549, + "step": 10492 + }, + { + "epoch": 0.88, + "grad_norm": 0.33763460805989276, + "learning_rate": 3.479496707582647e-07, + "loss": 0.0646, + "step": 10493 + }, + { + "epoch": 0.88, + "grad_norm": 0.47471329047445865, + "learning_rate": 3.474496887809925e-07, + "loss": 0.0944, + "step": 10494 + }, + { + "epoch": 0.88, + "grad_norm": 0.5184970627391564, + "learning_rate": 3.4695005335277196e-07, + "loss": 0.0927, + "step": 10495 + }, + { + "epoch": 0.88, + "grad_norm": 0.5482564924865132, + "learning_rate": 3.464507645108167e-07, + "loss": 0.1072, + "step": 10496 + }, + { + "epoch": 0.88, + "grad_norm": 0.26146433915154516, + "learning_rate": 3.4595182229231805e-07, + "loss": 0.0765, + "step": 10497 + }, + { + "epoch": 0.88, + "grad_norm": 0.2833952828881233, + "learning_rate": 3.4545322673443846e-07, + "loss": 0.06, + "step": 10498 + }, + { + "epoch": 0.88, + "grad_norm": 0.25964675330088915, + "learning_rate": 3.449549778743183e-07, + "loss": 0.0904, + "step": 10499 + }, + { + "epoch": 0.88, + "grad_norm": 0.24525051661574676, + "learning_rate": 3.4445707574906896e-07, + "loss": 0.0665, + "step": 10500 + }, + { + "epoch": 0.88, + "grad_norm": 0.28054263243408406, + "learning_rate": 3.439595203957763e-07, + "loss": 0.0832, + "step": 10501 + }, + { + "epoch": 0.88, + "grad_norm": 0.2846810692005072, + "learning_rate": 3.4346231185150237e-07, + "loss": 0.0796, + "step": 10502 + }, + { + "epoch": 0.88, + "grad_norm": 0.406924815608045, + "learning_rate": 3.4296545015328254e-07, + "loss": 0.0666, + "step": 10503 + }, + { + "epoch": 0.89, + "grad_norm": 0.3325068894870213, + "learning_rate": 3.42468935338125e-07, + "loss": 0.1097, + "step": 10504 + }, + { + "epoch": 0.89, + "grad_norm": 0.37001391262801203, + "learning_rate": 3.4197276744301233e-07, + "loss": 0.0703, + "step": 10505 + }, + { + "epoch": 0.89, + "grad_norm": 0.4522199526074526, + "learning_rate": 3.4147694650490337e-07, + "loss": 0.0744, + "step": 10506 + }, + { + "epoch": 0.89, + "grad_norm": 0.3460647939827886, + "learning_rate": 3.409814725607302e-07, + "loss": 0.0583, + "step": 10507 + }, + { + "epoch": 0.89, + "grad_norm": 0.18811806538613207, + "learning_rate": 3.4048634564739615e-07, + "loss": 0.0569, + "step": 10508 + }, + { + "epoch": 0.89, + "grad_norm": 0.3294954360897352, + "learning_rate": 3.3999156580178384e-07, + "loss": 0.0894, + "step": 10509 + }, + { + "epoch": 0.89, + "grad_norm": 0.27252655235928647, + "learning_rate": 3.3949713306074663e-07, + "loss": 0.0613, + "step": 10510 + }, + { + "epoch": 0.89, + "grad_norm": 0.33323910518383537, + "learning_rate": 3.390030474611111e-07, + "loss": 0.0759, + "step": 10511 + }, + { + "epoch": 0.89, + "grad_norm": 0.20062127867840693, + "learning_rate": 3.385093090396818e-07, + "loss": 0.0528, + "step": 10512 + }, + { + "epoch": 0.89, + "grad_norm": 0.3390222959342926, + "learning_rate": 3.3801591783323427e-07, + "loss": 0.0677, + "step": 10513 + }, + { + "epoch": 0.89, + "grad_norm": 0.2473870775536169, + "learning_rate": 3.3752287387851966e-07, + "loss": 0.0448, + "step": 10514 + }, + { + "epoch": 0.89, + "grad_norm": 0.34236027484224807, + "learning_rate": 3.370301772122614e-07, + "loss": 0.0825, + "step": 10515 + }, + { + "epoch": 0.89, + "grad_norm": 0.3583592990098601, + "learning_rate": 3.3653782787116007e-07, + "loss": 0.0979, + "step": 10516 + }, + { + "epoch": 0.89, + "grad_norm": 0.31999527877070755, + "learning_rate": 3.3604582589188804e-07, + "loss": 0.0758, + "step": 10517 + }, + { + "epoch": 0.89, + "grad_norm": 0.25868058460947924, + "learning_rate": 3.355541713110916e-07, + "loss": 0.0526, + "step": 10518 + }, + { + "epoch": 0.89, + "grad_norm": 0.30531092742352656, + "learning_rate": 3.3506286416539356e-07, + "loss": 0.0913, + "step": 10519 + }, + { + "epoch": 0.89, + "grad_norm": 0.2514835573253783, + "learning_rate": 3.345719044913887e-07, + "loss": 0.0589, + "step": 10520 + }, + { + "epoch": 0.89, + "grad_norm": 0.2554346321368242, + "learning_rate": 3.3408129232564643e-07, + "loss": 0.0941, + "step": 10521 + }, + { + "epoch": 0.89, + "grad_norm": 0.42332144022705254, + "learning_rate": 3.3359102770470995e-07, + "loss": 0.0787, + "step": 10522 + }, + { + "epoch": 0.89, + "grad_norm": 0.25057039178563384, + "learning_rate": 3.331011106650983e-07, + "loss": 0.0734, + "step": 10523 + }, + { + "epoch": 0.89, + "grad_norm": 0.22888472576557964, + "learning_rate": 3.326115412433023e-07, + "loss": 0.0696, + "step": 10524 + }, + { + "epoch": 0.89, + "grad_norm": 0.2837018169304391, + "learning_rate": 3.3212231947578776e-07, + "loss": 0.0596, + "step": 10525 + }, + { + "epoch": 0.89, + "grad_norm": 0.33671498312772125, + "learning_rate": 3.316334453989961e-07, + "loss": 0.0938, + "step": 10526 + }, + { + "epoch": 0.89, + "grad_norm": 0.19046463705906086, + "learning_rate": 3.31144919049341e-07, + "loss": 0.0367, + "step": 10527 + }, + { + "epoch": 0.89, + "grad_norm": 0.26463219555433937, + "learning_rate": 3.306567404632099e-07, + "loss": 0.0739, + "step": 10528 + }, + { + "epoch": 0.89, + "grad_norm": 0.4454770489519749, + "learning_rate": 3.3016890967696505e-07, + "loss": 0.0548, + "step": 10529 + }, + { + "epoch": 0.89, + "grad_norm": 0.31359564436429543, + "learning_rate": 3.296814267269444e-07, + "loss": 0.1005, + "step": 10530 + }, + { + "epoch": 0.89, + "grad_norm": 0.3273724667288739, + "learning_rate": 3.2919429164945727e-07, + "loss": 0.087, + "step": 10531 + }, + { + "epoch": 0.89, + "grad_norm": 0.4518487269253093, + "learning_rate": 3.287075044807891e-07, + "loss": 0.0822, + "step": 10532 + }, + { + "epoch": 0.89, + "grad_norm": 0.29733008655387794, + "learning_rate": 3.2822106525719876e-07, + "loss": 0.0719, + "step": 10533 + }, + { + "epoch": 0.89, + "grad_norm": 0.28975007308871087, + "learning_rate": 3.277349740149172e-07, + "loss": 0.0764, + "step": 10534 + }, + { + "epoch": 0.89, + "grad_norm": 0.15990816976795472, + "learning_rate": 3.272492307901537e-07, + "loss": 0.0346, + "step": 10535 + }, + { + "epoch": 0.89, + "grad_norm": 0.3947060466676695, + "learning_rate": 3.267638356190883e-07, + "loss": 0.0663, + "step": 10536 + }, + { + "epoch": 0.89, + "grad_norm": 0.584502766580984, + "learning_rate": 3.2627878853787596e-07, + "loss": 0.0748, + "step": 10537 + }, + { + "epoch": 0.89, + "grad_norm": 0.2926138660466728, + "learning_rate": 3.2579408958264555e-07, + "loss": 0.0641, + "step": 10538 + }, + { + "epoch": 0.89, + "grad_norm": 0.3054040774427007, + "learning_rate": 3.25309738789501e-07, + "loss": 0.1105, + "step": 10539 + }, + { + "epoch": 0.89, + "grad_norm": 0.4410149104236629, + "learning_rate": 3.2482573619451897e-07, + "loss": 0.1148, + "step": 10540 + }, + { + "epoch": 0.89, + "grad_norm": 0.5433760761701039, + "learning_rate": 3.2434208183375114e-07, + "loss": 0.1131, + "step": 10541 + }, + { + "epoch": 0.89, + "grad_norm": 0.49555734878324664, + "learning_rate": 3.238587757432221e-07, + "loss": 0.0833, + "step": 10542 + }, + { + "epoch": 0.89, + "grad_norm": 0.2662690571406797, + "learning_rate": 3.2337581795893304e-07, + "loss": 0.0708, + "step": 10543 + }, + { + "epoch": 0.89, + "grad_norm": 0.33803000823296375, + "learning_rate": 3.228932085168562e-07, + "loss": 0.077, + "step": 10544 + }, + { + "epoch": 0.89, + "grad_norm": 0.4042093060129459, + "learning_rate": 3.2241094745293857e-07, + "loss": 0.0862, + "step": 10545 + }, + { + "epoch": 0.89, + "grad_norm": 0.2393566917912546, + "learning_rate": 3.2192903480310287e-07, + "loss": 0.0778, + "step": 10546 + }, + { + "epoch": 0.89, + "grad_norm": 0.3668246107966983, + "learning_rate": 3.21447470603245e-07, + "loss": 0.08, + "step": 10547 + }, + { + "epoch": 0.89, + "grad_norm": 0.34951635990558216, + "learning_rate": 3.2096625488923394e-07, + "loss": 0.0836, + "step": 10548 + }, + { + "epoch": 0.89, + "grad_norm": 0.35209366397886566, + "learning_rate": 3.2048538769691274e-07, + "loss": 0.1049, + "step": 10549 + }, + { + "epoch": 0.89, + "grad_norm": 0.678524513533921, + "learning_rate": 3.2000486906210105e-07, + "loss": 0.1071, + "step": 10550 + }, + { + "epoch": 0.89, + "grad_norm": 0.2741064983489201, + "learning_rate": 3.1952469902058967e-07, + "loss": 0.0708, + "step": 10551 + }, + { + "epoch": 0.89, + "grad_norm": 0.288369291183368, + "learning_rate": 3.190448776081434e-07, + "loss": 0.0771, + "step": 10552 + }, + { + "epoch": 0.89, + "grad_norm": 0.37482059852323857, + "learning_rate": 3.1856540486050414e-07, + "loss": 0.0979, + "step": 10553 + }, + { + "epoch": 0.89, + "grad_norm": 0.30145012252113956, + "learning_rate": 3.18086280813385e-07, + "loss": 0.0708, + "step": 10554 + }, + { + "epoch": 0.89, + "grad_norm": 0.3064269380223725, + "learning_rate": 3.176075055024741e-07, + "loss": 0.0834, + "step": 10555 + }, + { + "epoch": 0.89, + "grad_norm": 0.26874681344461454, + "learning_rate": 3.171290789634318e-07, + "loss": 0.0674, + "step": 10556 + }, + { + "epoch": 0.89, + "grad_norm": 0.4059155035424254, + "learning_rate": 3.166510012318963e-07, + "loss": 0.0843, + "step": 10557 + }, + { + "epoch": 0.89, + "grad_norm": 0.30266201892715733, + "learning_rate": 3.1617327234347685e-07, + "loss": 0.0751, + "step": 10558 + }, + { + "epoch": 0.89, + "grad_norm": 0.27986848896118965, + "learning_rate": 3.156958923337572e-07, + "loss": 0.0642, + "step": 10559 + }, + { + "epoch": 0.89, + "grad_norm": 0.4828619497745637, + "learning_rate": 3.152188612382956e-07, + "loss": 0.1315, + "step": 10560 + }, + { + "epoch": 0.89, + "grad_norm": 0.4761244133583572, + "learning_rate": 3.1474217909262415e-07, + "loss": 0.133, + "step": 10561 + }, + { + "epoch": 0.89, + "grad_norm": 0.3280095053415452, + "learning_rate": 3.142658459322484e-07, + "loss": 0.074, + "step": 10562 + }, + { + "epoch": 0.89, + "grad_norm": 0.3092142711676505, + "learning_rate": 3.137898617926494e-07, + "loss": 0.1016, + "step": 10563 + }, + { + "epoch": 0.89, + "grad_norm": 0.4611464566894081, + "learning_rate": 3.133142267092809e-07, + "loss": 0.142, + "step": 10564 + }, + { + "epoch": 0.89, + "grad_norm": 0.4871804063392949, + "learning_rate": 3.1283894071757027e-07, + "loss": 0.0952, + "step": 10565 + }, + { + "epoch": 0.89, + "grad_norm": 0.26152097126464496, + "learning_rate": 3.1236400385291966e-07, + "loss": 0.0602, + "step": 10566 + }, + { + "epoch": 0.89, + "grad_norm": 0.27470637997284963, + "learning_rate": 3.1188941615070635e-07, + "loss": 0.0848, + "step": 10567 + }, + { + "epoch": 0.89, + "grad_norm": 0.6272577386750017, + "learning_rate": 3.1141517764627983e-07, + "loss": 0.0734, + "step": 10568 + }, + { + "epoch": 0.89, + "grad_norm": 0.30903804672318314, + "learning_rate": 3.10941288374963e-07, + "loss": 0.0534, + "step": 10569 + }, + { + "epoch": 0.89, + "grad_norm": 0.702414945729997, + "learning_rate": 3.104677483720553e-07, + "loss": 0.0815, + "step": 10570 + }, + { + "epoch": 0.89, + "grad_norm": 0.3330389424097082, + "learning_rate": 3.09994557672828e-07, + "loss": 0.0719, + "step": 10571 + }, + { + "epoch": 0.89, + "grad_norm": 0.3240480399560202, + "learning_rate": 3.09521716312528e-07, + "loss": 0.0727, + "step": 10572 + }, + { + "epoch": 0.89, + "grad_norm": 0.3895906169163102, + "learning_rate": 3.0904922432637373e-07, + "loss": 0.0893, + "step": 10573 + }, + { + "epoch": 0.89, + "grad_norm": 0.1592286078402547, + "learning_rate": 3.0857708174956034e-07, + "loss": 0.0506, + "step": 10574 + }, + { + "epoch": 0.89, + "grad_norm": 0.2580433824725031, + "learning_rate": 3.0810528861725586e-07, + "loss": 0.0389, + "step": 10575 + }, + { + "epoch": 0.89, + "grad_norm": 0.6450703521510163, + "learning_rate": 3.0763384496460104e-07, + "loss": 0.1391, + "step": 10576 + }, + { + "epoch": 0.89, + "grad_norm": 0.5987838311500814, + "learning_rate": 3.0716275082671276e-07, + "loss": 0.1338, + "step": 10577 + }, + { + "epoch": 0.89, + "grad_norm": 0.42137137186709683, + "learning_rate": 3.0669200623868136e-07, + "loss": 0.1084, + "step": 10578 + }, + { + "epoch": 0.89, + "grad_norm": 0.26046920342369423, + "learning_rate": 3.062216112355687e-07, + "loss": 0.0508, + "step": 10579 + }, + { + "epoch": 0.89, + "grad_norm": 0.22880540521923015, + "learning_rate": 3.0575156585241463e-07, + "loss": 0.0431, + "step": 10580 + }, + { + "epoch": 0.89, + "grad_norm": 0.45656405730575056, + "learning_rate": 3.052818701242294e-07, + "loss": 0.0759, + "step": 10581 + }, + { + "epoch": 0.89, + "grad_norm": 0.42194495348293215, + "learning_rate": 3.0481252408600003e-07, + "loss": 0.1288, + "step": 10582 + }, + { + "epoch": 0.89, + "grad_norm": 0.32962733916510273, + "learning_rate": 3.043435277726842e-07, + "loss": 0.0695, + "step": 10583 + }, + { + "epoch": 0.89, + "grad_norm": 0.3801706104726801, + "learning_rate": 3.0387488121921716e-07, + "loss": 0.1055, + "step": 10584 + }, + { + "epoch": 0.89, + "grad_norm": 0.19630267906230492, + "learning_rate": 3.0340658446050665e-07, + "loss": 0.0407, + "step": 10585 + }, + { + "epoch": 0.89, + "grad_norm": 0.3308721668865176, + "learning_rate": 3.0293863753143193e-07, + "loss": 0.1011, + "step": 10586 + }, + { + "epoch": 0.89, + "grad_norm": 0.4662723227989085, + "learning_rate": 3.0247104046685073e-07, + "loss": 0.0999, + "step": 10587 + }, + { + "epoch": 0.89, + "grad_norm": 0.31103857353863984, + "learning_rate": 3.0200379330159177e-07, + "loss": 0.09, + "step": 10588 + }, + { + "epoch": 0.89, + "grad_norm": 0.4577575370164702, + "learning_rate": 3.015368960704584e-07, + "loss": 0.0968, + "step": 10589 + }, + { + "epoch": 0.89, + "grad_norm": 0.2570775377556307, + "learning_rate": 3.0107034880822674e-07, + "loss": 0.0781, + "step": 10590 + }, + { + "epoch": 0.89, + "grad_norm": 0.34312737924216574, + "learning_rate": 3.0060415154964937e-07, + "loss": 0.0818, + "step": 10591 + }, + { + "epoch": 0.89, + "grad_norm": 0.14871748548183075, + "learning_rate": 3.0013830432945037e-07, + "loss": 0.045, + "step": 10592 + }, + { + "epoch": 0.89, + "grad_norm": 0.3386365312809842, + "learning_rate": 2.9967280718232903e-07, + "loss": 0.1087, + "step": 10593 + }, + { + "epoch": 0.89, + "grad_norm": 0.33828358721708823, + "learning_rate": 2.9920766014295886e-07, + "loss": 0.0746, + "step": 10594 + }, + { + "epoch": 0.89, + "grad_norm": 0.2738901649331908, + "learning_rate": 2.9874286324598657e-07, + "loss": 0.0482, + "step": 10595 + }, + { + "epoch": 0.89, + "grad_norm": 0.2712945454395805, + "learning_rate": 2.9827841652603273e-07, + "loss": 0.0851, + "step": 10596 + }, + { + "epoch": 0.89, + "grad_norm": 0.31652130956199204, + "learning_rate": 2.9781432001769086e-07, + "loss": 0.1016, + "step": 10597 + }, + { + "epoch": 0.89, + "grad_norm": 0.3625633838994426, + "learning_rate": 2.9735057375553154e-07, + "loss": 0.0592, + "step": 10598 + }, + { + "epoch": 0.89, + "grad_norm": 0.23771259681559842, + "learning_rate": 2.968871777740967e-07, + "loss": 0.0363, + "step": 10599 + }, + { + "epoch": 0.89, + "grad_norm": 0.3101318905081017, + "learning_rate": 2.9642413210790253e-07, + "loss": 0.0695, + "step": 10600 + }, + { + "epoch": 0.89, + "grad_norm": 0.41914793798237976, + "learning_rate": 2.959614367914387e-07, + "loss": 0.1039, + "step": 10601 + }, + { + "epoch": 0.89, + "grad_norm": 0.24704702339604917, + "learning_rate": 2.9549909185916993e-07, + "loss": 0.0674, + "step": 10602 + }, + { + "epoch": 0.89, + "grad_norm": 0.3022853090130656, + "learning_rate": 2.950370973455352e-07, + "loss": 0.0835, + "step": 10603 + }, + { + "epoch": 0.89, + "grad_norm": 0.22099288861972136, + "learning_rate": 2.9457545328494596e-07, + "loss": 0.0634, + "step": 10604 + }, + { + "epoch": 0.89, + "grad_norm": 0.26939161956845403, + "learning_rate": 2.94114159711788e-07, + "loss": 0.0413, + "step": 10605 + }, + { + "epoch": 0.89, + "grad_norm": 0.3837592934915364, + "learning_rate": 2.936532166604206e-07, + "loss": 0.1, + "step": 10606 + }, + { + "epoch": 0.89, + "grad_norm": 0.36370660818135603, + "learning_rate": 2.931926241651789e-07, + "loss": 0.0913, + "step": 10607 + }, + { + "epoch": 0.89, + "grad_norm": 0.33039888658643185, + "learning_rate": 2.9273238226036947e-07, + "loss": 0.0942, + "step": 10608 + }, + { + "epoch": 0.89, + "grad_norm": 0.31153450001509664, + "learning_rate": 2.9227249098027434e-07, + "loss": 0.0571, + "step": 10609 + }, + { + "epoch": 0.89, + "grad_norm": 0.18881085738850661, + "learning_rate": 2.918129503591477e-07, + "loss": 0.0398, + "step": 10610 + }, + { + "epoch": 0.89, + "grad_norm": 0.3336975147296537, + "learning_rate": 2.9135376043122056e-07, + "loss": 0.0896, + "step": 10611 + }, + { + "epoch": 0.89, + "grad_norm": 0.1571289423943912, + "learning_rate": 2.9089492123069497e-07, + "loss": 0.0432, + "step": 10612 + }, + { + "epoch": 0.89, + "grad_norm": 0.2689176169190801, + "learning_rate": 2.904364327917486e-07, + "loss": 0.0602, + "step": 10613 + }, + { + "epoch": 0.89, + "grad_norm": 0.32584866666490875, + "learning_rate": 2.8997829514853137e-07, + "loss": 0.0491, + "step": 10614 + }, + { + "epoch": 0.89, + "grad_norm": 0.3557088237261066, + "learning_rate": 2.8952050833516877e-07, + "loss": 0.0562, + "step": 10615 + }, + { + "epoch": 0.89, + "grad_norm": 0.30025664800341817, + "learning_rate": 2.890630723857596e-07, + "loss": 0.0413, + "step": 10616 + }, + { + "epoch": 0.89, + "grad_norm": 0.2457728564681372, + "learning_rate": 2.886059873343755e-07, + "loss": 0.078, + "step": 10617 + }, + { + "epoch": 0.89, + "grad_norm": 0.4243062270469183, + "learning_rate": 2.881492532150637e-07, + "loss": 0.0539, + "step": 10618 + }, + { + "epoch": 0.89, + "grad_norm": 0.24500580977027087, + "learning_rate": 2.876928700618442e-07, + "loss": 0.0627, + "step": 10619 + }, + { + "epoch": 0.89, + "grad_norm": 0.23459753992949234, + "learning_rate": 2.872368379087104e-07, + "loss": 0.0539, + "step": 10620 + }, + { + "epoch": 0.89, + "grad_norm": 0.26208057601175905, + "learning_rate": 2.8678115678963115e-07, + "loss": 0.0565, + "step": 10621 + }, + { + "epoch": 0.9, + "grad_norm": 0.2877380084663504, + "learning_rate": 2.8632582673854825e-07, + "loss": 0.0624, + "step": 10622 + }, + { + "epoch": 0.9, + "grad_norm": 0.36095301786344913, + "learning_rate": 2.858708477893768e-07, + "loss": 0.0905, + "step": 10623 + }, + { + "epoch": 0.9, + "grad_norm": 0.37438226972767596, + "learning_rate": 2.8541621997600575e-07, + "loss": 0.0897, + "step": 10624 + }, + { + "epoch": 0.9, + "grad_norm": 0.2753598538508495, + "learning_rate": 2.8496194333229975e-07, + "loss": 0.0974, + "step": 10625 + }, + { + "epoch": 0.9, + "grad_norm": 0.21850999250467382, + "learning_rate": 2.845080178920956e-07, + "loss": 0.0416, + "step": 10626 + }, + { + "epoch": 0.9, + "grad_norm": 0.3214391115033729, + "learning_rate": 2.840544436892029e-07, + "loss": 0.0806, + "step": 10627 + }, + { + "epoch": 0.9, + "grad_norm": 0.25545571585626325, + "learning_rate": 2.836012207574085e-07, + "loss": 0.0599, + "step": 10628 + }, + { + "epoch": 0.9, + "grad_norm": 0.36986655432666005, + "learning_rate": 2.831483491304704e-07, + "loss": 0.0555, + "step": 10629 + }, + { + "epoch": 0.9, + "grad_norm": 0.28101986268397056, + "learning_rate": 2.826958288421211e-07, + "loss": 0.0713, + "step": 10630 + }, + { + "epoch": 0.9, + "grad_norm": 0.2659055513523229, + "learning_rate": 2.8224365992606584e-07, + "loss": 0.0758, + "step": 10631 + }, + { + "epoch": 0.9, + "grad_norm": 0.26775905010852646, + "learning_rate": 2.8179184241598603e-07, + "loss": 0.0812, + "step": 10632 + }, + { + "epoch": 0.9, + "grad_norm": 0.3012004691828637, + "learning_rate": 2.8134037634553633e-07, + "loss": 0.0736, + "step": 10633 + }, + { + "epoch": 0.9, + "grad_norm": 0.5405324750854406, + "learning_rate": 2.808892617483422e-07, + "loss": 0.0912, + "step": 10634 + }, + { + "epoch": 0.9, + "grad_norm": 0.3046402271099459, + "learning_rate": 2.8043849865800774e-07, + "loss": 0.0733, + "step": 10635 + }, + { + "epoch": 0.9, + "grad_norm": 0.39863037753583463, + "learning_rate": 2.799880871081073e-07, + "loss": 0.079, + "step": 10636 + }, + { + "epoch": 0.9, + "grad_norm": 0.40206253885735577, + "learning_rate": 2.7953802713218957e-07, + "loss": 0.1257, + "step": 10637 + }, + { + "epoch": 0.9, + "grad_norm": 0.3489190058449629, + "learning_rate": 2.7908831876377885e-07, + "loss": 0.0939, + "step": 10638 + }, + { + "epoch": 0.9, + "grad_norm": 0.20489123775638518, + "learning_rate": 2.7863896203637165e-07, + "loss": 0.0461, + "step": 10639 + }, + { + "epoch": 0.9, + "grad_norm": 0.2824713187504084, + "learning_rate": 2.781899569834384e-07, + "loss": 0.0803, + "step": 10640 + }, + { + "epoch": 0.9, + "grad_norm": 0.20221908958629, + "learning_rate": 2.77741303638423e-07, + "loss": 0.0467, + "step": 10641 + }, + { + "epoch": 0.9, + "grad_norm": 0.21658574694668653, + "learning_rate": 2.772930020347453e-07, + "loss": 0.0407, + "step": 10642 + }, + { + "epoch": 0.9, + "grad_norm": 0.337935501775752, + "learning_rate": 2.768450522057964e-07, + "loss": 0.076, + "step": 10643 + }, + { + "epoch": 0.9, + "grad_norm": 0.21048115227383948, + "learning_rate": 2.7639745418494233e-07, + "loss": 0.0593, + "step": 10644 + }, + { + "epoch": 0.9, + "grad_norm": 0.40982669711008407, + "learning_rate": 2.759502080055232e-07, + "loss": 0.0843, + "step": 10645 + }, + { + "epoch": 0.9, + "grad_norm": 0.38198004624061155, + "learning_rate": 2.755033137008517e-07, + "loss": 0.0457, + "step": 10646 + }, + { + "epoch": 0.9, + "grad_norm": 0.2859408319962393, + "learning_rate": 2.7505677130421624e-07, + "loss": 0.098, + "step": 10647 + }, + { + "epoch": 0.9, + "grad_norm": 0.2509317574694417, + "learning_rate": 2.746105808488764e-07, + "loss": 0.0614, + "step": 10648 + }, + { + "epoch": 0.9, + "grad_norm": 0.51728116132231, + "learning_rate": 2.741647423680688e-07, + "loss": 0.0898, + "step": 10649 + }, + { + "epoch": 0.9, + "grad_norm": 0.2986396436562258, + "learning_rate": 2.7371925589500137e-07, + "loss": 0.0564, + "step": 10650 + }, + { + "epoch": 0.9, + "grad_norm": 0.268150260584324, + "learning_rate": 2.7327412146285536e-07, + "loss": 0.0677, + "step": 10651 + }, + { + "epoch": 0.9, + "grad_norm": 0.20202805153272946, + "learning_rate": 2.728293391047887e-07, + "loss": 0.0428, + "step": 10652 + }, + { + "epoch": 0.9, + "grad_norm": 0.37578281823150106, + "learning_rate": 2.723849088539304e-07, + "loss": 0.138, + "step": 10653 + }, + { + "epoch": 0.9, + "grad_norm": 0.4570903178962846, + "learning_rate": 2.7194083074338405e-07, + "loss": 0.0854, + "step": 10654 + }, + { + "epoch": 0.9, + "grad_norm": 0.41159020157343845, + "learning_rate": 2.714971048062287e-07, + "loss": 0.0745, + "step": 10655 + }, + { + "epoch": 0.9, + "grad_norm": 0.30365278470017293, + "learning_rate": 2.71053731075514e-07, + "loss": 0.0634, + "step": 10656 + }, + { + "epoch": 0.9, + "grad_norm": 0.30157206353722177, + "learning_rate": 2.7061070958426585e-07, + "loss": 0.0834, + "step": 10657 + }, + { + "epoch": 0.9, + "grad_norm": 0.2015676863928254, + "learning_rate": 2.7016804036548214e-07, + "loss": 0.0466, + "step": 10658 + }, + { + "epoch": 0.9, + "grad_norm": 0.33018156256166303, + "learning_rate": 2.697257234521367e-07, + "loss": 0.0816, + "step": 10659 + }, + { + "epoch": 0.9, + "grad_norm": 0.461212713954877, + "learning_rate": 2.6928375887717516e-07, + "loss": 0.092, + "step": 10660 + }, + { + "epoch": 0.9, + "grad_norm": 0.38600741659420473, + "learning_rate": 2.6884214667351694e-07, + "loss": 0.0932, + "step": 10661 + }, + { + "epoch": 0.9, + "grad_norm": 0.2911092057464604, + "learning_rate": 2.684008868740573e-07, + "loss": 0.0587, + "step": 10662 + }, + { + "epoch": 0.9, + "grad_norm": 0.34345628494224295, + "learning_rate": 2.6795997951166333e-07, + "loss": 0.0667, + "step": 10663 + }, + { + "epoch": 0.9, + "grad_norm": 0.15689201314123802, + "learning_rate": 2.6751942461917646e-07, + "loss": 0.0454, + "step": 10664 + }, + { + "epoch": 0.9, + "grad_norm": 0.3155251662219744, + "learning_rate": 2.670792222294105e-07, + "loss": 0.0597, + "step": 10665 + }, + { + "epoch": 0.9, + "grad_norm": 0.27330919797660425, + "learning_rate": 2.6663937237515646e-07, + "loss": 0.0651, + "step": 10666 + }, + { + "epoch": 0.9, + "grad_norm": 0.5820658352893192, + "learning_rate": 2.661998750891759e-07, + "loss": 0.1526, + "step": 10667 + }, + { + "epoch": 0.9, + "grad_norm": 0.4455780710192732, + "learning_rate": 2.657607304042048e-07, + "loss": 0.0891, + "step": 10668 + }, + { + "epoch": 0.9, + "grad_norm": 0.39254660622402754, + "learning_rate": 2.6532193835295374e-07, + "loss": 0.0928, + "step": 10669 + }, + { + "epoch": 0.9, + "grad_norm": 0.5070593765650299, + "learning_rate": 2.648834989681054e-07, + "loss": 0.0949, + "step": 10670 + }, + { + "epoch": 0.9, + "grad_norm": 0.29509843400299524, + "learning_rate": 2.6444541228231867e-07, + "loss": 0.0723, + "step": 10671 + }, + { + "epoch": 0.9, + "grad_norm": 0.3779101798218891, + "learning_rate": 2.6400767832822414e-07, + "loss": 0.107, + "step": 10672 + }, + { + "epoch": 0.9, + "grad_norm": 0.5315043888151257, + "learning_rate": 2.635702971384274e-07, + "loss": 0.1013, + "step": 10673 + }, + { + "epoch": 0.9, + "grad_norm": 0.4889985446644459, + "learning_rate": 2.631332687455057e-07, + "loss": 0.1362, + "step": 10674 + }, + { + "epoch": 0.9, + "grad_norm": 0.23095392097665712, + "learning_rate": 2.62696593182013e-07, + "loss": 0.0551, + "step": 10675 + }, + { + "epoch": 0.9, + "grad_norm": 0.3154703360187384, + "learning_rate": 2.62260270480475e-07, + "loss": 0.0802, + "step": 10676 + }, + { + "epoch": 0.9, + "grad_norm": 0.2169419453313571, + "learning_rate": 2.618243006733917e-07, + "loss": 0.03, + "step": 10677 + }, + { + "epoch": 0.9, + "grad_norm": 0.32075604273952646, + "learning_rate": 2.613886837932356e-07, + "loss": 0.0708, + "step": 10678 + }, + { + "epoch": 0.9, + "grad_norm": 0.35938608771554126, + "learning_rate": 2.6095341987245503e-07, + "loss": 0.11, + "step": 10679 + }, + { + "epoch": 0.9, + "grad_norm": 0.2851445840072855, + "learning_rate": 2.605185089434714e-07, + "loss": 0.0567, + "step": 10680 + }, + { + "epoch": 0.9, + "grad_norm": 0.252652873899567, + "learning_rate": 2.6008395103867825e-07, + "loss": 0.0555, + "step": 10681 + }, + { + "epoch": 0.9, + "grad_norm": 0.34964815149536616, + "learning_rate": 2.5964974619044403e-07, + "loss": 0.0552, + "step": 10682 + }, + { + "epoch": 0.9, + "grad_norm": 0.2577347229942725, + "learning_rate": 2.592158944311118e-07, + "loss": 0.0992, + "step": 10683 + }, + { + "epoch": 0.9, + "grad_norm": 0.23998755551313036, + "learning_rate": 2.587823957929969e-07, + "loss": 0.0367, + "step": 10684 + }, + { + "epoch": 0.9, + "grad_norm": 0.28547270389760815, + "learning_rate": 2.583492503083884e-07, + "loss": 0.0833, + "step": 10685 + }, + { + "epoch": 0.9, + "grad_norm": 0.3640939669729773, + "learning_rate": 2.5791645800955055e-07, + "loss": 0.1208, + "step": 10686 + }, + { + "epoch": 0.9, + "grad_norm": 0.7762202913151485, + "learning_rate": 2.5748401892871923e-07, + "loss": 0.0861, + "step": 10687 + }, + { + "epoch": 0.9, + "grad_norm": 0.33739724603665006, + "learning_rate": 2.5705193309810475e-07, + "loss": 0.0698, + "step": 10688 + }, + { + "epoch": 0.9, + "grad_norm": 0.276461230913421, + "learning_rate": 2.56620200549893e-07, + "loss": 0.0789, + "step": 10689 + }, + { + "epoch": 0.9, + "grad_norm": 0.6546707167915755, + "learning_rate": 2.561888213162406e-07, + "loss": 0.1289, + "step": 10690 + }, + { + "epoch": 0.9, + "grad_norm": 0.2609747241904819, + "learning_rate": 2.5575779542928e-07, + "loss": 0.0476, + "step": 10691 + }, + { + "epoch": 0.9, + "grad_norm": 0.35659619315624297, + "learning_rate": 2.553271229211146e-07, + "loss": 0.0675, + "step": 10692 + }, + { + "epoch": 0.9, + "grad_norm": 0.363098873056821, + "learning_rate": 2.548968038238264e-07, + "loss": 0.076, + "step": 10693 + }, + { + "epoch": 0.9, + "grad_norm": 0.25278872524319185, + "learning_rate": 2.5446683816946593e-07, + "loss": 0.0801, + "step": 10694 + }, + { + "epoch": 0.9, + "grad_norm": 0.3907987234237108, + "learning_rate": 2.5403722599005976e-07, + "loss": 0.0482, + "step": 10695 + }, + { + "epoch": 0.9, + "grad_norm": 0.26626351353033323, + "learning_rate": 2.536079673176084e-07, + "loss": 0.0411, + "step": 10696 + }, + { + "epoch": 0.9, + "grad_norm": 0.2631643809052591, + "learning_rate": 2.531790621840863e-07, + "loss": 0.0688, + "step": 10697 + }, + { + "epoch": 0.9, + "grad_norm": 0.31159268215396135, + "learning_rate": 2.527505106214395e-07, + "loss": 0.0862, + "step": 10698 + }, + { + "epoch": 0.9, + "grad_norm": 0.5037018353706154, + "learning_rate": 2.523223126615887e-07, + "loss": 0.1016, + "step": 10699 + }, + { + "epoch": 0.9, + "grad_norm": 0.32808332314128585, + "learning_rate": 2.5189446833642997e-07, + "loss": 0.0687, + "step": 10700 + }, + { + "epoch": 0.9, + "grad_norm": 0.5471854355134675, + "learning_rate": 2.5146697767783114e-07, + "loss": 0.0987, + "step": 10701 + }, + { + "epoch": 0.9, + "grad_norm": 0.2644053146360624, + "learning_rate": 2.510398407176329e-07, + "loss": 0.0722, + "step": 10702 + }, + { + "epoch": 0.9, + "grad_norm": 0.2259187429182303, + "learning_rate": 2.506130574876531e-07, + "loss": 0.0705, + "step": 10703 + }, + { + "epoch": 0.9, + "grad_norm": 0.2240345005194627, + "learning_rate": 2.5018662801968016e-07, + "loss": 0.0626, + "step": 10704 + }, + { + "epoch": 0.9, + "grad_norm": 0.2299818547519316, + "learning_rate": 2.49760552345476e-07, + "loss": 0.0748, + "step": 10705 + }, + { + "epoch": 0.9, + "grad_norm": 0.282624728294897, + "learning_rate": 2.493348304967791e-07, + "loss": 0.047, + "step": 10706 + }, + { + "epoch": 0.9, + "grad_norm": 0.18010589559773116, + "learning_rate": 2.4890946250529844e-07, + "loss": 0.0523, + "step": 10707 + }, + { + "epoch": 0.9, + "grad_norm": 0.2095888690906835, + "learning_rate": 2.484844484027182e-07, + "loss": 0.0479, + "step": 10708 + }, + { + "epoch": 0.9, + "grad_norm": 0.5544875648615079, + "learning_rate": 2.480597882206948e-07, + "loss": 0.0681, + "step": 10709 + }, + { + "epoch": 0.9, + "grad_norm": 0.394415306368715, + "learning_rate": 2.476354819908616e-07, + "loss": 0.0702, + "step": 10710 + }, + { + "epoch": 0.9, + "grad_norm": 0.3040860655207578, + "learning_rate": 2.472115297448224e-07, + "loss": 0.0856, + "step": 10711 + }, + { + "epoch": 0.9, + "grad_norm": 0.4901303532813168, + "learning_rate": 2.4678793151415415e-07, + "loss": 0.1181, + "step": 10712 + }, + { + "epoch": 0.9, + "grad_norm": 0.4213804271480601, + "learning_rate": 2.463646873304115e-07, + "loss": 0.094, + "step": 10713 + }, + { + "epoch": 0.9, + "grad_norm": 0.41597075375835674, + "learning_rate": 2.459417972251188e-07, + "loss": 0.1252, + "step": 10714 + }, + { + "epoch": 0.9, + "grad_norm": 0.25987022287689737, + "learning_rate": 2.4551926122977575e-07, + "loss": 0.0809, + "step": 10715 + }, + { + "epoch": 0.9, + "grad_norm": 0.3392275105352114, + "learning_rate": 2.450970793758539e-07, + "loss": 0.0739, + "step": 10716 + }, + { + "epoch": 0.9, + "grad_norm": 0.40045914471456195, + "learning_rate": 2.44675251694802e-07, + "loss": 0.084, + "step": 10717 + }, + { + "epoch": 0.9, + "grad_norm": 0.55505291727462, + "learning_rate": 2.4425377821803875e-07, + "loss": 0.1019, + "step": 10718 + }, + { + "epoch": 0.9, + "grad_norm": 0.29148165496352124, + "learning_rate": 2.4383265897695794e-07, + "loss": 0.0486, + "step": 10719 + }, + { + "epoch": 0.9, + "grad_norm": 0.18579742626577006, + "learning_rate": 2.4341189400292784e-07, + "loss": 0.0449, + "step": 10720 + }, + { + "epoch": 0.9, + "grad_norm": 0.4755753008009053, + "learning_rate": 2.429914833272895e-07, + "loss": 0.1071, + "step": 10721 + }, + { + "epoch": 0.9, + "grad_norm": 0.5503858920489393, + "learning_rate": 2.425714269813567e-07, + "loss": 0.131, + "step": 10722 + }, + { + "epoch": 0.9, + "grad_norm": 0.2840245429736912, + "learning_rate": 2.4215172499641784e-07, + "loss": 0.0764, + "step": 10723 + }, + { + "epoch": 0.9, + "grad_norm": 0.7959610624823609, + "learning_rate": 2.41732377403735e-07, + "loss": 0.1311, + "step": 10724 + }, + { + "epoch": 0.9, + "grad_norm": 0.2923292515261836, + "learning_rate": 2.413133842345444e-07, + "loss": 0.0846, + "step": 10725 + }, + { + "epoch": 0.9, + "grad_norm": 0.5061462059865495, + "learning_rate": 2.4089474552005334e-07, + "loss": 0.1034, + "step": 10726 + }, + { + "epoch": 0.9, + "grad_norm": 0.23667007963378325, + "learning_rate": 2.404764612914462e-07, + "loss": 0.0399, + "step": 10727 + }, + { + "epoch": 0.9, + "grad_norm": 0.26619665370728324, + "learning_rate": 2.4005853157987813e-07, + "loss": 0.0789, + "step": 10728 + }, + { + "epoch": 0.9, + "grad_norm": 0.3297421159426025, + "learning_rate": 2.3964095641647924e-07, + "loss": 0.0824, + "step": 10729 + }, + { + "epoch": 0.9, + "grad_norm": 0.24051887425020937, + "learning_rate": 2.392237358323535e-07, + "loss": 0.0532, + "step": 10730 + }, + { + "epoch": 0.9, + "grad_norm": 0.27031493360775616, + "learning_rate": 2.3880686985857714e-07, + "loss": 0.0705, + "step": 10731 + }, + { + "epoch": 0.9, + "grad_norm": 0.22823239887082386, + "learning_rate": 2.3839035852620152e-07, + "loss": 0.0464, + "step": 10732 + }, + { + "epoch": 0.9, + "grad_norm": 0.3302753035625371, + "learning_rate": 2.3797420186624953e-07, + "loss": 0.087, + "step": 10733 + }, + { + "epoch": 0.9, + "grad_norm": 0.4609507075350674, + "learning_rate": 2.3755839990972086e-07, + "loss": 0.126, + "step": 10734 + }, + { + "epoch": 0.9, + "grad_norm": 0.3018838806075609, + "learning_rate": 2.371429526875857e-07, + "loss": 0.105, + "step": 10735 + }, + { + "epoch": 0.9, + "grad_norm": 0.4825051800044049, + "learning_rate": 2.3672786023078876e-07, + "loss": 0.0983, + "step": 10736 + }, + { + "epoch": 0.9, + "grad_norm": 0.6981423643813979, + "learning_rate": 2.3631312257024918e-07, + "loss": 0.1595, + "step": 10737 + }, + { + "epoch": 0.9, + "grad_norm": 0.4292411778929159, + "learning_rate": 2.3589873973685839e-07, + "loss": 0.1199, + "step": 10738 + }, + { + "epoch": 0.9, + "grad_norm": 0.43114024003540075, + "learning_rate": 2.354847117614828e-07, + "loss": 0.155, + "step": 10739 + }, + { + "epoch": 0.9, + "grad_norm": 0.3028976562245556, + "learning_rate": 2.3507103867496107e-07, + "loss": 0.0807, + "step": 10740 + }, + { + "epoch": 0.91, + "grad_norm": 0.23623535731175438, + "learning_rate": 2.3465772050810632e-07, + "loss": 0.0769, + "step": 10741 + }, + { + "epoch": 0.91, + "grad_norm": 0.20982305609758156, + "learning_rate": 2.3424475729170392e-07, + "loss": 0.0572, + "step": 10742 + }, + { + "epoch": 0.91, + "grad_norm": 0.3545383002188968, + "learning_rate": 2.3383214905651542e-07, + "loss": 0.1008, + "step": 10743 + }, + { + "epoch": 0.91, + "grad_norm": 0.5684953917210803, + "learning_rate": 2.334198958332734e-07, + "loss": 0.1328, + "step": 10744 + }, + { + "epoch": 0.91, + "grad_norm": 0.3151669729825578, + "learning_rate": 2.3300799765268445e-07, + "loss": 0.0809, + "step": 10745 + }, + { + "epoch": 0.91, + "grad_norm": 0.45486957500255654, + "learning_rate": 2.3259645454542956e-07, + "loss": 0.1128, + "step": 10746 + }, + { + "epoch": 0.91, + "grad_norm": 0.3232602887947338, + "learning_rate": 2.321852665421631e-07, + "loss": 0.0889, + "step": 10747 + }, + { + "epoch": 0.91, + "grad_norm": 0.5640674518744295, + "learning_rate": 2.3177443367351338e-07, + "loss": 0.1338, + "step": 10748 + }, + { + "epoch": 0.91, + "grad_norm": 0.3821172908457758, + "learning_rate": 2.3136395597007976e-07, + "loss": 0.0923, + "step": 10749 + }, + { + "epoch": 0.91, + "grad_norm": 0.43088544534987544, + "learning_rate": 2.3095383346243837e-07, + "loss": 0.0878, + "step": 10750 + }, + { + "epoch": 0.91, + "grad_norm": 0.41029279011108094, + "learning_rate": 2.30544066181137e-07, + "loss": 0.1207, + "step": 10751 + }, + { + "epoch": 0.91, + "grad_norm": 0.25537402727854897, + "learning_rate": 2.3013465415669845e-07, + "loss": 0.0665, + "step": 10752 + }, + { + "epoch": 0.91, + "grad_norm": 0.41650577359191204, + "learning_rate": 2.297255974196161e-07, + "loss": 0.1124, + "step": 10753 + }, + { + "epoch": 0.91, + "grad_norm": 0.29977458824375897, + "learning_rate": 2.293168960003611e-07, + "loss": 0.0611, + "step": 10754 + }, + { + "epoch": 0.91, + "grad_norm": 0.36985380465025514, + "learning_rate": 2.2890854992937528e-07, + "loss": 0.1063, + "step": 10755 + }, + { + "epoch": 0.91, + "grad_norm": 0.2866156418507742, + "learning_rate": 2.285005592370737e-07, + "loss": 0.0785, + "step": 10756 + }, + { + "epoch": 0.91, + "grad_norm": 0.31616866269159105, + "learning_rate": 2.2809292395384652e-07, + "loss": 0.1045, + "step": 10757 + }, + { + "epoch": 0.91, + "grad_norm": 0.2785008336608391, + "learning_rate": 2.276856441100572e-07, + "loss": 0.0791, + "step": 10758 + }, + { + "epoch": 0.91, + "grad_norm": 0.4713637643662592, + "learning_rate": 2.2727871973604155e-07, + "loss": 0.0839, + "step": 10759 + }, + { + "epoch": 0.91, + "grad_norm": 0.33857644626116934, + "learning_rate": 2.2687215086210967e-07, + "loss": 0.0716, + "step": 10760 + }, + { + "epoch": 0.91, + "grad_norm": 0.24868341592862014, + "learning_rate": 2.2646593751854573e-07, + "loss": 0.0595, + "step": 10761 + }, + { + "epoch": 0.91, + "grad_norm": 0.23295129361557404, + "learning_rate": 2.2606007973560716e-07, + "loss": 0.0471, + "step": 10762 + }, + { + "epoch": 0.91, + "grad_norm": 0.3220780616468485, + "learning_rate": 2.2565457754352372e-07, + "loss": 0.0503, + "step": 10763 + }, + { + "epoch": 0.91, + "grad_norm": 0.29355657181633643, + "learning_rate": 2.2524943097250008e-07, + "loss": 0.0753, + "step": 10764 + }, + { + "epoch": 0.91, + "grad_norm": 0.3479929942629928, + "learning_rate": 2.248446400527138e-07, + "loss": 0.0713, + "step": 10765 + }, + { + "epoch": 0.91, + "grad_norm": 0.45609974581004376, + "learning_rate": 2.244402048143163e-07, + "loss": 0.1157, + "step": 10766 + }, + { + "epoch": 0.91, + "grad_norm": 0.2980593545857858, + "learning_rate": 2.2403612528743125e-07, + "loss": 0.0567, + "step": 10767 + }, + { + "epoch": 0.91, + "grad_norm": 0.3035160381075223, + "learning_rate": 2.2363240150215848e-07, + "loss": 0.0722, + "step": 10768 + }, + { + "epoch": 0.91, + "grad_norm": 0.21947652143348073, + "learning_rate": 2.232290334885684e-07, + "loss": 0.0637, + "step": 10769 + }, + { + "epoch": 0.91, + "grad_norm": 0.3706976801572213, + "learning_rate": 2.2282602127670638e-07, + "loss": 0.0626, + "step": 10770 + }, + { + "epoch": 0.91, + "grad_norm": 0.348425051174002, + "learning_rate": 2.224233648965918e-07, + "loss": 0.0765, + "step": 10771 + }, + { + "epoch": 0.91, + "grad_norm": 0.5410431415644702, + "learning_rate": 2.220210643782167e-07, + "loss": 0.1062, + "step": 10772 + }, + { + "epoch": 0.91, + "grad_norm": 0.3225189081429699, + "learning_rate": 2.216191197515466e-07, + "loss": 0.0895, + "step": 10773 + }, + { + "epoch": 0.91, + "grad_norm": 0.36975850408544747, + "learning_rate": 2.2121753104651922e-07, + "loss": 0.0872, + "step": 10774 + }, + { + "epoch": 0.91, + "grad_norm": 0.19529394023574156, + "learning_rate": 2.2081629829305006e-07, + "loss": 0.0553, + "step": 10775 + }, + { + "epoch": 0.91, + "grad_norm": 0.2622552167330584, + "learning_rate": 2.2041542152102357e-07, + "loss": 0.06, + "step": 10776 + }, + { + "epoch": 0.91, + "grad_norm": 0.3675975920097866, + "learning_rate": 2.2001490076029864e-07, + "loss": 0.0652, + "step": 10777 + }, + { + "epoch": 0.91, + "grad_norm": 0.26944115649759387, + "learning_rate": 2.1961473604071028e-07, + "loss": 0.0776, + "step": 10778 + }, + { + "epoch": 0.91, + "grad_norm": 0.22129977581100824, + "learning_rate": 2.1921492739206463e-07, + "loss": 0.0601, + "step": 10779 + }, + { + "epoch": 0.91, + "grad_norm": 0.40932445802003786, + "learning_rate": 2.188154748441401e-07, + "loss": 0.1011, + "step": 10780 + }, + { + "epoch": 0.91, + "grad_norm": 0.23854481823817053, + "learning_rate": 2.1841637842669238e-07, + "loss": 0.0491, + "step": 10781 + }, + { + "epoch": 0.91, + "grad_norm": 0.2998955767529141, + "learning_rate": 2.1801763816944709e-07, + "loss": 0.0573, + "step": 10782 + }, + { + "epoch": 0.91, + "grad_norm": 0.4084412468752217, + "learning_rate": 2.176192541021055e-07, + "loss": 0.0774, + "step": 10783 + }, + { + "epoch": 0.91, + "grad_norm": 0.17009352006554865, + "learning_rate": 2.1722122625434105e-07, + "loss": 0.0378, + "step": 10784 + }, + { + "epoch": 0.91, + "grad_norm": 0.3319436128204301, + "learning_rate": 2.1682355465580174e-07, + "loss": 0.0702, + "step": 10785 + }, + { + "epoch": 0.91, + "grad_norm": 0.4237771006846492, + "learning_rate": 2.1642623933610773e-07, + "loss": 0.1172, + "step": 10786 + }, + { + "epoch": 0.91, + "grad_norm": 0.3927803088198924, + "learning_rate": 2.160292803248537e-07, + "loss": 0.1032, + "step": 10787 + }, + { + "epoch": 0.91, + "grad_norm": 0.48859552387621963, + "learning_rate": 2.1563267765160823e-07, + "loss": 0.1305, + "step": 10788 + }, + { + "epoch": 0.91, + "grad_norm": 0.3673575174905861, + "learning_rate": 2.1523643134591154e-07, + "loss": 0.0747, + "step": 10789 + }, + { + "epoch": 0.91, + "grad_norm": 0.4364790867669465, + "learning_rate": 2.1484054143727895e-07, + "loss": 0.0536, + "step": 10790 + }, + { + "epoch": 0.91, + "grad_norm": 0.4610076375718733, + "learning_rate": 2.1444500795519795e-07, + "loss": 0.0972, + "step": 10791 + }, + { + "epoch": 0.91, + "grad_norm": 0.27598321985357693, + "learning_rate": 2.1404983092913112e-07, + "loss": 0.0637, + "step": 10792 + }, + { + "epoch": 0.91, + "grad_norm": 0.9370747809669794, + "learning_rate": 2.136550103885132e-07, + "loss": 0.0915, + "step": 10793 + }, + { + "epoch": 0.91, + "grad_norm": 0.37028228726295187, + "learning_rate": 2.1326054636275185e-07, + "loss": 0.0835, + "step": 10794 + }, + { + "epoch": 0.91, + "grad_norm": 0.30345761105397, + "learning_rate": 2.128664388812307e-07, + "loss": 0.0674, + "step": 10795 + }, + { + "epoch": 0.91, + "grad_norm": 0.2820681808035894, + "learning_rate": 2.1247268797330412e-07, + "loss": 0.0685, + "step": 10796 + }, + { + "epoch": 0.91, + "grad_norm": 0.31159278748779146, + "learning_rate": 2.1207929366830082e-07, + "loss": 0.075, + "step": 10797 + }, + { + "epoch": 0.91, + "grad_norm": 0.4162972842982443, + "learning_rate": 2.116862559955235e-07, + "loss": 0.0844, + "step": 10798 + }, + { + "epoch": 0.91, + "grad_norm": 0.3341701735618537, + "learning_rate": 2.1129357498424818e-07, + "loss": 0.0517, + "step": 10799 + }, + { + "epoch": 0.91, + "grad_norm": 0.3134727448543742, + "learning_rate": 2.1090125066372369e-07, + "loss": 0.1079, + "step": 10800 + }, + { + "epoch": 0.91, + "grad_norm": 0.13005172474605234, + "learning_rate": 2.105092830631722e-07, + "loss": 0.026, + "step": 10801 + }, + { + "epoch": 0.91, + "grad_norm": 0.5168911581019014, + "learning_rate": 2.1011767221179035e-07, + "loss": 0.1157, + "step": 10802 + }, + { + "epoch": 0.91, + "grad_norm": 0.4963179543856095, + "learning_rate": 2.097264181387476e-07, + "loss": 0.0914, + "step": 10803 + }, + { + "epoch": 0.91, + "grad_norm": 0.3000011557774194, + "learning_rate": 2.093355208731862e-07, + "loss": 0.0562, + "step": 10804 + }, + { + "epoch": 0.91, + "grad_norm": 0.3423850936825139, + "learning_rate": 2.089449804442234e-07, + "loss": 0.0583, + "step": 10805 + }, + { + "epoch": 0.91, + "grad_norm": 0.40032511427242506, + "learning_rate": 2.085547968809476e-07, + "loss": 0.0917, + "step": 10806 + }, + { + "epoch": 0.91, + "grad_norm": 0.48629328656186543, + "learning_rate": 2.0816497021242388e-07, + "loss": 0.1022, + "step": 10807 + }, + { + "epoch": 0.91, + "grad_norm": 0.23613425211046524, + "learning_rate": 2.0777550046768679e-07, + "loss": 0.0525, + "step": 10808 + }, + { + "epoch": 0.91, + "grad_norm": 0.7181709009559915, + "learning_rate": 2.073863876757476e-07, + "loss": 0.0711, + "step": 10809 + }, + { + "epoch": 0.91, + "grad_norm": 0.45097673588275816, + "learning_rate": 2.0699763186558919e-07, + "loss": 0.0977, + "step": 10810 + }, + { + "epoch": 0.91, + "grad_norm": 0.41253262812203184, + "learning_rate": 2.0660923306616843e-07, + "loss": 0.108, + "step": 10811 + }, + { + "epoch": 0.91, + "grad_norm": 0.2458095715756952, + "learning_rate": 2.062211913064155e-07, + "loss": 0.0632, + "step": 10812 + }, + { + "epoch": 0.91, + "grad_norm": 0.2858059811956463, + "learning_rate": 2.058335066152345e-07, + "loss": 0.0649, + "step": 10813 + }, + { + "epoch": 0.91, + "grad_norm": 0.25240986836099916, + "learning_rate": 2.054461790215012e-07, + "loss": 0.0657, + "step": 10814 + }, + { + "epoch": 0.91, + "grad_norm": 0.44864381854731467, + "learning_rate": 2.0505920855406757e-07, + "loss": 0.0885, + "step": 10815 + }, + { + "epoch": 0.91, + "grad_norm": 0.2893213554504105, + "learning_rate": 2.0467259524175663e-07, + "loss": 0.0802, + "step": 10816 + }, + { + "epoch": 0.91, + "grad_norm": 0.472797234700328, + "learning_rate": 2.0428633911336537e-07, + "loss": 0.0916, + "step": 10817 + }, + { + "epoch": 0.91, + "grad_norm": 0.2815168299084547, + "learning_rate": 2.0390044019766409e-07, + "loss": 0.1039, + "step": 10818 + }, + { + "epoch": 0.91, + "grad_norm": 0.27420039753303493, + "learning_rate": 2.0351489852339813e-07, + "loss": 0.0689, + "step": 10819 + }, + { + "epoch": 0.91, + "grad_norm": 0.2934587813782035, + "learning_rate": 2.0312971411928451e-07, + "loss": 0.0941, + "step": 10820 + }, + { + "epoch": 0.91, + "grad_norm": 0.5198588389521508, + "learning_rate": 2.027448870140125e-07, + "loss": 0.0921, + "step": 10821 + }, + { + "epoch": 0.91, + "grad_norm": 0.3194354551861137, + "learning_rate": 2.023604172362481e-07, + "loss": 0.062, + "step": 10822 + }, + { + "epoch": 0.91, + "grad_norm": 0.21742643432787762, + "learning_rate": 2.0197630481462838e-07, + "loss": 0.0492, + "step": 10823 + }, + { + "epoch": 0.91, + "grad_norm": 0.40320347123352607, + "learning_rate": 2.0159254977776376e-07, + "loss": 0.0793, + "step": 10824 + }, + { + "epoch": 0.91, + "grad_norm": 0.32151464874824387, + "learning_rate": 2.012091521542381e-07, + "loss": 0.0571, + "step": 10825 + }, + { + "epoch": 0.91, + "grad_norm": 0.3188341861624577, + "learning_rate": 2.008261119726107e-07, + "loss": 0.0684, + "step": 10826 + }, + { + "epoch": 0.91, + "grad_norm": 0.37037750728114366, + "learning_rate": 2.0044342926141158e-07, + "loss": 0.0611, + "step": 10827 + }, + { + "epoch": 0.91, + "grad_norm": 0.3821049797498922, + "learning_rate": 2.0006110404914457e-07, + "loss": 0.0871, + "step": 10828 + }, + { + "epoch": 0.91, + "grad_norm": 0.5153336178514163, + "learning_rate": 1.9967913636428914e-07, + "loss": 0.0888, + "step": 10829 + }, + { + "epoch": 0.91, + "grad_norm": 0.2831283792782594, + "learning_rate": 1.9929752623529587e-07, + "loss": 0.0741, + "step": 10830 + }, + { + "epoch": 0.91, + "grad_norm": 0.21697811813416326, + "learning_rate": 1.9891627369058807e-07, + "loss": 0.0551, + "step": 10831 + }, + { + "epoch": 0.91, + "grad_norm": 0.5860499456732063, + "learning_rate": 1.9853537875856477e-07, + "loss": 0.1087, + "step": 10832 + }, + { + "epoch": 0.91, + "grad_norm": 0.47063105174915953, + "learning_rate": 1.9815484146759768e-07, + "loss": 0.1516, + "step": 10833 + }, + { + "epoch": 0.91, + "grad_norm": 0.4221928876832875, + "learning_rate": 1.9777466184603022e-07, + "loss": 0.0834, + "step": 10834 + }, + { + "epoch": 0.91, + "grad_norm": 0.35425686659273553, + "learning_rate": 1.973948399221809e-07, + "loss": 0.0977, + "step": 10835 + }, + { + "epoch": 0.91, + "grad_norm": 0.4648591204857386, + "learning_rate": 1.9701537572434147e-07, + "loss": 0.1097, + "step": 10836 + }, + { + "epoch": 0.91, + "grad_norm": 0.3855660883995771, + "learning_rate": 1.966362692807766e-07, + "loss": 0.076, + "step": 10837 + }, + { + "epoch": 0.91, + "grad_norm": 0.6796185621988937, + "learning_rate": 1.9625752061972315e-07, + "loss": 0.0729, + "step": 10838 + }, + { + "epoch": 0.91, + "grad_norm": 0.45776135025254255, + "learning_rate": 1.9587912976939406e-07, + "loss": 0.0896, + "step": 10839 + }, + { + "epoch": 0.91, + "grad_norm": 0.27343725983218053, + "learning_rate": 1.9550109675797347e-07, + "loss": 0.0921, + "step": 10840 + }, + { + "epoch": 0.91, + "grad_norm": 0.4005520350515181, + "learning_rate": 1.9512342161361998e-07, + "loss": 0.1207, + "step": 10841 + }, + { + "epoch": 0.91, + "grad_norm": 0.5427669953809398, + "learning_rate": 1.947461043644633e-07, + "loss": 0.1341, + "step": 10842 + }, + { + "epoch": 0.91, + "grad_norm": 0.37338738931436405, + "learning_rate": 1.9436914503861036e-07, + "loss": 0.118, + "step": 10843 + }, + { + "epoch": 0.91, + "grad_norm": 0.37260938137062105, + "learning_rate": 1.9399254366413822e-07, + "loss": 0.0916, + "step": 10844 + }, + { + "epoch": 0.91, + "grad_norm": 0.3745274183965353, + "learning_rate": 1.9361630026909772e-07, + "loss": 0.0684, + "step": 10845 + }, + { + "epoch": 0.91, + "grad_norm": 0.28433820458532916, + "learning_rate": 1.9324041488151534e-07, + "loss": 0.0743, + "step": 10846 + }, + { + "epoch": 0.91, + "grad_norm": 0.5438397861938997, + "learning_rate": 1.9286488752938814e-07, + "loss": 0.1242, + "step": 10847 + }, + { + "epoch": 0.91, + "grad_norm": 0.22433509642694524, + "learning_rate": 1.9248971824068762e-07, + "loss": 0.0602, + "step": 10848 + }, + { + "epoch": 0.91, + "grad_norm": 0.3110333744521675, + "learning_rate": 1.9211490704335867e-07, + "loss": 0.0878, + "step": 10849 + }, + { + "epoch": 0.91, + "grad_norm": 0.2621688801504687, + "learning_rate": 1.9174045396531948e-07, + "loss": 0.0789, + "step": 10850 + }, + { + "epoch": 0.91, + "grad_norm": 0.23631606543764658, + "learning_rate": 1.9136635903446167e-07, + "loss": 0.0553, + "step": 10851 + }, + { + "epoch": 0.91, + "grad_norm": 0.3146764824473073, + "learning_rate": 1.90992622278649e-07, + "loss": 0.0696, + "step": 10852 + }, + { + "epoch": 0.91, + "grad_norm": 0.7276494102161084, + "learning_rate": 1.9061924372572094e-07, + "loss": 0.1321, + "step": 10853 + }, + { + "epoch": 0.91, + "grad_norm": 0.22965072117380494, + "learning_rate": 1.9024622340348852e-07, + "loss": 0.0567, + "step": 10854 + }, + { + "epoch": 0.91, + "grad_norm": 0.2956052290240375, + "learning_rate": 1.8987356133973567e-07, + "loss": 0.085, + "step": 10855 + }, + { + "epoch": 0.91, + "grad_norm": 0.34987882429469974, + "learning_rate": 1.8950125756222183e-07, + "loss": 0.0886, + "step": 10856 + }, + { + "epoch": 0.91, + "grad_norm": 0.4881286148236791, + "learning_rate": 1.8912931209867758e-07, + "loss": 0.1018, + "step": 10857 + }, + { + "epoch": 0.91, + "grad_norm": 0.3576147675942974, + "learning_rate": 1.887577249768069e-07, + "loss": 0.1271, + "step": 10858 + }, + { + "epoch": 0.91, + "grad_norm": 0.19334740333300582, + "learning_rate": 1.8838649622428819e-07, + "loss": 0.0714, + "step": 10859 + }, + { + "epoch": 0.92, + "grad_norm": 0.28771729860249684, + "learning_rate": 1.8801562586877377e-07, + "loss": 0.0801, + "step": 10860 + }, + { + "epoch": 0.92, + "grad_norm": 0.2783994197897828, + "learning_rate": 1.876451139378871e-07, + "loss": 0.0815, + "step": 10861 + }, + { + "epoch": 0.92, + "grad_norm": 0.41085767117799715, + "learning_rate": 1.8727496045922611e-07, + "loss": 0.0931, + "step": 10862 + }, + { + "epoch": 0.92, + "grad_norm": 0.33912183552615693, + "learning_rate": 1.8690516546036263e-07, + "loss": 0.0986, + "step": 10863 + }, + { + "epoch": 0.92, + "grad_norm": 0.21884426210098282, + "learning_rate": 1.8653572896884075e-07, + "loss": 0.0373, + "step": 10864 + }, + { + "epoch": 0.92, + "grad_norm": 0.3255546789654701, + "learning_rate": 1.8616665101217846e-07, + "loss": 0.0326, + "step": 10865 + }, + { + "epoch": 0.92, + "grad_norm": 0.24705694909537113, + "learning_rate": 1.8579793161786595e-07, + "loss": 0.0592, + "step": 10866 + }, + { + "epoch": 0.92, + "grad_norm": 0.3190315028443304, + "learning_rate": 1.854295708133691e-07, + "loss": 0.0861, + "step": 10867 + }, + { + "epoch": 0.92, + "grad_norm": 0.24384461393340837, + "learning_rate": 1.8506156862612423e-07, + "loss": 0.0749, + "step": 10868 + }, + { + "epoch": 0.92, + "grad_norm": 0.25555627543443904, + "learning_rate": 1.8469392508354277e-07, + "loss": 0.0733, + "step": 10869 + }, + { + "epoch": 0.92, + "grad_norm": 0.33043674400772677, + "learning_rate": 1.843266402130095e-07, + "loss": 0.0912, + "step": 10870 + }, + { + "epoch": 0.92, + "grad_norm": 0.32432216897747956, + "learning_rate": 1.839597140418814e-07, + "loss": 0.098, + "step": 10871 + }, + { + "epoch": 0.92, + "grad_norm": 0.22766680346013396, + "learning_rate": 1.8359314659748883e-07, + "loss": 0.0566, + "step": 10872 + }, + { + "epoch": 0.92, + "grad_norm": 0.3079250214782278, + "learning_rate": 1.832269379071372e-07, + "loss": 0.0591, + "step": 10873 + }, + { + "epoch": 0.92, + "grad_norm": 0.28622843857989033, + "learning_rate": 1.8286108799810187e-07, + "loss": 0.0914, + "step": 10874 + }, + { + "epoch": 0.92, + "grad_norm": 0.29928928214107514, + "learning_rate": 1.8249559689763497e-07, + "loss": 0.0795, + "step": 10875 + }, + { + "epoch": 0.92, + "grad_norm": 0.49324924849899376, + "learning_rate": 1.821304646329608e-07, + "loss": 0.1077, + "step": 10876 + }, + { + "epoch": 0.92, + "grad_norm": 0.35200481842786346, + "learning_rate": 1.8176569123127542e-07, + "loss": 0.1003, + "step": 10877 + }, + { + "epoch": 0.92, + "grad_norm": 0.5150574270795837, + "learning_rate": 1.8140127671974872e-07, + "loss": 0.1158, + "step": 10878 + }, + { + "epoch": 0.92, + "grad_norm": 0.30613341361403507, + "learning_rate": 1.8103722112552625e-07, + "loss": 0.0648, + "step": 10879 + }, + { + "epoch": 0.92, + "grad_norm": 0.15308659037169078, + "learning_rate": 1.8067352447572405e-07, + "loss": 0.0259, + "step": 10880 + }, + { + "epoch": 0.92, + "grad_norm": 0.3529978620391495, + "learning_rate": 1.8031018679743217e-07, + "loss": 0.0828, + "step": 10881 + }, + { + "epoch": 0.92, + "grad_norm": 0.2841708934665366, + "learning_rate": 1.7994720811771506e-07, + "loss": 0.0648, + "step": 10882 + }, + { + "epoch": 0.92, + "grad_norm": 0.49609183772861, + "learning_rate": 1.7958458846360772e-07, + "loss": 0.1132, + "step": 10883 + }, + { + "epoch": 0.92, + "grad_norm": 0.34161258445454934, + "learning_rate": 1.7922232786212134e-07, + "loss": 0.1031, + "step": 10884 + }, + { + "epoch": 0.92, + "grad_norm": 0.2929430792918475, + "learning_rate": 1.788604263402399e-07, + "loss": 0.0831, + "step": 10885 + }, + { + "epoch": 0.92, + "grad_norm": 0.3834876021791396, + "learning_rate": 1.7849888392491799e-07, + "loss": 0.069, + "step": 10886 + }, + { + "epoch": 0.92, + "grad_norm": 0.2735056011925727, + "learning_rate": 1.7813770064308732e-07, + "loss": 0.0767, + "step": 10887 + }, + { + "epoch": 0.92, + "grad_norm": 0.44478265137258893, + "learning_rate": 1.7777687652165031e-07, + "loss": 0.0942, + "step": 10888 + }, + { + "epoch": 0.92, + "grad_norm": 0.21625502462686888, + "learning_rate": 1.7741641158748213e-07, + "loss": 0.0587, + "step": 10889 + }, + { + "epoch": 0.92, + "grad_norm": 0.6513067873716356, + "learning_rate": 1.7705630586743404e-07, + "loss": 0.1086, + "step": 10890 + }, + { + "epoch": 0.92, + "grad_norm": 0.38419730527094703, + "learning_rate": 1.7669655938832797e-07, + "loss": 0.07, + "step": 10891 + }, + { + "epoch": 0.92, + "grad_norm": 0.7610889784368245, + "learning_rate": 1.7633717217695968e-07, + "loss": 0.1368, + "step": 10892 + }, + { + "epoch": 0.92, + "grad_norm": 0.23693384629111947, + "learning_rate": 1.7597814426009884e-07, + "loss": 0.0701, + "step": 10893 + }, + { + "epoch": 0.92, + "grad_norm": 0.21713260005515417, + "learning_rate": 1.7561947566448746e-07, + "loss": 0.0478, + "step": 10894 + }, + { + "epoch": 0.92, + "grad_norm": 0.3439506667074959, + "learning_rate": 1.7526116641684243e-07, + "loss": 0.0944, + "step": 10895 + }, + { + "epoch": 0.92, + "grad_norm": 0.24885671747089583, + "learning_rate": 1.749032165438508e-07, + "loss": 0.0731, + "step": 10896 + }, + { + "epoch": 0.92, + "grad_norm": 0.3039888176124646, + "learning_rate": 1.7454562607217674e-07, + "loss": 0.0893, + "step": 10897 + }, + { + "epoch": 0.92, + "grad_norm": 0.5286856861986966, + "learning_rate": 1.741883950284551e-07, + "loss": 0.1275, + "step": 10898 + }, + { + "epoch": 0.92, + "grad_norm": 0.3756186840212498, + "learning_rate": 1.7383152343929345e-07, + "loss": 0.0739, + "step": 10899 + }, + { + "epoch": 0.92, + "grad_norm": 0.3466034373804129, + "learning_rate": 1.7347501133127442e-07, + "loss": 0.1077, + "step": 10900 + }, + { + "epoch": 0.92, + "grad_norm": 0.2897371083908611, + "learning_rate": 1.7311885873095345e-07, + "loss": 0.0989, + "step": 10901 + }, + { + "epoch": 0.92, + "grad_norm": 0.48578090650974554, + "learning_rate": 1.7276306566485879e-07, + "loss": 0.1299, + "step": 10902 + }, + { + "epoch": 0.92, + "grad_norm": 0.30265813612148157, + "learning_rate": 1.724076321594914e-07, + "loss": 0.0895, + "step": 10903 + }, + { + "epoch": 0.92, + "grad_norm": 0.3773675116387481, + "learning_rate": 1.7205255824132627e-07, + "loss": 0.0991, + "step": 10904 + }, + { + "epoch": 0.92, + "grad_norm": 0.31704541276376935, + "learning_rate": 1.7169784393681166e-07, + "loss": 0.0779, + "step": 10905 + }, + { + "epoch": 0.92, + "grad_norm": 0.5931366756271921, + "learning_rate": 1.7134348927236866e-07, + "loss": 0.0664, + "step": 10906 + }, + { + "epoch": 0.92, + "grad_norm": 0.39926128924057525, + "learning_rate": 1.7098949427439115e-07, + "loss": 0.0888, + "step": 10907 + }, + { + "epoch": 0.92, + "grad_norm": 0.5026792589542943, + "learning_rate": 1.7063585896924806e-07, + "loss": 0.1328, + "step": 10908 + }, + { + "epoch": 0.92, + "grad_norm": 0.3368691878771733, + "learning_rate": 1.7028258338327885e-07, + "loss": 0.0676, + "step": 10909 + }, + { + "epoch": 0.92, + "grad_norm": 0.2613155750065307, + "learning_rate": 1.699296675427975e-07, + "loss": 0.0582, + "step": 10910 + }, + { + "epoch": 0.92, + "grad_norm": 0.3402108387927932, + "learning_rate": 1.695771114740924e-07, + "loss": 0.0939, + "step": 10911 + }, + { + "epoch": 0.92, + "grad_norm": 0.3250831335260662, + "learning_rate": 1.692249152034231e-07, + "loss": 0.0903, + "step": 10912 + }, + { + "epoch": 0.92, + "grad_norm": 0.23631057901512023, + "learning_rate": 1.6887307875702307e-07, + "loss": 0.0547, + "step": 10913 + }, + { + "epoch": 0.92, + "grad_norm": 0.2548952061065161, + "learning_rate": 1.6852160216110026e-07, + "loss": 0.0596, + "step": 10914 + }, + { + "epoch": 0.92, + "grad_norm": 0.4531824297082557, + "learning_rate": 1.6817048544183368e-07, + "loss": 0.1163, + "step": 10915 + }, + { + "epoch": 0.92, + "grad_norm": 0.23602436320047168, + "learning_rate": 1.678197286253769e-07, + "loss": 0.0575, + "step": 10916 + }, + { + "epoch": 0.92, + "grad_norm": 0.2604961325603366, + "learning_rate": 1.6746933173785563e-07, + "loss": 0.07, + "step": 10917 + }, + { + "epoch": 0.92, + "grad_norm": 0.45600108595951705, + "learning_rate": 1.671192948053707e-07, + "loss": 0.0855, + "step": 10918 + }, + { + "epoch": 0.92, + "grad_norm": 0.3172426653068614, + "learning_rate": 1.667696178539946e-07, + "loss": 0.0852, + "step": 10919 + }, + { + "epoch": 0.92, + "grad_norm": 0.18480010702860972, + "learning_rate": 1.6642030090977202e-07, + "loss": 0.0553, + "step": 10920 + }, + { + "epoch": 0.92, + "grad_norm": 0.38001014959891893, + "learning_rate": 1.6607134399872382e-07, + "loss": 0.0744, + "step": 10921 + }, + { + "epoch": 0.92, + "grad_norm": 0.2746768944561804, + "learning_rate": 1.65722747146842e-07, + "loss": 0.0209, + "step": 10922 + }, + { + "epoch": 0.92, + "grad_norm": 0.4323663419430902, + "learning_rate": 1.6537451038009077e-07, + "loss": 0.0805, + "step": 10923 + }, + { + "epoch": 0.92, + "grad_norm": 0.26041174242435644, + "learning_rate": 1.6502663372441053e-07, + "loss": 0.0862, + "step": 10924 + }, + { + "epoch": 0.92, + "grad_norm": 0.351208112802025, + "learning_rate": 1.6467911720571216e-07, + "loss": 0.0596, + "step": 10925 + }, + { + "epoch": 0.92, + "grad_norm": 0.5413161115406405, + "learning_rate": 1.6433196084988113e-07, + "loss": 0.1199, + "step": 10926 + }, + { + "epoch": 0.92, + "grad_norm": 0.36451183211082616, + "learning_rate": 1.6398516468277503e-07, + "loss": 0.0823, + "step": 10927 + }, + { + "epoch": 0.92, + "grad_norm": 0.42996920408962125, + "learning_rate": 1.63638728730226e-07, + "loss": 0.0993, + "step": 10928 + }, + { + "epoch": 0.92, + "grad_norm": 0.4802627540909113, + "learning_rate": 1.6329265301803843e-07, + "loss": 0.1061, + "step": 10929 + }, + { + "epoch": 0.92, + "grad_norm": 0.3489318677176047, + "learning_rate": 1.6294693757198943e-07, + "loss": 0.0878, + "step": 10930 + }, + { + "epoch": 0.92, + "grad_norm": 0.4166750667584952, + "learning_rate": 1.626015824178312e-07, + "loss": 0.0963, + "step": 10931 + }, + { + "epoch": 0.92, + "grad_norm": 0.3907724352636322, + "learning_rate": 1.6225658758128703e-07, + "loss": 0.0518, + "step": 10932 + }, + { + "epoch": 0.92, + "grad_norm": 0.3015459053800912, + "learning_rate": 1.6191195308805417e-07, + "loss": 0.0548, + "step": 10933 + }, + { + "epoch": 0.92, + "grad_norm": 0.4361947951910891, + "learning_rate": 1.615676789638021e-07, + "loss": 0.0825, + "step": 10934 + }, + { + "epoch": 0.92, + "grad_norm": 0.34419642519696336, + "learning_rate": 1.6122376523417638e-07, + "loss": 0.057, + "step": 10935 + }, + { + "epoch": 0.92, + "grad_norm": 0.3325474144844211, + "learning_rate": 1.6088021192479207e-07, + "loss": 0.0831, + "step": 10936 + }, + { + "epoch": 0.92, + "grad_norm": 0.3027950083915135, + "learning_rate": 1.605370190612393e-07, + "loss": 0.0621, + "step": 10937 + }, + { + "epoch": 0.92, + "grad_norm": 0.40322321290470015, + "learning_rate": 1.601941866690815e-07, + "loss": 0.0801, + "step": 10938 + }, + { + "epoch": 0.92, + "grad_norm": 0.3970684844148039, + "learning_rate": 1.5985171477385485e-07, + "loss": 0.079, + "step": 10939 + }, + { + "epoch": 0.92, + "grad_norm": 0.19992423798987713, + "learning_rate": 1.5950960340106848e-07, + "loss": 0.0509, + "step": 10940 + }, + { + "epoch": 0.92, + "grad_norm": 0.42758933245322533, + "learning_rate": 1.591678525762047e-07, + "loss": 0.089, + "step": 10941 + }, + { + "epoch": 0.92, + "grad_norm": 0.24037967554726927, + "learning_rate": 1.5882646232471877e-07, + "loss": 0.0636, + "step": 10942 + }, + { + "epoch": 0.92, + "grad_norm": 0.2921442944739254, + "learning_rate": 1.5848543267203976e-07, + "loss": 0.0542, + "step": 10943 + }, + { + "epoch": 0.92, + "grad_norm": 0.4419292576801984, + "learning_rate": 1.5814476364357013e-07, + "loss": 0.0971, + "step": 10944 + }, + { + "epoch": 0.92, + "grad_norm": 0.3598501085061918, + "learning_rate": 1.5780445526468457e-07, + "loss": 0.1075, + "step": 10945 + }, + { + "epoch": 0.92, + "grad_norm": 0.21631671745008493, + "learning_rate": 1.5746450756073062e-07, + "loss": 0.0462, + "step": 10946 + }, + { + "epoch": 0.92, + "grad_norm": 0.44433941711797487, + "learning_rate": 1.571249205570302e-07, + "loss": 0.0699, + "step": 10947 + }, + { + "epoch": 0.92, + "grad_norm": 0.3694008390036305, + "learning_rate": 1.5678569427887756e-07, + "loss": 0.1017, + "step": 10948 + }, + { + "epoch": 0.92, + "grad_norm": 0.1973451981976213, + "learning_rate": 1.5644682875154026e-07, + "loss": 0.0655, + "step": 10949 + }, + { + "epoch": 0.92, + "grad_norm": 0.3170268448730254, + "learning_rate": 1.5610832400025921e-07, + "loss": 0.0804, + "step": 10950 + }, + { + "epoch": 0.92, + "grad_norm": 0.21664474621761837, + "learning_rate": 1.5577018005024702e-07, + "loss": 0.0411, + "step": 10951 + }, + { + "epoch": 0.92, + "grad_norm": 0.5123724062690334, + "learning_rate": 1.554323969266919e-07, + "loss": 0.0941, + "step": 10952 + }, + { + "epoch": 0.92, + "grad_norm": 0.1882500177831335, + "learning_rate": 1.550949746547542e-07, + "loss": 0.0492, + "step": 10953 + }, + { + "epoch": 0.92, + "grad_norm": 0.40682578384981505, + "learning_rate": 1.5475791325956556e-07, + "loss": 0.1132, + "step": 10954 + }, + { + "epoch": 0.92, + "grad_norm": 0.23282010426795893, + "learning_rate": 1.5442121276623369e-07, + "loss": 0.0563, + "step": 10955 + }, + { + "epoch": 0.92, + "grad_norm": 0.3986976654197232, + "learning_rate": 1.5408487319983734e-07, + "loss": 0.0396, + "step": 10956 + }, + { + "epoch": 0.92, + "grad_norm": 0.6821326980939498, + "learning_rate": 1.5374889458542875e-07, + "loss": 0.0988, + "step": 10957 + }, + { + "epoch": 0.92, + "grad_norm": 0.3568034030439121, + "learning_rate": 1.5341327694803455e-07, + "loss": 0.0681, + "step": 10958 + }, + { + "epoch": 0.92, + "grad_norm": 0.2673461419668013, + "learning_rate": 1.5307802031265305e-07, + "loss": 0.0826, + "step": 10959 + }, + { + "epoch": 0.92, + "grad_norm": 0.39532692543599574, + "learning_rate": 1.52743124704256e-07, + "loss": 0.0718, + "step": 10960 + }, + { + "epoch": 0.92, + "grad_norm": 0.49744765993420337, + "learning_rate": 1.5240859014778787e-07, + "loss": 0.0946, + "step": 10961 + }, + { + "epoch": 0.92, + "grad_norm": 0.3844361127497604, + "learning_rate": 1.5207441666816758e-07, + "loss": 0.0828, + "step": 10962 + }, + { + "epoch": 0.92, + "grad_norm": 0.5035057548365465, + "learning_rate": 1.5174060429028582e-07, + "loss": 0.1349, + "step": 10963 + }, + { + "epoch": 0.92, + "grad_norm": 0.22898304472490458, + "learning_rate": 1.514071530390071e-07, + "loss": 0.0386, + "step": 10964 + }, + { + "epoch": 0.92, + "grad_norm": 0.3564599031160037, + "learning_rate": 1.5107406293916883e-07, + "loss": 0.0859, + "step": 10965 + }, + { + "epoch": 0.92, + "grad_norm": 0.25951859024984947, + "learning_rate": 1.5074133401558165e-07, + "loss": 0.0818, + "step": 10966 + }, + { + "epoch": 0.92, + "grad_norm": 0.24153081994501283, + "learning_rate": 1.504089662930286e-07, + "loss": 0.0415, + "step": 10967 + }, + { + "epoch": 0.92, + "grad_norm": 0.28175278736023496, + "learning_rate": 1.5007695979626646e-07, + "loss": 0.058, + "step": 10968 + }, + { + "epoch": 0.92, + "grad_norm": 0.20701268775280623, + "learning_rate": 1.4974531455002605e-07, + "loss": 0.0746, + "step": 10969 + }, + { + "epoch": 0.92, + "grad_norm": 0.2905471592784245, + "learning_rate": 1.4941403057900927e-07, + "loss": 0.0929, + "step": 10970 + }, + { + "epoch": 0.92, + "grad_norm": 0.2807491074182197, + "learning_rate": 1.4908310790789137e-07, + "loss": 0.0835, + "step": 10971 + }, + { + "epoch": 0.92, + "grad_norm": 0.21969335684062477, + "learning_rate": 1.487525465613232e-07, + "loss": 0.0639, + "step": 10972 + }, + { + "epoch": 0.92, + "grad_norm": 0.2971679042824664, + "learning_rate": 1.4842234656392562e-07, + "loss": 0.0492, + "step": 10973 + }, + { + "epoch": 0.92, + "grad_norm": 0.4987830549264698, + "learning_rate": 1.4809250794029452e-07, + "loss": 0.1133, + "step": 10974 + }, + { + "epoch": 0.92, + "grad_norm": 0.33337451637349785, + "learning_rate": 1.4776303071499687e-07, + "loss": 0.0795, + "step": 10975 + }, + { + "epoch": 0.92, + "grad_norm": 0.3033378143128062, + "learning_rate": 1.4743391491257585e-07, + "loss": 0.0868, + "step": 10976 + }, + { + "epoch": 0.92, + "grad_norm": 0.48357695957854385, + "learning_rate": 1.4710516055754521e-07, + "loss": 0.1008, + "step": 10977 + }, + { + "epoch": 0.93, + "grad_norm": 0.34691315527683, + "learning_rate": 1.4677676767439198e-07, + "loss": 0.0867, + "step": 10978 + }, + { + "epoch": 0.93, + "grad_norm": 0.349506733591542, + "learning_rate": 1.4644873628757773e-07, + "loss": 0.1008, + "step": 10979 + }, + { + "epoch": 0.93, + "grad_norm": 0.3993853506533474, + "learning_rate": 1.4612106642153568e-07, + "loss": 0.0863, + "step": 10980 + }, + { + "epoch": 0.93, + "grad_norm": 0.5637088142549584, + "learning_rate": 1.4579375810067186e-07, + "loss": 0.1152, + "step": 10981 + }, + { + "epoch": 0.93, + "grad_norm": 0.2800794203106138, + "learning_rate": 1.454668113493668e-07, + "loss": 0.0784, + "step": 10982 + }, + { + "epoch": 0.93, + "grad_norm": 0.317480010413139, + "learning_rate": 1.4514022619197431e-07, + "loss": 0.0692, + "step": 10983 + }, + { + "epoch": 0.93, + "grad_norm": 0.23131663785230341, + "learning_rate": 1.4481400265281888e-07, + "loss": 0.0393, + "step": 10984 + }, + { + "epoch": 0.93, + "grad_norm": 0.3063102832241636, + "learning_rate": 1.4448814075619988e-07, + "loss": 0.0878, + "step": 10985 + }, + { + "epoch": 0.93, + "grad_norm": 0.330661163671372, + "learning_rate": 1.4416264052638961e-07, + "loss": 0.054, + "step": 10986 + }, + { + "epoch": 0.93, + "grad_norm": 0.29342077034986697, + "learning_rate": 1.4383750198763368e-07, + "loss": 0.0865, + "step": 10987 + }, + { + "epoch": 0.93, + "grad_norm": 0.28997534567421346, + "learning_rate": 1.4351272516414882e-07, + "loss": 0.0555, + "step": 10988 + }, + { + "epoch": 0.93, + "grad_norm": 0.5302516164253191, + "learning_rate": 1.4318831008012845e-07, + "loss": 0.1007, + "step": 10989 + }, + { + "epoch": 0.93, + "grad_norm": 0.35094217572194647, + "learning_rate": 1.4286425675973549e-07, + "loss": 0.0897, + "step": 10990 + }, + { + "epoch": 0.93, + "grad_norm": 0.287059338660618, + "learning_rate": 1.4254056522710779e-07, + "loss": 0.0387, + "step": 10991 + }, + { + "epoch": 0.93, + "grad_norm": 0.2870486821576297, + "learning_rate": 1.4221723550635446e-07, + "loss": 0.0533, + "step": 10992 + }, + { + "epoch": 0.93, + "grad_norm": 0.3076179063433011, + "learning_rate": 1.4189426762156122e-07, + "loss": 0.0733, + "step": 10993 + }, + { + "epoch": 0.93, + "grad_norm": 0.24929750374699525, + "learning_rate": 1.4157166159678326e-07, + "loss": 0.0821, + "step": 10994 + }, + { + "epoch": 0.93, + "grad_norm": 0.22563166378918179, + "learning_rate": 1.4124941745605024e-07, + "loss": 0.0478, + "step": 10995 + }, + { + "epoch": 0.93, + "grad_norm": 0.2634247334143495, + "learning_rate": 1.4092753522336466e-07, + "loss": 0.0496, + "step": 10996 + }, + { + "epoch": 0.93, + "grad_norm": 0.22570587237133935, + "learning_rate": 1.4060601492270288e-07, + "loss": 0.075, + "step": 10997 + }, + { + "epoch": 0.93, + "grad_norm": 0.2643235702530205, + "learning_rate": 1.4028485657801295e-07, + "loss": 0.0638, + "step": 10998 + }, + { + "epoch": 0.93, + "grad_norm": 0.38302164938698036, + "learning_rate": 1.3996406021321686e-07, + "loss": 0.0823, + "step": 10999 + }, + { + "epoch": 0.93, + "grad_norm": 0.32411950697647246, + "learning_rate": 1.3964362585220993e-07, + "loss": 0.084, + "step": 11000 + }, + { + "epoch": 0.93, + "grad_norm": 0.3690642630777387, + "learning_rate": 1.393235535188586e-07, + "loss": 0.1092, + "step": 11001 + }, + { + "epoch": 0.93, + "grad_norm": 0.5611291920937465, + "learning_rate": 1.39003843237005e-07, + "loss": 0.1298, + "step": 11002 + }, + { + "epoch": 0.93, + "grad_norm": 0.3971075828971253, + "learning_rate": 1.386844950304622e-07, + "loss": 0.0881, + "step": 11003 + }, + { + "epoch": 0.93, + "grad_norm": 0.2640944537587143, + "learning_rate": 1.3836550892301792e-07, + "loss": 0.0681, + "step": 11004 + }, + { + "epoch": 0.93, + "grad_norm": 0.31481012095030914, + "learning_rate": 1.3804688493843087e-07, + "loss": 0.0739, + "step": 11005 + }, + { + "epoch": 0.93, + "grad_norm": 0.2593427515773532, + "learning_rate": 1.3772862310043543e-07, + "loss": 0.0861, + "step": 11006 + }, + { + "epoch": 0.93, + "grad_norm": 0.5013116804945846, + "learning_rate": 1.374107234327371e-07, + "loss": 0.0921, + "step": 11007 + }, + { + "epoch": 0.93, + "grad_norm": 0.2734727757925456, + "learning_rate": 1.3709318595901465e-07, + "loss": 0.0786, + "step": 11008 + }, + { + "epoch": 0.93, + "grad_norm": 0.32302142223359825, + "learning_rate": 1.3677601070292034e-07, + "loss": 0.0958, + "step": 11009 + }, + { + "epoch": 0.93, + "grad_norm": 0.3339957079727807, + "learning_rate": 1.3645919768807858e-07, + "loss": 0.0908, + "step": 11010 + }, + { + "epoch": 0.93, + "grad_norm": 0.21727381421509548, + "learning_rate": 1.361427469380888e-07, + "loss": 0.0403, + "step": 11011 + }, + { + "epoch": 0.93, + "grad_norm": 0.2710627617893132, + "learning_rate": 1.3582665847652053e-07, + "loss": 0.0837, + "step": 11012 + }, + { + "epoch": 0.93, + "grad_norm": 0.7516050556880568, + "learning_rate": 1.3551093232691937e-07, + "loss": 0.132, + "step": 11013 + }, + { + "epoch": 0.93, + "grad_norm": 0.3416181980584646, + "learning_rate": 1.351955685128009e-07, + "loss": 0.0609, + "step": 11014 + }, + { + "epoch": 0.93, + "grad_norm": 0.3079639260409698, + "learning_rate": 1.348805670576564e-07, + "loss": 0.1092, + "step": 11015 + }, + { + "epoch": 0.93, + "grad_norm": 0.2008957145760628, + "learning_rate": 1.3456592798494928e-07, + "loss": 0.0601, + "step": 11016 + }, + { + "epoch": 0.93, + "grad_norm": 0.3215262886148315, + "learning_rate": 1.342516513181147e-07, + "loss": 0.0611, + "step": 11017 + }, + { + "epoch": 0.93, + "grad_norm": 0.4316006367952478, + "learning_rate": 1.3393773708056225e-07, + "loss": 0.0684, + "step": 11018 + }, + { + "epoch": 0.93, + "grad_norm": 0.26739681515823316, + "learning_rate": 1.3362418529567322e-07, + "loss": 0.0508, + "step": 11019 + }, + { + "epoch": 0.93, + "grad_norm": 0.2067938739321848, + "learning_rate": 1.3331099598680453e-07, + "loss": 0.038, + "step": 11020 + }, + { + "epoch": 0.93, + "grad_norm": 0.5479743881268239, + "learning_rate": 1.3299816917728303e-07, + "loss": 0.1112, + "step": 11021 + }, + { + "epoch": 0.93, + "grad_norm": 0.5689304550789344, + "learning_rate": 1.326857048904101e-07, + "loss": 0.0743, + "step": 11022 + }, + { + "epoch": 0.93, + "grad_norm": 0.4624647981447354, + "learning_rate": 1.3237360314946045e-07, + "loss": 0.076, + "step": 11023 + }, + { + "epoch": 0.93, + "grad_norm": 0.28509041814383523, + "learning_rate": 1.3206186397768107e-07, + "loss": 0.0699, + "step": 11024 + }, + { + "epoch": 0.93, + "grad_norm": 0.29341525083605907, + "learning_rate": 1.3175048739829167e-07, + "loss": 0.072, + "step": 11025 + }, + { + "epoch": 0.93, + "grad_norm": 0.2571658055766883, + "learning_rate": 1.3143947343448482e-07, + "loss": 0.0754, + "step": 11026 + }, + { + "epoch": 0.93, + "grad_norm": 0.4036928123104843, + "learning_rate": 1.311288221094287e-07, + "loss": 0.0789, + "step": 11027 + }, + { + "epoch": 0.93, + "grad_norm": 0.3502379375605657, + "learning_rate": 1.3081853344626082e-07, + "loss": 0.0873, + "step": 11028 + }, + { + "epoch": 0.93, + "grad_norm": 0.366677074693949, + "learning_rate": 1.305086074680928e-07, + "loss": 0.0783, + "step": 11029 + }, + { + "epoch": 0.93, + "grad_norm": 0.27978024354036973, + "learning_rate": 1.3019904419801165e-07, + "loss": 0.0502, + "step": 11030 + }, + { + "epoch": 0.93, + "grad_norm": 0.5302028471515052, + "learning_rate": 1.2988984365907397e-07, + "loss": 0.1365, + "step": 11031 + }, + { + "epoch": 0.93, + "grad_norm": 0.38606569544698016, + "learning_rate": 1.2958100587431076e-07, + "loss": 0.1038, + "step": 11032 + }, + { + "epoch": 0.93, + "grad_norm": 0.3095819248433764, + "learning_rate": 1.2927253086672698e-07, + "loss": 0.0688, + "step": 11033 + }, + { + "epoch": 0.93, + "grad_norm": 0.27863117005278804, + "learning_rate": 1.2896441865929976e-07, + "loss": 0.0763, + "step": 11034 + }, + { + "epoch": 0.93, + "grad_norm": 0.26107616828004504, + "learning_rate": 1.28656669274978e-07, + "loss": 0.0634, + "step": 11035 + }, + { + "epoch": 0.93, + "grad_norm": 0.20922024806940337, + "learning_rate": 1.2834928273668446e-07, + "loss": 0.0467, + "step": 11036 + }, + { + "epoch": 0.93, + "grad_norm": 0.31224538630757276, + "learning_rate": 1.2804225906731693e-07, + "loss": 0.0769, + "step": 11037 + }, + { + "epoch": 0.93, + "grad_norm": 1.0982170682710088, + "learning_rate": 1.2773559828974268e-07, + "loss": 0.0771, + "step": 11038 + }, + { + "epoch": 0.93, + "grad_norm": 0.42984613220516227, + "learning_rate": 1.2742930042680402e-07, + "loss": 0.1025, + "step": 11039 + }, + { + "epoch": 0.93, + "grad_norm": 0.3211660310350537, + "learning_rate": 1.27123365501316e-07, + "loss": 0.0648, + "step": 11040 + }, + { + "epoch": 0.93, + "grad_norm": 0.3725324455960209, + "learning_rate": 1.268177935360665e-07, + "loss": 0.1124, + "step": 11041 + }, + { + "epoch": 0.93, + "grad_norm": 0.47762734935196544, + "learning_rate": 1.265125845538162e-07, + "loss": 0.0854, + "step": 11042 + }, + { + "epoch": 0.93, + "grad_norm": 0.4949293571046767, + "learning_rate": 1.2620773857729807e-07, + "loss": 0.1227, + "step": 11043 + }, + { + "epoch": 0.93, + "grad_norm": 0.26003897704203677, + "learning_rate": 1.2590325562922002e-07, + "loss": 0.0723, + "step": 11044 + }, + { + "epoch": 0.93, + "grad_norm": 0.3492841026661677, + "learning_rate": 1.2559913573226168e-07, + "loss": 0.107, + "step": 11045 + }, + { + "epoch": 0.93, + "grad_norm": 0.2920559640824417, + "learning_rate": 1.2529537890907438e-07, + "loss": 0.0473, + "step": 11046 + }, + { + "epoch": 0.93, + "grad_norm": 0.38668183035908504, + "learning_rate": 1.249919851822845e-07, + "loss": 0.0994, + "step": 11047 + }, + { + "epoch": 0.93, + "grad_norm": 0.36097518629495084, + "learning_rate": 1.2468895457449115e-07, + "loss": 0.0787, + "step": 11048 + }, + { + "epoch": 0.93, + "grad_norm": 0.30465758635246637, + "learning_rate": 1.2438628710826462e-07, + "loss": 0.1228, + "step": 11049 + }, + { + "epoch": 0.93, + "grad_norm": 0.2193426357381622, + "learning_rate": 1.2408398280615075e-07, + "loss": 0.0447, + "step": 11050 + }, + { + "epoch": 0.93, + "grad_norm": 0.3357814261508535, + "learning_rate": 1.2378204169066599e-07, + "loss": 0.0939, + "step": 11051 + }, + { + "epoch": 0.93, + "grad_norm": 0.33715502988057505, + "learning_rate": 1.2348046378430068e-07, + "loss": 0.0776, + "step": 11052 + }, + { + "epoch": 0.93, + "grad_norm": 0.24794686955078973, + "learning_rate": 1.2317924910951794e-07, + "loss": 0.061, + "step": 11053 + }, + { + "epoch": 0.93, + "grad_norm": 0.31391790070466, + "learning_rate": 1.2287839768875431e-07, + "loss": 0.07, + "step": 11054 + }, + { + "epoch": 0.93, + "grad_norm": 0.24467535575192373, + "learning_rate": 1.2257790954441962e-07, + "loss": 0.0254, + "step": 11055 + }, + { + "epoch": 0.93, + "grad_norm": 0.3228074477683021, + "learning_rate": 1.2227778469889429e-07, + "loss": 0.0867, + "step": 11056 + }, + { + "epoch": 0.93, + "grad_norm": 0.2411432887187595, + "learning_rate": 1.2197802317453544e-07, + "loss": 0.047, + "step": 11057 + }, + { + "epoch": 0.93, + "grad_norm": 0.4206553135367485, + "learning_rate": 1.2167862499366968e-07, + "loss": 0.1286, + "step": 11058 + }, + { + "epoch": 0.93, + "grad_norm": 0.3173055861147901, + "learning_rate": 1.2137959017859802e-07, + "loss": 0.0784, + "step": 11059 + }, + { + "epoch": 0.93, + "grad_norm": 0.23820894682227708, + "learning_rate": 1.2108091875159432e-07, + "loss": 0.0719, + "step": 11060 + }, + { + "epoch": 0.93, + "grad_norm": 0.3714655031189375, + "learning_rate": 1.2078261073490581e-07, + "loss": 0.1096, + "step": 11061 + }, + { + "epoch": 0.93, + "grad_norm": 0.3060146658426092, + "learning_rate": 1.204846661507525e-07, + "loss": 0.0915, + "step": 11062 + }, + { + "epoch": 0.93, + "grad_norm": 0.33058244785948837, + "learning_rate": 1.2018708502132602e-07, + "loss": 0.0601, + "step": 11063 + }, + { + "epoch": 0.93, + "grad_norm": 0.29436704459557533, + "learning_rate": 1.198898673687926e-07, + "loss": 0.096, + "step": 11064 + }, + { + "epoch": 0.93, + "grad_norm": 0.2034673746554507, + "learning_rate": 1.195930132152906e-07, + "loss": 0.0683, + "step": 11065 + }, + { + "epoch": 0.93, + "grad_norm": 0.2856781654113205, + "learning_rate": 1.1929652258293122e-07, + "loss": 0.0908, + "step": 11066 + }, + { + "epoch": 0.93, + "grad_norm": 0.30265539699690486, + "learning_rate": 1.1900039549379904e-07, + "loss": 0.0406, + "step": 11067 + }, + { + "epoch": 0.93, + "grad_norm": 0.4239218909472435, + "learning_rate": 1.1870463196995196e-07, + "loss": 0.1285, + "step": 11068 + }, + { + "epoch": 0.93, + "grad_norm": 0.27239438551441, + "learning_rate": 1.1840923203341903e-07, + "loss": 0.0899, + "step": 11069 + }, + { + "epoch": 0.93, + "grad_norm": 0.47260266589926764, + "learning_rate": 1.1811419570620375e-07, + "loss": 0.1129, + "step": 11070 + }, + { + "epoch": 0.93, + "grad_norm": 0.49296411168890064, + "learning_rate": 1.1781952301028243e-07, + "loss": 0.1371, + "step": 11071 + }, + { + "epoch": 0.93, + "grad_norm": 0.23522108228345823, + "learning_rate": 1.1752521396760419e-07, + "loss": 0.0509, + "step": 11072 + }, + { + "epoch": 0.93, + "grad_norm": 0.2938075118739258, + "learning_rate": 1.1723126860008981e-07, + "loss": 0.0762, + "step": 11073 + }, + { + "epoch": 0.93, + "grad_norm": 0.18859819434479183, + "learning_rate": 1.1693768692963569e-07, + "loss": 0.0694, + "step": 11074 + }, + { + "epoch": 0.93, + "grad_norm": 0.2331799941321318, + "learning_rate": 1.1664446897810821e-07, + "loss": 0.0692, + "step": 11075 + }, + { + "epoch": 0.93, + "grad_norm": 0.2775195088717485, + "learning_rate": 1.1635161476734824e-07, + "loss": 0.062, + "step": 11076 + }, + { + "epoch": 0.93, + "grad_norm": 0.40283451705409695, + "learning_rate": 1.1605912431916943e-07, + "loss": 0.0725, + "step": 11077 + }, + { + "epoch": 0.93, + "grad_norm": 0.22180221832919114, + "learning_rate": 1.157669976553577e-07, + "loss": 0.0564, + "step": 11078 + }, + { + "epoch": 0.93, + "grad_norm": 0.27655848838424457, + "learning_rate": 1.1547523479767287e-07, + "loss": 0.0726, + "step": 11079 + }, + { + "epoch": 0.93, + "grad_norm": 0.2589735677437674, + "learning_rate": 1.1518383576784753e-07, + "loss": 0.0589, + "step": 11080 + }, + { + "epoch": 0.93, + "grad_norm": 0.3955122149407037, + "learning_rate": 1.1489280058758601e-07, + "loss": 0.084, + "step": 11081 + }, + { + "epoch": 0.93, + "grad_norm": 0.3190993815374948, + "learning_rate": 1.1460212927856595e-07, + "loss": 0.092, + "step": 11082 + }, + { + "epoch": 0.93, + "grad_norm": 0.5942341938638771, + "learning_rate": 1.143118218624395e-07, + "loss": 0.0883, + "step": 11083 + }, + { + "epoch": 0.93, + "grad_norm": 0.1862497302499822, + "learning_rate": 1.140218783608299e-07, + "loss": 0.04, + "step": 11084 + }, + { + "epoch": 0.93, + "grad_norm": 0.3063210335201512, + "learning_rate": 1.1373229879533376e-07, + "loss": 0.0891, + "step": 11085 + }, + { + "epoch": 0.93, + "grad_norm": 0.34530531220014465, + "learning_rate": 1.1344308318752051e-07, + "loss": 0.1037, + "step": 11086 + }, + { + "epoch": 0.93, + "grad_norm": 0.24713037225827278, + "learning_rate": 1.1315423155893179e-07, + "loss": 0.0705, + "step": 11087 + }, + { + "epoch": 0.93, + "grad_norm": 0.3231677939429711, + "learning_rate": 1.1286574393108485e-07, + "loss": 0.0571, + "step": 11088 + }, + { + "epoch": 0.93, + "grad_norm": 0.31207249153400085, + "learning_rate": 1.1257762032546693e-07, + "loss": 0.0702, + "step": 11089 + }, + { + "epoch": 0.93, + "grad_norm": 0.4573930505270725, + "learning_rate": 1.1228986076353865e-07, + "loss": 0.0937, + "step": 11090 + }, + { + "epoch": 0.93, + "grad_norm": 0.2137385492851112, + "learning_rate": 1.1200246526673509e-07, + "loss": 0.0467, + "step": 11091 + }, + { + "epoch": 0.93, + "grad_norm": 0.22265368858365467, + "learning_rate": 1.1171543385646244e-07, + "loss": 0.0538, + "step": 11092 + }, + { + "epoch": 0.93, + "grad_norm": 0.40744981240597195, + "learning_rate": 1.1142876655410084e-07, + "loss": 0.1155, + "step": 11093 + }, + { + "epoch": 0.93, + "grad_norm": 0.2914035617508047, + "learning_rate": 1.1114246338100209e-07, + "loss": 0.0613, + "step": 11094 + }, + { + "epoch": 0.93, + "grad_norm": 0.299713112638648, + "learning_rate": 1.1085652435849303e-07, + "loss": 0.083, + "step": 11095 + }, + { + "epoch": 0.93, + "grad_norm": 0.4469237316739102, + "learning_rate": 1.1057094950787162e-07, + "loss": 0.0747, + "step": 11096 + }, + { + "epoch": 0.94, + "grad_norm": 0.3487351181843528, + "learning_rate": 1.1028573885040806e-07, + "loss": 0.0882, + "step": 11097 + }, + { + "epoch": 0.94, + "grad_norm": 0.24219512939207158, + "learning_rate": 1.1000089240734757e-07, + "loss": 0.056, + "step": 11098 + }, + { + "epoch": 0.94, + "grad_norm": 0.33748241318513894, + "learning_rate": 1.0971641019990764e-07, + "loss": 0.0827, + "step": 11099 + }, + { + "epoch": 0.94, + "grad_norm": 0.6583702746094607, + "learning_rate": 1.0943229224927687e-07, + "loss": 0.0906, + "step": 11100 + }, + { + "epoch": 0.94, + "grad_norm": 0.2484965385955238, + "learning_rate": 1.091485385766189e-07, + "loss": 0.0522, + "step": 11101 + }, + { + "epoch": 0.94, + "grad_norm": 0.19693323631136342, + "learning_rate": 1.0886514920306902e-07, + "loss": 0.0489, + "step": 11102 + }, + { + "epoch": 0.94, + "grad_norm": 0.25318500687297296, + "learning_rate": 1.085821241497359e-07, + "loss": 0.0547, + "step": 11103 + }, + { + "epoch": 0.94, + "grad_norm": 0.43407562696556895, + "learning_rate": 1.0829946343770048e-07, + "loss": 0.1305, + "step": 11104 + }, + { + "epoch": 0.94, + "grad_norm": 0.2887839597129628, + "learning_rate": 1.0801716708801757e-07, + "loss": 0.0414, + "step": 11105 + }, + { + "epoch": 0.94, + "grad_norm": 0.31810485943793565, + "learning_rate": 1.0773523512171368e-07, + "loss": 0.0755, + "step": 11106 + }, + { + "epoch": 0.94, + "grad_norm": 0.5098641948806104, + "learning_rate": 1.074536675597887e-07, + "loss": 0.1176, + "step": 11107 + }, + { + "epoch": 0.94, + "grad_norm": 0.2641917895628593, + "learning_rate": 1.0717246442321638e-07, + "loss": 0.0686, + "step": 11108 + }, + { + "epoch": 0.94, + "grad_norm": 0.23773646542488674, + "learning_rate": 1.068916257329422e-07, + "loss": 0.0671, + "step": 11109 + }, + { + "epoch": 0.94, + "grad_norm": 0.45564730259135444, + "learning_rate": 1.0661115150988332e-07, + "loss": 0.0955, + "step": 11110 + }, + { + "epoch": 0.94, + "grad_norm": 0.6361904933325756, + "learning_rate": 1.0633104177493192e-07, + "loss": 0.0989, + "step": 11111 + }, + { + "epoch": 0.94, + "grad_norm": 0.40220513493448534, + "learning_rate": 1.0605129654895296e-07, + "loss": 0.0838, + "step": 11112 + }, + { + "epoch": 0.94, + "grad_norm": 0.28176553003860905, + "learning_rate": 1.0577191585278257e-07, + "loss": 0.072, + "step": 11113 + }, + { + "epoch": 0.94, + "grad_norm": 0.34817836981770933, + "learning_rate": 1.0549289970723021e-07, + "loss": 0.1194, + "step": 11114 + }, + { + "epoch": 0.94, + "grad_norm": 0.4596741557188952, + "learning_rate": 1.0521424813307979e-07, + "loss": 0.098, + "step": 11115 + }, + { + "epoch": 0.94, + "grad_norm": 0.30019521486968, + "learning_rate": 1.0493596115108695e-07, + "loss": 0.0831, + "step": 11116 + }, + { + "epoch": 0.94, + "grad_norm": 0.3730464457498851, + "learning_rate": 1.0465803878197899e-07, + "loss": 0.0846, + "step": 11117 + }, + { + "epoch": 0.94, + "grad_norm": 0.3825698974745048, + "learning_rate": 1.043804810464577e-07, + "loss": 0.0843, + "step": 11118 + }, + { + "epoch": 0.94, + "grad_norm": 0.19465036026403382, + "learning_rate": 1.0410328796519764e-07, + "loss": 0.0418, + "step": 11119 + }, + { + "epoch": 0.94, + "grad_norm": 0.4297390873607323, + "learning_rate": 1.0382645955884508e-07, + "loss": 0.1392, + "step": 11120 + }, + { + "epoch": 0.94, + "grad_norm": 0.19914969879815567, + "learning_rate": 1.0354999584802017e-07, + "loss": 0.0543, + "step": 11121 + }, + { + "epoch": 0.94, + "grad_norm": 0.3368781128553592, + "learning_rate": 1.0327389685331535e-07, + "loss": 0.0742, + "step": 11122 + }, + { + "epoch": 0.94, + "grad_norm": 0.2561567565555721, + "learning_rate": 1.0299816259529638e-07, + "loss": 0.0449, + "step": 11123 + }, + { + "epoch": 0.94, + "grad_norm": 0.442824555581664, + "learning_rate": 1.0272279309450129e-07, + "loss": 0.0722, + "step": 11124 + }, + { + "epoch": 0.94, + "grad_norm": 0.39412310736001593, + "learning_rate": 1.0244778837144198e-07, + "loss": 0.1188, + "step": 11125 + }, + { + "epoch": 0.94, + "grad_norm": 0.22899019583356575, + "learning_rate": 1.0217314844660097e-07, + "loss": 0.045, + "step": 11126 + }, + { + "epoch": 0.94, + "grad_norm": 0.24268415690934625, + "learning_rate": 1.0189887334043635e-07, + "loss": 0.0507, + "step": 11127 + }, + { + "epoch": 0.94, + "grad_norm": 0.28323668343850633, + "learning_rate": 1.0162496307337677e-07, + "loss": 0.0374, + "step": 11128 + }, + { + "epoch": 0.94, + "grad_norm": 0.35385262776531995, + "learning_rate": 1.0135141766582535e-07, + "loss": 0.0329, + "step": 11129 + }, + { + "epoch": 0.94, + "grad_norm": 0.3649155454418222, + "learning_rate": 1.010782371381569e-07, + "loss": 0.0601, + "step": 11130 + }, + { + "epoch": 0.94, + "grad_norm": 0.31108798852298297, + "learning_rate": 1.0080542151071959e-07, + "loss": 0.0771, + "step": 11131 + }, + { + "epoch": 0.94, + "grad_norm": 0.734070394559944, + "learning_rate": 1.0053297080383496e-07, + "loss": 0.12, + "step": 11132 + }, + { + "epoch": 0.94, + "grad_norm": 0.228704914015767, + "learning_rate": 1.0026088503779564e-07, + "loss": 0.0588, + "step": 11133 + }, + { + "epoch": 0.94, + "grad_norm": 0.5435423084885356, + "learning_rate": 9.998916423286876e-08, + "loss": 0.0978, + "step": 11134 + }, + { + "epoch": 0.94, + "grad_norm": 0.4121767810379705, + "learning_rate": 9.971780840929368e-08, + "loss": 0.0804, + "step": 11135 + }, + { + "epoch": 0.94, + "grad_norm": 0.3759780590336709, + "learning_rate": 9.944681758728258e-08, + "loss": 0.0851, + "step": 11136 + }, + { + "epoch": 0.94, + "grad_norm": 0.33390015919531246, + "learning_rate": 9.917619178702043e-08, + "loss": 0.0793, + "step": 11137 + }, + { + "epoch": 0.94, + "grad_norm": 0.3245391746862259, + "learning_rate": 9.890593102866442e-08, + "loss": 0.0757, + "step": 11138 + }, + { + "epoch": 0.94, + "grad_norm": 0.4999889183245336, + "learning_rate": 9.863603533234622e-08, + "loss": 0.0925, + "step": 11139 + }, + { + "epoch": 0.94, + "grad_norm": 0.21210101211848256, + "learning_rate": 9.83665047181681e-08, + "loss": 0.0462, + "step": 11140 + }, + { + "epoch": 0.94, + "grad_norm": 0.17663036225687423, + "learning_rate": 9.809733920620679e-08, + "loss": 0.0495, + "step": 11141 + }, + { + "epoch": 0.94, + "grad_norm": 0.4829498091272957, + "learning_rate": 9.782853881651233e-08, + "loss": 0.1027, + "step": 11142 + }, + { + "epoch": 0.94, + "grad_norm": 0.2360634002781989, + "learning_rate": 9.756010356910484e-08, + "loss": 0.0523, + "step": 11143 + }, + { + "epoch": 0.94, + "grad_norm": 0.5424902744133441, + "learning_rate": 9.729203348397997e-08, + "loss": 0.068, + "step": 11144 + }, + { + "epoch": 0.94, + "grad_norm": 0.43894418538831637, + "learning_rate": 9.702432858110455e-08, + "loss": 0.087, + "step": 11145 + }, + { + "epoch": 0.94, + "grad_norm": 0.2020666164691161, + "learning_rate": 9.675698888041873e-08, + "loss": 0.0399, + "step": 11146 + }, + { + "epoch": 0.94, + "grad_norm": 0.36028601050765807, + "learning_rate": 9.649001440183658e-08, + "loss": 0.0859, + "step": 11147 + }, + { + "epoch": 0.94, + "grad_norm": 0.25490210576013106, + "learning_rate": 9.622340516524332e-08, + "loss": 0.0413, + "step": 11148 + }, + { + "epoch": 0.94, + "grad_norm": 0.5328712387866651, + "learning_rate": 9.595716119049692e-08, + "loss": 0.126, + "step": 11149 + }, + { + "epoch": 0.94, + "grad_norm": 0.5973336135376869, + "learning_rate": 9.569128249742932e-08, + "loss": 0.0597, + "step": 11150 + }, + { + "epoch": 0.94, + "grad_norm": 0.46205782605054385, + "learning_rate": 9.542576910584467e-08, + "loss": 0.1418, + "step": 11151 + }, + { + "epoch": 0.94, + "grad_norm": 0.3655348174753182, + "learning_rate": 9.516062103552049e-08, + "loss": 0.0957, + "step": 11152 + }, + { + "epoch": 0.94, + "grad_norm": 0.4592514359092967, + "learning_rate": 9.489583830620597e-08, + "loss": 0.1065, + "step": 11153 + }, + { + "epoch": 0.94, + "grad_norm": 0.2877827933482134, + "learning_rate": 9.463142093762367e-08, + "loss": 0.0663, + "step": 11154 + }, + { + "epoch": 0.94, + "grad_norm": 0.23285224799903795, + "learning_rate": 9.436736894946841e-08, + "loss": 0.0711, + "step": 11155 + }, + { + "epoch": 0.94, + "grad_norm": 0.33547084570254815, + "learning_rate": 9.410368236140943e-08, + "loss": 0.0701, + "step": 11156 + }, + { + "epoch": 0.94, + "grad_norm": 0.4975086414040959, + "learning_rate": 9.384036119308771e-08, + "loss": 0.0871, + "step": 11157 + }, + { + "epoch": 0.94, + "grad_norm": 0.2675973146701914, + "learning_rate": 9.357740546411531e-08, + "loss": 0.0891, + "step": 11158 + }, + { + "epoch": 0.94, + "grad_norm": 0.28660399788452495, + "learning_rate": 9.331481519408048e-08, + "loss": 0.0457, + "step": 11159 + }, + { + "epoch": 0.94, + "grad_norm": 0.3596893520389731, + "learning_rate": 9.305259040254144e-08, + "loss": 0.1055, + "step": 11160 + }, + { + "epoch": 0.94, + "grad_norm": 0.37220464402171766, + "learning_rate": 9.279073110903092e-08, + "loss": 0.0772, + "step": 11161 + }, + { + "epoch": 0.94, + "grad_norm": 0.2898081359962696, + "learning_rate": 9.252923733305274e-08, + "loss": 0.0642, + "step": 11162 + }, + { + "epoch": 0.94, + "grad_norm": 0.5619680933823189, + "learning_rate": 9.226810909408579e-08, + "loss": 0.0923, + "step": 11163 + }, + { + "epoch": 0.94, + "grad_norm": 0.3438638219043938, + "learning_rate": 9.200734641157949e-08, + "loss": 0.0596, + "step": 11164 + }, + { + "epoch": 0.94, + "grad_norm": 0.5385374795559705, + "learning_rate": 9.174694930495664e-08, + "loss": 0.1107, + "step": 11165 + }, + { + "epoch": 0.94, + "grad_norm": 0.29259661100403284, + "learning_rate": 9.148691779361452e-08, + "loss": 0.0738, + "step": 11166 + }, + { + "epoch": 0.94, + "grad_norm": 0.32241658110500787, + "learning_rate": 9.122725189692039e-08, + "loss": 0.0692, + "step": 11167 + }, + { + "epoch": 0.94, + "grad_norm": 0.4442942848518488, + "learning_rate": 9.096795163421657e-08, + "loss": 0.0688, + "step": 11168 + }, + { + "epoch": 0.94, + "grad_norm": 0.22061675103132072, + "learning_rate": 9.07090170248165e-08, + "loss": 0.0494, + "step": 11169 + }, + { + "epoch": 0.94, + "grad_norm": 0.24935385543925348, + "learning_rate": 9.045044808800806e-08, + "loss": 0.0621, + "step": 11170 + }, + { + "epoch": 0.94, + "grad_norm": 0.26149146069001444, + "learning_rate": 9.019224484305033e-08, + "loss": 0.0683, + "step": 11171 + }, + { + "epoch": 0.94, + "grad_norm": 0.20905267479499953, + "learning_rate": 8.993440730917569e-08, + "loss": 0.0552, + "step": 11172 + }, + { + "epoch": 0.94, + "grad_norm": 0.44374962044494315, + "learning_rate": 8.967693550559042e-08, + "loss": 0.1094, + "step": 11173 + }, + { + "epoch": 0.94, + "grad_norm": 0.32352189648419505, + "learning_rate": 8.941982945147087e-08, + "loss": 0.0693, + "step": 11174 + }, + { + "epoch": 0.94, + "grad_norm": 0.37343560174673135, + "learning_rate": 8.916308916596894e-08, + "loss": 0.0829, + "step": 11175 + }, + { + "epoch": 0.94, + "grad_norm": 0.3347283913269959, + "learning_rate": 8.890671466820821e-08, + "loss": 0.0877, + "step": 11176 + }, + { + "epoch": 0.94, + "grad_norm": 0.406050802050187, + "learning_rate": 8.865070597728453e-08, + "loss": 0.104, + "step": 11177 + }, + { + "epoch": 0.94, + "grad_norm": 0.24734928450503685, + "learning_rate": 8.839506311226654e-08, + "loss": 0.075, + "step": 11178 + }, + { + "epoch": 0.94, + "grad_norm": 0.307723288515105, + "learning_rate": 8.813978609219676e-08, + "loss": 0.0794, + "step": 11179 + }, + { + "epoch": 0.94, + "grad_norm": 0.3121892422367239, + "learning_rate": 8.788487493608944e-08, + "loss": 0.0887, + "step": 11180 + }, + { + "epoch": 0.94, + "grad_norm": 0.19868817136489092, + "learning_rate": 8.763032966293161e-08, + "loss": 0.0469, + "step": 11181 + }, + { + "epoch": 0.94, + "grad_norm": 0.33222524979361934, + "learning_rate": 8.737615029168367e-08, + "loss": 0.0777, + "step": 11182 + }, + { + "epoch": 0.94, + "grad_norm": 0.27398542395398534, + "learning_rate": 8.712233684127824e-08, + "loss": 0.067, + "step": 11183 + }, + { + "epoch": 0.94, + "grad_norm": 1.1366121371041498, + "learning_rate": 8.686888933062076e-08, + "loss": 0.1125, + "step": 11184 + }, + { + "epoch": 0.94, + "grad_norm": 0.4938426717019866, + "learning_rate": 8.661580777858947e-08, + "loss": 0.1099, + "step": 11185 + }, + { + "epoch": 0.94, + "grad_norm": 0.32501706599396374, + "learning_rate": 8.636309220403538e-08, + "loss": 0.0395, + "step": 11186 + }, + { + "epoch": 0.94, + "grad_norm": 0.59213436980137, + "learning_rate": 8.611074262578234e-08, + "loss": 0.0715, + "step": 11187 + }, + { + "epoch": 0.94, + "grad_norm": 0.19225049871954866, + "learning_rate": 8.585875906262697e-08, + "loss": 0.0356, + "step": 11188 + }, + { + "epoch": 0.94, + "grad_norm": 0.28075366090648485, + "learning_rate": 8.56071415333376e-08, + "loss": 0.0796, + "step": 11189 + }, + { + "epoch": 0.94, + "grad_norm": 0.34416319446156557, + "learning_rate": 8.535589005665701e-08, + "loss": 0.0676, + "step": 11190 + }, + { + "epoch": 0.94, + "grad_norm": 0.3300993629931398, + "learning_rate": 8.510500465129967e-08, + "loss": 0.0997, + "step": 11191 + }, + { + "epoch": 0.94, + "grad_norm": 0.23165686311412675, + "learning_rate": 8.485448533595287e-08, + "loss": 0.0604, + "step": 11192 + }, + { + "epoch": 0.94, + "grad_norm": 0.38879980953372406, + "learning_rate": 8.460433212927721e-08, + "loss": 0.0691, + "step": 11193 + }, + { + "epoch": 0.94, + "grad_norm": 0.23818055724998383, + "learning_rate": 8.435454504990503e-08, + "loss": 0.0707, + "step": 11194 + }, + { + "epoch": 0.94, + "grad_norm": 0.2671763339942266, + "learning_rate": 8.4105124116442e-08, + "loss": 0.0623, + "step": 11195 + }, + { + "epoch": 0.94, + "grad_norm": 0.20741074287060438, + "learning_rate": 8.385606934746604e-08, + "loss": 0.045, + "step": 11196 + }, + { + "epoch": 0.94, + "grad_norm": 0.431272790347049, + "learning_rate": 8.360738076152953e-08, + "loss": 0.1092, + "step": 11197 + }, + { + "epoch": 0.94, + "grad_norm": 0.47105150349570835, + "learning_rate": 8.335905837715485e-08, + "loss": 0.1013, + "step": 11198 + }, + { + "epoch": 0.94, + "grad_norm": 0.28014076928953896, + "learning_rate": 8.311110221283947e-08, + "loss": 0.0531, + "step": 11199 + }, + { + "epoch": 0.94, + "grad_norm": 0.3032661527429069, + "learning_rate": 8.286351228705192e-08, + "loss": 0.0559, + "step": 11200 + }, + { + "epoch": 0.94, + "grad_norm": 0.46947631512391585, + "learning_rate": 8.261628861823467e-08, + "loss": 0.085, + "step": 11201 + }, + { + "epoch": 0.94, + "grad_norm": 0.28421430748229026, + "learning_rate": 8.23694312248019e-08, + "loss": 0.0747, + "step": 11202 + }, + { + "epoch": 0.94, + "grad_norm": 0.31799200737196076, + "learning_rate": 8.21229401251411e-08, + "loss": 0.0966, + "step": 11203 + }, + { + "epoch": 0.94, + "grad_norm": 0.280505804045926, + "learning_rate": 8.18768153376126e-08, + "loss": 0.0655, + "step": 11204 + }, + { + "epoch": 0.94, + "grad_norm": 0.3987247531982659, + "learning_rate": 8.163105688054896e-08, + "loss": 0.1045, + "step": 11205 + }, + { + "epoch": 0.94, + "grad_norm": 0.2809256498676631, + "learning_rate": 8.138566477225607e-08, + "loss": 0.0603, + "step": 11206 + }, + { + "epoch": 0.94, + "grad_norm": 0.5603739508641362, + "learning_rate": 8.114063903101154e-08, + "loss": 0.135, + "step": 11207 + }, + { + "epoch": 0.94, + "grad_norm": 0.2675628912681819, + "learning_rate": 8.089597967506746e-08, + "loss": 0.0525, + "step": 11208 + }, + { + "epoch": 0.94, + "grad_norm": 0.2214586840481978, + "learning_rate": 8.065168672264589e-08, + "loss": 0.0637, + "step": 11209 + }, + { + "epoch": 0.94, + "grad_norm": 0.30433944747903424, + "learning_rate": 8.040776019194396e-08, + "loss": 0.08, + "step": 11210 + }, + { + "epoch": 0.94, + "grad_norm": 0.32926647978052526, + "learning_rate": 8.016420010113158e-08, + "loss": 0.0888, + "step": 11211 + }, + { + "epoch": 0.94, + "grad_norm": 0.23041305092221398, + "learning_rate": 7.992100646834921e-08, + "loss": 0.0391, + "step": 11212 + }, + { + "epoch": 0.94, + "grad_norm": 0.35414899545237477, + "learning_rate": 7.96781793117124e-08, + "loss": 0.0924, + "step": 11213 + }, + { + "epoch": 0.94, + "grad_norm": 0.2385904184812834, + "learning_rate": 7.943571864930667e-08, + "loss": 0.1019, + "step": 11214 + }, + { + "epoch": 0.94, + "grad_norm": 0.33139354274541266, + "learning_rate": 7.919362449919421e-08, + "loss": 0.0793, + "step": 11215 + }, + { + "epoch": 0.95, + "grad_norm": 0.32304354652793965, + "learning_rate": 7.895189687940563e-08, + "loss": 0.0705, + "step": 11216 + }, + { + "epoch": 0.95, + "grad_norm": 0.36539358690478235, + "learning_rate": 7.871053580794763e-08, + "loss": 0.0745, + "step": 11217 + }, + { + "epoch": 0.95, + "grad_norm": 0.38962341800656736, + "learning_rate": 7.846954130279693e-08, + "loss": 0.0829, + "step": 11218 + }, + { + "epoch": 0.95, + "grad_norm": 0.2877837003591167, + "learning_rate": 7.82289133819053e-08, + "loss": 0.0673, + "step": 11219 + }, + { + "epoch": 0.95, + "grad_norm": 0.26208692568085346, + "learning_rate": 7.798865206319561e-08, + "loss": 0.0384, + "step": 11220 + }, + { + "epoch": 0.95, + "grad_norm": 0.19061227930757443, + "learning_rate": 7.774875736456356e-08, + "loss": 0.0625, + "step": 11221 + }, + { + "epoch": 0.95, + "grad_norm": 0.21371070667596545, + "learning_rate": 7.750922930387871e-08, + "loss": 0.0607, + "step": 11222 + }, + { + "epoch": 0.95, + "grad_norm": 0.3625241741603625, + "learning_rate": 7.727006789898184e-08, + "loss": 0.0844, + "step": 11223 + }, + { + "epoch": 0.95, + "grad_norm": 0.2377337032398751, + "learning_rate": 7.703127316768754e-08, + "loss": 0.0977, + "step": 11224 + }, + { + "epoch": 0.95, + "grad_norm": 0.3076311041441613, + "learning_rate": 7.679284512778218e-08, + "loss": 0.0869, + "step": 11225 + }, + { + "epoch": 0.95, + "grad_norm": 0.24889487445787453, + "learning_rate": 7.655478379702597e-08, + "loss": 0.0778, + "step": 11226 + }, + { + "epoch": 0.95, + "grad_norm": 0.3041811454513362, + "learning_rate": 7.631708919314973e-08, + "loss": 0.0739, + "step": 11227 + }, + { + "epoch": 0.95, + "grad_norm": 0.3837462729933248, + "learning_rate": 7.607976133385986e-08, + "loss": 0.055, + "step": 11228 + }, + { + "epoch": 0.95, + "grad_norm": 0.22703436802581847, + "learning_rate": 7.584280023683333e-08, + "loss": 0.053, + "step": 11229 + }, + { + "epoch": 0.95, + "grad_norm": 0.317863921746909, + "learning_rate": 7.560620591972046e-08, + "loss": 0.0701, + "step": 11230 + }, + { + "epoch": 0.95, + "grad_norm": 0.3093262347744054, + "learning_rate": 7.536997840014382e-08, + "loss": 0.0931, + "step": 11231 + }, + { + "epoch": 0.95, + "grad_norm": 0.19468723014563755, + "learning_rate": 7.513411769569933e-08, + "loss": 0.0563, + "step": 11232 + }, + { + "epoch": 0.95, + "grad_norm": 0.457678307666195, + "learning_rate": 7.489862382395519e-08, + "loss": 0.0929, + "step": 11233 + }, + { + "epoch": 0.95, + "grad_norm": 0.4859982354400338, + "learning_rate": 7.466349680245289e-08, + "loss": 0.0986, + "step": 11234 + }, + { + "epoch": 0.95, + "grad_norm": 0.23526808004245472, + "learning_rate": 7.442873664870509e-08, + "loss": 0.0597, + "step": 11235 + }, + { + "epoch": 0.95, + "grad_norm": 0.49011815495816713, + "learning_rate": 7.419434338019893e-08, + "loss": 0.0911, + "step": 11236 + }, + { + "epoch": 0.95, + "grad_norm": 0.3729100628997059, + "learning_rate": 7.39603170143921e-08, + "loss": 0.0903, + "step": 11237 + }, + { + "epoch": 0.95, + "grad_norm": 0.3063423895087612, + "learning_rate": 7.372665756871789e-08, + "loss": 0.0758, + "step": 11238 + }, + { + "epoch": 0.95, + "grad_norm": 0.24011249765651202, + "learning_rate": 7.349336506058014e-08, + "loss": 0.076, + "step": 11239 + }, + { + "epoch": 0.95, + "grad_norm": 0.5580741005454749, + "learning_rate": 7.326043950735495e-08, + "loss": 0.1224, + "step": 11240 + }, + { + "epoch": 0.95, + "grad_norm": 0.22135717131903207, + "learning_rate": 7.30278809263929e-08, + "loss": 0.0752, + "step": 11241 + }, + { + "epoch": 0.95, + "grad_norm": 0.29234339820697935, + "learning_rate": 7.279568933501624e-08, + "loss": 0.0711, + "step": 11242 + }, + { + "epoch": 0.95, + "grad_norm": 0.35048190751767727, + "learning_rate": 7.256386475051947e-08, + "loss": 0.0721, + "step": 11243 + }, + { + "epoch": 0.95, + "grad_norm": 0.23415601153607596, + "learning_rate": 7.233240719016988e-08, + "loss": 0.0501, + "step": 11244 + }, + { + "epoch": 0.95, + "grad_norm": 0.17600387466280934, + "learning_rate": 7.210131667120924e-08, + "loss": 0.0259, + "step": 11245 + }, + { + "epoch": 0.95, + "grad_norm": 0.404367456852045, + "learning_rate": 7.187059321084933e-08, + "loss": 0.0674, + "step": 11246 + }, + { + "epoch": 0.95, + "grad_norm": 0.6044465104869069, + "learning_rate": 7.164023682627641e-08, + "loss": 0.0912, + "step": 11247 + }, + { + "epoch": 0.95, + "grad_norm": 0.34610343308176733, + "learning_rate": 7.141024753464843e-08, + "loss": 0.1046, + "step": 11248 + }, + { + "epoch": 0.95, + "grad_norm": 0.32783883795532953, + "learning_rate": 7.118062535309611e-08, + "loss": 0.082, + "step": 11249 + }, + { + "epoch": 0.95, + "grad_norm": 0.3069930541123221, + "learning_rate": 7.095137029872301e-08, + "loss": 0.0841, + "step": 11250 + }, + { + "epoch": 0.95, + "grad_norm": 0.27487476985542186, + "learning_rate": 7.072248238860602e-08, + "loss": 0.068, + "step": 11251 + }, + { + "epoch": 0.95, + "grad_norm": 0.45129739842020855, + "learning_rate": 7.049396163979427e-08, + "loss": 0.1189, + "step": 11252 + }, + { + "epoch": 0.95, + "grad_norm": 0.23670679181427776, + "learning_rate": 7.026580806930805e-08, + "loss": 0.0647, + "step": 11253 + }, + { + "epoch": 0.95, + "grad_norm": 0.5537412171512616, + "learning_rate": 7.003802169414209e-08, + "loss": 0.078, + "step": 11254 + }, + { + "epoch": 0.95, + "grad_norm": 0.3229884078768127, + "learning_rate": 6.981060253126392e-08, + "loss": 0.0823, + "step": 11255 + }, + { + "epoch": 0.95, + "grad_norm": 0.37962925504819794, + "learning_rate": 6.95835505976128e-08, + "loss": 0.0776, + "step": 11256 + }, + { + "epoch": 0.95, + "grad_norm": 0.2829364961804394, + "learning_rate": 6.935686591010015e-08, + "loss": 0.0734, + "step": 11257 + }, + { + "epoch": 0.95, + "grad_norm": 0.29742728719668166, + "learning_rate": 6.91305484856114e-08, + "loss": 0.0599, + "step": 11258 + }, + { + "epoch": 0.95, + "grad_norm": 0.33796634688926985, + "learning_rate": 6.89045983410036e-08, + "loss": 0.0757, + "step": 11259 + }, + { + "epoch": 0.95, + "grad_norm": 0.5165733335515028, + "learning_rate": 6.867901549310774e-08, + "loss": 0.1179, + "step": 11260 + }, + { + "epoch": 0.95, + "grad_norm": 0.4098000336506173, + "learning_rate": 6.845379995872481e-08, + "loss": 0.0948, + "step": 11261 + }, + { + "epoch": 0.95, + "grad_norm": 0.2680233413513996, + "learning_rate": 6.822895175463195e-08, + "loss": 0.0588, + "step": 11262 + }, + { + "epoch": 0.95, + "grad_norm": 0.44541560692359705, + "learning_rate": 6.800447089757689e-08, + "loss": 0.1417, + "step": 11263 + }, + { + "epoch": 0.95, + "grad_norm": 0.356844719759819, + "learning_rate": 6.778035740427902e-08, + "loss": 0.0904, + "step": 11264 + }, + { + "epoch": 0.95, + "grad_norm": 0.29956693348692304, + "learning_rate": 6.755661129143276e-08, + "loss": 0.0387, + "step": 11265 + }, + { + "epoch": 0.95, + "grad_norm": 0.32630871221460434, + "learning_rate": 6.733323257570368e-08, + "loss": 0.0822, + "step": 11266 + }, + { + "epoch": 0.95, + "grad_norm": 0.4803437241753069, + "learning_rate": 6.711022127373012e-08, + "loss": 0.1288, + "step": 11267 + }, + { + "epoch": 0.95, + "grad_norm": 0.3278616547359571, + "learning_rate": 6.688757740212382e-08, + "loss": 0.0807, + "step": 11268 + }, + { + "epoch": 0.95, + "grad_norm": 0.509827095141917, + "learning_rate": 6.666530097746814e-08, + "loss": 0.1139, + "step": 11269 + }, + { + "epoch": 0.95, + "grad_norm": 0.3528096830949959, + "learning_rate": 6.644339201631933e-08, + "loss": 0.0961, + "step": 11270 + }, + { + "epoch": 0.95, + "grad_norm": 0.2263115088115778, + "learning_rate": 6.622185053520691e-08, + "loss": 0.0644, + "step": 11271 + }, + { + "epoch": 0.95, + "grad_norm": 0.3074063761545856, + "learning_rate": 6.600067655063269e-08, + "loss": 0.047, + "step": 11272 + }, + { + "epoch": 0.95, + "grad_norm": 0.27142098272127957, + "learning_rate": 6.577987007907071e-08, + "loss": 0.066, + "step": 11273 + }, + { + "epoch": 0.95, + "grad_norm": 0.32247256335687213, + "learning_rate": 6.555943113696783e-08, + "loss": 0.0846, + "step": 11274 + }, + { + "epoch": 0.95, + "grad_norm": 0.2766507074880423, + "learning_rate": 6.533935974074369e-08, + "loss": 0.0677, + "step": 11275 + }, + { + "epoch": 0.95, + "grad_norm": 0.3706051400807488, + "learning_rate": 6.511965590679126e-08, + "loss": 0.0788, + "step": 11276 + }, + { + "epoch": 0.95, + "grad_norm": 0.5334665315574001, + "learning_rate": 6.490031965147415e-08, + "loss": 0.0927, + "step": 11277 + }, + { + "epoch": 0.95, + "grad_norm": 0.37372903024697623, + "learning_rate": 6.468135099112982e-08, + "loss": 0.0964, + "step": 11278 + }, + { + "epoch": 0.95, + "grad_norm": 0.39637452293583175, + "learning_rate": 6.446274994206969e-08, + "loss": 0.1148, + "step": 11279 + }, + { + "epoch": 0.95, + "grad_norm": 0.39408032747648475, + "learning_rate": 6.424451652057517e-08, + "loss": 0.056, + "step": 11280 + }, + { + "epoch": 0.95, + "grad_norm": 0.2306670636104225, + "learning_rate": 6.402665074290215e-08, + "loss": 0.058, + "step": 11281 + }, + { + "epoch": 0.95, + "grad_norm": 0.41757942126120007, + "learning_rate": 6.380915262527765e-08, + "loss": 0.0845, + "step": 11282 + }, + { + "epoch": 0.95, + "grad_norm": 0.30454904877306804, + "learning_rate": 6.35920221839037e-08, + "loss": 0.065, + "step": 11283 + }, + { + "epoch": 0.95, + "grad_norm": 0.5498314056471072, + "learning_rate": 6.337525943495237e-08, + "loss": 0.1256, + "step": 11284 + }, + { + "epoch": 0.95, + "grad_norm": 0.30493502280316315, + "learning_rate": 6.315886439456964e-08, + "loss": 0.0449, + "step": 11285 + }, + { + "epoch": 0.95, + "grad_norm": 0.4580618055995727, + "learning_rate": 6.294283707887372e-08, + "loss": 0.0887, + "step": 11286 + }, + { + "epoch": 0.95, + "grad_norm": 0.28530437728878005, + "learning_rate": 6.272717750395618e-08, + "loss": 0.0658, + "step": 11287 + }, + { + "epoch": 0.95, + "grad_norm": 0.2645329507799146, + "learning_rate": 6.251188568587973e-08, + "loss": 0.081, + "step": 11288 + }, + { + "epoch": 0.95, + "grad_norm": 0.31904548399563337, + "learning_rate": 6.229696164068156e-08, + "loss": 0.0714, + "step": 11289 + }, + { + "epoch": 0.95, + "grad_norm": 0.2053897971750405, + "learning_rate": 6.208240538436993e-08, + "loss": 0.0504, + "step": 11290 + }, + { + "epoch": 0.95, + "grad_norm": 0.3165104364030358, + "learning_rate": 6.186821693292599e-08, + "loss": 0.0787, + "step": 11291 + }, + { + "epoch": 0.95, + "grad_norm": 0.28628400124775477, + "learning_rate": 6.165439630230363e-08, + "loss": 0.0723, + "step": 11292 + }, + { + "epoch": 0.95, + "grad_norm": 0.3514415429696445, + "learning_rate": 6.144094350843066e-08, + "loss": 0.0562, + "step": 11293 + }, + { + "epoch": 0.95, + "grad_norm": 0.6287995790262101, + "learning_rate": 6.122785856720492e-08, + "loss": 0.1061, + "step": 11294 + }, + { + "epoch": 0.95, + "grad_norm": 0.34477923905358565, + "learning_rate": 6.101514149449872e-08, + "loss": 0.0685, + "step": 11295 + }, + { + "epoch": 0.95, + "grad_norm": 0.21466988875106274, + "learning_rate": 6.08027923061566e-08, + "loss": 0.0689, + "step": 11296 + }, + { + "epoch": 0.95, + "grad_norm": 0.3399869100395212, + "learning_rate": 6.059081101799535e-08, + "loss": 0.0799, + "step": 11297 + }, + { + "epoch": 0.95, + "grad_norm": 0.5183571381734173, + "learning_rate": 6.037919764580457e-08, + "loss": 0.076, + "step": 11298 + }, + { + "epoch": 0.95, + "grad_norm": 0.3056311888900178, + "learning_rate": 6.016795220534721e-08, + "loss": 0.0847, + "step": 11299 + }, + { + "epoch": 0.95, + "grad_norm": 0.3469973640215041, + "learning_rate": 5.995707471235679e-08, + "loss": 0.0855, + "step": 11300 + }, + { + "epoch": 0.95, + "grad_norm": 0.4321331394795447, + "learning_rate": 5.97465651825413e-08, + "loss": 0.0739, + "step": 11301 + }, + { + "epoch": 0.95, + "grad_norm": 0.24505976539533014, + "learning_rate": 5.953642363158096e-08, + "loss": 0.0715, + "step": 11302 + }, + { + "epoch": 0.95, + "grad_norm": 0.4520016443839452, + "learning_rate": 5.9326650075128276e-08, + "loss": 0.0945, + "step": 11303 + }, + { + "epoch": 0.95, + "grad_norm": 0.47271434561429265, + "learning_rate": 5.9117244528808516e-08, + "loss": 0.0989, + "step": 11304 + }, + { + "epoch": 0.95, + "grad_norm": 0.3890941574697573, + "learning_rate": 5.890820700821809e-08, + "loss": 0.0917, + "step": 11305 + }, + { + "epoch": 0.95, + "grad_norm": 0.3616456320561048, + "learning_rate": 5.869953752892954e-08, + "loss": 0.088, + "step": 11306 + }, + { + "epoch": 0.95, + "grad_norm": 0.4044091054774401, + "learning_rate": 5.8491236106483774e-08, + "loss": 0.0935, + "step": 11307 + }, + { + "epoch": 0.95, + "grad_norm": 0.3332779043928037, + "learning_rate": 5.828330275639782e-08, + "loss": 0.1093, + "step": 11308 + }, + { + "epoch": 0.95, + "grad_norm": 0.6236353017227955, + "learning_rate": 5.807573749415873e-08, + "loss": 0.1011, + "step": 11309 + }, + { + "epoch": 0.95, + "grad_norm": 0.320432391036484, + "learning_rate": 5.786854033522748e-08, + "loss": 0.071, + "step": 11310 + }, + { + "epoch": 0.95, + "grad_norm": 0.34481205906138357, + "learning_rate": 5.7661711295037836e-08, + "loss": 0.0689, + "step": 11311 + }, + { + "epoch": 0.95, + "grad_norm": 0.3350922360027979, + "learning_rate": 5.745525038899469e-08, + "loss": 0.0888, + "step": 11312 + }, + { + "epoch": 0.95, + "grad_norm": 0.4635520634090538, + "learning_rate": 5.724915763247685e-08, + "loss": 0.0512, + "step": 11313 + }, + { + "epoch": 0.95, + "grad_norm": 0.2639679656239182, + "learning_rate": 5.704343304083593e-08, + "loss": 0.0736, + "step": 11314 + }, + { + "epoch": 0.95, + "grad_norm": 0.16673479432900848, + "learning_rate": 5.683807662939467e-08, + "loss": 0.0317, + "step": 11315 + }, + { + "epoch": 0.95, + "grad_norm": 0.3974567446005665, + "learning_rate": 5.663308841344972e-08, + "loss": 0.092, + "step": 11316 + }, + { + "epoch": 0.95, + "grad_norm": 0.3610695959370713, + "learning_rate": 5.6428468408269435e-08, + "loss": 0.0677, + "step": 11317 + }, + { + "epoch": 0.95, + "grad_norm": 0.3203274888251374, + "learning_rate": 5.622421662909494e-08, + "loss": 0.0641, + "step": 11318 + }, + { + "epoch": 0.95, + "grad_norm": 0.17055499379425346, + "learning_rate": 5.6020333091140743e-08, + "loss": 0.0543, + "step": 11319 + }, + { + "epoch": 0.95, + "grad_norm": 0.3333436329644468, + "learning_rate": 5.5816817809593026e-08, + "loss": 0.0799, + "step": 11320 + }, + { + "epoch": 0.95, + "grad_norm": 0.35037645775671616, + "learning_rate": 5.5613670799610774e-08, + "loss": 0.1119, + "step": 11321 + }, + { + "epoch": 0.95, + "grad_norm": 0.5187449920728862, + "learning_rate": 5.541089207632522e-08, + "loss": 0.1084, + "step": 11322 + }, + { + "epoch": 0.95, + "grad_norm": 0.32895272099749806, + "learning_rate": 5.520848165484094e-08, + "loss": 0.0872, + "step": 11323 + }, + { + "epoch": 0.95, + "grad_norm": 0.42981996460968186, + "learning_rate": 5.500643955023477e-08, + "loss": 0.0711, + "step": 11324 + }, + { + "epoch": 0.95, + "grad_norm": 0.17831005122805985, + "learning_rate": 5.480476577755522e-08, + "loss": 0.0551, + "step": 11325 + }, + { + "epoch": 0.95, + "grad_norm": 0.3045963024840234, + "learning_rate": 5.4603460351824734e-08, + "loss": 0.082, + "step": 11326 + }, + { + "epoch": 0.95, + "grad_norm": 0.35660415002327656, + "learning_rate": 5.440252328803741e-08, + "loss": 0.0682, + "step": 11327 + }, + { + "epoch": 0.95, + "grad_norm": 0.34131128869374466, + "learning_rate": 5.420195460116073e-08, + "loss": 0.0869, + "step": 11328 + }, + { + "epoch": 0.95, + "grad_norm": 0.26880446199797353, + "learning_rate": 5.400175430613386e-08, + "loss": 0.0818, + "step": 11329 + }, + { + "epoch": 0.95, + "grad_norm": 0.3434558816849806, + "learning_rate": 5.3801922417868745e-08, + "loss": 0.0965, + "step": 11330 + }, + { + "epoch": 0.95, + "grad_norm": 0.36512128709644526, + "learning_rate": 5.360245895125016e-08, + "loss": 0.0956, + "step": 11331 + }, + { + "epoch": 0.95, + "grad_norm": 0.304488699632571, + "learning_rate": 5.34033639211351e-08, + "loss": 0.0824, + "step": 11332 + }, + { + "epoch": 0.95, + "grad_norm": 0.8430855127807839, + "learning_rate": 5.320463734235393e-08, + "loss": 0.1699, + "step": 11333 + }, + { + "epoch": 0.96, + "grad_norm": 0.3038620563934288, + "learning_rate": 5.300627922970869e-08, + "loss": 0.0699, + "step": 11334 + }, + { + "epoch": 0.96, + "grad_norm": 0.30726140059862755, + "learning_rate": 5.280828959797424e-08, + "loss": 0.0846, + "step": 11335 + }, + { + "epoch": 0.96, + "grad_norm": 0.3455810027705089, + "learning_rate": 5.2610668461897105e-08, + "loss": 0.0947, + "step": 11336 + }, + { + "epoch": 0.96, + "grad_norm": 0.24333341421881813, + "learning_rate": 5.241341583619886e-08, + "loss": 0.0542, + "step": 11337 + }, + { + "epoch": 0.96, + "grad_norm": 0.30168058041806944, + "learning_rate": 5.2216531735571064e-08, + "loss": 0.0804, + "step": 11338 + }, + { + "epoch": 0.96, + "grad_norm": 0.2400316864465075, + "learning_rate": 5.2020016174678665e-08, + "loss": 0.0489, + "step": 11339 + }, + { + "epoch": 0.96, + "grad_norm": 0.458949354416097, + "learning_rate": 5.18238691681594e-08, + "loss": 0.0604, + "step": 11340 + }, + { + "epoch": 0.96, + "grad_norm": 0.3867263906489827, + "learning_rate": 5.1628090730624355e-08, + "loss": 0.0936, + "step": 11341 + }, + { + "epoch": 0.96, + "grad_norm": 0.224722510843846, + "learning_rate": 5.143268087665465e-08, + "loss": 0.0641, + "step": 11342 + }, + { + "epoch": 0.96, + "grad_norm": 0.3375220001873892, + "learning_rate": 5.123763962080697e-08, + "loss": 0.0735, + "step": 11343 + }, + { + "epoch": 0.96, + "grad_norm": 0.3029390357604747, + "learning_rate": 5.104296697760858e-08, + "loss": 0.0725, + "step": 11344 + }, + { + "epoch": 0.96, + "grad_norm": 0.1926658989417735, + "learning_rate": 5.0848662961560106e-08, + "loss": 0.0392, + "step": 11345 + }, + { + "epoch": 0.96, + "grad_norm": 0.3775448902036218, + "learning_rate": 5.0654727587133303e-08, + "loss": 0.0602, + "step": 11346 + }, + { + "epoch": 0.96, + "grad_norm": 0.4649038124308176, + "learning_rate": 5.046116086877495e-08, + "loss": 0.1244, + "step": 11347 + }, + { + "epoch": 0.96, + "grad_norm": 0.2687562557865079, + "learning_rate": 5.02679628209024e-08, + "loss": 0.0804, + "step": 11348 + }, + { + "epoch": 0.96, + "grad_norm": 0.35274356978120736, + "learning_rate": 5.0075133457906934e-08, + "loss": 0.0693, + "step": 11349 + }, + { + "epoch": 0.96, + "grad_norm": 0.2150622420007374, + "learning_rate": 4.988267279414982e-08, + "loss": 0.0545, + "step": 11350 + }, + { + "epoch": 0.96, + "grad_norm": 0.2727031397930467, + "learning_rate": 4.96905808439685e-08, + "loss": 0.068, + "step": 11351 + }, + { + "epoch": 0.96, + "grad_norm": 0.33424933316428324, + "learning_rate": 4.949885762167039e-08, + "loss": 0.0672, + "step": 11352 + }, + { + "epoch": 0.96, + "grad_norm": 0.44138481099346616, + "learning_rate": 4.930750314153632e-08, + "loss": 0.0557, + "step": 11353 + }, + { + "epoch": 0.96, + "grad_norm": 0.4013189476395214, + "learning_rate": 4.911651741781875e-08, + "loss": 0.1293, + "step": 11354 + }, + { + "epoch": 0.96, + "grad_norm": 0.3881659899305356, + "learning_rate": 4.8925900464744656e-08, + "loss": 0.0968, + "step": 11355 + }, + { + "epoch": 0.96, + "grad_norm": 0.4638157795291874, + "learning_rate": 4.8735652296511006e-08, + "loss": 0.0827, + "step": 11356 + }, + { + "epoch": 0.96, + "grad_norm": 0.2250924702952408, + "learning_rate": 4.854577292728979e-08, + "loss": 0.0593, + "step": 11357 + }, + { + "epoch": 0.96, + "grad_norm": 0.25333487880204586, + "learning_rate": 4.8356262371223595e-08, + "loss": 0.0561, + "step": 11358 + }, + { + "epoch": 0.96, + "grad_norm": 0.2964428085683523, + "learning_rate": 4.816712064242834e-08, + "loss": 0.0959, + "step": 11359 + }, + { + "epoch": 0.96, + "grad_norm": 0.276022621851475, + "learning_rate": 4.797834775499222e-08, + "loss": 0.0796, + "step": 11360 + }, + { + "epoch": 0.96, + "grad_norm": 0.28001006366069237, + "learning_rate": 4.778994372297674e-08, + "loss": 0.0444, + "step": 11361 + }, + { + "epoch": 0.96, + "grad_norm": 0.3495570352991, + "learning_rate": 4.760190856041513e-08, + "loss": 0.0759, + "step": 11362 + }, + { + "epoch": 0.96, + "grad_norm": 1.0046541657819519, + "learning_rate": 4.741424228131286e-08, + "loss": 0.122, + "step": 11363 + }, + { + "epoch": 0.96, + "grad_norm": 0.43400250443587873, + "learning_rate": 4.7226944899649296e-08, + "loss": 0.118, + "step": 11364 + }, + { + "epoch": 0.96, + "grad_norm": 0.16205431210960036, + "learning_rate": 4.704001642937439e-08, + "loss": 0.0412, + "step": 11365 + }, + { + "epoch": 0.96, + "grad_norm": 0.31436112819749146, + "learning_rate": 4.6853456884412004e-08, + "loss": 0.0837, + "step": 11366 + }, + { + "epoch": 0.96, + "grad_norm": 0.34379047261929285, + "learning_rate": 4.666726627865881e-08, + "loss": 0.0968, + "step": 11367 + }, + { + "epoch": 0.96, + "grad_norm": 0.4771284870334238, + "learning_rate": 4.648144462598314e-08, + "loss": 0.1117, + "step": 11368 + }, + { + "epoch": 0.96, + "grad_norm": 0.31492308559775567, + "learning_rate": 4.6295991940225603e-08, + "loss": 0.0559, + "step": 11369 + }, + { + "epoch": 0.96, + "grad_norm": 0.5790130886573577, + "learning_rate": 4.6110908235199595e-08, + "loss": 0.0824, + "step": 11370 + }, + { + "epoch": 0.96, + "grad_norm": 0.4433207158290598, + "learning_rate": 4.592619352469241e-08, + "loss": 0.1202, + "step": 11371 + }, + { + "epoch": 0.96, + "grad_norm": 0.2857450995265564, + "learning_rate": 4.574184782246138e-08, + "loss": 0.0612, + "step": 11372 + }, + { + "epoch": 0.96, + "grad_norm": 0.35139243849597784, + "learning_rate": 4.555787114223831e-08, + "loss": 0.0868, + "step": 11373 + }, + { + "epoch": 0.96, + "grad_norm": 0.3795460743533305, + "learning_rate": 4.537426349772722e-08, + "loss": 0.1096, + "step": 11374 + }, + { + "epoch": 0.96, + "grad_norm": 0.30537323141618544, + "learning_rate": 4.5191024902603297e-08, + "loss": 0.0899, + "step": 11375 + }, + { + "epoch": 0.96, + "grad_norm": 0.38482976203550645, + "learning_rate": 4.5008155370516174e-08, + "loss": 0.0924, + "step": 11376 + }, + { + "epoch": 0.96, + "grad_norm": 0.25073260386258756, + "learning_rate": 4.482565491508606e-08, + "loss": 0.0843, + "step": 11377 + }, + { + "epoch": 0.96, + "grad_norm": 0.2802971419188257, + "learning_rate": 4.464352354990764e-08, + "loss": 0.0639, + "step": 11378 + }, + { + "epoch": 0.96, + "grad_norm": 0.36716505550610323, + "learning_rate": 4.4461761288546735e-08, + "loss": 0.1027, + "step": 11379 + }, + { + "epoch": 0.96, + "grad_norm": 0.4472378823494747, + "learning_rate": 4.4280368144541396e-08, + "loss": 0.074, + "step": 11380 + }, + { + "epoch": 0.96, + "grad_norm": 0.27146838768130216, + "learning_rate": 4.409934413140415e-08, + "loss": 0.0485, + "step": 11381 + }, + { + "epoch": 0.96, + "grad_norm": 0.31941422477928805, + "learning_rate": 4.391868926261755e-08, + "loss": 0.0596, + "step": 11382 + }, + { + "epoch": 0.96, + "grad_norm": 0.3278798804123493, + "learning_rate": 4.373840355163805e-08, + "loss": 0.0718, + "step": 11383 + }, + { + "epoch": 0.96, + "grad_norm": 0.4530233184171188, + "learning_rate": 4.3558487011895464e-08, + "loss": 0.0799, + "step": 11384 + }, + { + "epoch": 0.96, + "grad_norm": 0.3788073125037581, + "learning_rate": 4.337893965678963e-08, + "loss": 0.0844, + "step": 11385 + }, + { + "epoch": 0.96, + "grad_norm": 0.3639434449392199, + "learning_rate": 4.319976149969485e-08, + "loss": 0.0634, + "step": 11386 + }, + { + "epoch": 0.96, + "grad_norm": 0.2553009224275852, + "learning_rate": 4.302095255395711e-08, + "loss": 0.0629, + "step": 11387 + }, + { + "epoch": 0.96, + "grad_norm": 0.305499144133457, + "learning_rate": 4.2842512832895754e-08, + "loss": 0.0676, + "step": 11388 + }, + { + "epoch": 0.96, + "grad_norm": 0.5192241469953546, + "learning_rate": 4.2664442349801273e-08, + "loss": 0.1343, + "step": 11389 + }, + { + "epoch": 0.96, + "grad_norm": 0.3865900443191973, + "learning_rate": 4.2486741117938044e-08, + "loss": 0.112, + "step": 11390 + }, + { + "epoch": 0.96, + "grad_norm": 0.5474079939428419, + "learning_rate": 4.2309409150541604e-08, + "loss": 0.1175, + "step": 11391 + }, + { + "epoch": 0.96, + "grad_norm": 0.4026515787344727, + "learning_rate": 4.213244646082137e-08, + "loss": 0.0919, + "step": 11392 + }, + { + "epoch": 0.96, + "grad_norm": 0.3829483578009171, + "learning_rate": 4.1955853061958486e-08, + "loss": 0.0822, + "step": 11393 + }, + { + "epoch": 0.96, + "grad_norm": 0.2243559674568583, + "learning_rate": 4.1779628967105745e-08, + "loss": 0.0734, + "step": 11394 + }, + { + "epoch": 0.96, + "grad_norm": 0.27013529964705557, + "learning_rate": 4.160377418939043e-08, + "loss": 0.0815, + "step": 11395 + }, + { + "epoch": 0.96, + "grad_norm": 0.31698877448499074, + "learning_rate": 4.142828874191096e-08, + "loss": 0.106, + "step": 11396 + }, + { + "epoch": 0.96, + "grad_norm": 0.2828775329304314, + "learning_rate": 4.125317263773798e-08, + "loss": 0.0856, + "step": 11397 + }, + { + "epoch": 0.96, + "grad_norm": 0.29835151369007096, + "learning_rate": 4.107842588991606e-08, + "loss": 0.0633, + "step": 11398 + }, + { + "epoch": 0.96, + "grad_norm": 0.2713615608164801, + "learning_rate": 4.0904048511460347e-08, + "loss": 0.0452, + "step": 11399 + }, + { + "epoch": 0.96, + "grad_norm": 0.27501725023227286, + "learning_rate": 4.073004051535989e-08, + "loss": 0.0628, + "step": 11400 + }, + { + "epoch": 0.96, + "grad_norm": 0.31856142878808086, + "learning_rate": 4.055640191457599e-08, + "loss": 0.0656, + "step": 11401 + }, + { + "epoch": 0.96, + "grad_norm": 0.23178081114532081, + "learning_rate": 4.038313272204275e-08, + "loss": 0.0752, + "step": 11402 + }, + { + "epoch": 0.96, + "grad_norm": 0.42608663349695775, + "learning_rate": 4.02102329506654e-08, + "loss": 0.0916, + "step": 11403 + }, + { + "epoch": 0.96, + "grad_norm": 0.347003711742509, + "learning_rate": 4.003770261332252e-08, + "loss": 0.058, + "step": 11404 + }, + { + "epoch": 0.96, + "grad_norm": 0.24440182950855546, + "learning_rate": 3.98655417228655e-08, + "loss": 0.0659, + "step": 11405 + }, + { + "epoch": 0.96, + "grad_norm": 0.19158227399747982, + "learning_rate": 3.969375029211797e-08, + "loss": 0.0433, + "step": 11406 + }, + { + "epoch": 0.96, + "grad_norm": 0.3341533937365476, + "learning_rate": 3.952232833387582e-08, + "loss": 0.055, + "step": 11407 + }, + { + "epoch": 0.96, + "grad_norm": 0.23059763784526194, + "learning_rate": 3.9351275860907146e-08, + "loss": 0.0693, + "step": 11408 + }, + { + "epoch": 0.96, + "grad_norm": 0.19838686076757472, + "learning_rate": 3.918059288595399e-08, + "loss": 0.0423, + "step": 11409 + }, + { + "epoch": 0.96, + "grad_norm": 0.30479441979708793, + "learning_rate": 3.901027942172897e-08, + "loss": 0.0753, + "step": 11410 + }, + { + "epoch": 0.96, + "grad_norm": 0.4155106453694752, + "learning_rate": 3.884033548091804e-08, + "loss": 0.1015, + "step": 11411 + }, + { + "epoch": 0.96, + "grad_norm": 0.2870839132161629, + "learning_rate": 3.8670761076180505e-08, + "loss": 0.0972, + "step": 11412 + }, + { + "epoch": 0.96, + "grad_norm": 0.3375010673678632, + "learning_rate": 3.850155622014573e-08, + "loss": 0.0793, + "step": 11413 + }, + { + "epoch": 0.96, + "grad_norm": 0.3222396983060046, + "learning_rate": 3.8332720925418064e-08, + "loss": 0.0869, + "step": 11414 + }, + { + "epoch": 0.96, + "grad_norm": 0.3717009578882101, + "learning_rate": 3.8164255204573566e-08, + "loss": 0.1002, + "step": 11415 + }, + { + "epoch": 0.96, + "grad_norm": 0.29335721506650886, + "learning_rate": 3.799615907016052e-08, + "loss": 0.0956, + "step": 11416 + }, + { + "epoch": 0.96, + "grad_norm": 0.2965948335629752, + "learning_rate": 3.782843253469892e-08, + "loss": 0.0914, + "step": 11417 + }, + { + "epoch": 0.96, + "grad_norm": 0.28385170346303873, + "learning_rate": 3.7661075610682104e-08, + "loss": 0.0508, + "step": 11418 + }, + { + "epoch": 0.96, + "grad_norm": 0.3172264753490098, + "learning_rate": 3.749408831057622e-08, + "loss": 0.0765, + "step": 11419 + }, + { + "epoch": 0.96, + "grad_norm": 0.1819224411021191, + "learning_rate": 3.732747064681963e-08, + "loss": 0.0372, + "step": 11420 + }, + { + "epoch": 0.96, + "grad_norm": 0.45227763654834774, + "learning_rate": 3.716122263182298e-08, + "loss": 0.0827, + "step": 11421 + }, + { + "epoch": 0.96, + "grad_norm": 0.19256002111206616, + "learning_rate": 3.6995344277968584e-08, + "loss": 0.0413, + "step": 11422 + }, + { + "epoch": 0.96, + "grad_norm": 0.34501944133766654, + "learning_rate": 3.6829835597612664e-08, + "loss": 0.078, + "step": 11423 + }, + { + "epoch": 0.96, + "grad_norm": 0.4492894490879793, + "learning_rate": 3.666469660308369e-08, + "loss": 0.1071, + "step": 11424 + }, + { + "epoch": 0.96, + "grad_norm": 0.23785379162235312, + "learning_rate": 3.6499927306681284e-08, + "loss": 0.0412, + "step": 11425 + }, + { + "epoch": 0.96, + "grad_norm": 0.2543995415106295, + "learning_rate": 3.6335527720678945e-08, + "loss": 0.0832, + "step": 11426 + }, + { + "epoch": 0.96, + "grad_norm": 0.3442318256900812, + "learning_rate": 3.617149785732188e-08, + "loss": 0.0397, + "step": 11427 + }, + { + "epoch": 0.96, + "grad_norm": 0.33131528303423136, + "learning_rate": 3.60078377288281e-08, + "loss": 0.0866, + "step": 11428 + }, + { + "epoch": 0.96, + "grad_norm": 0.5195593079643612, + "learning_rate": 3.584454734738785e-08, + "loss": 0.1303, + "step": 11429 + }, + { + "epoch": 0.96, + "grad_norm": 0.21342066166493776, + "learning_rate": 3.568162672516418e-08, + "loss": 0.0652, + "step": 11430 + }, + { + "epoch": 0.96, + "grad_norm": 0.7086892704107729, + "learning_rate": 3.5519075874292376e-08, + "loss": 0.1394, + "step": 11431 + }, + { + "epoch": 0.96, + "grad_norm": 0.21496705789083373, + "learning_rate": 3.535689480687998e-08, + "loss": 0.0478, + "step": 11432 + }, + { + "epoch": 0.96, + "grad_norm": 0.5863225671315513, + "learning_rate": 3.519508353500733e-08, + "loss": 0.1033, + "step": 11433 + }, + { + "epoch": 0.96, + "grad_norm": 0.3741328718146961, + "learning_rate": 3.5033642070726457e-08, + "loss": 0.0692, + "step": 11434 + }, + { + "epoch": 0.96, + "grad_norm": 0.27093058300583583, + "learning_rate": 3.487257042606329e-08, + "loss": 0.0721, + "step": 11435 + }, + { + "epoch": 0.96, + "grad_norm": 0.24102486483142388, + "learning_rate": 3.4711868613015457e-08, + "loss": 0.0723, + "step": 11436 + }, + { + "epoch": 0.96, + "grad_norm": 0.361043161509741, + "learning_rate": 3.455153664355226e-08, + "loss": 0.1138, + "step": 11437 + }, + { + "epoch": 0.96, + "grad_norm": 0.34571186902891393, + "learning_rate": 3.439157452961639e-08, + "loss": 0.0953, + "step": 11438 + }, + { + "epoch": 0.96, + "grad_norm": 0.5790109348841226, + "learning_rate": 3.4231982283123296e-08, + "loss": 0.1199, + "step": 11439 + }, + { + "epoch": 0.96, + "grad_norm": 0.24737178493504416, + "learning_rate": 3.4072759915959594e-08, + "loss": 0.0827, + "step": 11440 + }, + { + "epoch": 0.96, + "grad_norm": 0.4066289355514357, + "learning_rate": 3.391390743998524e-08, + "loss": 0.1047, + "step": 11441 + }, + { + "epoch": 0.96, + "grad_norm": 0.28347114644906707, + "learning_rate": 3.3755424867032424e-08, + "loss": 0.0897, + "step": 11442 + }, + { + "epoch": 0.96, + "grad_norm": 0.25358990304960355, + "learning_rate": 3.359731220890672e-08, + "loss": 0.0686, + "step": 11443 + }, + { + "epoch": 0.96, + "grad_norm": 0.36827734514629457, + "learning_rate": 3.34395694773848e-08, + "loss": 0.0589, + "step": 11444 + }, + { + "epoch": 0.96, + "grad_norm": 0.38108222768607725, + "learning_rate": 3.328219668421506e-08, + "loss": 0.0852, + "step": 11445 + }, + { + "epoch": 0.96, + "grad_norm": 0.44714109758132564, + "learning_rate": 3.312519384112145e-08, + "loss": 0.0923, + "step": 11446 + }, + { + "epoch": 0.96, + "grad_norm": 0.3102352687356454, + "learning_rate": 3.296856095979739e-08, + "loss": 0.0717, + "step": 11447 + }, + { + "epoch": 0.96, + "grad_norm": 0.30853595692004665, + "learning_rate": 3.2812298051909665e-08, + "loss": 0.0974, + "step": 11448 + }, + { + "epoch": 0.96, + "grad_norm": 0.38497342614820634, + "learning_rate": 3.26564051290984e-08, + "loss": 0.0693, + "step": 11449 + }, + { + "epoch": 0.96, + "grad_norm": 0.4639917283424007, + "learning_rate": 3.250088220297487e-08, + "loss": 0.1143, + "step": 11450 + }, + { + "epoch": 0.96, + "grad_norm": 0.2761422054016099, + "learning_rate": 3.23457292851237e-08, + "loss": 0.0863, + "step": 11451 + }, + { + "epoch": 0.96, + "grad_norm": 0.2550914587048357, + "learning_rate": 3.219094638710119e-08, + "loss": 0.0821, + "step": 11452 + }, + { + "epoch": 0.97, + "grad_norm": 0.540475332838519, + "learning_rate": 3.203653352043645e-08, + "loss": 0.1101, + "step": 11453 + }, + { + "epoch": 0.97, + "grad_norm": 0.35515013916666804, + "learning_rate": 3.1882490696631406e-08, + "loss": 0.1226, + "step": 11454 + }, + { + "epoch": 0.97, + "grad_norm": 0.18411805147902935, + "learning_rate": 3.172881792715965e-08, + "loss": 0.0503, + "step": 11455 + }, + { + "epoch": 0.97, + "grad_norm": 0.23449252729901343, + "learning_rate": 3.157551522346758e-08, + "loss": 0.0553, + "step": 11456 + }, + { + "epoch": 0.97, + "grad_norm": 0.3115322918366016, + "learning_rate": 3.1422582596974946e-08, + "loss": 0.0678, + "step": 11457 + }, + { + "epoch": 0.97, + "grad_norm": 0.21309368512204982, + "learning_rate": 3.127002005907209e-08, + "loss": 0.0481, + "step": 11458 + }, + { + "epoch": 0.97, + "grad_norm": 0.33521428753774857, + "learning_rate": 3.1117827621123254e-08, + "loss": 0.087, + "step": 11459 + }, + { + "epoch": 0.97, + "grad_norm": 0.38819730925766033, + "learning_rate": 3.0966005294464915e-08, + "loss": 0.0878, + "step": 11460 + }, + { + "epoch": 0.97, + "grad_norm": 0.27438590140715324, + "learning_rate": 3.081455309040471e-08, + "loss": 0.0713, + "step": 11461 + }, + { + "epoch": 0.97, + "grad_norm": 0.3621791055278678, + "learning_rate": 3.0663471020224714e-08, + "loss": 0.0885, + "step": 11462 + }, + { + "epoch": 0.97, + "grad_norm": 0.3121781877821618, + "learning_rate": 3.0512759095177594e-08, + "loss": 0.0808, + "step": 11463 + }, + { + "epoch": 0.97, + "grad_norm": 0.3802480447638035, + "learning_rate": 3.036241732648937e-08, + "loss": 0.0765, + "step": 11464 + }, + { + "epoch": 0.97, + "grad_norm": 0.3494434942111629, + "learning_rate": 3.021244572535886e-08, + "loss": 0.103, + "step": 11465 + }, + { + "epoch": 0.97, + "grad_norm": 0.6263959216132077, + "learning_rate": 3.0062844302956586e-08, + "loss": 0.117, + "step": 11466 + }, + { + "epoch": 0.97, + "grad_norm": 0.30290312226107696, + "learning_rate": 2.991361307042584e-08, + "loss": 0.0887, + "step": 11467 + }, + { + "epoch": 0.97, + "grad_norm": 0.30156162515867974, + "learning_rate": 2.9764752038881627e-08, + "loss": 0.0832, + "step": 11468 + }, + { + "epoch": 0.97, + "grad_norm": 0.24069645419334346, + "learning_rate": 2.9616261219412856e-08, + "loss": 0.053, + "step": 11469 + }, + { + "epoch": 0.97, + "grad_norm": 0.31810159605174715, + "learning_rate": 2.9468140623079012e-08, + "loss": 0.0588, + "step": 11470 + }, + { + "epoch": 0.97, + "grad_norm": 0.3137005872257044, + "learning_rate": 2.9320390260914046e-08, + "loss": 0.0806, + "step": 11471 + }, + { + "epoch": 0.97, + "grad_norm": 0.47357518913960495, + "learning_rate": 2.9173010143922487e-08, + "loss": 0.0736, + "step": 11472 + }, + { + "epoch": 0.97, + "grad_norm": 0.326841670977742, + "learning_rate": 2.9026000283082776e-08, + "loss": 0.1028, + "step": 11473 + }, + { + "epoch": 0.97, + "grad_norm": 0.4108330679511492, + "learning_rate": 2.8879360689344492e-08, + "loss": 0.1119, + "step": 11474 + }, + { + "epoch": 0.97, + "grad_norm": 0.31509106420968125, + "learning_rate": 2.8733091373630007e-08, + "loss": 0.0957, + "step": 11475 + }, + { + "epoch": 0.97, + "grad_norm": 0.500468889654519, + "learning_rate": 2.8587192346835046e-08, + "loss": 0.1113, + "step": 11476 + }, + { + "epoch": 0.97, + "grad_norm": 0.2600530867703393, + "learning_rate": 2.8441663619826477e-08, + "loss": 0.0855, + "step": 11477 + }, + { + "epoch": 0.97, + "grad_norm": 0.2400533743940749, + "learning_rate": 2.829650520344396e-08, + "loss": 0.0587, + "step": 11478 + }, + { + "epoch": 0.97, + "grad_norm": 0.24730895228165162, + "learning_rate": 2.815171710850051e-08, + "loss": 0.0615, + "step": 11479 + }, + { + "epoch": 0.97, + "grad_norm": 0.26653627030677, + "learning_rate": 2.8007299345779727e-08, + "loss": 0.0572, + "step": 11480 + }, + { + "epoch": 0.97, + "grad_norm": 0.34223765757323427, + "learning_rate": 2.7863251926040223e-08, + "loss": 0.0688, + "step": 11481 + }, + { + "epoch": 0.97, + "grad_norm": 0.40197556147643376, + "learning_rate": 2.7719574860009534e-08, + "loss": 0.1034, + "step": 11482 + }, + { + "epoch": 0.97, + "grad_norm": 0.38240479099749586, + "learning_rate": 2.7576268158391317e-08, + "loss": 0.0974, + "step": 11483 + }, + { + "epoch": 0.97, + "grad_norm": 0.388620167106343, + "learning_rate": 2.74333318318587e-08, + "loss": 0.0825, + "step": 11484 + }, + { + "epoch": 0.97, + "grad_norm": 0.30764787184791, + "learning_rate": 2.729076589105928e-08, + "loss": 0.0642, + "step": 11485 + }, + { + "epoch": 0.97, + "grad_norm": 0.35943874571087586, + "learning_rate": 2.7148570346611225e-08, + "loss": 0.0954, + "step": 11486 + }, + { + "epoch": 0.97, + "grad_norm": 0.2663779601904052, + "learning_rate": 2.7006745209107176e-08, + "loss": 0.0757, + "step": 11487 + }, + { + "epoch": 0.97, + "grad_norm": 0.3553796898844576, + "learning_rate": 2.686529048911035e-08, + "loss": 0.0931, + "step": 11488 + }, + { + "epoch": 0.97, + "grad_norm": 0.46927607908348656, + "learning_rate": 2.6724206197157875e-08, + "loss": 0.1248, + "step": 11489 + }, + { + "epoch": 0.97, + "grad_norm": 0.2730521614156334, + "learning_rate": 2.6583492343757454e-08, + "loss": 0.0692, + "step": 11490 + }, + { + "epoch": 0.97, + "grad_norm": 0.31381966069078626, + "learning_rate": 2.6443148939390705e-08, + "loss": 0.064, + "step": 11491 + }, + { + "epoch": 0.97, + "grad_norm": 0.32772199054595486, + "learning_rate": 2.630317599451149e-08, + "loss": 0.0901, + "step": 11492 + }, + { + "epoch": 0.97, + "grad_norm": 0.37816602273875527, + "learning_rate": 2.6163573519545904e-08, + "loss": 0.0784, + "step": 11493 + }, + { + "epoch": 0.97, + "grad_norm": 0.331847945439843, + "learning_rate": 2.602434152489175e-08, + "loss": 0.063, + "step": 11494 + }, + { + "epoch": 0.97, + "grad_norm": 0.4007527061843292, + "learning_rate": 2.5885480020920173e-08, + "loss": 0.0905, + "step": 11495 + }, + { + "epoch": 0.97, + "grad_norm": 0.21585089844164831, + "learning_rate": 2.5746989017974567e-08, + "loss": 0.0585, + "step": 11496 + }, + { + "epoch": 0.97, + "grad_norm": 0.3515074925231003, + "learning_rate": 2.5608868526370568e-08, + "loss": 0.0999, + "step": 11497 + }, + { + "epoch": 0.97, + "grad_norm": 0.47047882031664207, + "learning_rate": 2.5471118556396056e-08, + "loss": 0.1227, + "step": 11498 + }, + { + "epoch": 0.97, + "grad_norm": 0.3077140393327332, + "learning_rate": 2.5333739118310607e-08, + "loss": 0.0815, + "step": 11499 + }, + { + "epoch": 0.97, + "grad_norm": 0.24090473496678147, + "learning_rate": 2.519673022234881e-08, + "loss": 0.0897, + "step": 11500 + }, + { + "epoch": 0.97, + "grad_norm": 0.42061866033185114, + "learning_rate": 2.5060091878714167e-08, + "loss": 0.097, + "step": 11501 + }, + { + "epoch": 0.97, + "grad_norm": 0.18005878418477725, + "learning_rate": 2.4923824097585202e-08, + "loss": 0.054, + "step": 11502 + }, + { + "epoch": 0.97, + "grad_norm": 0.32607927631017386, + "learning_rate": 2.4787926889112134e-08, + "loss": 0.1094, + "step": 11503 + }, + { + "epoch": 0.97, + "grad_norm": 0.4369335721510544, + "learning_rate": 2.4652400263416864e-08, + "loss": 0.0954, + "step": 11504 + }, + { + "epoch": 0.97, + "grad_norm": 0.26396388373818613, + "learning_rate": 2.451724423059465e-08, + "loss": 0.0669, + "step": 11505 + }, + { + "epoch": 0.97, + "grad_norm": 0.25641996677247253, + "learning_rate": 2.4382458800711882e-08, + "loss": 0.0593, + "step": 11506 + }, + { + "epoch": 0.97, + "grad_norm": 0.28026813077011703, + "learning_rate": 2.4248043983808867e-08, + "loss": 0.0827, + "step": 11507 + }, + { + "epoch": 0.97, + "grad_norm": 0.3502519443175744, + "learning_rate": 2.411399978989759e-08, + "loss": 0.0812, + "step": 11508 + }, + { + "epoch": 0.97, + "grad_norm": 0.20350464725794706, + "learning_rate": 2.398032622896229e-08, + "loss": 0.0604, + "step": 11509 + }, + { + "epoch": 0.97, + "grad_norm": 0.36021756932694515, + "learning_rate": 2.384702331095945e-08, + "loss": 0.0676, + "step": 11510 + }, + { + "epoch": 0.97, + "grad_norm": 0.2006316583053726, + "learning_rate": 2.37140910458189e-08, + "loss": 0.0712, + "step": 11511 + }, + { + "epoch": 0.97, + "grad_norm": 0.3640464624018539, + "learning_rate": 2.3581529443441608e-08, + "loss": 0.1099, + "step": 11512 + }, + { + "epoch": 0.97, + "grad_norm": 0.3037238310506292, + "learning_rate": 2.3449338513701904e-08, + "loss": 0.0764, + "step": 11513 + }, + { + "epoch": 0.97, + "grad_norm": 0.3206207545015379, + "learning_rate": 2.3317518266445793e-08, + "loss": 0.0869, + "step": 11514 + }, + { + "epoch": 0.97, + "grad_norm": 0.2973364896136456, + "learning_rate": 2.318606871149265e-08, + "loss": 0.0867, + "step": 11515 + }, + { + "epoch": 0.97, + "grad_norm": 0.4563819497232023, + "learning_rate": 2.305498985863297e-08, + "loss": 0.0819, + "step": 11516 + }, + { + "epoch": 0.97, + "grad_norm": 0.2841969646588255, + "learning_rate": 2.2924281717630613e-08, + "loss": 0.0935, + "step": 11517 + }, + { + "epoch": 0.97, + "grad_norm": 0.2192323421880098, + "learning_rate": 2.2793944298221128e-08, + "loss": 0.0569, + "step": 11518 + }, + { + "epoch": 0.97, + "grad_norm": 0.4873349525226456, + "learning_rate": 2.2663977610112852e-08, + "loss": 0.1165, + "step": 11519 + }, + { + "epoch": 0.97, + "grad_norm": 0.7466334148146463, + "learning_rate": 2.2534381662987492e-08, + "loss": 0.0984, + "step": 11520 + }, + { + "epoch": 0.97, + "grad_norm": 0.269561793362833, + "learning_rate": 2.240515646649677e-08, + "loss": 0.0598, + "step": 11521 + }, + { + "epoch": 0.97, + "grad_norm": 0.4321745938994596, + "learning_rate": 2.2276302030266318e-08, + "loss": 0.0918, + "step": 11522 + }, + { + "epoch": 0.97, + "grad_norm": 0.35306318159092687, + "learning_rate": 2.2147818363894568e-08, + "loss": 0.102, + "step": 11523 + }, + { + "epoch": 0.97, + "grad_norm": 0.39154363362699474, + "learning_rate": 2.2019705476951646e-08, + "loss": 0.0719, + "step": 11524 + }, + { + "epoch": 0.97, + "grad_norm": 0.28021996191133947, + "learning_rate": 2.1891963378979918e-08, + "loss": 0.0903, + "step": 11525 + }, + { + "epoch": 0.97, + "grad_norm": 0.28381826649616543, + "learning_rate": 2.1764592079493995e-08, + "loss": 0.0727, + "step": 11526 + }, + { + "epoch": 0.97, + "grad_norm": 0.28098312596080793, + "learning_rate": 2.1637591587981844e-08, + "loss": 0.0684, + "step": 11527 + }, + { + "epoch": 0.97, + "grad_norm": 0.6029841444074632, + "learning_rate": 2.1510961913903115e-08, + "loss": 0.1255, + "step": 11528 + }, + { + "epoch": 0.97, + "grad_norm": 0.3274631943879163, + "learning_rate": 2.138470306668916e-08, + "loss": 0.0431, + "step": 11529 + }, + { + "epoch": 0.97, + "grad_norm": 0.22485210333720665, + "learning_rate": 2.1258815055745785e-08, + "loss": 0.0429, + "step": 11530 + }, + { + "epoch": 0.97, + "grad_norm": 0.4381206931078247, + "learning_rate": 2.1133297890448822e-08, + "loss": 0.0869, + "step": 11531 + }, + { + "epoch": 0.97, + "grad_norm": 0.4297777043068957, + "learning_rate": 2.1008151580148017e-08, + "loss": 0.0621, + "step": 11532 + }, + { + "epoch": 0.97, + "grad_norm": 0.3256433004477139, + "learning_rate": 2.08833761341648e-08, + "loss": 0.0672, + "step": 11533 + }, + { + "epoch": 0.97, + "grad_norm": 0.33207108673669844, + "learning_rate": 2.0758971561793406e-08, + "loss": 0.0503, + "step": 11534 + }, + { + "epoch": 0.97, + "grad_norm": 0.41434776562921183, + "learning_rate": 2.0634937872299754e-08, + "loss": 0.1095, + "step": 11535 + }, + { + "epoch": 0.97, + "grad_norm": 0.26255068569039547, + "learning_rate": 2.051127507492312e-08, + "loss": 0.0994, + "step": 11536 + }, + { + "epoch": 0.97, + "grad_norm": 0.49876291100700343, + "learning_rate": 2.038798317887447e-08, + "loss": 0.1305, + "step": 11537 + }, + { + "epoch": 0.97, + "grad_norm": 0.35195645737122633, + "learning_rate": 2.0265062193337014e-08, + "loss": 0.1106, + "step": 11538 + }, + { + "epoch": 0.97, + "grad_norm": 0.3061923993169574, + "learning_rate": 2.0142512127466763e-08, + "loss": 0.0568, + "step": 11539 + }, + { + "epoch": 0.97, + "grad_norm": 0.3383181090125839, + "learning_rate": 2.0020332990391967e-08, + "loss": 0.0699, + "step": 11540 + }, + { + "epoch": 0.97, + "grad_norm": 0.25599434500929386, + "learning_rate": 1.989852479121368e-08, + "loss": 0.0566, + "step": 11541 + }, + { + "epoch": 0.97, + "grad_norm": 0.3821385913076168, + "learning_rate": 1.9777087539004093e-08, + "loss": 0.0709, + "step": 11542 + }, + { + "epoch": 0.97, + "grad_norm": 0.34559006744891935, + "learning_rate": 1.9656021242808742e-08, + "loss": 0.0656, + "step": 11543 + }, + { + "epoch": 0.97, + "grad_norm": 0.45413643937277454, + "learning_rate": 1.9535325911645974e-08, + "loss": 0.0793, + "step": 11544 + }, + { + "epoch": 0.97, + "grad_norm": 0.4554363825208631, + "learning_rate": 1.941500155450582e-08, + "loss": 0.0855, + "step": 11545 + }, + { + "epoch": 0.97, + "grad_norm": 0.23241215282092142, + "learning_rate": 1.9295048180349994e-08, + "loss": 0.0727, + "step": 11546 + }, + { + "epoch": 0.97, + "grad_norm": 0.2577645239870664, + "learning_rate": 1.917546579811358e-08, + "loss": 0.0705, + "step": 11547 + }, + { + "epoch": 0.97, + "grad_norm": 0.7350815724816152, + "learning_rate": 1.9056254416704446e-08, + "loss": 0.1312, + "step": 11548 + }, + { + "epoch": 0.97, + "grad_norm": 0.46447700153543364, + "learning_rate": 1.8937414045001046e-08, + "loss": 0.0982, + "step": 11549 + }, + { + "epoch": 0.97, + "grad_norm": 0.35007205211580567, + "learning_rate": 1.88189446918563e-08, + "loss": 0.0655, + "step": 11550 + }, + { + "epoch": 0.97, + "grad_norm": 0.28472921696865283, + "learning_rate": 1.8700846366093705e-08, + "loss": 0.0789, + "step": 11551 + }, + { + "epoch": 0.97, + "grad_norm": 0.49510503638709313, + "learning_rate": 1.858311907651067e-08, + "loss": 0.0845, + "step": 11552 + }, + { + "epoch": 0.97, + "grad_norm": 0.1784477893714195, + "learning_rate": 1.8465762831875734e-08, + "loss": 0.0423, + "step": 11553 + }, + { + "epoch": 0.97, + "grad_norm": 0.5431164617101403, + "learning_rate": 1.8348777640930236e-08, + "loss": 0.0945, + "step": 11554 + }, + { + "epoch": 0.97, + "grad_norm": 0.3925571091483308, + "learning_rate": 1.8232163512388325e-08, + "loss": 0.0862, + "step": 11555 + }, + { + "epoch": 0.97, + "grad_norm": 0.34210563674857997, + "learning_rate": 1.8115920454935264e-08, + "loss": 0.0991, + "step": 11556 + }, + { + "epoch": 0.97, + "grad_norm": 0.3945014467416165, + "learning_rate": 1.80000484772308e-08, + "loss": 0.0825, + "step": 11557 + }, + { + "epoch": 0.97, + "grad_norm": 0.30706276657426446, + "learning_rate": 1.7884547587904144e-08, + "loss": 0.0952, + "step": 11558 + }, + { + "epoch": 0.97, + "grad_norm": 0.18054078498119652, + "learning_rate": 1.7769417795560074e-08, + "loss": 0.0359, + "step": 11559 + }, + { + "epoch": 0.97, + "grad_norm": 0.2818013613223306, + "learning_rate": 1.76546591087734e-08, + "loss": 0.0817, + "step": 11560 + }, + { + "epoch": 0.97, + "grad_norm": 0.47933782292267946, + "learning_rate": 1.7540271536091726e-08, + "loss": 0.0763, + "step": 11561 + }, + { + "epoch": 0.97, + "grad_norm": 0.258962066596929, + "learning_rate": 1.742625508603546e-08, + "loss": 0.0416, + "step": 11562 + }, + { + "epoch": 0.97, + "grad_norm": 0.3765095489717722, + "learning_rate": 1.731260976709781e-08, + "loss": 0.1272, + "step": 11563 + }, + { + "epoch": 0.97, + "grad_norm": 0.4749963916557255, + "learning_rate": 1.7199335587743114e-08, + "loss": 0.1197, + "step": 11564 + }, + { + "epoch": 0.97, + "grad_norm": 0.8406285668901154, + "learning_rate": 1.7086432556408515e-08, + "loss": 0.1386, + "step": 11565 + }, + { + "epoch": 0.97, + "grad_norm": 0.18078421383041954, + "learning_rate": 1.6973900681504506e-08, + "loss": 0.0535, + "step": 11566 + }, + { + "epoch": 0.97, + "grad_norm": 0.23152975325523695, + "learning_rate": 1.686173997141216e-08, + "loss": 0.081, + "step": 11567 + }, + { + "epoch": 0.97, + "grad_norm": 0.29758257104991653, + "learning_rate": 1.6749950434487018e-08, + "loss": 0.0864, + "step": 11568 + }, + { + "epoch": 0.97, + "grad_norm": 0.24545763016054287, + "learning_rate": 1.6638532079054638e-08, + "loss": 0.0693, + "step": 11569 + }, + { + "epoch": 0.97, + "grad_norm": 0.4482030379949024, + "learning_rate": 1.6527484913414494e-08, + "loss": 0.0847, + "step": 11570 + }, + { + "epoch": 0.97, + "grad_norm": 0.2631957732348104, + "learning_rate": 1.6416808945838304e-08, + "loss": 0.0623, + "step": 11571 + }, + { + "epoch": 0.98, + "grad_norm": 0.4276227964281277, + "learning_rate": 1.6306504184570027e-08, + "loss": 0.1005, + "step": 11572 + }, + { + "epoch": 0.98, + "grad_norm": 0.35335301594794294, + "learning_rate": 1.6196570637825317e-08, + "loss": 0.0743, + "step": 11573 + }, + { + "epoch": 0.98, + "grad_norm": 0.38034496661137235, + "learning_rate": 1.608700831379262e-08, + "loss": 0.0789, + "step": 11574 + }, + { + "epoch": 0.98, + "grad_norm": 0.2171940583455382, + "learning_rate": 1.5977817220633184e-08, + "loss": 0.0623, + "step": 11575 + }, + { + "epoch": 0.98, + "grad_norm": 0.3147256449362975, + "learning_rate": 1.5868997366479955e-08, + "loss": 0.0642, + "step": 11576 + }, + { + "epoch": 0.98, + "grad_norm": 0.29207929128927024, + "learning_rate": 1.576054875943811e-08, + "loss": 0.0753, + "step": 11577 + }, + { + "epoch": 0.98, + "grad_norm": 0.3026095848318076, + "learning_rate": 1.5652471407586743e-08, + "loss": 0.0911, + "step": 11578 + }, + { + "epoch": 0.98, + "grad_norm": 0.47134602121213226, + "learning_rate": 1.5544765318974975e-08, + "loss": 0.1266, + "step": 11579 + }, + { + "epoch": 0.98, + "grad_norm": 0.42788347066781407, + "learning_rate": 1.5437430501625272e-08, + "loss": 0.0958, + "step": 11580 + }, + { + "epoch": 0.98, + "grad_norm": 0.2111782848657347, + "learning_rate": 1.5330466963532908e-08, + "loss": 0.0579, + "step": 11581 + }, + { + "epoch": 0.98, + "grad_norm": 0.3683916541020962, + "learning_rate": 1.5223874712665954e-08, + "loss": 0.0729, + "step": 11582 + }, + { + "epoch": 0.98, + "grad_norm": 0.49798363479012253, + "learning_rate": 1.511765375696306e-08, + "loss": 0.1201, + "step": 11583 + }, + { + "epoch": 0.98, + "grad_norm": 0.2527166619435074, + "learning_rate": 1.5011804104335668e-08, + "loss": 0.0647, + "step": 11584 + }, + { + "epoch": 0.98, + "grad_norm": 0.32205308461034304, + "learning_rate": 1.4906325762669705e-08, + "loss": 0.0806, + "step": 11585 + }, + { + "epoch": 0.98, + "grad_norm": 0.327638273724475, + "learning_rate": 1.4801218739820544e-08, + "loss": 0.0901, + "step": 11586 + }, + { + "epoch": 0.98, + "grad_norm": 0.35203439740372067, + "learning_rate": 1.4696483043617482e-08, + "loss": 0.1009, + "step": 11587 + }, + { + "epoch": 0.98, + "grad_norm": 0.3212084621534735, + "learning_rate": 1.4592118681862056e-08, + "loss": 0.0749, + "step": 11588 + }, + { + "epoch": 0.98, + "grad_norm": 0.35593589279390087, + "learning_rate": 1.448812566232749e-08, + "loss": 0.0905, + "step": 11589 + }, + { + "epoch": 0.98, + "grad_norm": 0.37372612001663696, + "learning_rate": 1.4384503992760368e-08, + "loss": 0.0914, + "step": 11590 + }, + { + "epoch": 0.98, + "grad_norm": 0.2711063095822574, + "learning_rate": 1.428125368087896e-08, + "loss": 0.0627, + "step": 11591 + }, + { + "epoch": 0.98, + "grad_norm": 0.19430711217725277, + "learning_rate": 1.4178374734373224e-08, + "loss": 0.0535, + "step": 11592 + }, + { + "epoch": 0.98, + "grad_norm": 0.38588194515337526, + "learning_rate": 1.4075867160907031e-08, + "loss": 0.0882, + "step": 11593 + }, + { + "epoch": 0.98, + "grad_norm": 0.3224665385214907, + "learning_rate": 1.3973730968115384e-08, + "loss": 0.1002, + "step": 11594 + }, + { + "epoch": 0.98, + "grad_norm": 0.24794471194302883, + "learning_rate": 1.3871966163605533e-08, + "loss": 0.0716, + "step": 11595 + }, + { + "epoch": 0.98, + "grad_norm": 0.36005254407483, + "learning_rate": 1.3770572754958633e-08, + "loss": 0.1072, + "step": 11596 + }, + { + "epoch": 0.98, + "grad_norm": 0.5281959621898545, + "learning_rate": 1.3669550749726424e-08, + "loss": 0.1104, + "step": 11597 + }, + { + "epoch": 0.98, + "grad_norm": 0.5774251795584968, + "learning_rate": 1.3568900155432885e-08, + "loss": 0.1243, + "step": 11598 + }, + { + "epoch": 0.98, + "grad_norm": 0.19457642851823678, + "learning_rate": 1.3468620979576464e-08, + "loss": 0.0393, + "step": 11599 + }, + { + "epoch": 0.98, + "grad_norm": 0.2944445371860034, + "learning_rate": 1.336871322962563e-08, + "loss": 0.097, + "step": 11600 + }, + { + "epoch": 0.98, + "grad_norm": 0.4060100705979313, + "learning_rate": 1.3269176913022208e-08, + "loss": 0.0825, + "step": 11601 + }, + { + "epoch": 0.98, + "grad_norm": 0.27558439411468527, + "learning_rate": 1.3170012037180268e-08, + "loss": 0.0604, + "step": 11602 + }, + { + "epoch": 0.98, + "grad_norm": 0.21685231105899788, + "learning_rate": 1.307121860948668e-08, + "loss": 0.0479, + "step": 11603 + }, + { + "epoch": 0.98, + "grad_norm": 0.20056050413425275, + "learning_rate": 1.2972796637299444e-08, + "loss": 0.0596, + "step": 11604 + }, + { + "epoch": 0.98, + "grad_norm": 0.5038161893165299, + "learning_rate": 1.2874746127949921e-08, + "loss": 0.0762, + "step": 11605 + }, + { + "epoch": 0.98, + "grad_norm": 0.2628471631309903, + "learning_rate": 1.2777067088741158e-08, + "loss": 0.0674, + "step": 11606 + }, + { + "epoch": 0.98, + "grad_norm": 0.5642598774161195, + "learning_rate": 1.2679759526949554e-08, + "loss": 0.1118, + "step": 11607 + }, + { + "epoch": 0.98, + "grad_norm": 0.3937694414584248, + "learning_rate": 1.2582823449822646e-08, + "loss": 0.1063, + "step": 11608 + }, + { + "epoch": 0.98, + "grad_norm": 0.2729646177002026, + "learning_rate": 1.248625886458077e-08, + "loss": 0.0713, + "step": 11609 + }, + { + "epoch": 0.98, + "grad_norm": 0.29737997021788937, + "learning_rate": 1.239006577841706e-08, + "loss": 0.0784, + "step": 11610 + }, + { + "epoch": 0.98, + "grad_norm": 0.4031927010267624, + "learning_rate": 1.2294244198495786e-08, + "loss": 0.1046, + "step": 11611 + }, + { + "epoch": 0.98, + "grad_norm": 0.3003093190190323, + "learning_rate": 1.2198794131955128e-08, + "loss": 0.0638, + "step": 11612 + }, + { + "epoch": 0.98, + "grad_norm": 0.35325147498237114, + "learning_rate": 1.2103715585904396e-08, + "loss": 0.0951, + "step": 11613 + }, + { + "epoch": 0.98, + "grad_norm": 0.2567778764752918, + "learning_rate": 1.200900856742515e-08, + "loss": 0.0967, + "step": 11614 + }, + { + "epoch": 0.98, + "grad_norm": 0.28619974689598177, + "learning_rate": 1.1914673083572303e-08, + "loss": 0.0656, + "step": 11615 + }, + { + "epoch": 0.98, + "grad_norm": 0.5024672333439948, + "learning_rate": 1.1820709141372454e-08, + "loss": 0.0991, + "step": 11616 + }, + { + "epoch": 0.98, + "grad_norm": 0.30415771250022955, + "learning_rate": 1.1727116747825008e-08, + "loss": 0.0819, + "step": 11617 + }, + { + "epoch": 0.98, + "grad_norm": 0.36811349359618833, + "learning_rate": 1.1633895909899939e-08, + "loss": 0.0876, + "step": 11618 + }, + { + "epoch": 0.98, + "grad_norm": 0.34453175114356055, + "learning_rate": 1.1541046634541696e-08, + "loss": 0.0877, + "step": 11619 + }, + { + "epoch": 0.98, + "grad_norm": 0.2682487894434015, + "learning_rate": 1.144856892866697e-08, + "loss": 0.0735, + "step": 11620 + }, + { + "epoch": 0.98, + "grad_norm": 0.15773572265678204, + "learning_rate": 1.1356462799162471e-08, + "loss": 0.0153, + "step": 11621 + }, + { + "epoch": 0.98, + "grad_norm": 0.6186343269358531, + "learning_rate": 1.1264728252889933e-08, + "loss": 0.1171, + "step": 11622 + }, + { + "epoch": 0.98, + "grad_norm": 0.46354534830920635, + "learning_rate": 1.1173365296682226e-08, + "loss": 0.1268, + "step": 11623 + }, + { + "epoch": 0.98, + "grad_norm": 0.2404725928518511, + "learning_rate": 1.1082373937343904e-08, + "loss": 0.0875, + "step": 11624 + }, + { + "epoch": 0.98, + "grad_norm": 0.2748765723076189, + "learning_rate": 1.0991754181653436e-08, + "loss": 0.0581, + "step": 11625 + }, + { + "epoch": 0.98, + "grad_norm": 0.39829795422306863, + "learning_rate": 1.0901506036359866e-08, + "loss": 0.0793, + "step": 11626 + }, + { + "epoch": 0.98, + "grad_norm": 0.34222473327070596, + "learning_rate": 1.081162950818615e-08, + "loss": 0.0676, + "step": 11627 + }, + { + "epoch": 0.98, + "grad_norm": 0.31160889584153484, + "learning_rate": 1.0722124603825823e-08, + "loss": 0.0733, + "step": 11628 + }, + { + "epoch": 0.98, + "grad_norm": 0.2841515650662639, + "learning_rate": 1.0632991329946885e-08, + "loss": 0.0994, + "step": 11629 + }, + { + "epoch": 0.98, + "grad_norm": 0.23118007751351166, + "learning_rate": 1.0544229693187913e-08, + "loss": 0.0631, + "step": 11630 + }, + { + "epoch": 0.98, + "grad_norm": 0.31827814344621286, + "learning_rate": 1.0455839700160286e-08, + "loss": 0.0852, + "step": 11631 + }, + { + "epoch": 0.98, + "grad_norm": 0.36395846163622325, + "learning_rate": 1.0367821357447627e-08, + "loss": 0.0587, + "step": 11632 + }, + { + "epoch": 0.98, + "grad_norm": 0.3400252682964885, + "learning_rate": 1.0280174671606912e-08, + "loss": 0.0561, + "step": 11633 + }, + { + "epoch": 0.98, + "grad_norm": 0.24890837008972586, + "learning_rate": 1.01928996491657e-08, + "loss": 0.0562, + "step": 11634 + }, + { + "epoch": 0.98, + "grad_norm": 0.2876885800084201, + "learning_rate": 1.0105996296625454e-08, + "loss": 0.0786, + "step": 11635 + }, + { + "epoch": 0.98, + "grad_norm": 0.6265329043142973, + "learning_rate": 1.0019464620458775e-08, + "loss": 0.0925, + "step": 11636 + }, + { + "epoch": 0.98, + "grad_norm": 0.27305983999333133, + "learning_rate": 9.933304627111063e-09, + "loss": 0.0636, + "step": 11637 + }, + { + "epoch": 0.98, + "grad_norm": 0.251064137937191, + "learning_rate": 9.847516322999962e-09, + "loss": 0.0414, + "step": 11638 + }, + { + "epoch": 0.98, + "grad_norm": 0.16489430912878564, + "learning_rate": 9.762099714515915e-09, + "loss": 0.0423, + "step": 11639 + }, + { + "epoch": 0.98, + "grad_norm": 0.18434499767430468, + "learning_rate": 9.677054808021058e-09, + "loss": 0.0457, + "step": 11640 + }, + { + "epoch": 0.98, + "grad_norm": 0.18950452226970704, + "learning_rate": 9.592381609849766e-09, + "loss": 0.0338, + "step": 11641 + }, + { + "epoch": 0.98, + "grad_norm": 0.34940353908481575, + "learning_rate": 9.50808012630866e-09, + "loss": 0.076, + "step": 11642 + }, + { + "epoch": 0.98, + "grad_norm": 0.3468091242355309, + "learning_rate": 9.424150363678275e-09, + "loss": 0.0459, + "step": 11643 + }, + { + "epoch": 0.98, + "grad_norm": 0.1991970139491377, + "learning_rate": 9.340592328209163e-09, + "loss": 0.0549, + "step": 11644 + }, + { + "epoch": 0.98, + "grad_norm": 0.2209420340354997, + "learning_rate": 9.257406026125238e-09, + "loss": 0.0609, + "step": 11645 + }, + { + "epoch": 0.98, + "grad_norm": 0.5451416592577246, + "learning_rate": 9.174591463623206e-09, + "loss": 0.1665, + "step": 11646 + }, + { + "epoch": 0.98, + "grad_norm": 0.2784633723637343, + "learning_rate": 9.092148646871469e-09, + "loss": 0.0745, + "step": 11647 + }, + { + "epoch": 0.98, + "grad_norm": 0.24837036139916127, + "learning_rate": 9.010077582010667e-09, + "loss": 0.0634, + "step": 11648 + }, + { + "epoch": 0.98, + "grad_norm": 0.5513257189644777, + "learning_rate": 8.928378275153693e-09, + "loss": 0.1166, + "step": 11649 + }, + { + "epoch": 0.98, + "grad_norm": 0.35458166894293935, + "learning_rate": 8.847050732386231e-09, + "loss": 0.0941, + "step": 11650 + }, + { + "epoch": 0.98, + "grad_norm": 0.2470265884225025, + "learning_rate": 8.766094959766214e-09, + "loss": 0.0478, + "step": 11651 + }, + { + "epoch": 0.98, + "grad_norm": 0.318524790578844, + "learning_rate": 8.685510963323262e-09, + "loss": 0.0732, + "step": 11652 + }, + { + "epoch": 0.98, + "grad_norm": 0.5335469697968668, + "learning_rate": 8.605298749060353e-09, + "loss": 0.0997, + "step": 11653 + }, + { + "epoch": 0.98, + "grad_norm": 0.292504353987712, + "learning_rate": 8.525458322951596e-09, + "loss": 0.0774, + "step": 11654 + }, + { + "epoch": 0.98, + "grad_norm": 0.2267313409990472, + "learning_rate": 8.445989690944457e-09, + "loss": 0.0782, + "step": 11655 + }, + { + "epoch": 0.98, + "grad_norm": 0.3102383269599712, + "learning_rate": 8.366892858957532e-09, + "loss": 0.0371, + "step": 11656 + }, + { + "epoch": 0.98, + "grad_norm": 0.3813447834389275, + "learning_rate": 8.288167832882777e-09, + "loss": 0.0839, + "step": 11657 + }, + { + "epoch": 0.98, + "grad_norm": 0.2293364294351581, + "learning_rate": 8.209814618584944e-09, + "loss": 0.0546, + "step": 11658 + }, + { + "epoch": 0.98, + "grad_norm": 0.3212925121479801, + "learning_rate": 8.13183322189881e-09, + "loss": 0.1134, + "step": 11659 + }, + { + "epoch": 0.98, + "grad_norm": 0.30201757307799637, + "learning_rate": 8.054223648633618e-09, + "loss": 0.0671, + "step": 11660 + }, + { + "epoch": 0.98, + "grad_norm": 0.5008833382816787, + "learning_rate": 7.976985904569746e-09, + "loss": 0.1078, + "step": 11661 + }, + { + "epoch": 0.98, + "grad_norm": 0.35644120764923065, + "learning_rate": 7.900119995460919e-09, + "loss": 0.0479, + "step": 11662 + }, + { + "epoch": 0.98, + "grad_norm": 0.21798623519603036, + "learning_rate": 7.823625927032563e-09, + "loss": 0.0435, + "step": 11663 + }, + { + "epoch": 0.98, + "grad_norm": 0.3693477655023786, + "learning_rate": 7.74750370498234e-09, + "loss": 0.0303, + "step": 11664 + }, + { + "epoch": 0.98, + "grad_norm": 0.5515599762152675, + "learning_rate": 7.671753334979604e-09, + "loss": 0.1245, + "step": 11665 + }, + { + "epoch": 0.98, + "grad_norm": 0.30871027878409035, + "learning_rate": 7.596374822667063e-09, + "loss": 0.0724, + "step": 11666 + }, + { + "epoch": 0.98, + "grad_norm": 0.23277483085736977, + "learning_rate": 7.52136817365967e-09, + "loss": 0.039, + "step": 11667 + }, + { + "epoch": 0.98, + "grad_norm": 0.2306364768487206, + "learning_rate": 7.44673339354407e-09, + "loss": 0.0667, + "step": 11668 + }, + { + "epoch": 0.98, + "grad_norm": 0.4225360651366093, + "learning_rate": 7.372470487879146e-09, + "loss": 0.0941, + "step": 11669 + }, + { + "epoch": 0.98, + "grad_norm": 0.308623947888744, + "learning_rate": 7.298579462197142e-09, + "loss": 0.0985, + "step": 11670 + }, + { + "epoch": 0.98, + "grad_norm": 0.2777964899887318, + "learning_rate": 7.225060322001986e-09, + "loss": 0.0667, + "step": 11671 + }, + { + "epoch": 0.98, + "grad_norm": 0.38665690749011766, + "learning_rate": 7.151913072768746e-09, + "loss": 0.1126, + "step": 11672 + }, + { + "epoch": 0.98, + "grad_norm": 0.22935740037088323, + "learning_rate": 7.079137719946949e-09, + "loss": 0.0576, + "step": 11673 + }, + { + "epoch": 0.98, + "grad_norm": 0.3293188296095383, + "learning_rate": 7.006734268956706e-09, + "loss": 0.0566, + "step": 11674 + }, + { + "epoch": 0.98, + "grad_norm": 0.3715346070788178, + "learning_rate": 6.934702725190923e-09, + "loss": 0.0859, + "step": 11675 + }, + { + "epoch": 0.98, + "grad_norm": 0.24745405478479526, + "learning_rate": 6.863043094015864e-09, + "loss": 0.0759, + "step": 11676 + }, + { + "epoch": 0.98, + "grad_norm": 0.31282545410458285, + "learning_rate": 6.791755380768372e-09, + "loss": 0.0794, + "step": 11677 + }, + { + "epoch": 0.98, + "grad_norm": 0.3696731280795269, + "learning_rate": 6.7208395907580884e-09, + "loss": 0.1033, + "step": 11678 + }, + { + "epoch": 0.98, + "grad_norm": 0.34498518477033374, + "learning_rate": 6.650295729268008e-09, + "loss": 0.0848, + "step": 11679 + }, + { + "epoch": 0.98, + "grad_norm": 0.3424817770552819, + "learning_rate": 6.580123801552263e-09, + "loss": 0.087, + "step": 11680 + }, + { + "epoch": 0.98, + "grad_norm": 0.2998104658487225, + "learning_rate": 6.510323812837782e-09, + "loss": 0.0931, + "step": 11681 + }, + { + "epoch": 0.98, + "grad_norm": 0.34499255623758235, + "learning_rate": 6.440895768323741e-09, + "loss": 0.1026, + "step": 11682 + }, + { + "epoch": 0.98, + "grad_norm": 0.1898615298303383, + "learning_rate": 6.371839673181001e-09, + "loss": 0.0419, + "step": 11683 + }, + { + "epoch": 0.98, + "grad_norm": 0.20100455650589155, + "learning_rate": 6.303155532553784e-09, + "loss": 0.0592, + "step": 11684 + }, + { + "epoch": 0.98, + "grad_norm": 0.21830741780046703, + "learning_rate": 6.234843351557995e-09, + "loss": 0.0632, + "step": 11685 + }, + { + "epoch": 0.98, + "grad_norm": 0.34962770773632357, + "learning_rate": 6.166903135282343e-09, + "loss": 0.0989, + "step": 11686 + }, + { + "epoch": 0.98, + "grad_norm": 0.23244997523901723, + "learning_rate": 6.0993348887866674e-09, + "loss": 0.0328, + "step": 11687 + }, + { + "epoch": 0.98, + "grad_norm": 0.22874942758022632, + "learning_rate": 6.032138617104166e-09, + "loss": 0.0426, + "step": 11688 + }, + { + "epoch": 0.98, + "grad_norm": 0.38510228914862726, + "learning_rate": 5.965314325239724e-09, + "loss": 0.0899, + "step": 11689 + }, + { + "epoch": 0.99, + "grad_norm": 0.4449176584090638, + "learning_rate": 5.898862018171026e-09, + "loss": 0.0872, + "step": 11690 + }, + { + "epoch": 0.99, + "grad_norm": 0.32093066418897115, + "learning_rate": 5.832781700848556e-09, + "loss": 0.074, + "step": 11691 + }, + { + "epoch": 0.99, + "grad_norm": 0.3708209093807455, + "learning_rate": 5.767073378193377e-09, + "loss": 0.0618, + "step": 11692 + }, + { + "epoch": 0.99, + "grad_norm": 0.1847916385801939, + "learning_rate": 5.701737055099909e-09, + "loss": 0.0422, + "step": 11693 + }, + { + "epoch": 0.99, + "grad_norm": 0.5758226941015413, + "learning_rate": 5.636772736434814e-09, + "loss": 0.133, + "step": 11694 + }, + { + "epoch": 0.99, + "grad_norm": 0.22094610434679887, + "learning_rate": 5.5721804270375545e-09, + "loss": 0.0645, + "step": 11695 + }, + { + "epoch": 0.99, + "grad_norm": 0.31302511008902073, + "learning_rate": 5.507960131719281e-09, + "loss": 0.0707, + "step": 11696 + }, + { + "epoch": 0.99, + "grad_norm": 0.34221648898517515, + "learning_rate": 5.444111855262835e-09, + "loss": 0.1116, + "step": 11697 + }, + { + "epoch": 0.99, + "grad_norm": 0.29108362659481507, + "learning_rate": 5.380635602424411e-09, + "loss": 0.0928, + "step": 11698 + }, + { + "epoch": 0.99, + "grad_norm": 0.32518192527661716, + "learning_rate": 5.3175313779318945e-09, + "loss": 0.0614, + "step": 11699 + }, + { + "epoch": 0.99, + "grad_norm": 0.2087886676579728, + "learning_rate": 5.25479918648597e-09, + "loss": 0.0568, + "step": 11700 + }, + { + "epoch": 0.99, + "grad_norm": 0.23743955714051537, + "learning_rate": 5.192439032759011e-09, + "loss": 0.0628, + "step": 11701 + }, + { + "epoch": 0.99, + "grad_norm": 0.21791651637671183, + "learning_rate": 5.1304509213967455e-09, + "loss": 0.0509, + "step": 11702 + }, + { + "epoch": 0.99, + "grad_norm": 0.30839713114936845, + "learning_rate": 5.068834857014926e-09, + "loss": 0.0658, + "step": 11703 + }, + { + "epoch": 0.99, + "grad_norm": 0.38117575824190525, + "learning_rate": 5.007590844204324e-09, + "loss": 0.0741, + "step": 11704 + }, + { + "epoch": 0.99, + "grad_norm": 0.2693265829443724, + "learning_rate": 4.946718887526292e-09, + "loss": 0.0798, + "step": 11705 + }, + { + "epoch": 0.99, + "grad_norm": 0.25886199458226505, + "learning_rate": 4.886218991514979e-09, + "loss": 0.1014, + "step": 11706 + }, + { + "epoch": 0.99, + "grad_norm": 0.20190110017395557, + "learning_rate": 4.826091160676782e-09, + "loss": 0.0454, + "step": 11707 + }, + { + "epoch": 0.99, + "grad_norm": 0.2908591937595396, + "learning_rate": 4.766335399490341e-09, + "loss": 0.0552, + "step": 11708 + }, + { + "epoch": 0.99, + "grad_norm": 0.2826969543259274, + "learning_rate": 4.706951712406538e-09, + "loss": 0.0992, + "step": 11709 + }, + { + "epoch": 0.99, + "grad_norm": 0.2424049771811536, + "learning_rate": 4.647940103848503e-09, + "loss": 0.0948, + "step": 11710 + }, + { + "epoch": 0.99, + "grad_norm": 0.32801346753312116, + "learning_rate": 4.589300578212163e-09, + "loss": 0.1164, + "step": 11711 + }, + { + "epoch": 0.99, + "grad_norm": 0.19767721035560054, + "learning_rate": 4.5310331398651374e-09, + "loss": 0.0562, + "step": 11712 + }, + { + "epoch": 0.99, + "grad_norm": 0.3023838447955846, + "learning_rate": 4.473137793147842e-09, + "loss": 0.0786, + "step": 11713 + }, + { + "epoch": 0.99, + "grad_norm": 0.1813769833934584, + "learning_rate": 4.4156145423718264e-09, + "loss": 0.0527, + "step": 11714 + }, + { + "epoch": 0.99, + "grad_norm": 0.3001139513292004, + "learning_rate": 4.358463391822554e-09, + "loss": 0.105, + "step": 11715 + }, + { + "epoch": 0.99, + "grad_norm": 0.23605412898680925, + "learning_rate": 4.301684345756619e-09, + "loss": 0.0521, + "step": 11716 + }, + { + "epoch": 0.99, + "grad_norm": 0.26957254624627247, + "learning_rate": 4.245277408403414e-09, + "loss": 0.062, + "step": 11717 + }, + { + "epoch": 0.99, + "grad_norm": 0.2960311829191653, + "learning_rate": 4.189242583964581e-09, + "loss": 0.0905, + "step": 11718 + }, + { + "epoch": 0.99, + "grad_norm": 0.3222039325305927, + "learning_rate": 4.133579876613447e-09, + "loss": 0.0981, + "step": 11719 + }, + { + "epoch": 0.99, + "grad_norm": 0.274492102600491, + "learning_rate": 4.07828929049614e-09, + "loss": 0.0784, + "step": 11720 + }, + { + "epoch": 0.99, + "grad_norm": 0.3507789854021571, + "learning_rate": 4.0233708297315874e-09, + "loss": 0.1123, + "step": 11721 + }, + { + "epoch": 0.99, + "grad_norm": 0.3582079842573663, + "learning_rate": 3.968824498409851e-09, + "loss": 0.0684, + "step": 11722 + }, + { + "epoch": 0.99, + "grad_norm": 0.38003107181484863, + "learning_rate": 3.914650300594347e-09, + "loss": 0.089, + "step": 11723 + }, + { + "epoch": 0.99, + "grad_norm": 0.3635450475906826, + "learning_rate": 3.8608482403196255e-09, + "loss": 0.1099, + "step": 11724 + }, + { + "epoch": 0.99, + "grad_norm": 0.3152907723099724, + "learning_rate": 3.807418321594147e-09, + "loss": 0.1083, + "step": 11725 + }, + { + "epoch": 0.99, + "grad_norm": 0.524929835984492, + "learning_rate": 3.754360548396951e-09, + "loss": 0.1209, + "step": 11726 + }, + { + "epoch": 0.99, + "grad_norm": 0.36260212920868823, + "learning_rate": 3.7016749246798766e-09, + "loss": 0.0767, + "step": 11727 + }, + { + "epoch": 0.99, + "grad_norm": 0.24489584199606804, + "learning_rate": 3.6493614543681166e-09, + "loss": 0.0578, + "step": 11728 + }, + { + "epoch": 0.99, + "grad_norm": 0.31041872957734284, + "learning_rate": 3.597420141357444e-09, + "loss": 0.0589, + "step": 11729 + }, + { + "epoch": 0.99, + "grad_norm": 0.3437706940663031, + "learning_rate": 3.545850989517541e-09, + "loss": 0.0859, + "step": 11730 + }, + { + "epoch": 0.99, + "grad_norm": 0.18754634991027802, + "learning_rate": 3.494654002688669e-09, + "loss": 0.0524, + "step": 11731 + }, + { + "epoch": 0.99, + "grad_norm": 0.2837435599278237, + "learning_rate": 3.4438291846849994e-09, + "loss": 0.0509, + "step": 11732 + }, + { + "epoch": 0.99, + "grad_norm": 0.3745836738960185, + "learning_rate": 3.393376539291837e-09, + "loss": 0.104, + "step": 11733 + }, + { + "epoch": 0.99, + "grad_norm": 0.35939347803926025, + "learning_rate": 3.343296070267843e-09, + "loss": 0.112, + "step": 11734 + }, + { + "epoch": 0.99, + "grad_norm": 0.3880796222436005, + "learning_rate": 3.2935877813422557e-09, + "loss": 0.0778, + "step": 11735 + }, + { + "epoch": 0.99, + "grad_norm": 0.25449122423019377, + "learning_rate": 3.244251676218779e-09, + "loss": 0.0825, + "step": 11736 + }, + { + "epoch": 0.99, + "grad_norm": 0.1876866328454259, + "learning_rate": 3.195287758571142e-09, + "loss": 0.0514, + "step": 11737 + }, + { + "epoch": 0.99, + "grad_norm": 0.4659302365464322, + "learning_rate": 3.146696032047536e-09, + "loss": 0.125, + "step": 11738 + }, + { + "epoch": 0.99, + "grad_norm": 0.21965715903888344, + "learning_rate": 3.098476500266734e-09, + "loss": 0.0747, + "step": 11739 + }, + { + "epoch": 0.99, + "grad_norm": 0.4616749118902757, + "learning_rate": 3.050629166820307e-09, + "loss": 0.0897, + "step": 11740 + }, + { + "epoch": 0.99, + "grad_norm": 0.2924198614917721, + "learning_rate": 3.003154035272071e-09, + "loss": 0.0832, + "step": 11741 + }, + { + "epoch": 0.99, + "grad_norm": 0.3195176685653182, + "learning_rate": 2.956051109159197e-09, + "loss": 0.0768, + "step": 11742 + }, + { + "epoch": 0.99, + "grad_norm": 0.2905713077334311, + "learning_rate": 2.9093203919894342e-09, + "loss": 0.0753, + "step": 11743 + }, + { + "epoch": 0.99, + "grad_norm": 0.22611534215757356, + "learning_rate": 2.862961887243332e-09, + "loss": 0.0444, + "step": 11744 + }, + { + "epoch": 0.99, + "grad_norm": 0.29157363493723704, + "learning_rate": 2.8169755983747938e-09, + "loss": 0.0723, + "step": 11745 + }, + { + "epoch": 0.99, + "grad_norm": 0.2623712901175991, + "learning_rate": 2.771361528808303e-09, + "loss": 0.0667, + "step": 11746 + }, + { + "epoch": 0.99, + "grad_norm": 0.2898198342386482, + "learning_rate": 2.726119681942252e-09, + "loss": 0.0897, + "step": 11747 + }, + { + "epoch": 0.99, + "grad_norm": 0.3878889869242105, + "learning_rate": 2.6812500611456128e-09, + "loss": 0.1351, + "step": 11748 + }, + { + "epoch": 0.99, + "grad_norm": 0.4742886126537848, + "learning_rate": 2.6367526697612667e-09, + "loss": 0.0726, + "step": 11749 + }, + { + "epoch": 0.99, + "grad_norm": 0.31844921512361263, + "learning_rate": 2.5926275111032296e-09, + "loss": 0.0744, + "step": 11750 + }, + { + "epoch": 0.99, + "grad_norm": 0.29980898060181776, + "learning_rate": 2.5488745884583166e-09, + "loss": 0.0838, + "step": 11751 + }, + { + "epoch": 0.99, + "grad_norm": 0.44363154521735015, + "learning_rate": 2.5054939050855877e-09, + "loss": 0.0739, + "step": 11752 + }, + { + "epoch": 0.99, + "grad_norm": 0.30603154280587114, + "learning_rate": 2.4624854642163467e-09, + "loss": 0.0709, + "step": 11753 + }, + { + "epoch": 0.99, + "grad_norm": 0.6724538549797336, + "learning_rate": 2.419849269053587e-09, + "loss": 0.1416, + "step": 11754 + }, + { + "epoch": 0.99, + "grad_norm": 0.3049014330178256, + "learning_rate": 2.3775853227736566e-09, + "loss": 0.0457, + "step": 11755 + }, + { + "epoch": 0.99, + "grad_norm": 0.5560289687569162, + "learning_rate": 2.3356936285251486e-09, + "loss": 0.133, + "step": 11756 + }, + { + "epoch": 0.99, + "grad_norm": 0.304402795354917, + "learning_rate": 2.294174189427234e-09, + "loss": 0.0966, + "step": 11757 + }, + { + "epoch": 0.99, + "grad_norm": 0.3847321658310377, + "learning_rate": 2.2530270085724394e-09, + "loss": 0.0769, + "step": 11758 + }, + { + "epoch": 0.99, + "grad_norm": 0.34332344273527193, + "learning_rate": 2.2122520890272005e-09, + "loss": 0.0968, + "step": 11759 + }, + { + "epoch": 0.99, + "grad_norm": 0.1429218124893918, + "learning_rate": 2.171849433826867e-09, + "loss": 0.0359, + "step": 11760 + }, + { + "epoch": 0.99, + "grad_norm": 0.29721986374660564, + "learning_rate": 2.131819045982364e-09, + "loss": 0.0736, + "step": 11761 + }, + { + "epoch": 0.99, + "grad_norm": 0.26007282749663907, + "learning_rate": 2.0921609284746402e-09, + "loss": 0.0563, + "step": 11762 + }, + { + "epoch": 0.99, + "grad_norm": 0.8124114389026214, + "learning_rate": 2.052875084257444e-09, + "loss": 0.0538, + "step": 11763 + }, + { + "epoch": 0.99, + "grad_norm": 0.22827876270234074, + "learning_rate": 2.0139615162573234e-09, + "loss": 0.0584, + "step": 11764 + }, + { + "epoch": 0.99, + "grad_norm": 0.34518652103434205, + "learning_rate": 1.9754202273730706e-09, + "loss": 0.0854, + "step": 11765 + }, + { + "epoch": 0.99, + "grad_norm": 0.3758834220443019, + "learning_rate": 1.937251220474612e-09, + "loss": 0.056, + "step": 11766 + }, + { + "epoch": 0.99, + "grad_norm": 0.34780667092615575, + "learning_rate": 1.8994544984057838e-09, + "loss": 0.0922, + "step": 11767 + }, + { + "epoch": 0.99, + "grad_norm": 0.35528637347309605, + "learning_rate": 1.862030063982112e-09, + "loss": 0.0942, + "step": 11768 + }, + { + "epoch": 0.99, + "grad_norm": 0.3799984134402679, + "learning_rate": 1.824977919990256e-09, + "loss": 0.082, + "step": 11769 + }, + { + "epoch": 0.99, + "grad_norm": 0.5781437266226297, + "learning_rate": 1.788298069190786e-09, + "loss": 0.1434, + "step": 11770 + }, + { + "epoch": 0.99, + "grad_norm": 0.3113911224875263, + "learning_rate": 1.7519905143154048e-09, + "loss": 0.0884, + "step": 11771 + }, + { + "epoch": 0.99, + "grad_norm": 0.37165810657128423, + "learning_rate": 1.7160552580686163e-09, + "loss": 0.0581, + "step": 11772 + }, + { + "epoch": 0.99, + "grad_norm": 0.3071167657486759, + "learning_rate": 1.680492303127723e-09, + "loss": 0.0736, + "step": 11773 + }, + { + "epoch": 0.99, + "grad_norm": 0.4172984737921329, + "learning_rate": 1.6453016521406074e-09, + "loss": 0.0389, + "step": 11774 + }, + { + "epoch": 0.99, + "grad_norm": 0.32295976498516704, + "learning_rate": 1.6104833077290605e-09, + "loss": 0.0685, + "step": 11775 + }, + { + "epoch": 0.99, + "grad_norm": 0.42797967652606334, + "learning_rate": 1.5760372724871186e-09, + "loss": 0.1185, + "step": 11776 + }, + { + "epoch": 0.99, + "grad_norm": 0.4633207679690751, + "learning_rate": 1.541963548979397e-09, + "loss": 0.0915, + "step": 11777 + }, + { + "epoch": 0.99, + "grad_norm": 0.4072940743861392, + "learning_rate": 1.5082621397444208e-09, + "loss": 0.1355, + "step": 11778 + }, + { + "epoch": 0.99, + "grad_norm": 0.23769197711983342, + "learning_rate": 1.474933047292404e-09, + "loss": 0.0584, + "step": 11779 + }, + { + "epoch": 0.99, + "grad_norm": 0.2066007336165977, + "learning_rate": 1.4419762741058052e-09, + "loss": 0.0583, + "step": 11780 + }, + { + "epoch": 0.99, + "grad_norm": 0.23935670955594482, + "learning_rate": 1.4093918226398829e-09, + "loss": 0.0618, + "step": 11781 + }, + { + "epoch": 0.99, + "grad_norm": 0.24753221993947386, + "learning_rate": 1.3771796953210292e-09, + "loss": 0.0673, + "step": 11782 + }, + { + "epoch": 0.99, + "grad_norm": 0.5860998978953742, + "learning_rate": 1.3453398945495467e-09, + "loss": 0.1269, + "step": 11783 + }, + { + "epoch": 0.99, + "grad_norm": 0.603867184496569, + "learning_rate": 1.313872422695761e-09, + "loss": 0.0883, + "step": 11784 + }, + { + "epoch": 0.99, + "grad_norm": 0.32797923400167506, + "learning_rate": 1.2827772821044638e-09, + "loss": 0.0982, + "step": 11785 + }, + { + "epoch": 0.99, + "grad_norm": 0.8177253745068733, + "learning_rate": 1.252054475092135e-09, + "loss": 0.1328, + "step": 11786 + }, + { + "epoch": 0.99, + "grad_norm": 0.3881613445963822, + "learning_rate": 1.221704003945834e-09, + "loss": 0.0895, + "step": 11787 + }, + { + "epoch": 0.99, + "grad_norm": 0.4614403955053557, + "learning_rate": 1.1917258709276402e-09, + "loss": 0.1132, + "step": 11788 + }, + { + "epoch": 0.99, + "grad_norm": 0.2756829686052264, + "learning_rate": 1.1621200782696573e-09, + "loss": 0.0956, + "step": 11789 + }, + { + "epoch": 0.99, + "grad_norm": 0.5739813669314355, + "learning_rate": 1.1328866281773432e-09, + "loss": 0.1109, + "step": 11790 + }, + { + "epoch": 0.99, + "grad_norm": 0.2540557745636962, + "learning_rate": 1.1040255228284002e-09, + "loss": 0.0532, + "step": 11791 + }, + { + "epoch": 0.99, + "grad_norm": 0.373205882408839, + "learning_rate": 1.0755367643722203e-09, + "loss": 0.1051, + "step": 11792 + }, + { + "epoch": 0.99, + "grad_norm": 0.4720354757525241, + "learning_rate": 1.0474203549309946e-09, + "loss": 0.1056, + "step": 11793 + }, + { + "epoch": 0.99, + "grad_norm": 0.3671025661759038, + "learning_rate": 1.0196762965991592e-09, + "loss": 0.0867, + "step": 11794 + }, + { + "epoch": 0.99, + "grad_norm": 0.3108969214941454, + "learning_rate": 9.923045914428387e-10, + "loss": 0.0546, + "step": 11795 + }, + { + "epoch": 0.99, + "grad_norm": 0.42579980270451384, + "learning_rate": 9.653052415009578e-10, + "loss": 0.1094, + "step": 11796 + }, + { + "epoch": 0.99, + "grad_norm": 0.2067672153837607, + "learning_rate": 9.386782487846857e-10, + "loss": 0.0644, + "step": 11797 + }, + { + "epoch": 0.99, + "grad_norm": 0.15849490249565845, + "learning_rate": 9.124236152774358e-10, + "loss": 0.0112, + "step": 11798 + }, + { + "epoch": 0.99, + "grad_norm": 0.3676063998633194, + "learning_rate": 8.865413429348658e-10, + "loss": 0.0774, + "step": 11799 + }, + { + "epoch": 0.99, + "grad_norm": 0.5118447288748603, + "learning_rate": 8.610314336843229e-10, + "loss": 0.055, + "step": 11800 + }, + { + "epoch": 0.99, + "grad_norm": 0.32184649405325855, + "learning_rate": 8.358938894265089e-10, + "loss": 0.091, + "step": 11801 + }, + { + "epoch": 0.99, + "grad_norm": 0.48191383943045984, + "learning_rate": 8.1112871203326e-10, + "loss": 0.096, + "step": 11802 + }, + { + "epoch": 0.99, + "grad_norm": 0.4378390971315926, + "learning_rate": 7.867359033497668e-10, + "loss": 0.0959, + "step": 11803 + }, + { + "epoch": 0.99, + "grad_norm": 0.3224403064762407, + "learning_rate": 7.627154651929092e-10, + "loss": 0.0776, + "step": 11804 + }, + { + "epoch": 0.99, + "grad_norm": 0.2113921458945713, + "learning_rate": 7.390673993518116e-10, + "loss": 0.0926, + "step": 11805 + }, + { + "epoch": 0.99, + "grad_norm": 0.3689104176921736, + "learning_rate": 7.157917075878429e-10, + "loss": 0.1036, + "step": 11806 + }, + { + "epoch": 0.99, + "grad_norm": 0.3229002645865447, + "learning_rate": 6.928883916346163e-10, + "loss": 0.0708, + "step": 11807 + }, + { + "epoch": 0.99, + "grad_norm": 0.4798771990897857, + "learning_rate": 6.703574531979895e-10, + "loss": 0.1192, + "step": 11808 + }, + { + "epoch": 1.0, + "grad_norm": 0.6060549837871293, + "learning_rate": 6.481988939566197e-10, + "loss": 0.0802, + "step": 11809 + }, + { + "epoch": 1.0, + "grad_norm": 0.3112226618217803, + "learning_rate": 6.264127155602983e-10, + "loss": 0.0845, + "step": 11810 + }, + { + "epoch": 1.0, + "grad_norm": 0.25791594707481535, + "learning_rate": 6.049989196327266e-10, + "loss": 0.0778, + "step": 11811 + }, + { + "epoch": 1.0, + "grad_norm": 0.2072148950472449, + "learning_rate": 5.839575077681847e-10, + "loss": 0.0613, + "step": 11812 + }, + { + "epoch": 1.0, + "grad_norm": 0.40416325552104143, + "learning_rate": 5.632884815343076e-10, + "loss": 0.0906, + "step": 11813 + }, + { + "epoch": 1.0, + "grad_norm": 0.3188732296165041, + "learning_rate": 5.429918424709745e-10, + "loss": 0.0951, + "step": 11814 + }, + { + "epoch": 1.0, + "grad_norm": 0.49423694435251964, + "learning_rate": 5.230675920891992e-10, + "loss": 0.1411, + "step": 11815 + }, + { + "epoch": 1.0, + "grad_norm": 0.27927466824910835, + "learning_rate": 5.035157318733497e-10, + "loss": 0.0631, + "step": 11816 + }, + { + "epoch": 1.0, + "grad_norm": 0.20511027595949646, + "learning_rate": 4.843362632800386e-10, + "loss": 0.0389, + "step": 11817 + }, + { + "epoch": 1.0, + "grad_norm": 0.26438975230668127, + "learning_rate": 4.6552918773812296e-10, + "loss": 0.0703, + "step": 11818 + }, + { + "epoch": 1.0, + "grad_norm": 0.40528210505853113, + "learning_rate": 4.47094506647594e-10, + "loss": 0.1062, + "step": 11819 + }, + { + "epoch": 1.0, + "grad_norm": 0.4291638868690063, + "learning_rate": 4.290322213817977e-10, + "loss": 0.1282, + "step": 11820 + }, + { + "epoch": 1.0, + "grad_norm": 0.35866197170922853, + "learning_rate": 4.1134233328687934e-10, + "loss": 0.07, + "step": 11821 + }, + { + "epoch": 1.0, + "grad_norm": 0.35621801850970936, + "learning_rate": 3.940248436795635e-10, + "loss": 0.0929, + "step": 11822 + }, + { + "epoch": 1.0, + "grad_norm": 0.2747052331328941, + "learning_rate": 3.7707975384992933e-10, + "loss": 0.0839, + "step": 11823 + }, + { + "epoch": 1.0, + "grad_norm": 0.2665092032953628, + "learning_rate": 3.6050706506085556e-10, + "loss": 0.0534, + "step": 11824 + }, + { + "epoch": 1.0, + "grad_norm": 0.1927948336476631, + "learning_rate": 3.4430677854579984e-10, + "loss": 0.0607, + "step": 11825 + }, + { + "epoch": 1.0, + "grad_norm": 0.2877933525528284, + "learning_rate": 3.2847889551268497e-10, + "loss": 0.0791, + "step": 11826 + }, + { + "epoch": 1.0, + "grad_norm": 0.2910236027602921, + "learning_rate": 3.1302341713890236e-10, + "loss": 0.0702, + "step": 11827 + }, + { + "epoch": 1.0, + "grad_norm": 0.31989387698803146, + "learning_rate": 2.979403445768636e-10, + "loss": 0.0767, + "step": 11828 + }, + { + "epoch": 1.0, + "grad_norm": 0.2795830480951221, + "learning_rate": 2.8322967894955924e-10, + "loss": 0.0655, + "step": 11829 + }, + { + "epoch": 1.0, + "grad_norm": 0.2895280812639416, + "learning_rate": 2.6889142135277935e-10, + "loss": 0.0721, + "step": 11830 + }, + { + "epoch": 1.0, + "grad_norm": 0.2765020444213322, + "learning_rate": 2.5492557285455855e-10, + "loss": 0.0433, + "step": 11831 + }, + { + "epoch": 1.0, + "grad_norm": 0.22811937424412942, + "learning_rate": 2.413321344951758e-10, + "loss": 0.0468, + "step": 11832 + }, + { + "epoch": 1.0, + "grad_norm": 0.36940869497529055, + "learning_rate": 2.2811110728715446e-10, + "loss": 0.1222, + "step": 11833 + }, + { + "epoch": 1.0, + "grad_norm": 0.23832616470487086, + "learning_rate": 2.152624922152624e-10, + "loss": 0.0697, + "step": 11834 + }, + { + "epoch": 1.0, + "grad_norm": 0.34869234414938416, + "learning_rate": 2.0278629023651187e-10, + "loss": 0.1378, + "step": 11835 + }, + { + "epoch": 1.0, + "grad_norm": 0.23666897967073502, + "learning_rate": 1.906825022801595e-10, + "loss": 0.0478, + "step": 11836 + }, + { + "epoch": 1.0, + "grad_norm": 0.2768783721044072, + "learning_rate": 1.789511292477064e-10, + "loss": 0.0889, + "step": 11837 + }, + { + "epoch": 1.0, + "grad_norm": 0.47324168891344626, + "learning_rate": 1.6759217201289812e-10, + "loss": 0.1094, + "step": 11838 + }, + { + "epoch": 1.0, + "grad_norm": 0.20960050846261782, + "learning_rate": 1.566056314222797e-10, + "loss": 0.0588, + "step": 11839 + }, + { + "epoch": 1.0, + "grad_norm": 0.2168419461505182, + "learning_rate": 1.4599150829408548e-10, + "loss": 0.0493, + "step": 11840 + }, + { + "epoch": 1.0, + "grad_norm": 0.3725211561619002, + "learning_rate": 1.357498034187943e-10, + "loss": 0.114, + "step": 11841 + }, + { + "epoch": 1.0, + "grad_norm": 0.45960117822304525, + "learning_rate": 1.2588051755912934e-10, + "loss": 0.1351, + "step": 11842 + }, + { + "epoch": 1.0, + "grad_norm": 0.39346999596671195, + "learning_rate": 1.1638365145005825e-10, + "loss": 0.0908, + "step": 11843 + }, + { + "epoch": 1.0, + "grad_norm": 0.23300106828992764, + "learning_rate": 1.0725920579934823e-10, + "loss": 0.0574, + "step": 11844 + }, + { + "epoch": 1.0, + "grad_norm": 0.26171702441622763, + "learning_rate": 9.850718128645576e-11, + "loss": 0.0534, + "step": 11845 + }, + { + "epoch": 1.0, + "grad_norm": 0.2676382409966388, + "learning_rate": 9.012757856308173e-11, + "loss": 0.0615, + "step": 11846 + }, + { + "epoch": 1.0, + "grad_norm": 0.21658409419396413, + "learning_rate": 8.212039825428175e-11, + "loss": 0.0564, + "step": 11847 + }, + { + "epoch": 1.0, + "grad_norm": 0.25757406244161735, + "learning_rate": 7.448564095513533e-11, + "loss": 0.0522, + "step": 11848 + }, + { + "epoch": 1.0, + "grad_norm": 0.3447296366322997, + "learning_rate": 6.7223307235742e-11, + "loss": 0.0615, + "step": 11849 + }, + { + "epoch": 1.0, + "grad_norm": 0.25626614718999186, + "learning_rate": 6.033339763567014e-11, + "loss": 0.077, + "step": 11850 + }, + { + "epoch": 1.0, + "grad_norm": 0.5193133874999188, + "learning_rate": 5.381591266895303e-11, + "loss": 0.102, + "step": 11851 + }, + { + "epoch": 1.0, + "grad_norm": 0.29202820410701, + "learning_rate": 4.767085282075812e-11, + "loss": 0.0912, + "step": 11852 + }, + { + "epoch": 1.0, + "grad_norm": 0.3404326817516068, + "learning_rate": 4.1898218549607515e-11, + "loss": 0.0488, + "step": 11853 + }, + { + "epoch": 1.0, + "grad_norm": 0.262533264660476, + "learning_rate": 3.649801028404731e-11, + "loss": 0.0782, + "step": 11854 + }, + { + "epoch": 1.0, + "grad_norm": 0.5546942607194442, + "learning_rate": 3.147022842708847e-11, + "loss": 0.1401, + "step": 11855 + }, + { + "epoch": 1.0, + "grad_norm": 0.3351340884470931, + "learning_rate": 2.6814873353431248e-11, + "loss": 0.0975, + "step": 11856 + }, + { + "epoch": 1.0, + "grad_norm": 0.3395039146296503, + "learning_rate": 2.2531945409465238e-11, + "loss": 0.0513, + "step": 11857 + }, + { + "epoch": 1.0, + "grad_norm": 0.45189046368524854, + "learning_rate": 1.862144491437956e-11, + "loss": 0.0922, + "step": 11858 + }, + { + "epoch": 1.0, + "grad_norm": 0.15549157182987108, + "learning_rate": 1.5083372159607756e-11, + "loss": 0.0314, + "step": 11859 + }, + { + "epoch": 1.0, + "grad_norm": 0.5875523599919215, + "learning_rate": 1.1917727408272684e-11, + "loss": 0.12, + "step": 11860 + }, + { + "epoch": 1.0, + "grad_norm": 0.2973782432299746, + "learning_rate": 9.124510896851845e-12, + "loss": 0.0644, + "step": 11861 + }, + { + "epoch": 1.0, + "grad_norm": 0.3297972389410682, + "learning_rate": 6.703722832401838e-12, + "loss": 0.1083, + "step": 11862 + }, + { + "epoch": 1.0, + "grad_norm": 0.8574829781047126, + "learning_rate": 4.655363395889012e-12, + "loss": 0.1404, + "step": 11863 + }, + { + "epoch": 1.0, + "grad_norm": 0.4144200820695359, + "learning_rate": 2.9794327399690347e-12, + "loss": 0.0433, + "step": 11864 + }, + { + "epoch": 1.0, + "grad_norm": 0.6252006663130905, + "learning_rate": 1.6759309889868847e-12, + "loss": 0.0852, + "step": 11865 + }, + { + "epoch": 1.0, + "grad_norm": 0.5665002798454156, + "learning_rate": 7.448582406421878e-13, + "loss": 0.0929, + "step": 11866 + }, + { + "epoch": 1.0, + "grad_norm": 0.24785827729567578, + "learning_rate": 1.8621456376877177e-13, + "loss": 0.0672, + "step": 11867 + }, + { + "epoch": 1.0, + "grad_norm": 0.35324169525443694, + "learning_rate": 0.0, + "loss": 0.0631, + "step": 11868 + }, + { + "epoch": 1.0, + "step": 11868, + "total_flos": 452060205940736.0, + "train_loss": 0.08946360924744487, + "train_runtime": 32513.5487, + "train_samples_per_second": 0.73, + "train_steps_per_second": 0.365 + } + ], + "logging_steps": 1.0, + "max_steps": 11868, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "total_flos": 452060205940736.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/README.md b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d4576fe074287232d3836bf69c21d3f2593290d9 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/README.md @@ -0,0 +1,9 @@ +--- +library_name: peft +--- +## Training procedure + +### Framework versions + + +- PEFT 0.4.0 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/adapter_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..522aa87957d3a02332bdcf639384a9346e2c385e --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/adapter_config.json @@ -0,0 +1,26 @@ +{ + "auto_mapping": null, + "base_model_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": "olora", + "layers_pattern": null, + "layers_to_transform": null, + "lora_alpha": 512, + "lora_dropout": 0.05, + "modules_to_save": null, + "peft_type": "LORA", + "r": 256, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "down_proj", + "k_proj", + "o_proj", + "q_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM" +} \ No newline at end of file diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/adapter_model.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/adapter_model.bin new file mode 100644 index 0000000000000000000000000000000000000000..44715c42e785279c52bced63ae9d7c31e64040b4 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/adapter_model.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acbd28cec4b78cb2d066189c430142cb98fff08ae2e35307dba74e28fe87f02d +size 1384057050 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/config.json new file mode 100644 index 0000000000000000000000000000000000000000..1a67f02d0063c3de7740207b9ab2a3eb7be1cbe3 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/config.json @@ -0,0 +1,221 @@ +{ + "_name_or_path": "lmms-lab/LLaVA-Video-7B-Qwen2", + "add_faster_video": false, + "add_time_instruction": true, + "architectures": [ + "LlavaQwenForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "faster_token_stride": 10, + "force_sample": true, + "hidden_act": "silu", + "hidden_size": 3584, + "ignore_index": -100, + "image_aspect_ratio": "anyres_max_9", + "image_crop_resolution": null, + "image_grid_pinpoints": [ + [ + 384, + 384 + ], + [ + 384, + 768 + ], + [ + 384, + 1152 + ], + [ + 384, + 1536 + ], + [ + 384, + 1920 + ], + [ + 384, + 2304 + ], + [ + 768, + 384 + ], + [ + 768, + 768 + ], + [ + 768, + 1152 + ], + [ + 768, + 1536 + ], + [ + 768, + 1920 + ], + [ + 768, + 2304 + ], + [ + 1152, + 384 + ], + [ + 1152, + 768 + ], + [ + 1152, + 1152 + ], + [ + 1152, + 1536 + ], + [ + 1152, + 1920 + ], + [ + 1152, + 2304 + ], + [ + 1536, + 384 + ], + [ + 1536, + 768 + ], + [ + 1536, + 1152 + ], + [ + 1536, + 1536 + ], + [ + 1536, + 1920 + ], + [ + 1536, + 2304 + ], + [ + 1920, + 384 + ], + [ + 1920, + 768 + ], + [ + 1920, + 1152 + ], + [ + 1920, + 1536 + ], + [ + 1920, + 1920 + ], + [ + 1920, + 2304 + ], + [ + 2304, + 384 + ], + [ + 2304, + 768 + ], + [ + 2304, + 1152 + ], + [ + 2304, + 1536 + ], + [ + 2304, + 1920 + ], + [ + 2304, + 2304 + ] + ], + "image_split_resolution": null, + "image_token_index": 151646, + "initializer_range": 0.02, + "intermediate_size": 18944, + "max_position_embeddings": 32768, + "max_window_layers": 28, + "mm_hidden_size": 1152, + "mm_newline_position": "grid", + "mm_patch_merge_type": "spatial_unpad", + "mm_projector_lr": 2e-05, + "mm_projector_type": "mlp2x_gelu", + "mm_resampler_type": null, + "mm_spatial_pool_mode": "bilinear", + "mm_spatial_pool_stride": 2, + "mm_tunable_parts": "mm_vision_tower,mm_mlp_adapter,mm_language_model", + "mm_use_im_patch_token": false, + "mm_use_im_start_end": false, + "mm_vision_select_feature": "patch", + "mm_vision_select_layer": -2, + "mm_vision_tower": "google/siglip-so400m-patch14-384", + "mm_vision_tower_lr": null, + "model_type": "llava", + "num_attention_heads": 28, + "num_hidden_layers": 28, + "num_key_value_heads": 4, + "pos_skipping_range": 4096, + "projector_hidden_act": "gelu", + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 131072, + "text_config": { + "model_type": "llama" + }, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 32768, + "tokenizer_padding_side": "right", + "torch_dtype": "bfloat16", + "transformers_version": "4.40.0.dev0", + "use_cache": true, + "use_mm_proj": true, + "use_pos_skipping": false, + "use_sliding_window": false, + "vision_config": { + "hidden_size": 1024, + "image_size": 336, + "intermediate_size": 4096, + "model_type": "clip_vision_model", + "num_attention_heads": 16, + "num_hidden_layers": 24, + "patch_size": 14, + "projection_dim": 768, + "vocab_size": 32000 + }, + "vision_feature_layer": -2, + "vision_feature_select_strategy": "default", + "vision_tower_pretrained": null +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/generation_config.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..19a297221acb87418d4388a3decef2282c6d7316 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.05, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.40.0.dev0" +} diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/non_lora_trainables.bin b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..76e56a68946b28450ca5e8d8b7eb2d0005823ce8 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55e845b34e8e2d9fb8f5b43868586b3bbc65e76f0957e62ba1f6478d5254b4e1 +size 33964208 diff --git a/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/trainer_state.json b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b545131aa0b283015d4d31c8e9aedc97d05183c9 --- /dev/null +++ b/llavaqwen2-google_siglip-so400m-patch14-384-Qwen_Qwen2-7B-Instruct-AG_v5_3_split06_all_mm_tune_olora256_512_llm/trainer_state.json @@ -0,0 +1,24530 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 2.6559280289916924, + "learning_rate": 9.523809523809525e-08, + "loss": 0.7224, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 2.0096688648254406, + "learning_rate": 1.904761904761905e-07, + "loss": 0.4464, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 1.9032770462485882, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.5203, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 1.9667735520114558, + "learning_rate": 3.80952380952381e-07, + "loss": 0.511, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 1.9814548453165757, + "learning_rate": 4.7619047619047623e-07, + "loss": 0.4734, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 2.1984625427959634, + "learning_rate": 5.714285714285715e-07, + "loss": 0.4914, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 2.0612080516412417, + "learning_rate": 6.666666666666667e-07, + "loss": 0.4076, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 1.792849022681951, + "learning_rate": 7.61904761904762e-07, + "loss": 0.5003, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 2.175499560707213, + "learning_rate": 8.571428571428572e-07, + "loss": 0.5635, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 3.0311328561733566, + "learning_rate": 9.523809523809525e-07, + "loss": 0.7983, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 1.8360302365975791, + "learning_rate": 1.0476190476190478e-06, + "loss": 0.5042, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 1.383169128978254, + "learning_rate": 1.142857142857143e-06, + "loss": 0.3251, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 1.9034153840049772, + "learning_rate": 1.2380952380952382e-06, + "loss": 0.465, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 4.775896938987523, + "learning_rate": 1.3333333333333334e-06, + "loss": 1.2175, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 2.30536972073011, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.6381, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 1.722675415522589, + "learning_rate": 1.523809523809524e-06, + "loss": 0.4503, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 1.614486736580859, + "learning_rate": 1.6190476190476193e-06, + "loss": 0.4714, + "step": 17 + }, + { + "epoch": 0.01, + "grad_norm": 1.5196186528148072, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.4606, + "step": 18 + }, + { + "epoch": 0.01, + "grad_norm": 1.497045818500941, + "learning_rate": 1.8095238095238097e-06, + "loss": 0.4179, + "step": 19 + }, + { + "epoch": 0.01, + "grad_norm": 2.0436549709147536, + "learning_rate": 1.904761904761905e-06, + "loss": 0.6045, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 1.6123040371644173, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.4135, + "step": 21 + }, + { + "epoch": 0.01, + "grad_norm": 2.2013571425957115, + "learning_rate": 2.0952380952380955e-06, + "loss": 0.5183, + "step": 22 + }, + { + "epoch": 0.01, + "grad_norm": 1.6030687602677367, + "learning_rate": 2.1904761904761908e-06, + "loss": 0.4749, + "step": 23 + }, + { + "epoch": 0.01, + "grad_norm": 1.9166891584638006, + "learning_rate": 2.285714285714286e-06, + "loss": 0.5479, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 1.2055675952706235, + "learning_rate": 2.380952380952381e-06, + "loss": 0.3448, + "step": 25 + }, + { + "epoch": 0.01, + "grad_norm": 1.7046011655338569, + "learning_rate": 2.4761904761904764e-06, + "loss": 0.4165, + "step": 26 + }, + { + "epoch": 0.01, + "grad_norm": 1.3200060583775266, + "learning_rate": 2.571428571428571e-06, + "loss": 0.3484, + "step": 27 + }, + { + "epoch": 0.01, + "grad_norm": 2.156488808318543, + "learning_rate": 2.666666666666667e-06, + "loss": 0.593, + "step": 28 + }, + { + "epoch": 0.01, + "grad_norm": 1.3724071169263563, + "learning_rate": 2.7619047619047625e-06, + "loss": 0.3157, + "step": 29 + }, + { + "epoch": 0.01, + "grad_norm": 1.0618373948373527, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.2136, + "step": 30 + }, + { + "epoch": 0.01, + "grad_norm": 1.2814610028366067, + "learning_rate": 2.9523809523809525e-06, + "loss": 0.3498, + "step": 31 + }, + { + "epoch": 0.01, + "grad_norm": 1.5198689941228432, + "learning_rate": 3.047619047619048e-06, + "loss": 0.3199, + "step": 32 + }, + { + "epoch": 0.01, + "grad_norm": 1.0259596723635873, + "learning_rate": 3.142857142857143e-06, + "loss": 0.2961, + "step": 33 + }, + { + "epoch": 0.01, + "grad_norm": 1.313082934999788, + "learning_rate": 3.2380952380952385e-06, + "loss": 0.2747, + "step": 34 + }, + { + "epoch": 0.01, + "grad_norm": 1.340821897956387, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.2578, + "step": 35 + }, + { + "epoch": 0.01, + "grad_norm": 1.2018022664265944, + "learning_rate": 3.428571428571429e-06, + "loss": 0.2718, + "step": 36 + }, + { + "epoch": 0.01, + "grad_norm": 1.3186914769581646, + "learning_rate": 3.523809523809524e-06, + "loss": 0.2489, + "step": 37 + }, + { + "epoch": 0.01, + "grad_norm": 1.2992416650203018, + "learning_rate": 3.6190476190476194e-06, + "loss": 0.2993, + "step": 38 + }, + { + "epoch": 0.01, + "grad_norm": 2.772125365448492, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.3999, + "step": 39 + }, + { + "epoch": 0.01, + "grad_norm": 0.8770991535129182, + "learning_rate": 3.80952380952381e-06, + "loss": 0.1851, + "step": 40 + }, + { + "epoch": 0.01, + "grad_norm": 0.8745028120708158, + "learning_rate": 3.9047619047619055e-06, + "loss": 0.1878, + "step": 41 + }, + { + "epoch": 0.01, + "grad_norm": 1.321115347273228, + "learning_rate": 4.000000000000001e-06, + "loss": 0.1735, + "step": 42 + }, + { + "epoch": 0.01, + "grad_norm": 1.0741926723658475, + "learning_rate": 4.095238095238096e-06, + "loss": 0.2032, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 1.1082442933568981, + "learning_rate": 4.190476190476191e-06, + "loss": 0.2134, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 1.0796947842685276, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.2507, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 2.4517323089964154, + "learning_rate": 4.3809523809523815e-06, + "loss": 0.3937, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 1.5717801997670124, + "learning_rate": 4.476190476190477e-06, + "loss": 0.1819, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 2.1780618394564493, + "learning_rate": 4.571428571428572e-06, + "loss": 0.2757, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 0.9538752405202381, + "learning_rate": 4.666666666666667e-06, + "loss": 0.1732, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 1.5594028468303063, + "learning_rate": 4.761904761904762e-06, + "loss": 0.1659, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 0.9013709697820569, + "learning_rate": 4.857142857142858e-06, + "loss": 0.1457, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 9.316396604855784, + "learning_rate": 4.952380952380953e-06, + "loss": 0.2838, + "step": 52 + }, + { + "epoch": 0.02, + "grad_norm": 1.0144195208183198, + "learning_rate": 5.047619047619048e-06, + "loss": 0.2073, + "step": 53 + }, + { + "epoch": 0.02, + "grad_norm": 0.7753323685166975, + "learning_rate": 5.142857142857142e-06, + "loss": 0.1442, + "step": 54 + }, + { + "epoch": 0.02, + "grad_norm": 1.8248824555139371, + "learning_rate": 5.2380952380952384e-06, + "loss": 0.2562, + "step": 55 + }, + { + "epoch": 0.02, + "grad_norm": 0.7063235284172091, + "learning_rate": 5.333333333333334e-06, + "loss": 0.1598, + "step": 56 + }, + { + "epoch": 0.02, + "grad_norm": 0.9732468260011179, + "learning_rate": 5.428571428571429e-06, + "loss": 0.1233, + "step": 57 + }, + { + "epoch": 0.02, + "grad_norm": 2.165700586222653, + "learning_rate": 5.523809523809525e-06, + "loss": 0.2323, + "step": 58 + }, + { + "epoch": 0.02, + "grad_norm": 2.3168009929121847, + "learning_rate": 5.619047619047619e-06, + "loss": 0.2518, + "step": 59 + }, + { + "epoch": 0.02, + "grad_norm": 1.233880179354882, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.1262, + "step": 60 + }, + { + "epoch": 0.02, + "grad_norm": 1.0209711977795521, + "learning_rate": 5.8095238095238106e-06, + "loss": 0.1771, + "step": 61 + }, + { + "epoch": 0.02, + "grad_norm": 1.2692656195439345, + "learning_rate": 5.904761904761905e-06, + "loss": 0.1833, + "step": 62 + }, + { + "epoch": 0.02, + "grad_norm": 1.2779359909698755, + "learning_rate": 6e-06, + "loss": 0.1395, + "step": 63 + }, + { + "epoch": 0.02, + "grad_norm": 0.9967245397604485, + "learning_rate": 6.095238095238096e-06, + "loss": 0.1365, + "step": 64 + }, + { + "epoch": 0.02, + "grad_norm": 1.094172667670302, + "learning_rate": 6.1904761904761914e-06, + "loss": 0.1587, + "step": 65 + }, + { + "epoch": 0.02, + "grad_norm": 2.837919991285461, + "learning_rate": 6.285714285714286e-06, + "loss": 0.3666, + "step": 66 + }, + { + "epoch": 0.02, + "grad_norm": 0.5700070650907721, + "learning_rate": 6.380952380952381e-06, + "loss": 0.1186, + "step": 67 + }, + { + "epoch": 0.02, + "grad_norm": 1.3198936509025527, + "learning_rate": 6.476190476190477e-06, + "loss": 0.1711, + "step": 68 + }, + { + "epoch": 0.02, + "grad_norm": 1.8045994016978977, + "learning_rate": 6.571428571428572e-06, + "loss": 0.209, + "step": 69 + }, + { + "epoch": 0.02, + "grad_norm": 0.6758420381151559, + "learning_rate": 6.666666666666667e-06, + "loss": 0.1143, + "step": 70 + }, + { + "epoch": 0.02, + "grad_norm": 0.8025693699018324, + "learning_rate": 6.761904761904763e-06, + "loss": 0.1159, + "step": 71 + }, + { + "epoch": 0.02, + "grad_norm": 0.8422640734211968, + "learning_rate": 6.857142857142858e-06, + "loss": 0.085, + "step": 72 + }, + { + "epoch": 0.02, + "grad_norm": 0.6451656843101513, + "learning_rate": 6.952380952380952e-06, + "loss": 0.0848, + "step": 73 + }, + { + "epoch": 0.02, + "grad_norm": 1.5481845662341531, + "learning_rate": 7.047619047619048e-06, + "loss": 0.1838, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 0.6814230718344307, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.0924, + "step": 75 + }, + { + "epoch": 0.02, + "grad_norm": 0.6534419780740991, + "learning_rate": 7.238095238095239e-06, + "loss": 0.0892, + "step": 76 + }, + { + "epoch": 0.02, + "grad_norm": 0.6311335037710308, + "learning_rate": 7.333333333333333e-06, + "loss": 0.138, + "step": 77 + }, + { + "epoch": 0.02, + "grad_norm": 0.7107753773840428, + "learning_rate": 7.428571428571429e-06, + "loss": 0.1209, + "step": 78 + }, + { + "epoch": 0.02, + "grad_norm": 1.18785073387853, + "learning_rate": 7.523809523809524e-06, + "loss": 0.1696, + "step": 79 + }, + { + "epoch": 0.02, + "grad_norm": 0.8769210319622615, + "learning_rate": 7.61904761904762e-06, + "loss": 0.1147, + "step": 80 + }, + { + "epoch": 0.02, + "grad_norm": 2.9972481202089707, + "learning_rate": 7.714285714285716e-06, + "loss": 0.2136, + "step": 81 + }, + { + "epoch": 0.02, + "grad_norm": 0.871714731096346, + "learning_rate": 7.809523809523811e-06, + "loss": 0.1671, + "step": 82 + }, + { + "epoch": 0.02, + "grad_norm": 0.9040197324423482, + "learning_rate": 7.904761904761904e-06, + "loss": 0.1314, + "step": 83 + }, + { + "epoch": 0.02, + "grad_norm": 0.8353400083573649, + "learning_rate": 8.000000000000001e-06, + "loss": 0.1492, + "step": 84 + }, + { + "epoch": 0.02, + "grad_norm": 2.1965167723105985, + "learning_rate": 8.095238095238097e-06, + "loss": 0.1899, + "step": 85 + }, + { + "epoch": 0.02, + "grad_norm": 0.6970684913235916, + "learning_rate": 8.190476190476192e-06, + "loss": 0.1167, + "step": 86 + }, + { + "epoch": 0.02, + "grad_norm": 1.0004972692904666, + "learning_rate": 8.285714285714287e-06, + "loss": 0.1797, + "step": 87 + }, + { + "epoch": 0.03, + "grad_norm": 0.682149243805965, + "learning_rate": 8.380952380952382e-06, + "loss": 0.14, + "step": 88 + }, + { + "epoch": 0.03, + "grad_norm": 0.6844206391813945, + "learning_rate": 8.476190476190477e-06, + "loss": 0.0611, + "step": 89 + }, + { + "epoch": 0.03, + "grad_norm": 0.9019633116200761, + "learning_rate": 8.571428571428571e-06, + "loss": 0.1573, + "step": 90 + }, + { + "epoch": 0.03, + "grad_norm": 1.0176943533714722, + "learning_rate": 8.666666666666668e-06, + "loss": 0.1504, + "step": 91 + }, + { + "epoch": 0.03, + "grad_norm": 0.7358111333939853, + "learning_rate": 8.761904761904763e-06, + "loss": 0.1435, + "step": 92 + }, + { + "epoch": 0.03, + "grad_norm": 0.5235847912021016, + "learning_rate": 8.857142857142858e-06, + "loss": 0.0699, + "step": 93 + }, + { + "epoch": 0.03, + "grad_norm": 1.0174063008813663, + "learning_rate": 8.952380952380953e-06, + "loss": 0.1368, + "step": 94 + }, + { + "epoch": 0.03, + "grad_norm": 0.7967069556065175, + "learning_rate": 9.047619047619049e-06, + "loss": 0.1081, + "step": 95 + }, + { + "epoch": 0.03, + "grad_norm": 1.3654155000995285, + "learning_rate": 9.142857142857144e-06, + "loss": 0.1395, + "step": 96 + }, + { + "epoch": 0.03, + "grad_norm": 1.6357033696427046, + "learning_rate": 9.238095238095239e-06, + "loss": 0.1652, + "step": 97 + }, + { + "epoch": 0.03, + "grad_norm": 0.8980953495205466, + "learning_rate": 9.333333333333334e-06, + "loss": 0.1171, + "step": 98 + }, + { + "epoch": 0.03, + "grad_norm": 0.8913864148939407, + "learning_rate": 9.42857142857143e-06, + "loss": 0.1599, + "step": 99 + }, + { + "epoch": 0.03, + "grad_norm": 1.544543553668204, + "learning_rate": 9.523809523809525e-06, + "loss": 0.1868, + "step": 100 + }, + { + "epoch": 0.03, + "grad_norm": 0.8894016880965862, + "learning_rate": 9.61904761904762e-06, + "loss": 0.086, + "step": 101 + }, + { + "epoch": 0.03, + "grad_norm": 0.9990694939814734, + "learning_rate": 9.714285714285715e-06, + "loss": 0.1195, + "step": 102 + }, + { + "epoch": 0.03, + "grad_norm": 0.8996897056834422, + "learning_rate": 9.80952380952381e-06, + "loss": 0.0993, + "step": 103 + }, + { + "epoch": 0.03, + "grad_norm": 0.7239166665180842, + "learning_rate": 9.904761904761906e-06, + "loss": 0.0803, + "step": 104 + }, + { + "epoch": 0.03, + "grad_norm": 0.5666278173832199, + "learning_rate": 1e-05, + "loss": 0.0864, + "step": 105 + }, + { + "epoch": 0.03, + "grad_norm": 0.6233138913957039, + "learning_rate": 9.999997859278526e-06, + "loss": 0.1164, + "step": 106 + }, + { + "epoch": 0.03, + "grad_norm": 1.883625227676478, + "learning_rate": 9.999991437115933e-06, + "loss": 0.214, + "step": 107 + }, + { + "epoch": 0.03, + "grad_norm": 0.9319416016251865, + "learning_rate": 9.999980733517723e-06, + "loss": 0.1669, + "step": 108 + }, + { + "epoch": 0.03, + "grad_norm": 0.6660261953169586, + "learning_rate": 9.999965748493058e-06, + "loss": 0.0889, + "step": 109 + }, + { + "epoch": 0.03, + "grad_norm": 0.8812531589326963, + "learning_rate": 9.999946482054771e-06, + "loss": 0.1087, + "step": 110 + }, + { + "epoch": 0.03, + "grad_norm": 1.013938509272412, + "learning_rate": 9.999922934219364e-06, + "loss": 0.16, + "step": 111 + }, + { + "epoch": 0.03, + "grad_norm": 0.779924825342806, + "learning_rate": 9.999895105006995e-06, + "loss": 0.1171, + "step": 112 + }, + { + "epoch": 0.03, + "grad_norm": 1.5518981538318566, + "learning_rate": 9.999862994441496e-06, + "loss": 0.1687, + "step": 113 + }, + { + "epoch": 0.03, + "grad_norm": 0.48999398014184997, + "learning_rate": 9.999826602550361e-06, + "loss": 0.0823, + "step": 114 + }, + { + "epoch": 0.03, + "grad_norm": 1.0379184831245838, + "learning_rate": 9.999785929364756e-06, + "loss": 0.0916, + "step": 115 + }, + { + "epoch": 0.03, + "grad_norm": 0.7814552871771392, + "learning_rate": 9.999740974919506e-06, + "loss": 0.1048, + "step": 116 + }, + { + "epoch": 0.03, + "grad_norm": 1.0459600546375207, + "learning_rate": 9.999691739253106e-06, + "loss": 0.1067, + "step": 117 + }, + { + "epoch": 0.03, + "grad_norm": 0.8687944034663304, + "learning_rate": 9.999638222407715e-06, + "loss": 0.0879, + "step": 118 + }, + { + "epoch": 0.03, + "grad_norm": 1.9992034541083337, + "learning_rate": 9.99958042442916e-06, + "loss": 0.1812, + "step": 119 + }, + { + "epoch": 0.03, + "grad_norm": 0.6549484173810968, + "learning_rate": 9.999518345366933e-06, + "loss": 0.1085, + "step": 120 + }, + { + "epoch": 0.03, + "grad_norm": 0.9620760826195979, + "learning_rate": 9.999451985274188e-06, + "loss": 0.1608, + "step": 121 + }, + { + "epoch": 0.03, + "grad_norm": 0.7995800634969509, + "learning_rate": 9.999381344207753e-06, + "loss": 0.0873, + "step": 122 + }, + { + "epoch": 0.04, + "grad_norm": 0.6696605238491989, + "learning_rate": 9.999306422228117e-06, + "loss": 0.1115, + "step": 123 + }, + { + "epoch": 0.04, + "grad_norm": 0.7509671990302248, + "learning_rate": 9.99922721939943e-06, + "loss": 0.1154, + "step": 124 + }, + { + "epoch": 0.04, + "grad_norm": 0.9825079833026981, + "learning_rate": 9.999143735789518e-06, + "loss": 0.1376, + "step": 125 + }, + { + "epoch": 0.04, + "grad_norm": 1.0799052477502589, + "learning_rate": 9.999055971469864e-06, + "loss": 0.1994, + "step": 126 + }, + { + "epoch": 0.04, + "grad_norm": 0.6075380890888262, + "learning_rate": 9.99896392651562e-06, + "loss": 0.1142, + "step": 127 + }, + { + "epoch": 0.04, + "grad_norm": 0.9286626226849513, + "learning_rate": 9.998867601005604e-06, + "loss": 0.1005, + "step": 128 + }, + { + "epoch": 0.04, + "grad_norm": 3.487131521035367, + "learning_rate": 9.998766995022297e-06, + "loss": 0.1936, + "step": 129 + }, + { + "epoch": 0.04, + "grad_norm": 0.6084846497507734, + "learning_rate": 9.99866210865185e-06, + "loss": 0.1044, + "step": 130 + }, + { + "epoch": 0.04, + "grad_norm": 0.7328283271168189, + "learning_rate": 9.99855294198407e-06, + "loss": 0.1114, + "step": 131 + }, + { + "epoch": 0.04, + "grad_norm": 0.9095013063381632, + "learning_rate": 9.998439495112439e-06, + "loss": 0.0944, + "step": 132 + }, + { + "epoch": 0.04, + "grad_norm": 1.8867638564351887, + "learning_rate": 9.998321768134101e-06, + "loss": 0.1078, + "step": 133 + }, + { + "epoch": 0.04, + "grad_norm": 0.5672820705386512, + "learning_rate": 9.998199761149865e-06, + "loss": 0.0866, + "step": 134 + }, + { + "epoch": 0.04, + "grad_norm": 0.725001728131153, + "learning_rate": 9.9980734742642e-06, + "loss": 0.0989, + "step": 135 + }, + { + "epoch": 0.04, + "grad_norm": 0.775556653527424, + "learning_rate": 9.997942907585247e-06, + "loss": 0.1185, + "step": 136 + }, + { + "epoch": 0.04, + "grad_norm": 1.0529065649602682, + "learning_rate": 9.99780806122481e-06, + "loss": 0.1798, + "step": 137 + }, + { + "epoch": 0.04, + "grad_norm": 0.5803911239780993, + "learning_rate": 9.997668935298353e-06, + "loss": 0.1033, + "step": 138 + }, + { + "epoch": 0.04, + "grad_norm": 0.9432373674731988, + "learning_rate": 9.997525529925008e-06, + "loss": 0.1244, + "step": 139 + }, + { + "epoch": 0.04, + "grad_norm": 0.41233635150763165, + "learning_rate": 9.997377845227577e-06, + "loss": 0.0681, + "step": 140 + }, + { + "epoch": 0.04, + "grad_norm": 0.7619125890832402, + "learning_rate": 9.997225881332513e-06, + "loss": 0.1189, + "step": 141 + }, + { + "epoch": 0.04, + "grad_norm": 1.731063900905247, + "learning_rate": 9.997069638369945e-06, + "loss": 0.1726, + "step": 142 + }, + { + "epoch": 0.04, + "grad_norm": 0.7350723280008996, + "learning_rate": 9.996909116473663e-06, + "loss": 0.132, + "step": 143 + }, + { + "epoch": 0.04, + "grad_norm": 0.885499903881568, + "learning_rate": 9.996744315781119e-06, + "loss": 0.1003, + "step": 144 + }, + { + "epoch": 0.04, + "grad_norm": 0.6107676956417163, + "learning_rate": 9.996575236433428e-06, + "loss": 0.0939, + "step": 145 + }, + { + "epoch": 0.04, + "grad_norm": 0.6855064867583867, + "learning_rate": 9.996401878575374e-06, + "loss": 0.1145, + "step": 146 + }, + { + "epoch": 0.04, + "grad_norm": 0.7232702300952882, + "learning_rate": 9.9962242423554e-06, + "loss": 0.1199, + "step": 147 + }, + { + "epoch": 0.04, + "grad_norm": 0.6744310151901042, + "learning_rate": 9.996042327925613e-06, + "loss": 0.0989, + "step": 148 + }, + { + "epoch": 0.04, + "grad_norm": 0.5170328702922494, + "learning_rate": 9.995856135441785e-06, + "loss": 0.0878, + "step": 149 + }, + { + "epoch": 0.04, + "grad_norm": 0.7400293830583283, + "learning_rate": 9.995665665063349e-06, + "loss": 0.1073, + "step": 150 + }, + { + "epoch": 0.04, + "grad_norm": 1.338869327388438, + "learning_rate": 9.995470916953405e-06, + "loss": 0.112, + "step": 151 + }, + { + "epoch": 0.04, + "grad_norm": 0.9744608681284141, + "learning_rate": 9.995271891278712e-06, + "loss": 0.1578, + "step": 152 + }, + { + "epoch": 0.04, + "grad_norm": 0.7245415196591839, + "learning_rate": 9.995068588209695e-06, + "loss": 0.1492, + "step": 153 + }, + { + "epoch": 0.04, + "grad_norm": 2.1864074350226086, + "learning_rate": 9.99486100792044e-06, + "loss": 0.151, + "step": 154 + }, + { + "epoch": 0.04, + "grad_norm": 1.0998672949312496, + "learning_rate": 9.994649150588694e-06, + "loss": 0.1328, + "step": 155 + }, + { + "epoch": 0.04, + "grad_norm": 0.7878447133281667, + "learning_rate": 9.994433016395868e-06, + "loss": 0.1173, + "step": 156 + }, + { + "epoch": 0.04, + "grad_norm": 1.179693118007151, + "learning_rate": 9.994212605527036e-06, + "loss": 0.1761, + "step": 157 + }, + { + "epoch": 0.05, + "grad_norm": 0.7837054114786819, + "learning_rate": 9.993987918170934e-06, + "loss": 0.0911, + "step": 158 + }, + { + "epoch": 0.05, + "grad_norm": 0.9016876537829598, + "learning_rate": 9.993758954519957e-06, + "loss": 0.1287, + "step": 159 + }, + { + "epoch": 0.05, + "grad_norm": 0.5654627349734971, + "learning_rate": 9.993525714770167e-06, + "loss": 0.1209, + "step": 160 + }, + { + "epoch": 0.05, + "grad_norm": 0.9325146601897986, + "learning_rate": 9.993288199121283e-06, + "loss": 0.1456, + "step": 161 + }, + { + "epoch": 0.05, + "grad_norm": 0.8011664414955603, + "learning_rate": 9.993046407776687e-06, + "loss": 0.0878, + "step": 162 + }, + { + "epoch": 0.05, + "grad_norm": 0.5265780114164411, + "learning_rate": 9.992800340943421e-06, + "loss": 0.09, + "step": 163 + }, + { + "epoch": 0.05, + "grad_norm": 0.7413159658677737, + "learning_rate": 9.992549998832192e-06, + "loss": 0.1522, + "step": 164 + }, + { + "epoch": 0.05, + "grad_norm": 6.64797029238668, + "learning_rate": 9.992295381657361e-06, + "loss": 0.2691, + "step": 165 + }, + { + "epoch": 0.05, + "grad_norm": 0.7338495948785921, + "learning_rate": 9.992036489636959e-06, + "loss": 0.1198, + "step": 166 + }, + { + "epoch": 0.05, + "grad_norm": 0.6518044519539281, + "learning_rate": 9.991773322992668e-06, + "loss": 0.1048, + "step": 167 + }, + { + "epoch": 0.05, + "grad_norm": 1.2989102160862043, + "learning_rate": 9.991505881949837e-06, + "loss": 0.1561, + "step": 168 + }, + { + "epoch": 0.05, + "grad_norm": 0.5184978052251222, + "learning_rate": 9.991234166737472e-06, + "loss": 0.0994, + "step": 169 + }, + { + "epoch": 0.05, + "grad_norm": 0.8114606660007323, + "learning_rate": 9.990958177588236e-06, + "loss": 0.0921, + "step": 170 + }, + { + "epoch": 0.05, + "grad_norm": 0.5405227025965889, + "learning_rate": 9.990677914738462e-06, + "loss": 0.1154, + "step": 171 + }, + { + "epoch": 0.05, + "grad_norm": 0.6854086725894425, + "learning_rate": 9.990393378428133e-06, + "loss": 0.0664, + "step": 172 + }, + { + "epoch": 0.05, + "grad_norm": 0.607785348904251, + "learning_rate": 9.990104568900894e-06, + "loss": 0.1144, + "step": 173 + }, + { + "epoch": 0.05, + "grad_norm": 1.3499216602173765, + "learning_rate": 9.989811486404047e-06, + "loss": 0.1298, + "step": 174 + }, + { + "epoch": 0.05, + "grad_norm": 0.6640856742707212, + "learning_rate": 9.98951413118856e-06, + "loss": 0.1199, + "step": 175 + }, + { + "epoch": 0.05, + "grad_norm": 0.7258902684307403, + "learning_rate": 9.989212503509051e-06, + "loss": 0.1582, + "step": 176 + }, + { + "epoch": 0.05, + "grad_norm": 0.6374866990595666, + "learning_rate": 9.9889066036238e-06, + "loss": 0.123, + "step": 177 + }, + { + "epoch": 0.05, + "grad_norm": 0.6473255925101078, + "learning_rate": 9.988596431794749e-06, + "loss": 0.0846, + "step": 178 + }, + { + "epoch": 0.05, + "grad_norm": 0.7056436245707706, + "learning_rate": 9.988281988287494e-06, + "loss": 0.1242, + "step": 179 + }, + { + "epoch": 0.05, + "grad_norm": 0.5768846856508759, + "learning_rate": 9.987963273371287e-06, + "loss": 0.0912, + "step": 180 + }, + { + "epoch": 0.05, + "grad_norm": 2.2463463020099645, + "learning_rate": 9.987640287319041e-06, + "loss": 0.2249, + "step": 181 + }, + { + "epoch": 0.05, + "grad_norm": 0.7353683654933066, + "learning_rate": 9.987313030407325e-06, + "loss": 0.1127, + "step": 182 + }, + { + "epoch": 0.05, + "grad_norm": 2.1174838155664126, + "learning_rate": 9.986981502916364e-06, + "loss": 0.1731, + "step": 183 + }, + { + "epoch": 0.05, + "grad_norm": 1.2635041592029812, + "learning_rate": 9.986645705130046e-06, + "loss": 0.2071, + "step": 184 + }, + { + "epoch": 0.05, + "grad_norm": 0.9739326159678383, + "learning_rate": 9.986305637335907e-06, + "loss": 0.1323, + "step": 185 + }, + { + "epoch": 0.05, + "grad_norm": 1.1746961119873447, + "learning_rate": 9.985961299825144e-06, + "loss": 0.1474, + "step": 186 + }, + { + "epoch": 0.05, + "grad_norm": 1.0224320471521582, + "learning_rate": 9.985612692892608e-06, + "loss": 0.1555, + "step": 187 + }, + { + "epoch": 0.05, + "grad_norm": 0.7115155142503841, + "learning_rate": 9.985259816836809e-06, + "loss": 0.1235, + "step": 188 + }, + { + "epoch": 0.05, + "grad_norm": 1.3880477340080042, + "learning_rate": 9.984902671959911e-06, + "loss": 0.0968, + "step": 189 + }, + { + "epoch": 0.05, + "grad_norm": 0.5229836416130678, + "learning_rate": 9.984541258567732e-06, + "loss": 0.091, + "step": 190 + }, + { + "epoch": 0.05, + "grad_norm": 0.9859563440891488, + "learning_rate": 9.984175576969746e-06, + "loss": 0.1875, + "step": 191 + }, + { + "epoch": 0.05, + "grad_norm": 0.8937978532083918, + "learning_rate": 9.983805627479081e-06, + "loss": 0.1528, + "step": 192 + }, + { + "epoch": 0.06, + "grad_norm": 0.6714263410693669, + "learning_rate": 9.983431410412525e-06, + "loss": 0.1248, + "step": 193 + }, + { + "epoch": 0.06, + "grad_norm": 0.6051274231913827, + "learning_rate": 9.98305292609051e-06, + "loss": 0.0963, + "step": 194 + }, + { + "epoch": 0.06, + "grad_norm": 1.2791179950988925, + "learning_rate": 9.982670174837131e-06, + "loss": 0.1546, + "step": 195 + }, + { + "epoch": 0.06, + "grad_norm": 0.5526581263372871, + "learning_rate": 9.982283156980133e-06, + "loss": 0.0974, + "step": 196 + }, + { + "epoch": 0.06, + "grad_norm": 0.5469715280915156, + "learning_rate": 9.981891872850915e-06, + "loss": 0.0827, + "step": 197 + }, + { + "epoch": 0.06, + "grad_norm": 2.3269280337142058, + "learning_rate": 9.98149632278453e-06, + "loss": 0.1549, + "step": 198 + }, + { + "epoch": 0.06, + "grad_norm": 0.6792026352415177, + "learning_rate": 9.98109650711968e-06, + "loss": 0.1237, + "step": 199 + }, + { + "epoch": 0.06, + "grad_norm": 0.5262828220255815, + "learning_rate": 9.980692426198728e-06, + "loss": 0.0865, + "step": 200 + }, + { + "epoch": 0.06, + "grad_norm": 0.5655283422306094, + "learning_rate": 9.98028408036768e-06, + "loss": 0.0797, + "step": 201 + }, + { + "epoch": 0.06, + "grad_norm": 0.49899879221140164, + "learning_rate": 9.979871469976197e-06, + "loss": 0.0872, + "step": 202 + }, + { + "epoch": 0.06, + "grad_norm": 2.023023912058545, + "learning_rate": 9.979454595377594e-06, + "loss": 0.1188, + "step": 203 + }, + { + "epoch": 0.06, + "grad_norm": 0.9585597431837045, + "learning_rate": 9.979033456928834e-06, + "loss": 0.0951, + "step": 204 + }, + { + "epoch": 0.06, + "grad_norm": 0.7593341762338881, + "learning_rate": 9.97860805499054e-06, + "loss": 0.1618, + "step": 205 + }, + { + "epoch": 0.06, + "grad_norm": 0.9878764238767545, + "learning_rate": 9.97817838992697e-06, + "loss": 0.1287, + "step": 206 + }, + { + "epoch": 0.06, + "grad_norm": 0.6226373819190023, + "learning_rate": 9.977744462106048e-06, + "loss": 0.1225, + "step": 207 + }, + { + "epoch": 0.06, + "grad_norm": 0.9282611042570976, + "learning_rate": 9.977306271899336e-06, + "loss": 0.1514, + "step": 208 + }, + { + "epoch": 0.06, + "grad_norm": 0.6124797469277562, + "learning_rate": 9.976863819682055e-06, + "loss": 0.11, + "step": 209 + }, + { + "epoch": 0.06, + "grad_norm": 0.7966969818313082, + "learning_rate": 9.97641710583307e-06, + "loss": 0.1481, + "step": 210 + }, + { + "epoch": 0.06, + "grad_norm": 0.6708286500114512, + "learning_rate": 9.9759661307349e-06, + "loss": 0.1071, + "step": 211 + }, + { + "epoch": 0.06, + "grad_norm": 1.3359148785200574, + "learning_rate": 9.975510894773705e-06, + "loss": 0.139, + "step": 212 + }, + { + "epoch": 0.06, + "grad_norm": 1.1732557367642897, + "learning_rate": 9.975051398339302e-06, + "loss": 0.1458, + "step": 213 + }, + { + "epoch": 0.06, + "grad_norm": 1.0954552864142109, + "learning_rate": 9.97458764182515e-06, + "loss": 0.1851, + "step": 214 + }, + { + "epoch": 0.06, + "grad_norm": 0.5668284518872119, + "learning_rate": 9.97411962562836e-06, + "loss": 0.1049, + "step": 215 + }, + { + "epoch": 0.06, + "grad_norm": 0.413407712524076, + "learning_rate": 9.97364735014969e-06, + "loss": 0.0924, + "step": 216 + }, + { + "epoch": 0.06, + "grad_norm": 0.6846820606475784, + "learning_rate": 9.973170815793543e-06, + "loss": 0.1567, + "step": 217 + }, + { + "epoch": 0.06, + "grad_norm": 0.7104955831606532, + "learning_rate": 9.97269002296797e-06, + "loss": 0.0961, + "step": 218 + }, + { + "epoch": 0.06, + "grad_norm": 0.9088643628647692, + "learning_rate": 9.972204972084667e-06, + "loss": 0.1577, + "step": 219 + }, + { + "epoch": 0.06, + "grad_norm": 1.2063379291826812, + "learning_rate": 9.971715663558978e-06, + "loss": 0.1569, + "step": 220 + }, + { + "epoch": 0.06, + "grad_norm": 0.6950750179869627, + "learning_rate": 9.971222097809895e-06, + "loss": 0.1179, + "step": 221 + }, + { + "epoch": 0.06, + "grad_norm": 0.6239492884456627, + "learning_rate": 9.97072427526005e-06, + "loss": 0.0964, + "step": 222 + }, + { + "epoch": 0.06, + "grad_norm": 1.074717542517687, + "learning_rate": 9.970222196335724e-06, + "loss": 0.1077, + "step": 223 + }, + { + "epoch": 0.06, + "grad_norm": 0.9295098911783852, + "learning_rate": 9.969715861466839e-06, + "loss": 0.1238, + "step": 224 + }, + { + "epoch": 0.06, + "grad_norm": 0.7725413999922282, + "learning_rate": 9.969205271086969e-06, + "loss": 0.157, + "step": 225 + }, + { + "epoch": 0.06, + "grad_norm": 1.0391873748724498, + "learning_rate": 9.96869042563332e-06, + "loss": 0.1547, + "step": 226 + }, + { + "epoch": 0.06, + "grad_norm": 0.7404030523492874, + "learning_rate": 9.968171325546754e-06, + "loss": 0.1077, + "step": 227 + }, + { + "epoch": 0.07, + "grad_norm": 1.0163578185462376, + "learning_rate": 9.967647971271769e-06, + "loss": 0.2065, + "step": 228 + }, + { + "epoch": 0.07, + "grad_norm": 0.4508114700967041, + "learning_rate": 9.967120363256503e-06, + "loss": 0.0665, + "step": 229 + }, + { + "epoch": 0.07, + "grad_norm": 2.5555517780970542, + "learning_rate": 9.966588501952747e-06, + "loss": 0.0767, + "step": 230 + }, + { + "epoch": 0.07, + "grad_norm": 0.7702021933859821, + "learning_rate": 9.966052387815923e-06, + "loss": 0.1436, + "step": 231 + }, + { + "epoch": 0.07, + "grad_norm": 0.8275984175997171, + "learning_rate": 9.965512021305104e-06, + "loss": 0.1149, + "step": 232 + }, + { + "epoch": 0.07, + "grad_norm": 0.4995501608117536, + "learning_rate": 9.964967402882995e-06, + "loss": 0.0996, + "step": 233 + }, + { + "epoch": 0.07, + "grad_norm": 0.5660501473102331, + "learning_rate": 9.96441853301595e-06, + "loss": 0.1113, + "step": 234 + }, + { + "epoch": 0.07, + "grad_norm": 0.8238187403852497, + "learning_rate": 9.963865412173958e-06, + "loss": 0.1186, + "step": 235 + }, + { + "epoch": 0.07, + "grad_norm": 0.7875749448428275, + "learning_rate": 9.96330804083065e-06, + "loss": 0.1162, + "step": 236 + }, + { + "epoch": 0.07, + "grad_norm": 1.0298150673502589, + "learning_rate": 9.962746419463298e-06, + "loss": 0.1724, + "step": 237 + }, + { + "epoch": 0.07, + "grad_norm": 0.7735969323058663, + "learning_rate": 9.962180548552812e-06, + "loss": 0.1237, + "step": 238 + }, + { + "epoch": 0.07, + "grad_norm": 0.930852149800409, + "learning_rate": 9.96161042858374e-06, + "loss": 0.169, + "step": 239 + }, + { + "epoch": 0.07, + "grad_norm": 0.5392105608782614, + "learning_rate": 9.961036060044268e-06, + "loss": 0.0713, + "step": 240 + }, + { + "epoch": 0.07, + "grad_norm": 0.7987543249051764, + "learning_rate": 9.960457443426227e-06, + "loss": 0.1622, + "step": 241 + }, + { + "epoch": 0.07, + "grad_norm": 0.5520057170888215, + "learning_rate": 9.959874579225071e-06, + "loss": 0.1214, + "step": 242 + }, + { + "epoch": 0.07, + "grad_norm": 0.7714202480896567, + "learning_rate": 9.959287467939905e-06, + "loss": 0.1012, + "step": 243 + }, + { + "epoch": 0.07, + "grad_norm": 0.7018270463464699, + "learning_rate": 9.958696110073467e-06, + "loss": 0.1301, + "step": 244 + }, + { + "epoch": 0.07, + "grad_norm": 0.38352681428144836, + "learning_rate": 9.958100506132127e-06, + "loss": 0.0897, + "step": 245 + }, + { + "epoch": 0.07, + "grad_norm": 1.0087660315330838, + "learning_rate": 9.957500656625894e-06, + "loss": 0.0879, + "step": 246 + }, + { + "epoch": 0.07, + "grad_norm": 0.8437066569382713, + "learning_rate": 9.956896562068414e-06, + "loss": 0.1374, + "step": 247 + }, + { + "epoch": 0.07, + "grad_norm": 1.0657285913367234, + "learning_rate": 9.956288222976967e-06, + "loss": 0.1117, + "step": 248 + }, + { + "epoch": 0.07, + "grad_norm": 0.7761536227390066, + "learning_rate": 9.955675639872465e-06, + "loss": 0.102, + "step": 249 + }, + { + "epoch": 0.07, + "grad_norm": 0.6343701991714403, + "learning_rate": 9.955058813279454e-06, + "loss": 0.1198, + "step": 250 + }, + { + "epoch": 0.07, + "grad_norm": 1.304116025053766, + "learning_rate": 9.954437743726119e-06, + "loss": 0.1093, + "step": 251 + }, + { + "epoch": 0.07, + "grad_norm": 0.8200651903998959, + "learning_rate": 9.953812431744274e-06, + "loss": 0.079, + "step": 252 + }, + { + "epoch": 0.07, + "grad_norm": 0.6296748927291802, + "learning_rate": 9.953182877869367e-06, + "loss": 0.0982, + "step": 253 + }, + { + "epoch": 0.07, + "grad_norm": 0.5565623273658198, + "learning_rate": 9.952549082640477e-06, + "loss": 0.0789, + "step": 254 + }, + { + "epoch": 0.07, + "grad_norm": 0.4177577620070906, + "learning_rate": 9.951911046600313e-06, + "loss": 0.0502, + "step": 255 + }, + { + "epoch": 0.07, + "grad_norm": 0.6088489128825829, + "learning_rate": 9.951268770295223e-06, + "loss": 0.1344, + "step": 256 + }, + { + "epoch": 0.07, + "grad_norm": 1.0327549751969483, + "learning_rate": 9.950622254275177e-06, + "loss": 0.1345, + "step": 257 + }, + { + "epoch": 0.07, + "grad_norm": 0.8541531254505703, + "learning_rate": 9.949971499093783e-06, + "loss": 0.1761, + "step": 258 + }, + { + "epoch": 0.07, + "grad_norm": 0.8841526214312907, + "learning_rate": 9.94931650530827e-06, + "loss": 0.098, + "step": 259 + }, + { + "epoch": 0.07, + "grad_norm": 0.9100164150700225, + "learning_rate": 9.948657273479508e-06, + "loss": 0.1796, + "step": 260 + }, + { + "epoch": 0.07, + "grad_norm": 0.7086473035157004, + "learning_rate": 9.947993804171984e-06, + "loss": 0.1044, + "step": 261 + }, + { + "epoch": 0.07, + "grad_norm": 0.7222672977162455, + "learning_rate": 9.947326097953822e-06, + "loss": 0.135, + "step": 262 + }, + { + "epoch": 0.08, + "grad_norm": 0.8827591249828824, + "learning_rate": 9.94665415539677e-06, + "loss": 0.1175, + "step": 263 + }, + { + "epoch": 0.08, + "grad_norm": 0.4623982686609934, + "learning_rate": 9.945977977076206e-06, + "loss": 0.1072, + "step": 264 + }, + { + "epoch": 0.08, + "grad_norm": 0.4658633145163734, + "learning_rate": 9.945297563571135e-06, + "loss": 0.0803, + "step": 265 + }, + { + "epoch": 0.08, + "grad_norm": 0.5577025537296104, + "learning_rate": 9.944612915464183e-06, + "loss": 0.0987, + "step": 266 + }, + { + "epoch": 0.08, + "grad_norm": 0.9151149107638324, + "learning_rate": 9.94392403334161e-06, + "loss": 0.1404, + "step": 267 + }, + { + "epoch": 0.08, + "grad_norm": 0.6881193923523389, + "learning_rate": 9.943230917793297e-06, + "loss": 0.085, + "step": 268 + }, + { + "epoch": 0.08, + "grad_norm": 1.1367716264658339, + "learning_rate": 9.942533569412751e-06, + "loss": 0.1158, + "step": 269 + }, + { + "epoch": 0.08, + "grad_norm": 0.6198641287768313, + "learning_rate": 9.941831988797104e-06, + "loss": 0.1357, + "step": 270 + }, + { + "epoch": 0.08, + "grad_norm": 0.5584956978186149, + "learning_rate": 9.94112617654711e-06, + "loss": 0.0635, + "step": 271 + }, + { + "epoch": 0.08, + "grad_norm": 0.7710831743353788, + "learning_rate": 9.940416133267147e-06, + "loss": 0.1128, + "step": 272 + }, + { + "epoch": 0.08, + "grad_norm": 0.5757367164546061, + "learning_rate": 9.93970185956522e-06, + "loss": 0.1268, + "step": 273 + }, + { + "epoch": 0.08, + "grad_norm": 0.78935651150386, + "learning_rate": 9.938983356052953e-06, + "loss": 0.0948, + "step": 274 + }, + { + "epoch": 0.08, + "grad_norm": 1.1938775083474606, + "learning_rate": 9.938260623345591e-06, + "loss": 0.1125, + "step": 275 + }, + { + "epoch": 0.08, + "grad_norm": 0.5934987143439762, + "learning_rate": 9.937533662062002e-06, + "loss": 0.1266, + "step": 276 + }, + { + "epoch": 0.08, + "grad_norm": 1.634122518529506, + "learning_rate": 9.936802472824675e-06, + "loss": 0.1645, + "step": 277 + }, + { + "epoch": 0.08, + "grad_norm": 0.5640956349326679, + "learning_rate": 9.93606705625972e-06, + "loss": 0.1316, + "step": 278 + }, + { + "epoch": 0.08, + "grad_norm": 0.8946303659615805, + "learning_rate": 9.935327412996863e-06, + "loss": 0.1282, + "step": 279 + }, + { + "epoch": 0.08, + "grad_norm": 0.40685726080093326, + "learning_rate": 9.934583543669454e-06, + "loss": 0.067, + "step": 280 + }, + { + "epoch": 0.08, + "grad_norm": 0.6363030971304591, + "learning_rate": 9.933835448914459e-06, + "loss": 0.1211, + "step": 281 + }, + { + "epoch": 0.08, + "grad_norm": 0.37747426502340686, + "learning_rate": 9.933083129372467e-06, + "loss": 0.069, + "step": 282 + }, + { + "epoch": 0.08, + "grad_norm": 1.0026041546645483, + "learning_rate": 9.932326585687675e-06, + "loss": 0.1103, + "step": 283 + }, + { + "epoch": 0.08, + "grad_norm": 0.6237086866445896, + "learning_rate": 9.931565818507904e-06, + "loss": 0.1459, + "step": 284 + }, + { + "epoch": 0.08, + "grad_norm": 0.5198324908614264, + "learning_rate": 9.930800828484593e-06, + "loss": 0.0949, + "step": 285 + }, + { + "epoch": 0.08, + "grad_norm": 1.52901423899486, + "learning_rate": 9.930031616272791e-06, + "loss": 0.1416, + "step": 286 + }, + { + "epoch": 0.08, + "grad_norm": 0.768077322285948, + "learning_rate": 9.929258182531167e-06, + "loss": 0.1069, + "step": 287 + }, + { + "epoch": 0.08, + "grad_norm": 0.43670884798857934, + "learning_rate": 9.928480527922004e-06, + "loss": 0.1021, + "step": 288 + }, + { + "epoch": 0.08, + "grad_norm": 0.9349301943026924, + "learning_rate": 9.9276986531112e-06, + "loss": 0.1397, + "step": 289 + }, + { + "epoch": 0.08, + "grad_norm": 0.46386221384249404, + "learning_rate": 9.926912558768261e-06, + "loss": 0.056, + "step": 290 + }, + { + "epoch": 0.08, + "grad_norm": 0.794965781068542, + "learning_rate": 9.926122245566315e-06, + "loss": 0.1069, + "step": 291 + }, + { + "epoch": 0.08, + "grad_norm": 0.6382219405542284, + "learning_rate": 9.925327714182098e-06, + "loss": 0.1126, + "step": 292 + }, + { + "epoch": 0.08, + "grad_norm": 0.6005837617420782, + "learning_rate": 9.924528965295955e-06, + "loss": 0.0878, + "step": 293 + }, + { + "epoch": 0.08, + "grad_norm": 1.1359604968686856, + "learning_rate": 9.923725999591846e-06, + "loss": 0.1481, + "step": 294 + }, + { + "epoch": 0.08, + "grad_norm": 0.4115889109907375, + "learning_rate": 9.922918817757346e-06, + "loss": 0.0788, + "step": 295 + }, + { + "epoch": 0.08, + "grad_norm": 0.7082289559538197, + "learning_rate": 9.92210742048363e-06, + "loss": 0.122, + "step": 296 + }, + { + "epoch": 0.08, + "grad_norm": 1.2019615043832632, + "learning_rate": 9.921291808465492e-06, + "loss": 0.1827, + "step": 297 + }, + { + "epoch": 0.09, + "grad_norm": 0.7559303887743698, + "learning_rate": 9.920471982401328e-06, + "loss": 0.1124, + "step": 298 + }, + { + "epoch": 0.09, + "grad_norm": 0.479984057936419, + "learning_rate": 9.91964794299315e-06, + "loss": 0.102, + "step": 299 + }, + { + "epoch": 0.09, + "grad_norm": 0.44094109838821893, + "learning_rate": 9.918819690946568e-06, + "loss": 0.0815, + "step": 300 + }, + { + "epoch": 0.09, + "grad_norm": 0.39589012538937873, + "learning_rate": 9.917987226970811e-06, + "loss": 0.1, + "step": 301 + }, + { + "epoch": 0.09, + "grad_norm": 0.5132120943206462, + "learning_rate": 9.917150551778702e-06, + "loss": 0.0835, + "step": 302 + }, + { + "epoch": 0.09, + "grad_norm": 0.4717486642036433, + "learning_rate": 9.916309666086682e-06, + "loss": 0.0935, + "step": 303 + }, + { + "epoch": 0.09, + "grad_norm": 0.7547071809281382, + "learning_rate": 9.91546457061479e-06, + "loss": 0.1458, + "step": 304 + }, + { + "epoch": 0.09, + "grad_norm": 0.3951791292961316, + "learning_rate": 9.914615266086668e-06, + "loss": 0.0738, + "step": 305 + }, + { + "epoch": 0.09, + "grad_norm": 0.4726660966593668, + "learning_rate": 9.91376175322957e-06, + "loss": 0.0852, + "step": 306 + }, + { + "epoch": 0.09, + "grad_norm": 0.9099276522464423, + "learning_rate": 9.912904032774351e-06, + "loss": 0.1884, + "step": 307 + }, + { + "epoch": 0.09, + "grad_norm": 0.6985873768508254, + "learning_rate": 9.912042105455462e-06, + "loss": 0.1273, + "step": 308 + }, + { + "epoch": 0.09, + "grad_norm": 0.7065357650409732, + "learning_rate": 9.911175972010965e-06, + "loss": 0.1125, + "step": 309 + }, + { + "epoch": 0.09, + "grad_norm": 0.6304748039856666, + "learning_rate": 9.910305633182518e-06, + "loss": 0.117, + "step": 310 + }, + { + "epoch": 0.09, + "grad_norm": 1.045398760806309, + "learning_rate": 9.909431089715384e-06, + "loss": 0.131, + "step": 311 + }, + { + "epoch": 0.09, + "grad_norm": 0.6531289786555421, + "learning_rate": 9.908552342358424e-06, + "loss": 0.1659, + "step": 312 + }, + { + "epoch": 0.09, + "grad_norm": 0.4342512824963871, + "learning_rate": 9.9076693918641e-06, + "loss": 0.1085, + "step": 313 + }, + { + "epoch": 0.09, + "grad_norm": 0.5572430880433067, + "learning_rate": 9.906782238988471e-06, + "loss": 0.0838, + "step": 314 + }, + { + "epoch": 0.09, + "grad_norm": 0.6234216153610849, + "learning_rate": 9.905890884491196e-06, + "loss": 0.082, + "step": 315 + }, + { + "epoch": 0.09, + "grad_norm": 0.4573970084251661, + "learning_rate": 9.904995329135533e-06, + "loss": 0.0803, + "step": 316 + }, + { + "epoch": 0.09, + "grad_norm": 0.5842062537706539, + "learning_rate": 9.904095573688334e-06, + "loss": 0.106, + "step": 317 + }, + { + "epoch": 0.09, + "grad_norm": 0.6506065102648938, + "learning_rate": 9.903191618920052e-06, + "loss": 0.1377, + "step": 318 + }, + { + "epoch": 0.09, + "grad_norm": 0.42497362854593346, + "learning_rate": 9.902283465604731e-06, + "loss": 0.0594, + "step": 319 + }, + { + "epoch": 0.09, + "grad_norm": 0.8118712222286686, + "learning_rate": 9.901371114520014e-06, + "loss": 0.095, + "step": 320 + }, + { + "epoch": 0.09, + "grad_norm": 0.5546005108949749, + "learning_rate": 9.900454566447133e-06, + "loss": 0.1164, + "step": 321 + }, + { + "epoch": 0.09, + "grad_norm": 2.642648777360291, + "learning_rate": 9.899533822170922e-06, + "loss": 0.1053, + "step": 322 + }, + { + "epoch": 0.09, + "grad_norm": 0.7432919092539422, + "learning_rate": 9.898608882479803e-06, + "loss": 0.1503, + "step": 323 + }, + { + "epoch": 0.09, + "grad_norm": 1.7774030160511798, + "learning_rate": 9.897679748165789e-06, + "loss": 0.1801, + "step": 324 + }, + { + "epoch": 0.09, + "grad_norm": 0.6358064884579139, + "learning_rate": 9.89674642002449e-06, + "loss": 0.1638, + "step": 325 + }, + { + "epoch": 0.09, + "grad_norm": 0.5941447561513429, + "learning_rate": 9.895808898855102e-06, + "loss": 0.1336, + "step": 326 + }, + { + "epoch": 0.09, + "grad_norm": 1.0164412114525911, + "learning_rate": 9.894867185460416e-06, + "loss": 0.1635, + "step": 327 + }, + { + "epoch": 0.09, + "grad_norm": 1.125245132533889, + "learning_rate": 9.893921280646806e-06, + "loss": 0.1492, + "step": 328 + }, + { + "epoch": 0.09, + "grad_norm": 0.919610050412465, + "learning_rate": 9.892971185224244e-06, + "loss": 0.1193, + "step": 329 + }, + { + "epoch": 0.09, + "grad_norm": 0.9071384652106338, + "learning_rate": 9.892016900006284e-06, + "loss": 0.1122, + "step": 330 + }, + { + "epoch": 0.09, + "grad_norm": 0.9191595453235826, + "learning_rate": 9.891058425810072e-06, + "loss": 0.1706, + "step": 331 + }, + { + "epoch": 0.09, + "grad_norm": 0.40444134298820555, + "learning_rate": 9.890095763456335e-06, + "loss": 0.102, + "step": 332 + }, + { + "epoch": 0.1, + "grad_norm": 0.5795507667218293, + "learning_rate": 9.88912891376939e-06, + "loss": 0.1184, + "step": 333 + }, + { + "epoch": 0.1, + "grad_norm": 0.5159918037293236, + "learning_rate": 9.888157877577142e-06, + "loss": 0.1555, + "step": 334 + }, + { + "epoch": 0.1, + "grad_norm": 0.7181342541230561, + "learning_rate": 9.887182655711078e-06, + "loss": 0.1283, + "step": 335 + }, + { + "epoch": 0.1, + "grad_norm": 0.5437508320370097, + "learning_rate": 9.886203249006265e-06, + "loss": 0.0821, + "step": 336 + }, + { + "epoch": 0.1, + "grad_norm": 1.4871363822091073, + "learning_rate": 9.885219658301364e-06, + "loss": 0.1376, + "step": 337 + }, + { + "epoch": 0.1, + "grad_norm": 0.49116677149229987, + "learning_rate": 9.884231884438607e-06, + "loss": 0.1075, + "step": 338 + }, + { + "epoch": 0.1, + "grad_norm": 0.6662276223253167, + "learning_rate": 9.883239928263817e-06, + "loss": 0.1418, + "step": 339 + }, + { + "epoch": 0.1, + "grad_norm": 0.7085309618070171, + "learning_rate": 9.882243790626393e-06, + "loss": 0.1491, + "step": 340 + }, + { + "epoch": 0.1, + "grad_norm": 0.4442275206621628, + "learning_rate": 9.881243472379318e-06, + "loss": 0.1132, + "step": 341 + }, + { + "epoch": 0.1, + "grad_norm": 0.8460443182094871, + "learning_rate": 9.880238974379151e-06, + "loss": 0.1599, + "step": 342 + }, + { + "epoch": 0.1, + "grad_norm": 1.1208304678283527, + "learning_rate": 9.879230297486034e-06, + "loss": 0.1617, + "step": 343 + }, + { + "epoch": 0.1, + "grad_norm": 0.6311369962916906, + "learning_rate": 9.878217442563684e-06, + "loss": 0.073, + "step": 344 + }, + { + "epoch": 0.1, + "grad_norm": 0.607057662787762, + "learning_rate": 9.877200410479399e-06, + "loss": 0.0814, + "step": 345 + }, + { + "epoch": 0.1, + "grad_norm": 0.6660247333119897, + "learning_rate": 9.87617920210405e-06, + "loss": 0.1606, + "step": 346 + }, + { + "epoch": 0.1, + "grad_norm": 1.1808944353739446, + "learning_rate": 9.875153818312087e-06, + "loss": 0.181, + "step": 347 + }, + { + "epoch": 0.1, + "grad_norm": 0.544513639908992, + "learning_rate": 9.874124259981535e-06, + "loss": 0.0945, + "step": 348 + }, + { + "epoch": 0.1, + "grad_norm": 0.3040066447276715, + "learning_rate": 9.873090527993991e-06, + "loss": 0.093, + "step": 349 + }, + { + "epoch": 0.1, + "grad_norm": 1.4463180589496372, + "learning_rate": 9.872052623234632e-06, + "loss": 0.1903, + "step": 350 + }, + { + "epoch": 0.1, + "grad_norm": 0.7270762926748952, + "learning_rate": 9.871010546592199e-06, + "loss": 0.1297, + "step": 351 + }, + { + "epoch": 0.1, + "grad_norm": 0.5589878973559803, + "learning_rate": 9.869964298959013e-06, + "loss": 0.1001, + "step": 352 + }, + { + "epoch": 0.1, + "grad_norm": 0.5514658148916033, + "learning_rate": 9.868913881230964e-06, + "loss": 0.108, + "step": 353 + }, + { + "epoch": 0.1, + "grad_norm": 0.39676003138608923, + "learning_rate": 9.867859294307512e-06, + "loss": 0.0845, + "step": 354 + }, + { + "epoch": 0.1, + "grad_norm": 0.8529347030868513, + "learning_rate": 9.866800539091688e-06, + "loss": 0.144, + "step": 355 + }, + { + "epoch": 0.1, + "grad_norm": 0.43061480486765114, + "learning_rate": 9.865737616490092e-06, + "loss": 0.0703, + "step": 356 + }, + { + "epoch": 0.1, + "grad_norm": 0.7716491329388591, + "learning_rate": 9.864670527412891e-06, + "loss": 0.1309, + "step": 357 + }, + { + "epoch": 0.1, + "grad_norm": 0.7008222888423032, + "learning_rate": 9.863599272773825e-06, + "loss": 0.1516, + "step": 358 + }, + { + "epoch": 0.1, + "grad_norm": 0.5992066599365296, + "learning_rate": 9.862523853490193e-06, + "loss": 0.0902, + "step": 359 + }, + { + "epoch": 0.1, + "grad_norm": 0.40837576918803054, + "learning_rate": 9.861444270482869e-06, + "loss": 0.0707, + "step": 360 + }, + { + "epoch": 0.1, + "grad_norm": 0.3809539796674921, + "learning_rate": 9.860360524676282e-06, + "loss": 0.0618, + "step": 361 + }, + { + "epoch": 0.1, + "grad_norm": 0.8405820843448246, + "learning_rate": 9.859272616998435e-06, + "loss": 0.1492, + "step": 362 + }, + { + "epoch": 0.1, + "grad_norm": 0.6296256234106073, + "learning_rate": 9.85818054838089e-06, + "loss": 0.1309, + "step": 363 + }, + { + "epoch": 0.1, + "grad_norm": 0.34282425587313214, + "learning_rate": 9.857084319758772e-06, + "loss": 0.0816, + "step": 364 + }, + { + "epoch": 0.1, + "grad_norm": 0.43307965248951774, + "learning_rate": 9.855983932070771e-06, + "loss": 0.0864, + "step": 365 + }, + { + "epoch": 0.1, + "grad_norm": 0.3853600765959287, + "learning_rate": 9.854879386259133e-06, + "loss": 0.0845, + "step": 366 + }, + { + "epoch": 0.1, + "grad_norm": 0.6166113837784739, + "learning_rate": 9.853770683269672e-06, + "loss": 0.115, + "step": 367 + }, + { + "epoch": 0.11, + "grad_norm": 0.523962182786505, + "learning_rate": 9.852657824051756e-06, + "loss": 0.0883, + "step": 368 + }, + { + "epoch": 0.11, + "grad_norm": 0.48823667476209875, + "learning_rate": 9.851540809558314e-06, + "loss": 0.0686, + "step": 369 + }, + { + "epoch": 0.11, + "grad_norm": 0.5456778299346657, + "learning_rate": 9.85041964074583e-06, + "loss": 0.0864, + "step": 370 + }, + { + "epoch": 0.11, + "grad_norm": 1.3571644499775195, + "learning_rate": 9.849294318574353e-06, + "loss": 0.1546, + "step": 371 + }, + { + "epoch": 0.11, + "grad_norm": 0.6307858386939773, + "learning_rate": 9.84816484400748e-06, + "loss": 0.0954, + "step": 372 + }, + { + "epoch": 0.11, + "grad_norm": 1.189465103768443, + "learning_rate": 9.847031218012366e-06, + "loss": 0.1403, + "step": 373 + }, + { + "epoch": 0.11, + "grad_norm": 0.6678800569095343, + "learning_rate": 9.845893441559726e-06, + "loss": 0.1179, + "step": 374 + }, + { + "epoch": 0.11, + "grad_norm": 0.48783364991932904, + "learning_rate": 9.844751515623824e-06, + "loss": 0.0581, + "step": 375 + }, + { + "epoch": 0.11, + "grad_norm": 0.7854315167901365, + "learning_rate": 9.843605441182476e-06, + "loss": 0.1179, + "step": 376 + }, + { + "epoch": 0.11, + "grad_norm": 0.8646577020314478, + "learning_rate": 9.842455219217054e-06, + "loss": 0.1421, + "step": 377 + }, + { + "epoch": 0.11, + "grad_norm": 0.6472659091717089, + "learning_rate": 9.841300850712479e-06, + "loss": 0.1106, + "step": 378 + }, + { + "epoch": 0.11, + "grad_norm": 0.4881078709914351, + "learning_rate": 9.840142336657225e-06, + "loss": 0.0784, + "step": 379 + }, + { + "epoch": 0.11, + "grad_norm": 0.4071832924612178, + "learning_rate": 9.838979678043314e-06, + "loss": 0.0925, + "step": 380 + }, + { + "epoch": 0.11, + "grad_norm": 0.5635616531793785, + "learning_rate": 9.837812875866317e-06, + "loss": 0.0882, + "step": 381 + }, + { + "epoch": 0.11, + "grad_norm": 0.8490835049175628, + "learning_rate": 9.836641931125352e-06, + "loss": 0.1173, + "step": 382 + }, + { + "epoch": 0.11, + "grad_norm": 0.46353250195994344, + "learning_rate": 9.835466844823089e-06, + "loss": 0.0914, + "step": 383 + }, + { + "epoch": 0.11, + "grad_norm": 0.44003404589085987, + "learning_rate": 9.834287617965737e-06, + "loss": 0.0917, + "step": 384 + }, + { + "epoch": 0.11, + "grad_norm": 0.4382669571034664, + "learning_rate": 9.833104251563058e-06, + "loss": 0.0922, + "step": 385 + }, + { + "epoch": 0.11, + "grad_norm": 0.6622412790128882, + "learning_rate": 9.831916746628352e-06, + "loss": 0.124, + "step": 386 + }, + { + "epoch": 0.11, + "grad_norm": 0.8740197276848335, + "learning_rate": 9.830725104178467e-06, + "loss": 0.1218, + "step": 387 + }, + { + "epoch": 0.11, + "grad_norm": 0.3418488166373136, + "learning_rate": 9.829529325233795e-06, + "loss": 0.0539, + "step": 388 + }, + { + "epoch": 0.11, + "grad_norm": 0.8820005412720228, + "learning_rate": 9.828329410818265e-06, + "loss": 0.094, + "step": 389 + }, + { + "epoch": 0.11, + "grad_norm": 0.6348576839659562, + "learning_rate": 9.827125361959353e-06, + "loss": 0.1595, + "step": 390 + }, + { + "epoch": 0.11, + "grad_norm": 0.7776623517993152, + "learning_rate": 9.825917179688069e-06, + "loss": 0.1286, + "step": 391 + }, + { + "epoch": 0.11, + "grad_norm": 0.3652804147520181, + "learning_rate": 9.824704865038967e-06, + "loss": 0.0893, + "step": 392 + }, + { + "epoch": 0.11, + "grad_norm": 0.6098195027818251, + "learning_rate": 9.82348841905014e-06, + "loss": 0.1259, + "step": 393 + }, + { + "epoch": 0.11, + "grad_norm": 0.6818619351526606, + "learning_rate": 9.822267842763214e-06, + "loss": 0.1382, + "step": 394 + }, + { + "epoch": 0.11, + "grad_norm": 0.41464173850333236, + "learning_rate": 9.821043137223356e-06, + "loss": 0.0875, + "step": 395 + }, + { + "epoch": 0.11, + "grad_norm": 0.7801461228744722, + "learning_rate": 9.819814303479268e-06, + "loss": 0.1142, + "step": 396 + }, + { + "epoch": 0.11, + "grad_norm": 0.6547149840231421, + "learning_rate": 9.818581342583184e-06, + "loss": 0.0917, + "step": 397 + }, + { + "epoch": 0.11, + "grad_norm": 0.5861194330529879, + "learning_rate": 9.817344255590878e-06, + "loss": 0.1201, + "step": 398 + }, + { + "epoch": 0.11, + "grad_norm": 0.557594547924238, + "learning_rate": 9.816103043561648e-06, + "loss": 0.0807, + "step": 399 + }, + { + "epoch": 0.11, + "grad_norm": 0.2760885179588771, + "learning_rate": 9.814857707558334e-06, + "loss": 0.0492, + "step": 400 + }, + { + "epoch": 0.11, + "grad_norm": 0.4693531378375744, + "learning_rate": 9.813608248647303e-06, + "loss": 0.0882, + "step": 401 + }, + { + "epoch": 0.11, + "grad_norm": 0.5731728776422602, + "learning_rate": 9.812354667898452e-06, + "loss": 0.0913, + "step": 402 + }, + { + "epoch": 0.12, + "grad_norm": 0.5198319925114208, + "learning_rate": 9.811096966385208e-06, + "loss": 0.1292, + "step": 403 + }, + { + "epoch": 0.12, + "grad_norm": 0.8579699415577815, + "learning_rate": 9.809835145184523e-06, + "loss": 0.1655, + "step": 404 + }, + { + "epoch": 0.12, + "grad_norm": 0.7003979193764588, + "learning_rate": 9.808569205376885e-06, + "loss": 0.1396, + "step": 405 + }, + { + "epoch": 0.12, + "grad_norm": 1.378968657022864, + "learning_rate": 9.807299148046301e-06, + "loss": 0.1405, + "step": 406 + }, + { + "epoch": 0.12, + "grad_norm": 0.4222883847606399, + "learning_rate": 9.806024974280308e-06, + "loss": 0.0845, + "step": 407 + }, + { + "epoch": 0.12, + "grad_norm": 0.5636086148045607, + "learning_rate": 9.804746685169964e-06, + "loss": 0.1077, + "step": 408 + }, + { + "epoch": 0.12, + "grad_norm": 1.0657574716078748, + "learning_rate": 9.803464281809856e-06, + "loss": 0.1338, + "step": 409 + }, + { + "epoch": 0.12, + "grad_norm": 0.4145608930789154, + "learning_rate": 9.802177765298091e-06, + "loss": 0.1138, + "step": 410 + }, + { + "epoch": 0.12, + "grad_norm": 0.577848034737305, + "learning_rate": 9.800887136736297e-06, + "loss": 0.0831, + "step": 411 + }, + { + "epoch": 0.12, + "grad_norm": 0.589083321715794, + "learning_rate": 9.799592397229626e-06, + "loss": 0.1157, + "step": 412 + }, + { + "epoch": 0.12, + "grad_norm": 0.37488285060769505, + "learning_rate": 9.798293547886748e-06, + "loss": 0.0834, + "step": 413 + }, + { + "epoch": 0.12, + "grad_norm": 0.882238271623321, + "learning_rate": 9.796990589819852e-06, + "loss": 0.1014, + "step": 414 + }, + { + "epoch": 0.12, + "grad_norm": 0.8510571435671381, + "learning_rate": 9.795683524144649e-06, + "loss": 0.0881, + "step": 415 + }, + { + "epoch": 0.12, + "grad_norm": 1.0433820395292264, + "learning_rate": 9.794372351980361e-06, + "loss": 0.159, + "step": 416 + }, + { + "epoch": 0.12, + "grad_norm": 0.5690948101339289, + "learning_rate": 9.793057074449732e-06, + "loss": 0.083, + "step": 417 + }, + { + "epoch": 0.12, + "grad_norm": 0.5635245868859875, + "learning_rate": 9.791737692679017e-06, + "loss": 0.0943, + "step": 418 + }, + { + "epoch": 0.12, + "grad_norm": 0.48896492458869784, + "learning_rate": 9.790414207797992e-06, + "loss": 0.1345, + "step": 419 + }, + { + "epoch": 0.12, + "grad_norm": 0.5080777969890101, + "learning_rate": 9.789086620939936e-06, + "loss": 0.0959, + "step": 420 + }, + { + "epoch": 0.12, + "grad_norm": 1.1796292883213124, + "learning_rate": 9.78775493324165e-06, + "loss": 0.1619, + "step": 421 + }, + { + "epoch": 0.12, + "grad_norm": 0.9168967688087062, + "learning_rate": 9.786419145843444e-06, + "loss": 0.1372, + "step": 422 + }, + { + "epoch": 0.12, + "grad_norm": 0.8450339717912574, + "learning_rate": 9.785079259889134e-06, + "loss": 0.1329, + "step": 423 + }, + { + "epoch": 0.12, + "grad_norm": 0.7086652838313081, + "learning_rate": 9.783735276526052e-06, + "loss": 0.1039, + "step": 424 + }, + { + "epoch": 0.12, + "grad_norm": 0.6658796117357779, + "learning_rate": 9.782387196905034e-06, + "loss": 0.1575, + "step": 425 + }, + { + "epoch": 0.12, + "grad_norm": 0.6668359446272558, + "learning_rate": 9.781035022180428e-06, + "loss": 0.1045, + "step": 426 + }, + { + "epoch": 0.12, + "grad_norm": 0.5404750055461947, + "learning_rate": 9.779678753510082e-06, + "loss": 0.1249, + "step": 427 + }, + { + "epoch": 0.12, + "grad_norm": 0.6652977074482611, + "learning_rate": 9.778318392055354e-06, + "loss": 0.092, + "step": 428 + }, + { + "epoch": 0.12, + "grad_norm": 0.6220798892890224, + "learning_rate": 9.776953938981107e-06, + "loss": 0.0951, + "step": 429 + }, + { + "epoch": 0.12, + "grad_norm": 0.7757080092841647, + "learning_rate": 9.775585395455708e-06, + "loss": 0.1066, + "step": 430 + }, + { + "epoch": 0.12, + "grad_norm": 0.5810251634899445, + "learning_rate": 9.774212762651023e-06, + "loss": 0.1342, + "step": 431 + }, + { + "epoch": 0.12, + "grad_norm": 0.5469304876227978, + "learning_rate": 9.772836041742423e-06, + "loss": 0.1351, + "step": 432 + }, + { + "epoch": 0.12, + "grad_norm": 0.5860406866309947, + "learning_rate": 9.771455233908778e-06, + "loss": 0.1137, + "step": 433 + }, + { + "epoch": 0.12, + "grad_norm": 0.5141147693391175, + "learning_rate": 9.770070340332457e-06, + "loss": 0.1005, + "step": 434 + }, + { + "epoch": 0.12, + "grad_norm": 0.6755000524610992, + "learning_rate": 9.76868136219933e-06, + "loss": 0.1028, + "step": 435 + }, + { + "epoch": 0.12, + "grad_norm": 0.35347845941056094, + "learning_rate": 9.767288300698762e-06, + "loss": 0.0929, + "step": 436 + }, + { + "epoch": 0.12, + "grad_norm": 0.3531724115204999, + "learning_rate": 9.76589115702362e-06, + "loss": 0.0692, + "step": 437 + }, + { + "epoch": 0.13, + "grad_norm": 0.3476338504585057, + "learning_rate": 9.764489932370254e-06, + "loss": 0.0632, + "step": 438 + }, + { + "epoch": 0.13, + "grad_norm": 0.5519961970875953, + "learning_rate": 9.763084627938524e-06, + "loss": 0.1219, + "step": 439 + }, + { + "epoch": 0.13, + "grad_norm": 1.1119189227138462, + "learning_rate": 9.761675244931772e-06, + "loss": 0.1654, + "step": 440 + }, + { + "epoch": 0.13, + "grad_norm": 0.670153662815508, + "learning_rate": 9.76026178455684e-06, + "loss": 0.1381, + "step": 441 + }, + { + "epoch": 0.13, + "grad_norm": 0.42626841703523144, + "learning_rate": 9.758844248024053e-06, + "loss": 0.1204, + "step": 442 + }, + { + "epoch": 0.13, + "grad_norm": 0.5487977644272218, + "learning_rate": 9.757422636547236e-06, + "loss": 0.1061, + "step": 443 + }, + { + "epoch": 0.13, + "grad_norm": 0.31267067712952507, + "learning_rate": 9.755996951343696e-06, + "loss": 0.0741, + "step": 444 + }, + { + "epoch": 0.13, + "grad_norm": 0.6992283598924897, + "learning_rate": 9.754567193634232e-06, + "loss": 0.0992, + "step": 445 + }, + { + "epoch": 0.13, + "grad_norm": 0.6011903547464283, + "learning_rate": 9.75313336464313e-06, + "loss": 0.1503, + "step": 446 + }, + { + "epoch": 0.13, + "grad_norm": 0.3818976630825426, + "learning_rate": 9.751695465598161e-06, + "loss": 0.0697, + "step": 447 + }, + { + "epoch": 0.13, + "grad_norm": 0.5769177820484979, + "learning_rate": 9.75025349773058e-06, + "loss": 0.1103, + "step": 448 + }, + { + "epoch": 0.13, + "grad_norm": 0.5619632209692563, + "learning_rate": 9.748807462275129e-06, + "loss": 0.1048, + "step": 449 + }, + { + "epoch": 0.13, + "grad_norm": 1.106285406278163, + "learning_rate": 9.747357360470033e-06, + "loss": 0.1484, + "step": 450 + }, + { + "epoch": 0.13, + "grad_norm": 0.4834893229443317, + "learning_rate": 9.745903193556994e-06, + "loss": 0.0645, + "step": 451 + }, + { + "epoch": 0.13, + "grad_norm": 0.9196394433430203, + "learning_rate": 9.7444449627812e-06, + "loss": 0.0938, + "step": 452 + }, + { + "epoch": 0.13, + "grad_norm": 0.7994197717698222, + "learning_rate": 9.742982669391321e-06, + "loss": 0.1251, + "step": 453 + }, + { + "epoch": 0.13, + "grad_norm": 0.5959381420740328, + "learning_rate": 9.741516314639496e-06, + "loss": 0.1066, + "step": 454 + }, + { + "epoch": 0.13, + "grad_norm": 0.57828463794402, + "learning_rate": 9.740045899781353e-06, + "loss": 0.1151, + "step": 455 + }, + { + "epoch": 0.13, + "grad_norm": 0.5054311221075344, + "learning_rate": 9.73857142607599e-06, + "loss": 0.1233, + "step": 456 + }, + { + "epoch": 0.13, + "grad_norm": 0.6722451213544615, + "learning_rate": 9.737092894785978e-06, + "loss": 0.1213, + "step": 457 + }, + { + "epoch": 0.13, + "grad_norm": 0.6038351017174377, + "learning_rate": 9.735610307177374e-06, + "loss": 0.0783, + "step": 458 + }, + { + "epoch": 0.13, + "grad_norm": 0.5161675228685113, + "learning_rate": 9.734123664519695e-06, + "loss": 0.1081, + "step": 459 + }, + { + "epoch": 0.13, + "grad_norm": 0.3431757659984759, + "learning_rate": 9.732632968085937e-06, + "loss": 0.0634, + "step": 460 + }, + { + "epoch": 0.13, + "grad_norm": 0.49936599615693683, + "learning_rate": 9.731138219152566e-06, + "loss": 0.0998, + "step": 461 + }, + { + "epoch": 0.13, + "grad_norm": 0.6865581145913818, + "learning_rate": 9.729639418999524e-06, + "loss": 0.1101, + "step": 462 + }, + { + "epoch": 0.13, + "grad_norm": 0.9076153368211043, + "learning_rate": 9.728136568910209e-06, + "loss": 0.1028, + "step": 463 + }, + { + "epoch": 0.13, + "grad_norm": 0.8907933608275562, + "learning_rate": 9.7266296701715e-06, + "loss": 0.161, + "step": 464 + }, + { + "epoch": 0.13, + "grad_norm": 1.1085238664126051, + "learning_rate": 9.725118724073732e-06, + "loss": 0.1504, + "step": 465 + }, + { + "epoch": 0.13, + "grad_norm": 0.6157213503360681, + "learning_rate": 9.723603731910715e-06, + "loss": 0.1169, + "step": 466 + }, + { + "epoch": 0.13, + "grad_norm": 0.5561371323209214, + "learning_rate": 9.72208469497972e-06, + "loss": 0.1235, + "step": 467 + }, + { + "epoch": 0.13, + "grad_norm": 0.4442295890258957, + "learning_rate": 9.720561614581475e-06, + "loss": 0.1019, + "step": 468 + }, + { + "epoch": 0.13, + "grad_norm": 0.3757130394438029, + "learning_rate": 9.719034492020183e-06, + "loss": 0.0695, + "step": 469 + }, + { + "epoch": 0.13, + "grad_norm": 0.7063966445456047, + "learning_rate": 9.717503328603499e-06, + "loss": 0.1342, + "step": 470 + }, + { + "epoch": 0.13, + "grad_norm": 0.9155691047755337, + "learning_rate": 9.71596812564254e-06, + "loss": 0.1061, + "step": 471 + }, + { + "epoch": 0.13, + "grad_norm": 2.9842753198583956, + "learning_rate": 9.714428884451886e-06, + "loss": 0.1689, + "step": 472 + }, + { + "epoch": 0.14, + "grad_norm": 0.6728142908542221, + "learning_rate": 9.712885606349567e-06, + "loss": 0.0818, + "step": 473 + }, + { + "epoch": 0.14, + "grad_norm": 1.1648373090904836, + "learning_rate": 9.711338292657078e-06, + "loss": 0.1476, + "step": 474 + }, + { + "epoch": 0.14, + "grad_norm": 0.5377512468558412, + "learning_rate": 9.709786944699364e-06, + "loss": 0.1165, + "step": 475 + }, + { + "epoch": 0.14, + "grad_norm": 0.5056295002520085, + "learning_rate": 9.708231563804828e-06, + "loss": 0.1046, + "step": 476 + }, + { + "epoch": 0.14, + "grad_norm": 0.5148043840776015, + "learning_rate": 9.706672151305324e-06, + "loss": 0.0667, + "step": 477 + }, + { + "epoch": 0.14, + "grad_norm": 0.6616997554763696, + "learning_rate": 9.70510870853616e-06, + "loss": 0.0847, + "step": 478 + }, + { + "epoch": 0.14, + "grad_norm": 0.5431856851371992, + "learning_rate": 9.703541236836094e-06, + "loss": 0.1055, + "step": 479 + }, + { + "epoch": 0.14, + "grad_norm": 0.5847676253243631, + "learning_rate": 9.701969737547332e-06, + "loss": 0.1157, + "step": 480 + }, + { + "epoch": 0.14, + "grad_norm": 0.46457743196055556, + "learning_rate": 9.700394212015533e-06, + "loss": 0.0875, + "step": 481 + }, + { + "epoch": 0.14, + "grad_norm": 0.6501521201959152, + "learning_rate": 9.698814661589804e-06, + "loss": 0.1197, + "step": 482 + }, + { + "epoch": 0.14, + "grad_norm": 0.5799873171236264, + "learning_rate": 9.697231087622691e-06, + "loss": 0.0995, + "step": 483 + }, + { + "epoch": 0.14, + "grad_norm": 0.7622329310948606, + "learning_rate": 9.695643491470192e-06, + "loss": 0.1341, + "step": 484 + }, + { + "epoch": 0.14, + "grad_norm": 0.3683783284062726, + "learning_rate": 9.694051874491748e-06, + "loss": 0.0901, + "step": 485 + }, + { + "epoch": 0.14, + "grad_norm": 0.43007029560479687, + "learning_rate": 9.692456238050245e-06, + "loss": 0.0731, + "step": 486 + }, + { + "epoch": 0.14, + "grad_norm": 0.4154136935256243, + "learning_rate": 9.690856583512002e-06, + "loss": 0.0768, + "step": 487 + }, + { + "epoch": 0.14, + "grad_norm": 0.9367202073860206, + "learning_rate": 9.689252912246792e-06, + "loss": 0.1284, + "step": 488 + }, + { + "epoch": 0.14, + "grad_norm": 0.4594907018923105, + "learning_rate": 9.687645225627817e-06, + "loss": 0.0815, + "step": 489 + }, + { + "epoch": 0.14, + "grad_norm": 0.6653529192373276, + "learning_rate": 9.68603352503172e-06, + "loss": 0.1173, + "step": 490 + }, + { + "epoch": 0.14, + "grad_norm": 0.4147604844498556, + "learning_rate": 9.68441781183858e-06, + "loss": 0.1052, + "step": 491 + }, + { + "epoch": 0.14, + "grad_norm": 0.4719536721308244, + "learning_rate": 9.68279808743192e-06, + "loss": 0.09, + "step": 492 + }, + { + "epoch": 0.14, + "grad_norm": 1.3180783346206244, + "learning_rate": 9.681174353198687e-06, + "loss": 0.1377, + "step": 493 + }, + { + "epoch": 0.14, + "grad_norm": 0.30528484487601437, + "learning_rate": 9.679546610529267e-06, + "loss": 0.0573, + "step": 494 + }, + { + "epoch": 0.14, + "grad_norm": 0.5632376819736666, + "learning_rate": 9.677914860817476e-06, + "loss": 0.0851, + "step": 495 + }, + { + "epoch": 0.14, + "grad_norm": 0.5931974641813189, + "learning_rate": 9.676279105460567e-06, + "loss": 0.0908, + "step": 496 + }, + { + "epoch": 0.14, + "grad_norm": 0.6363861396511807, + "learning_rate": 9.674639345859213e-06, + "loss": 0.1158, + "step": 497 + }, + { + "epoch": 0.14, + "grad_norm": 0.4182628237754652, + "learning_rate": 9.672995583417526e-06, + "loss": 0.0883, + "step": 498 + }, + { + "epoch": 0.14, + "grad_norm": 0.5090871258925077, + "learning_rate": 9.671347819543039e-06, + "loss": 0.1205, + "step": 499 + }, + { + "epoch": 0.14, + "grad_norm": 1.1133692096561043, + "learning_rate": 9.669696055646713e-06, + "loss": 0.0881, + "step": 500 + }, + { + "epoch": 0.14, + "grad_norm": 0.4833512750349395, + "learning_rate": 9.668040293142937e-06, + "loss": 0.1377, + "step": 501 + }, + { + "epoch": 0.14, + "grad_norm": 0.3457146551502528, + "learning_rate": 9.666380533449517e-06, + "loss": 0.0589, + "step": 502 + }, + { + "epoch": 0.14, + "grad_norm": 0.50304511972244, + "learning_rate": 9.664716777987691e-06, + "loss": 0.1298, + "step": 503 + }, + { + "epoch": 0.14, + "grad_norm": 0.4533229168514692, + "learning_rate": 9.663049028182112e-06, + "loss": 0.0664, + "step": 504 + }, + { + "epoch": 0.14, + "grad_norm": 0.5655907790886503, + "learning_rate": 9.661377285460856e-06, + "loss": 0.1235, + "step": 505 + }, + { + "epoch": 0.14, + "grad_norm": 0.716552282403697, + "learning_rate": 9.659701551255415e-06, + "loss": 0.1111, + "step": 506 + }, + { + "epoch": 0.14, + "grad_norm": 0.7380319278342574, + "learning_rate": 9.658021827000702e-06, + "loss": 0.1298, + "step": 507 + }, + { + "epoch": 0.15, + "grad_norm": 0.371408881428638, + "learning_rate": 9.656338114135045e-06, + "loss": 0.0718, + "step": 508 + }, + { + "epoch": 0.15, + "grad_norm": 0.587187527009652, + "learning_rate": 9.65465041410019e-06, + "loss": 0.1381, + "step": 509 + }, + { + "epoch": 0.15, + "grad_norm": 0.6578247223161997, + "learning_rate": 9.652958728341296e-06, + "loss": 0.1205, + "step": 510 + }, + { + "epoch": 0.15, + "grad_norm": 0.39107218775046754, + "learning_rate": 9.651263058306932e-06, + "loss": 0.1052, + "step": 511 + }, + { + "epoch": 0.15, + "grad_norm": 0.6450822857601923, + "learning_rate": 9.64956340544908e-06, + "loss": 0.0798, + "step": 512 + }, + { + "epoch": 0.15, + "grad_norm": 0.8606836095404234, + "learning_rate": 9.647859771223137e-06, + "loss": 0.1416, + "step": 513 + }, + { + "epoch": 0.15, + "grad_norm": 0.5190929098082754, + "learning_rate": 9.646152157087903e-06, + "loss": 0.1125, + "step": 514 + }, + { + "epoch": 0.15, + "grad_norm": 0.4424401320689641, + "learning_rate": 9.644440564505589e-06, + "loss": 0.1151, + "step": 515 + }, + { + "epoch": 0.15, + "grad_norm": 1.1476742980713732, + "learning_rate": 9.64272499494181e-06, + "loss": 0.109, + "step": 516 + }, + { + "epoch": 0.15, + "grad_norm": 0.5946358724686177, + "learning_rate": 9.641005449865593e-06, + "loss": 0.1347, + "step": 517 + }, + { + "epoch": 0.15, + "grad_norm": 0.76152838679737, + "learning_rate": 9.639281930749363e-06, + "loss": 0.1563, + "step": 518 + }, + { + "epoch": 0.15, + "grad_norm": 0.47081786131225006, + "learning_rate": 9.637554439068949e-06, + "loss": 0.0578, + "step": 519 + }, + { + "epoch": 0.15, + "grad_norm": 1.1990414757673342, + "learning_rate": 9.635822976303582e-06, + "loss": 0.1554, + "step": 520 + }, + { + "epoch": 0.15, + "grad_norm": 1.448209401779092, + "learning_rate": 9.634087543935894e-06, + "loss": 0.1159, + "step": 521 + }, + { + "epoch": 0.15, + "grad_norm": 0.4996623490869529, + "learning_rate": 9.632348143451918e-06, + "loss": 0.106, + "step": 522 + }, + { + "epoch": 0.15, + "grad_norm": 0.7512792764643562, + "learning_rate": 9.630604776341082e-06, + "loss": 0.0803, + "step": 523 + }, + { + "epoch": 0.15, + "grad_norm": 0.3930266590077223, + "learning_rate": 9.628857444096209e-06, + "loss": 0.074, + "step": 524 + }, + { + "epoch": 0.15, + "grad_norm": 0.3841164058938006, + "learning_rate": 9.627106148213521e-06, + "loss": 0.0724, + "step": 525 + }, + { + "epoch": 0.15, + "grad_norm": 1.0102148139482265, + "learning_rate": 9.625350890192634e-06, + "loss": 0.178, + "step": 526 + }, + { + "epoch": 0.15, + "grad_norm": 0.8634464485184761, + "learning_rate": 9.623591671536554e-06, + "loss": 0.1394, + "step": 527 + }, + { + "epoch": 0.15, + "grad_norm": 0.33868117647467416, + "learning_rate": 9.621828493751681e-06, + "loss": 0.0795, + "step": 528 + }, + { + "epoch": 0.15, + "grad_norm": 0.5374296491118014, + "learning_rate": 9.620061358347805e-06, + "loss": 0.1129, + "step": 529 + }, + { + "epoch": 0.15, + "grad_norm": 0.5393537120310781, + "learning_rate": 9.6182902668381e-06, + "loss": 0.1291, + "step": 530 + }, + { + "epoch": 0.15, + "grad_norm": 0.6608501322157044, + "learning_rate": 9.616515220739134e-06, + "loss": 0.1484, + "step": 531 + }, + { + "epoch": 0.15, + "grad_norm": 0.8839234605871841, + "learning_rate": 9.61473622157086e-06, + "loss": 0.1588, + "step": 532 + }, + { + "epoch": 0.15, + "grad_norm": 0.48316007077989875, + "learning_rate": 9.61295327085661e-06, + "loss": 0.1078, + "step": 533 + }, + { + "epoch": 0.15, + "grad_norm": 0.8887117248606631, + "learning_rate": 9.611166370123109e-06, + "loss": 0.1719, + "step": 534 + }, + { + "epoch": 0.15, + "grad_norm": 0.4890278731456642, + "learning_rate": 9.60937552090046e-06, + "loss": 0.1185, + "step": 535 + }, + { + "epoch": 0.15, + "grad_norm": 1.2707751582206246, + "learning_rate": 9.607580724722141e-06, + "loss": 0.1571, + "step": 536 + }, + { + "epoch": 0.15, + "grad_norm": 0.42484621527305594, + "learning_rate": 9.605781983125023e-06, + "loss": 0.0993, + "step": 537 + }, + { + "epoch": 0.15, + "grad_norm": 0.7346021626296555, + "learning_rate": 9.603979297649344e-06, + "loss": 0.1441, + "step": 538 + }, + { + "epoch": 0.15, + "grad_norm": 0.44585308353686126, + "learning_rate": 9.602172669838721e-06, + "loss": 0.061, + "step": 539 + }, + { + "epoch": 0.15, + "grad_norm": 0.34525815165431484, + "learning_rate": 9.600362101240153e-06, + "loss": 0.11, + "step": 540 + }, + { + "epoch": 0.15, + "grad_norm": 0.6344262740370257, + "learning_rate": 9.598547593404007e-06, + "loss": 0.1526, + "step": 541 + }, + { + "epoch": 0.15, + "grad_norm": 0.393301102748828, + "learning_rate": 9.596729147884024e-06, + "loss": 0.0947, + "step": 542 + }, + { + "epoch": 0.16, + "grad_norm": 0.776761271158952, + "learning_rate": 9.594906766237322e-06, + "loss": 0.102, + "step": 543 + }, + { + "epoch": 0.16, + "grad_norm": 0.3314025179857403, + "learning_rate": 9.593080450024382e-06, + "loss": 0.0585, + "step": 544 + }, + { + "epoch": 0.16, + "grad_norm": 0.5065702669137127, + "learning_rate": 9.591250200809061e-06, + "loss": 0.1066, + "step": 545 + }, + { + "epoch": 0.16, + "grad_norm": 0.7828868868785803, + "learning_rate": 9.589416020158577e-06, + "loss": 0.1324, + "step": 546 + }, + { + "epoch": 0.16, + "grad_norm": 0.5464686564794162, + "learning_rate": 9.58757790964352e-06, + "loss": 0.0948, + "step": 547 + }, + { + "epoch": 0.16, + "grad_norm": 0.564572501433098, + "learning_rate": 9.585735870837842e-06, + "loss": 0.0813, + "step": 548 + }, + { + "epoch": 0.16, + "grad_norm": 0.45973238827855767, + "learning_rate": 9.583889905318863e-06, + "loss": 0.0839, + "step": 549 + }, + { + "epoch": 0.16, + "grad_norm": 1.6380586075479202, + "learning_rate": 9.582040014667258e-06, + "loss": 0.0813, + "step": 550 + }, + { + "epoch": 0.16, + "grad_norm": 0.3444860266036195, + "learning_rate": 9.58018620046707e-06, + "loss": 0.0741, + "step": 551 + }, + { + "epoch": 0.16, + "grad_norm": 0.8103692222613961, + "learning_rate": 9.578328464305697e-06, + "loss": 0.1618, + "step": 552 + }, + { + "epoch": 0.16, + "grad_norm": 0.30131060469646403, + "learning_rate": 9.5764668077739e-06, + "loss": 0.0595, + "step": 553 + }, + { + "epoch": 0.16, + "grad_norm": 0.7401579131128008, + "learning_rate": 9.57460123246579e-06, + "loss": 0.1484, + "step": 554 + }, + { + "epoch": 0.16, + "grad_norm": 0.5854614695945147, + "learning_rate": 9.57273173997884e-06, + "loss": 0.0956, + "step": 555 + }, + { + "epoch": 0.16, + "grad_norm": 0.4661820227358909, + "learning_rate": 9.570858331913877e-06, + "loss": 0.067, + "step": 556 + }, + { + "epoch": 0.16, + "grad_norm": 0.5485571108713299, + "learning_rate": 9.568981009875078e-06, + "loss": 0.1233, + "step": 557 + }, + { + "epoch": 0.16, + "grad_norm": 0.4455101604075415, + "learning_rate": 9.56709977546997e-06, + "loss": 0.1008, + "step": 558 + }, + { + "epoch": 0.16, + "grad_norm": 0.46579763917353784, + "learning_rate": 9.565214630309437e-06, + "loss": 0.1065, + "step": 559 + }, + { + "epoch": 0.16, + "grad_norm": 0.8301614026074754, + "learning_rate": 9.563325576007702e-06, + "loss": 0.1845, + "step": 560 + }, + { + "epoch": 0.16, + "grad_norm": 0.4677538331059423, + "learning_rate": 9.561432614182344e-06, + "loss": 0.1058, + "step": 561 + }, + { + "epoch": 0.16, + "grad_norm": 0.7161536191110569, + "learning_rate": 9.559535746454285e-06, + "loss": 0.1176, + "step": 562 + }, + { + "epoch": 0.16, + "grad_norm": 0.48154650097884616, + "learning_rate": 9.55763497444779e-06, + "loss": 0.08, + "step": 563 + }, + { + "epoch": 0.16, + "grad_norm": 0.411701023144594, + "learning_rate": 9.555730299790467e-06, + "loss": 0.0787, + "step": 564 + }, + { + "epoch": 0.16, + "grad_norm": 0.7955992073989979, + "learning_rate": 9.553821724113268e-06, + "loss": 0.0849, + "step": 565 + }, + { + "epoch": 0.16, + "grad_norm": 0.4804114124676437, + "learning_rate": 9.551909249050489e-06, + "loss": 0.1282, + "step": 566 + }, + { + "epoch": 0.16, + "grad_norm": 0.5893379062898435, + "learning_rate": 9.549992876239753e-06, + "loss": 0.1583, + "step": 567 + }, + { + "epoch": 0.16, + "grad_norm": 0.781153614929025, + "learning_rate": 9.548072607322035e-06, + "loss": 0.1309, + "step": 568 + }, + { + "epoch": 0.16, + "grad_norm": 1.092893497688046, + "learning_rate": 9.546148443941634e-06, + "loss": 0.1531, + "step": 569 + }, + { + "epoch": 0.16, + "grad_norm": 0.931755470555799, + "learning_rate": 9.544220387746193e-06, + "loss": 0.1008, + "step": 570 + }, + { + "epoch": 0.16, + "grad_norm": 0.37262268518549607, + "learning_rate": 9.54228844038668e-06, + "loss": 0.0809, + "step": 571 + }, + { + "epoch": 0.16, + "grad_norm": 0.8229771839333507, + "learning_rate": 9.540352603517407e-06, + "loss": 0.1297, + "step": 572 + }, + { + "epoch": 0.16, + "grad_norm": 0.42212543135085095, + "learning_rate": 9.538412878796001e-06, + "loss": 0.0814, + "step": 573 + }, + { + "epoch": 0.16, + "grad_norm": 0.7491251188140415, + "learning_rate": 9.536469267883432e-06, + "loss": 0.1239, + "step": 574 + }, + { + "epoch": 0.16, + "grad_norm": 0.5777544621930815, + "learning_rate": 9.534521772443989e-06, + "loss": 0.1275, + "step": 575 + }, + { + "epoch": 0.16, + "grad_norm": 0.4013350836379305, + "learning_rate": 9.532570394145292e-06, + "loss": 0.0945, + "step": 576 + }, + { + "epoch": 0.16, + "grad_norm": 0.7159831233187581, + "learning_rate": 9.530615134658278e-06, + "loss": 0.0986, + "step": 577 + }, + { + "epoch": 0.17, + "grad_norm": 0.40874424417924476, + "learning_rate": 9.528655995657222e-06, + "loss": 0.063, + "step": 578 + }, + { + "epoch": 0.17, + "grad_norm": 0.48580392186182875, + "learning_rate": 9.526692978819706e-06, + "loss": 0.102, + "step": 579 + }, + { + "epoch": 0.17, + "grad_norm": 0.5600308979814005, + "learning_rate": 9.524726085826645e-06, + "loss": 0.0907, + "step": 580 + }, + { + "epoch": 0.17, + "grad_norm": 0.6358393813816943, + "learning_rate": 9.52275531836226e-06, + "loss": 0.1216, + "step": 581 + }, + { + "epoch": 0.17, + "grad_norm": 0.5206580267327036, + "learning_rate": 9.5207806781141e-06, + "loss": 0.0985, + "step": 582 + }, + { + "epoch": 0.17, + "grad_norm": 0.3799897525179568, + "learning_rate": 9.518802166773028e-06, + "loss": 0.0885, + "step": 583 + }, + { + "epoch": 0.17, + "grad_norm": 0.2813068295543023, + "learning_rate": 9.51681978603322e-06, + "loss": 0.0508, + "step": 584 + }, + { + "epoch": 0.17, + "grad_norm": 0.3522678471071529, + "learning_rate": 9.514833537592167e-06, + "loss": 0.07, + "step": 585 + }, + { + "epoch": 0.17, + "grad_norm": 0.45470245267937753, + "learning_rate": 9.512843423150666e-06, + "loss": 0.0813, + "step": 586 + }, + { + "epoch": 0.17, + "grad_norm": 0.6978287764047405, + "learning_rate": 9.510849444412835e-06, + "loss": 0.1432, + "step": 587 + }, + { + "epoch": 0.17, + "grad_norm": 0.5919384658602628, + "learning_rate": 9.508851603086094e-06, + "loss": 0.0984, + "step": 588 + }, + { + "epoch": 0.17, + "grad_norm": 0.4508118904856478, + "learning_rate": 9.506849900881169e-06, + "loss": 0.1059, + "step": 589 + }, + { + "epoch": 0.17, + "grad_norm": 0.7844312756182817, + "learning_rate": 9.504844339512096e-06, + "loss": 0.1245, + "step": 590 + }, + { + "epoch": 0.17, + "grad_norm": 0.35067890977456656, + "learning_rate": 9.502834920696216e-06, + "loss": 0.0813, + "step": 591 + }, + { + "epoch": 0.17, + "grad_norm": 0.4916953871743061, + "learning_rate": 9.50082164615417e-06, + "loss": 0.073, + "step": 592 + }, + { + "epoch": 0.17, + "grad_norm": 0.3728444034470237, + "learning_rate": 9.498804517609903e-06, + "loss": 0.0875, + "step": 593 + }, + { + "epoch": 0.17, + "grad_norm": 0.35651214288355376, + "learning_rate": 9.496783536790658e-06, + "loss": 0.0765, + "step": 594 + }, + { + "epoch": 0.17, + "grad_norm": 0.31375965590627275, + "learning_rate": 9.494758705426978e-06, + "loss": 0.0528, + "step": 595 + }, + { + "epoch": 0.17, + "grad_norm": 1.5356005361879115, + "learning_rate": 9.492730025252704e-06, + "loss": 0.0594, + "step": 596 + }, + { + "epoch": 0.17, + "grad_norm": 0.5648101065684547, + "learning_rate": 9.490697498004968e-06, + "loss": 0.1364, + "step": 597 + }, + { + "epoch": 0.17, + "grad_norm": 0.7114698023455776, + "learning_rate": 9.488661125424205e-06, + "loss": 0.1575, + "step": 598 + }, + { + "epoch": 0.17, + "grad_norm": 0.6531081938971721, + "learning_rate": 9.486620909254136e-06, + "loss": 0.1356, + "step": 599 + }, + { + "epoch": 0.17, + "grad_norm": 0.7131636474188195, + "learning_rate": 9.484576851241774e-06, + "loss": 0.1292, + "step": 600 + }, + { + "epoch": 0.17, + "grad_norm": 0.35752307596870525, + "learning_rate": 9.482528953137422e-06, + "loss": 0.0472, + "step": 601 + }, + { + "epoch": 0.17, + "grad_norm": 0.5298837090546342, + "learning_rate": 9.480477216694674e-06, + "loss": 0.0987, + "step": 602 + }, + { + "epoch": 0.17, + "grad_norm": 0.5747005138242258, + "learning_rate": 9.478421643670406e-06, + "loss": 0.1086, + "step": 603 + }, + { + "epoch": 0.17, + "grad_norm": 0.45450892628952655, + "learning_rate": 9.476362235824783e-06, + "loss": 0.0609, + "step": 604 + }, + { + "epoch": 0.17, + "grad_norm": 0.45746596718396365, + "learning_rate": 9.474298994921252e-06, + "loss": 0.1131, + "step": 605 + }, + { + "epoch": 0.17, + "grad_norm": 0.67000800703229, + "learning_rate": 9.472231922726544e-06, + "loss": 0.1091, + "step": 606 + }, + { + "epoch": 0.17, + "grad_norm": 0.47934571718907415, + "learning_rate": 9.470161021010667e-06, + "loss": 0.1048, + "step": 607 + }, + { + "epoch": 0.17, + "grad_norm": 0.4194734317041921, + "learning_rate": 9.468086291546913e-06, + "loss": 0.1048, + "step": 608 + }, + { + "epoch": 0.17, + "grad_norm": 0.6524051051139869, + "learning_rate": 9.466007736111846e-06, + "loss": 0.1262, + "step": 609 + }, + { + "epoch": 0.17, + "grad_norm": 0.6188530913371063, + "learning_rate": 9.463925356485313e-06, + "loss": 0.139, + "step": 610 + }, + { + "epoch": 0.17, + "grad_norm": 0.5035072039530757, + "learning_rate": 9.46183915445043e-06, + "loss": 0.1044, + "step": 611 + }, + { + "epoch": 0.17, + "grad_norm": 0.41982057323405875, + "learning_rate": 9.459749131793588e-06, + "loss": 0.1046, + "step": 612 + }, + { + "epoch": 0.18, + "grad_norm": 0.4670070943276553, + "learning_rate": 9.457655290304449e-06, + "loss": 0.0803, + "step": 613 + }, + { + "epoch": 0.18, + "grad_norm": 0.5615915375373741, + "learning_rate": 9.455557631775946e-06, + "loss": 0.1195, + "step": 614 + }, + { + "epoch": 0.18, + "grad_norm": 0.4503642746310023, + "learning_rate": 9.45345615800428e-06, + "loss": 0.0769, + "step": 615 + }, + { + "epoch": 0.18, + "grad_norm": 0.36719871467858317, + "learning_rate": 9.451350870788922e-06, + "loss": 0.0987, + "step": 616 + }, + { + "epoch": 0.18, + "grad_norm": 0.3667326749266243, + "learning_rate": 9.4492417719326e-06, + "loss": 0.0801, + "step": 617 + }, + { + "epoch": 0.18, + "grad_norm": 0.3488566704855922, + "learning_rate": 9.447128863241315e-06, + "loss": 0.1018, + "step": 618 + }, + { + "epoch": 0.18, + "grad_norm": 0.6368301632708415, + "learning_rate": 9.445012146524326e-06, + "loss": 0.1721, + "step": 619 + }, + { + "epoch": 0.18, + "grad_norm": 0.47525937384417927, + "learning_rate": 9.442891623594153e-06, + "loss": 0.1081, + "step": 620 + }, + { + "epoch": 0.18, + "grad_norm": 0.33093487651641074, + "learning_rate": 9.440767296266576e-06, + "loss": 0.0706, + "step": 621 + }, + { + "epoch": 0.18, + "grad_norm": 0.36351252074473267, + "learning_rate": 9.43863916636063e-06, + "loss": 0.0473, + "step": 622 + }, + { + "epoch": 0.18, + "grad_norm": 0.530672538411266, + "learning_rate": 9.436507235698613e-06, + "loss": 0.1207, + "step": 623 + }, + { + "epoch": 0.18, + "grad_norm": 0.7421951703191766, + "learning_rate": 9.434371506106068e-06, + "loss": 0.1246, + "step": 624 + }, + { + "epoch": 0.18, + "grad_norm": 0.5299383321675147, + "learning_rate": 9.432231979411799e-06, + "loss": 0.0752, + "step": 625 + }, + { + "epoch": 0.18, + "grad_norm": 0.5208153217297306, + "learning_rate": 9.430088657447856e-06, + "loss": 0.1453, + "step": 626 + }, + { + "epoch": 0.18, + "grad_norm": 0.4653774048923359, + "learning_rate": 9.427941542049543e-06, + "loss": 0.1156, + "step": 627 + }, + { + "epoch": 0.18, + "grad_norm": 0.43697657357216074, + "learning_rate": 9.42579063505541e-06, + "loss": 0.0883, + "step": 628 + }, + { + "epoch": 0.18, + "grad_norm": 0.6115594859303495, + "learning_rate": 9.423635938307256e-06, + "loss": 0.1428, + "step": 629 + }, + { + "epoch": 0.18, + "grad_norm": 0.41136842035064647, + "learning_rate": 9.421477453650118e-06, + "loss": 0.0984, + "step": 630 + }, + { + "epoch": 0.18, + "grad_norm": 0.4929570243812377, + "learning_rate": 9.419315182932287e-06, + "loss": 0.0927, + "step": 631 + }, + { + "epoch": 0.18, + "grad_norm": 0.5358411779767442, + "learning_rate": 9.417149128005288e-06, + "loss": 0.0679, + "step": 632 + }, + { + "epoch": 0.18, + "grad_norm": 0.5663438385482606, + "learning_rate": 9.414979290723891e-06, + "loss": 0.1458, + "step": 633 + }, + { + "epoch": 0.18, + "grad_norm": 0.3177486118184364, + "learning_rate": 9.412805672946102e-06, + "loss": 0.055, + "step": 634 + }, + { + "epoch": 0.18, + "grad_norm": 0.5447180871590981, + "learning_rate": 9.410628276533163e-06, + "loss": 0.098, + "step": 635 + }, + { + "epoch": 0.18, + "grad_norm": 0.48325418160280365, + "learning_rate": 9.408447103349556e-06, + "loss": 0.0826, + "step": 636 + }, + { + "epoch": 0.18, + "grad_norm": 0.5790689085859455, + "learning_rate": 9.406262155262995e-06, + "loss": 0.0737, + "step": 637 + }, + { + "epoch": 0.18, + "grad_norm": 0.4039519034914288, + "learning_rate": 9.404073434144424e-06, + "loss": 0.0809, + "step": 638 + }, + { + "epoch": 0.18, + "grad_norm": 0.4461995600952947, + "learning_rate": 9.401880941868024e-06, + "loss": 0.1006, + "step": 639 + }, + { + "epoch": 0.18, + "grad_norm": 0.45319261228159363, + "learning_rate": 9.399684680311197e-06, + "loss": 0.1137, + "step": 640 + }, + { + "epoch": 0.18, + "grad_norm": 0.7312794169976998, + "learning_rate": 9.397484651354578e-06, + "loss": 0.0816, + "step": 641 + }, + { + "epoch": 0.18, + "grad_norm": 0.42358036696294604, + "learning_rate": 9.395280856882026e-06, + "loss": 0.1044, + "step": 642 + }, + { + "epoch": 0.18, + "grad_norm": 0.7109159286986856, + "learning_rate": 9.393073298780627e-06, + "loss": 0.104, + "step": 643 + }, + { + "epoch": 0.18, + "grad_norm": 1.2283321580262665, + "learning_rate": 9.390861978940687e-06, + "loss": 0.058, + "step": 644 + }, + { + "epoch": 0.18, + "grad_norm": 0.5407706797610025, + "learning_rate": 9.388646899255733e-06, + "loss": 0.0865, + "step": 645 + }, + { + "epoch": 0.18, + "grad_norm": 0.5305289253000061, + "learning_rate": 9.386428061622513e-06, + "loss": 0.1291, + "step": 646 + }, + { + "epoch": 0.18, + "grad_norm": 0.4608935788913227, + "learning_rate": 9.384205467940993e-06, + "loss": 0.1044, + "step": 647 + }, + { + "epoch": 0.19, + "grad_norm": 0.4187809752103644, + "learning_rate": 9.381979120114354e-06, + "loss": 0.0986, + "step": 648 + }, + { + "epoch": 0.19, + "grad_norm": 1.40910352465299, + "learning_rate": 9.379749020048992e-06, + "loss": 0.1548, + "step": 649 + }, + { + "epoch": 0.19, + "grad_norm": 0.4904574292595726, + "learning_rate": 9.377515169654518e-06, + "loss": 0.0779, + "step": 650 + }, + { + "epoch": 0.19, + "grad_norm": 0.27826549346693685, + "learning_rate": 9.37527757084375e-06, + "loss": 0.0335, + "step": 651 + }, + { + "epoch": 0.19, + "grad_norm": 0.8832023529321562, + "learning_rate": 9.373036225532719e-06, + "loss": 0.0866, + "step": 652 + }, + { + "epoch": 0.19, + "grad_norm": 0.8272570531872325, + "learning_rate": 9.370791135640665e-06, + "loss": 0.1265, + "step": 653 + }, + { + "epoch": 0.19, + "grad_norm": 0.683414814615958, + "learning_rate": 9.368542303090032e-06, + "loss": 0.1107, + "step": 654 + }, + { + "epoch": 0.19, + "grad_norm": 0.4537591785209198, + "learning_rate": 9.366289729806468e-06, + "loss": 0.0909, + "step": 655 + }, + { + "epoch": 0.19, + "grad_norm": 0.3611325743188263, + "learning_rate": 9.36403341771883e-06, + "loss": 0.0776, + "step": 656 + }, + { + "epoch": 0.19, + "grad_norm": 0.7715225557983743, + "learning_rate": 9.361773368759167e-06, + "loss": 0.1438, + "step": 657 + }, + { + "epoch": 0.19, + "grad_norm": 0.8229300546981989, + "learning_rate": 9.359509584862735e-06, + "loss": 0.0597, + "step": 658 + }, + { + "epoch": 0.19, + "grad_norm": 0.39645132593923904, + "learning_rate": 9.35724206796799e-06, + "loss": 0.0917, + "step": 659 + }, + { + "epoch": 0.19, + "grad_norm": 0.7286126262325562, + "learning_rate": 9.354970820016576e-06, + "loss": 0.0756, + "step": 660 + }, + { + "epoch": 0.19, + "grad_norm": 0.5107501028216207, + "learning_rate": 9.35269584295334e-06, + "loss": 0.0908, + "step": 661 + }, + { + "epoch": 0.19, + "grad_norm": 0.5590486582264846, + "learning_rate": 9.350417138726315e-06, + "loss": 0.1172, + "step": 662 + }, + { + "epoch": 0.19, + "grad_norm": 1.0684748553819996, + "learning_rate": 9.348134709286732e-06, + "loss": 0.1464, + "step": 663 + }, + { + "epoch": 0.19, + "grad_norm": 0.308743027902976, + "learning_rate": 9.345848556589011e-06, + "loss": 0.0602, + "step": 664 + }, + { + "epoch": 0.19, + "grad_norm": 0.556289940922841, + "learning_rate": 9.343558682590757e-06, + "loss": 0.1205, + "step": 665 + }, + { + "epoch": 0.19, + "grad_norm": 0.4636388866468853, + "learning_rate": 9.34126508925276e-06, + "loss": 0.1141, + "step": 666 + }, + { + "epoch": 0.19, + "grad_norm": 0.2961531246516484, + "learning_rate": 9.338967778539003e-06, + "loss": 0.0587, + "step": 667 + }, + { + "epoch": 0.19, + "grad_norm": 0.6908136801369276, + "learning_rate": 9.336666752416643e-06, + "loss": 0.1248, + "step": 668 + }, + { + "epoch": 0.19, + "grad_norm": 0.7189385091006617, + "learning_rate": 9.334362012856025e-06, + "loss": 0.0899, + "step": 669 + }, + { + "epoch": 0.19, + "grad_norm": 0.34565430257336976, + "learning_rate": 9.332053561830669e-06, + "loss": 0.0779, + "step": 670 + }, + { + "epoch": 0.19, + "grad_norm": 0.4313753350170771, + "learning_rate": 9.329741401317275e-06, + "loss": 0.0497, + "step": 671 + }, + { + "epoch": 0.19, + "grad_norm": 0.5036708096897136, + "learning_rate": 9.327425533295725e-06, + "loss": 0.078, + "step": 672 + }, + { + "epoch": 0.19, + "grad_norm": 0.6683806773452055, + "learning_rate": 9.325105959749062e-06, + "loss": 0.1216, + "step": 673 + }, + { + "epoch": 0.19, + "grad_norm": 0.6964085263385951, + "learning_rate": 9.322782682663516e-06, + "loss": 0.1461, + "step": 674 + }, + { + "epoch": 0.19, + "grad_norm": 0.306454261021034, + "learning_rate": 9.320455704028482e-06, + "loss": 0.0757, + "step": 675 + }, + { + "epoch": 0.19, + "grad_norm": 0.7014155264403084, + "learning_rate": 9.318125025836524e-06, + "loss": 0.1415, + "step": 676 + }, + { + "epoch": 0.19, + "grad_norm": 0.3516675920524297, + "learning_rate": 9.315790650083376e-06, + "loss": 0.0648, + "step": 677 + }, + { + "epoch": 0.19, + "grad_norm": 0.20928109827686087, + "learning_rate": 9.313452578767937e-06, + "loss": 0.0372, + "step": 678 + }, + { + "epoch": 0.19, + "grad_norm": 0.5061408737680055, + "learning_rate": 9.31111081389227e-06, + "loss": 0.0905, + "step": 679 + }, + { + "epoch": 0.19, + "grad_norm": 0.5234568909313362, + "learning_rate": 9.308765357461604e-06, + "loss": 0.0942, + "step": 680 + }, + { + "epoch": 0.19, + "grad_norm": 0.5961880907562055, + "learning_rate": 9.306416211484323e-06, + "loss": 0.0715, + "step": 681 + }, + { + "epoch": 0.19, + "grad_norm": 0.4436182651534206, + "learning_rate": 9.304063377971978e-06, + "loss": 0.1086, + "step": 682 + }, + { + "epoch": 0.2, + "grad_norm": 0.8996073654251279, + "learning_rate": 9.30170685893927e-06, + "loss": 0.1333, + "step": 683 + }, + { + "epoch": 0.2, + "grad_norm": 0.46052297409844706, + "learning_rate": 9.299346656404061e-06, + "loss": 0.0797, + "step": 684 + }, + { + "epoch": 0.2, + "grad_norm": 0.6021166938690489, + "learning_rate": 9.296982772387366e-06, + "loss": 0.1053, + "step": 685 + }, + { + "epoch": 0.2, + "grad_norm": 0.2674255269406825, + "learning_rate": 9.29461520891335e-06, + "loss": 0.0663, + "step": 686 + }, + { + "epoch": 0.2, + "grad_norm": 0.3045359359469913, + "learning_rate": 9.292243968009332e-06, + "loss": 0.0763, + "step": 687 + }, + { + "epoch": 0.2, + "grad_norm": 0.5834508891312872, + "learning_rate": 9.289869051705777e-06, + "loss": 0.1361, + "step": 688 + }, + { + "epoch": 0.2, + "grad_norm": 0.6611334537227191, + "learning_rate": 9.287490462036301e-06, + "loss": 0.1085, + "step": 689 + }, + { + "epoch": 0.2, + "grad_norm": 0.5636213905563565, + "learning_rate": 9.285108201037663e-06, + "loss": 0.0558, + "step": 690 + }, + { + "epoch": 0.2, + "grad_norm": 0.7057820765338262, + "learning_rate": 9.282722270749764e-06, + "loss": 0.107, + "step": 691 + }, + { + "epoch": 0.2, + "grad_norm": 0.8620729301203439, + "learning_rate": 9.280332673215651e-06, + "loss": 0.1444, + "step": 692 + }, + { + "epoch": 0.2, + "grad_norm": 0.6183703632004979, + "learning_rate": 9.277939410481507e-06, + "loss": 0.099, + "step": 693 + }, + { + "epoch": 0.2, + "grad_norm": 0.7057134143581547, + "learning_rate": 9.275542484596658e-06, + "loss": 0.1041, + "step": 694 + }, + { + "epoch": 0.2, + "grad_norm": 0.2628334930423759, + "learning_rate": 9.27314189761356e-06, + "loss": 0.0642, + "step": 695 + }, + { + "epoch": 0.2, + "grad_norm": 0.6088153193058466, + "learning_rate": 9.270737651587813e-06, + "loss": 0.1295, + "step": 696 + }, + { + "epoch": 0.2, + "grad_norm": 0.744151412836345, + "learning_rate": 9.268329748578144e-06, + "loss": 0.1621, + "step": 697 + }, + { + "epoch": 0.2, + "grad_norm": 0.5618375618090972, + "learning_rate": 9.265918190646413e-06, + "loss": 0.1311, + "step": 698 + }, + { + "epoch": 0.2, + "grad_norm": 0.6350495934711705, + "learning_rate": 9.263502979857608e-06, + "loss": 0.1249, + "step": 699 + }, + { + "epoch": 0.2, + "grad_norm": 0.7429219334181577, + "learning_rate": 9.261084118279846e-06, + "loss": 0.1236, + "step": 700 + }, + { + "epoch": 0.2, + "grad_norm": 0.6834825143828912, + "learning_rate": 9.258661607984374e-06, + "loss": 0.1622, + "step": 701 + }, + { + "epoch": 0.2, + "grad_norm": 0.4665496520433196, + "learning_rate": 9.256235451045558e-06, + "loss": 0.0687, + "step": 702 + }, + { + "epoch": 0.2, + "grad_norm": 0.43041703878012194, + "learning_rate": 9.253805649540888e-06, + "loss": 0.0915, + "step": 703 + }, + { + "epoch": 0.2, + "grad_norm": 0.4323839035861413, + "learning_rate": 9.251372205550975e-06, + "loss": 0.0915, + "step": 704 + }, + { + "epoch": 0.2, + "grad_norm": 0.5734425170958729, + "learning_rate": 9.248935121159552e-06, + "loss": 0.1654, + "step": 705 + }, + { + "epoch": 0.2, + "grad_norm": 0.37270001001552855, + "learning_rate": 9.246494398453462e-06, + "loss": 0.0889, + "step": 706 + }, + { + "epoch": 0.2, + "grad_norm": 0.4087403745286263, + "learning_rate": 9.244050039522673e-06, + "loss": 0.1161, + "step": 707 + }, + { + "epoch": 0.2, + "grad_norm": 0.37248115377091445, + "learning_rate": 9.241602046460259e-06, + "loss": 0.0912, + "step": 708 + }, + { + "epoch": 0.2, + "grad_norm": 0.3926233305358984, + "learning_rate": 9.239150421362409e-06, + "loss": 0.0778, + "step": 709 + }, + { + "epoch": 0.2, + "grad_norm": 0.43737638808273627, + "learning_rate": 9.23669516632842e-06, + "loss": 0.0643, + "step": 710 + }, + { + "epoch": 0.2, + "grad_norm": 0.684772974774738, + "learning_rate": 9.2342362834607e-06, + "loss": 0.1126, + "step": 711 + }, + { + "epoch": 0.2, + "grad_norm": 0.5215146223231149, + "learning_rate": 9.231773774864764e-06, + "loss": 0.1385, + "step": 712 + }, + { + "epoch": 0.2, + "grad_norm": 0.5365368655609294, + "learning_rate": 9.229307642649227e-06, + "loss": 0.0933, + "step": 713 + }, + { + "epoch": 0.2, + "grad_norm": 0.4725361435553599, + "learning_rate": 9.226837888925813e-06, + "loss": 0.0765, + "step": 714 + }, + { + "epoch": 0.2, + "grad_norm": 0.3904654870756912, + "learning_rate": 9.224364515809344e-06, + "loss": 0.103, + "step": 715 + }, + { + "epoch": 0.2, + "grad_norm": 1.1742846534420257, + "learning_rate": 9.221887525417737e-06, + "loss": 0.1431, + "step": 716 + }, + { + "epoch": 0.2, + "grad_norm": 0.7353839708273526, + "learning_rate": 9.219406919872014e-06, + "loss": 0.1401, + "step": 717 + }, + { + "epoch": 0.21, + "grad_norm": 0.3523709264724253, + "learning_rate": 9.216922701296289e-06, + "loss": 0.0586, + "step": 718 + }, + { + "epoch": 0.21, + "grad_norm": 0.3411766976346362, + "learning_rate": 9.214434871817769e-06, + "loss": 0.0769, + "step": 719 + }, + { + "epoch": 0.21, + "grad_norm": 1.2009156501089908, + "learning_rate": 9.211943433566755e-06, + "loss": 0.1282, + "step": 720 + }, + { + "epoch": 0.21, + "grad_norm": 0.5582439527709884, + "learning_rate": 9.209448388676636e-06, + "loss": 0.1446, + "step": 721 + }, + { + "epoch": 0.21, + "grad_norm": 0.4252995547914209, + "learning_rate": 9.20694973928389e-06, + "loss": 0.0921, + "step": 722 + }, + { + "epoch": 0.21, + "grad_norm": 0.37333649499305366, + "learning_rate": 9.204447487528085e-06, + "loss": 0.0922, + "step": 723 + }, + { + "epoch": 0.21, + "grad_norm": 0.3364452080789254, + "learning_rate": 9.20194163555187e-06, + "loss": 0.0688, + "step": 724 + }, + { + "epoch": 0.21, + "grad_norm": 0.841864169024913, + "learning_rate": 9.199432185500972e-06, + "loss": 0.1456, + "step": 725 + }, + { + "epoch": 0.21, + "grad_norm": 0.6276528959560425, + "learning_rate": 9.196919139524213e-06, + "loss": 0.0969, + "step": 726 + }, + { + "epoch": 0.21, + "grad_norm": 0.6504575615211694, + "learning_rate": 9.194402499773478e-06, + "loss": 0.1332, + "step": 727 + }, + { + "epoch": 0.21, + "grad_norm": 1.189817192486516, + "learning_rate": 9.191882268403743e-06, + "loss": 0.1409, + "step": 728 + }, + { + "epoch": 0.21, + "grad_norm": 0.5374540414308913, + "learning_rate": 9.189358447573047e-06, + "loss": 0.1292, + "step": 729 + }, + { + "epoch": 0.21, + "grad_norm": 0.3552774265718138, + "learning_rate": 9.186831039442514e-06, + "loss": 0.0677, + "step": 730 + }, + { + "epoch": 0.21, + "grad_norm": 0.6988232672750744, + "learning_rate": 9.184300046176333e-06, + "loss": 0.1037, + "step": 731 + }, + { + "epoch": 0.21, + "grad_norm": 0.5337454702787745, + "learning_rate": 9.181765469941765e-06, + "loss": 0.1033, + "step": 732 + }, + { + "epoch": 0.21, + "grad_norm": 0.5276214220760512, + "learning_rate": 9.17922731290914e-06, + "loss": 0.1027, + "step": 733 + }, + { + "epoch": 0.21, + "grad_norm": 0.35890532134151637, + "learning_rate": 9.176685577251851e-06, + "loss": 0.0808, + "step": 734 + }, + { + "epoch": 0.21, + "grad_norm": 0.4825929864132039, + "learning_rate": 9.174140265146356e-06, + "loss": 0.1101, + "step": 735 + }, + { + "epoch": 0.21, + "grad_norm": 0.7291551338337156, + "learning_rate": 9.17159137877218e-06, + "loss": 0.17, + "step": 736 + }, + { + "epoch": 0.21, + "grad_norm": 0.750892141102736, + "learning_rate": 9.169038920311904e-06, + "loss": 0.1176, + "step": 737 + }, + { + "epoch": 0.21, + "grad_norm": 0.6081493782398585, + "learning_rate": 9.166482891951167e-06, + "loss": 0.1273, + "step": 738 + }, + { + "epoch": 0.21, + "grad_norm": 0.7329853233032924, + "learning_rate": 9.163923295878671e-06, + "loss": 0.1187, + "step": 739 + }, + { + "epoch": 0.21, + "grad_norm": 0.4699516253473333, + "learning_rate": 9.161360134286166e-06, + "loss": 0.0968, + "step": 740 + }, + { + "epoch": 0.21, + "grad_norm": 0.3577997147083906, + "learning_rate": 9.158793409368457e-06, + "loss": 0.0828, + "step": 741 + }, + { + "epoch": 0.21, + "grad_norm": 0.5160848593239924, + "learning_rate": 9.156223123323405e-06, + "loss": 0.1055, + "step": 742 + }, + { + "epoch": 0.21, + "grad_norm": 0.5661886219563604, + "learning_rate": 9.153649278351912e-06, + "loss": 0.1159, + "step": 743 + }, + { + "epoch": 0.21, + "grad_norm": 0.5506370893870068, + "learning_rate": 9.151071876657938e-06, + "loss": 0.0952, + "step": 744 + }, + { + "epoch": 0.21, + "grad_norm": 0.49667306715332216, + "learning_rate": 9.148490920448476e-06, + "loss": 0.1312, + "step": 745 + }, + { + "epoch": 0.21, + "grad_norm": 0.6028460999658366, + "learning_rate": 9.145906411933576e-06, + "loss": 0.1018, + "step": 746 + }, + { + "epoch": 0.21, + "grad_norm": 0.7536199812633009, + "learning_rate": 9.143318353326316e-06, + "loss": 0.1419, + "step": 747 + }, + { + "epoch": 0.21, + "grad_norm": 0.3783696079417029, + "learning_rate": 9.140726746842827e-06, + "loss": 0.0511, + "step": 748 + }, + { + "epoch": 0.21, + "grad_norm": 0.4995011041289576, + "learning_rate": 9.13813159470227e-06, + "loss": 0.1112, + "step": 749 + }, + { + "epoch": 0.21, + "grad_norm": 0.5643956899537229, + "learning_rate": 9.135532899126844e-06, + "loss": 0.1254, + "step": 750 + }, + { + "epoch": 0.21, + "grad_norm": 0.42022233300605943, + "learning_rate": 9.132930662341783e-06, + "loss": 0.0708, + "step": 751 + }, + { + "epoch": 0.21, + "grad_norm": 0.5380086554905054, + "learning_rate": 9.130324886575351e-06, + "loss": 0.123, + "step": 752 + }, + { + "epoch": 0.22, + "grad_norm": 0.4400408465782521, + "learning_rate": 9.127715574058847e-06, + "loss": 0.0716, + "step": 753 + }, + { + "epoch": 0.22, + "grad_norm": 0.37534544197208, + "learning_rate": 9.125102727026592e-06, + "loss": 0.1099, + "step": 754 + }, + { + "epoch": 0.22, + "grad_norm": 0.6991202727796607, + "learning_rate": 9.122486347715937e-06, + "loss": 0.1367, + "step": 755 + }, + { + "epoch": 0.22, + "grad_norm": 0.41246221387968546, + "learning_rate": 9.119866438367263e-06, + "loss": 0.0825, + "step": 756 + }, + { + "epoch": 0.22, + "grad_norm": 0.2983856613021006, + "learning_rate": 9.117243001223963e-06, + "loss": 0.0793, + "step": 757 + }, + { + "epoch": 0.22, + "grad_norm": 0.4286379751579008, + "learning_rate": 9.11461603853246e-06, + "loss": 0.0718, + "step": 758 + }, + { + "epoch": 0.22, + "grad_norm": 0.30316378745638267, + "learning_rate": 9.111985552542188e-06, + "loss": 0.0703, + "step": 759 + }, + { + "epoch": 0.22, + "grad_norm": 0.36592788858390773, + "learning_rate": 9.109351545505607e-06, + "loss": 0.0647, + "step": 760 + }, + { + "epoch": 0.22, + "grad_norm": 0.5676181326297737, + "learning_rate": 9.106714019678185e-06, + "loss": 0.0779, + "step": 761 + }, + { + "epoch": 0.22, + "grad_norm": 0.4835641579608436, + "learning_rate": 9.104072977318403e-06, + "loss": 0.0641, + "step": 762 + }, + { + "epoch": 0.22, + "grad_norm": 0.5508660668429857, + "learning_rate": 9.101428420687759e-06, + "loss": 0.0954, + "step": 763 + }, + { + "epoch": 0.22, + "grad_norm": 0.36934478548767896, + "learning_rate": 9.098780352050756e-06, + "loss": 0.083, + "step": 764 + }, + { + "epoch": 0.22, + "grad_norm": 0.8650730665456191, + "learning_rate": 9.096128773674902e-06, + "loss": 0.1289, + "step": 765 + }, + { + "epoch": 0.22, + "grad_norm": 0.48078309658654167, + "learning_rate": 9.093473687830718e-06, + "loss": 0.1137, + "step": 766 + }, + { + "epoch": 0.22, + "grad_norm": 0.556726383146375, + "learning_rate": 9.090815096791719e-06, + "loss": 0.1399, + "step": 767 + }, + { + "epoch": 0.22, + "grad_norm": 0.5062090329432174, + "learning_rate": 9.08815300283443e-06, + "loss": 0.1002, + "step": 768 + }, + { + "epoch": 0.22, + "grad_norm": 0.3857594535941769, + "learning_rate": 9.08548740823837e-06, + "loss": 0.0979, + "step": 769 + }, + { + "epoch": 0.22, + "grad_norm": 0.42728429643783955, + "learning_rate": 9.082818315286054e-06, + "loss": 0.0873, + "step": 770 + }, + { + "epoch": 0.22, + "grad_norm": 0.4544981939951406, + "learning_rate": 9.080145726263003e-06, + "loss": 0.0838, + "step": 771 + }, + { + "epoch": 0.22, + "grad_norm": 0.6013625857886737, + "learning_rate": 9.077469643457719e-06, + "loss": 0.1116, + "step": 772 + }, + { + "epoch": 0.22, + "grad_norm": 0.6236641276359599, + "learning_rate": 9.074790069161703e-06, + "loss": 0.1359, + "step": 773 + }, + { + "epoch": 0.22, + "grad_norm": 0.5043621989493956, + "learning_rate": 9.072107005669444e-06, + "loss": 0.1426, + "step": 774 + }, + { + "epoch": 0.22, + "grad_norm": 0.4141078190743088, + "learning_rate": 9.069420455278418e-06, + "loss": 0.1124, + "step": 775 + }, + { + "epoch": 0.22, + "grad_norm": 0.3135133375118733, + "learning_rate": 9.066730420289088e-06, + "loss": 0.0566, + "step": 776 + }, + { + "epoch": 0.22, + "grad_norm": 0.34710548595849217, + "learning_rate": 9.0640369030049e-06, + "loss": 0.0884, + "step": 777 + }, + { + "epoch": 0.22, + "grad_norm": 0.5349865562937722, + "learning_rate": 9.061339905732282e-06, + "loss": 0.1306, + "step": 778 + }, + { + "epoch": 0.22, + "grad_norm": 0.5997926521709435, + "learning_rate": 9.058639430780642e-06, + "loss": 0.1752, + "step": 779 + }, + { + "epoch": 0.22, + "grad_norm": 0.3936916454698406, + "learning_rate": 9.055935480462366e-06, + "loss": 0.0914, + "step": 780 + }, + { + "epoch": 0.22, + "grad_norm": 0.3228969309279008, + "learning_rate": 9.053228057092817e-06, + "loss": 0.0814, + "step": 781 + }, + { + "epoch": 0.22, + "grad_norm": 0.24771035108866218, + "learning_rate": 9.050517162990329e-06, + "loss": 0.0531, + "step": 782 + }, + { + "epoch": 0.22, + "grad_norm": 0.47770744193018433, + "learning_rate": 9.04780280047621e-06, + "loss": 0.1322, + "step": 783 + }, + { + "epoch": 0.22, + "grad_norm": 0.5372195573790007, + "learning_rate": 9.045084971874738e-06, + "loss": 0.0865, + "step": 784 + }, + { + "epoch": 0.22, + "grad_norm": 0.8220141399078558, + "learning_rate": 9.042363679513158e-06, + "loss": 0.1439, + "step": 785 + }, + { + "epoch": 0.22, + "grad_norm": 0.5064666522255065, + "learning_rate": 9.039638925721683e-06, + "loss": 0.1297, + "step": 786 + }, + { + "epoch": 0.22, + "grad_norm": 0.6144017014891907, + "learning_rate": 9.036910712833488e-06, + "loss": 0.1163, + "step": 787 + }, + { + "epoch": 0.23, + "grad_norm": 0.4727152509436765, + "learning_rate": 9.03417904318471e-06, + "loss": 0.1106, + "step": 788 + }, + { + "epoch": 0.23, + "grad_norm": 0.5433509786648136, + "learning_rate": 9.031443919114447e-06, + "loss": 0.1118, + "step": 789 + }, + { + "epoch": 0.23, + "grad_norm": 0.6010335333394724, + "learning_rate": 9.028705342964752e-06, + "loss": 0.139, + "step": 790 + }, + { + "epoch": 0.23, + "grad_norm": 0.32402803259529556, + "learning_rate": 9.025963317080641e-06, + "loss": 0.0852, + "step": 791 + }, + { + "epoch": 0.23, + "grad_norm": 0.5616724571525001, + "learning_rate": 9.023217843810078e-06, + "loss": 0.1097, + "step": 792 + }, + { + "epoch": 0.23, + "grad_norm": 0.4277656347530322, + "learning_rate": 9.02046892550398e-06, + "loss": 0.0922, + "step": 793 + }, + { + "epoch": 0.23, + "grad_norm": 0.5865243436306229, + "learning_rate": 9.017716564516213e-06, + "loss": 0.0731, + "step": 794 + }, + { + "epoch": 0.23, + "grad_norm": 0.389559883434402, + "learning_rate": 9.014960763203592e-06, + "loss": 0.0986, + "step": 795 + }, + { + "epoch": 0.23, + "grad_norm": 0.5123520630529207, + "learning_rate": 9.012201523925883e-06, + "loss": 0.083, + "step": 796 + }, + { + "epoch": 0.23, + "grad_norm": 0.6005092454859025, + "learning_rate": 9.009438849045787e-06, + "loss": 0.1422, + "step": 797 + }, + { + "epoch": 0.23, + "grad_norm": 0.7752913470234684, + "learning_rate": 9.006672740928952e-06, + "loss": 0.1155, + "step": 798 + }, + { + "epoch": 0.23, + "grad_norm": 0.30730440396061093, + "learning_rate": 9.003903201943963e-06, + "loss": 0.0668, + "step": 799 + }, + { + "epoch": 0.23, + "grad_norm": 0.3973499918699411, + "learning_rate": 9.001130234462348e-06, + "loss": 0.1166, + "step": 800 + }, + { + "epoch": 0.23, + "grad_norm": 0.43049515189178245, + "learning_rate": 8.998353840858565e-06, + "loss": 0.0933, + "step": 801 + }, + { + "epoch": 0.23, + "grad_norm": 0.6827994479548175, + "learning_rate": 8.995574023510007e-06, + "loss": 0.119, + "step": 802 + }, + { + "epoch": 0.23, + "grad_norm": 0.48279321530860336, + "learning_rate": 8.992790784797004e-06, + "loss": 0.0967, + "step": 803 + }, + { + "epoch": 0.23, + "grad_norm": 0.5756268706713004, + "learning_rate": 8.99000412710281e-06, + "loss": 0.094, + "step": 804 + }, + { + "epoch": 0.23, + "grad_norm": 0.5182683702974817, + "learning_rate": 8.987214052813605e-06, + "loss": 0.1155, + "step": 805 + }, + { + "epoch": 0.23, + "grad_norm": 0.42356209279728074, + "learning_rate": 8.984420564318501e-06, + "loss": 0.0938, + "step": 806 + }, + { + "epoch": 0.23, + "grad_norm": 1.069788771610959, + "learning_rate": 8.98162366400953e-06, + "loss": 0.1668, + "step": 807 + }, + { + "epoch": 0.23, + "grad_norm": 0.4550416977253712, + "learning_rate": 8.978823354281645e-06, + "loss": 0.1007, + "step": 808 + }, + { + "epoch": 0.23, + "grad_norm": 0.7524921906573384, + "learning_rate": 8.97601963753272e-06, + "loss": 0.119, + "step": 809 + }, + { + "epoch": 0.23, + "grad_norm": 0.32381408525164634, + "learning_rate": 8.973212516163545e-06, + "loss": 0.0755, + "step": 810 + }, + { + "epoch": 0.23, + "grad_norm": 0.7182312122919299, + "learning_rate": 8.970401992577828e-06, + "loss": 0.091, + "step": 811 + }, + { + "epoch": 0.23, + "grad_norm": 0.505126853830724, + "learning_rate": 8.967588069182184e-06, + "loss": 0.0636, + "step": 812 + }, + { + "epoch": 0.23, + "grad_norm": 0.6576481777177007, + "learning_rate": 8.964770748386149e-06, + "loss": 0.1209, + "step": 813 + }, + { + "epoch": 0.23, + "grad_norm": 7.692326398992742, + "learning_rate": 8.961950032602157e-06, + "loss": 0.2793, + "step": 814 + }, + { + "epoch": 0.23, + "grad_norm": 0.6519662848792224, + "learning_rate": 8.959125924245559e-06, + "loss": 0.1369, + "step": 815 + }, + { + "epoch": 0.23, + "grad_norm": 0.6731876604424843, + "learning_rate": 8.956298425734606e-06, + "loss": 0.113, + "step": 816 + }, + { + "epoch": 0.23, + "grad_norm": 0.631265442700046, + "learning_rate": 8.95346753949045e-06, + "loss": 0.1409, + "step": 817 + }, + { + "epoch": 0.23, + "grad_norm": 0.4359598787752724, + "learning_rate": 8.95063326793715e-06, + "loss": 0.0561, + "step": 818 + }, + { + "epoch": 0.23, + "grad_norm": 1.5325726032748053, + "learning_rate": 8.947795613501658e-06, + "loss": 0.1352, + "step": 819 + }, + { + "epoch": 0.23, + "grad_norm": 0.36788253328153797, + "learning_rate": 8.944954578613826e-06, + "loss": 0.1112, + "step": 820 + }, + { + "epoch": 0.23, + "grad_norm": 0.5167892458757593, + "learning_rate": 8.942110165706402e-06, + "loss": 0.0984, + "step": 821 + }, + { + "epoch": 0.23, + "grad_norm": 0.5279357960312524, + "learning_rate": 8.939262377215019e-06, + "loss": 0.1245, + "step": 822 + }, + { + "epoch": 0.24, + "grad_norm": 0.3902659964321454, + "learning_rate": 8.936411215578211e-06, + "loss": 0.097, + "step": 823 + }, + { + "epoch": 0.24, + "grad_norm": 0.4741580517118749, + "learning_rate": 8.933556683237392e-06, + "loss": 0.0967, + "step": 824 + }, + { + "epoch": 0.24, + "grad_norm": 0.5879684966050115, + "learning_rate": 8.930698782636868e-06, + "loss": 0.125, + "step": 825 + }, + { + "epoch": 0.24, + "grad_norm": 0.295025135184848, + "learning_rate": 8.927837516223824e-06, + "loss": 0.0856, + "step": 826 + }, + { + "epoch": 0.24, + "grad_norm": 0.8211083633805236, + "learning_rate": 8.924972886448331e-06, + "loss": 0.1354, + "step": 827 + }, + { + "epoch": 0.24, + "grad_norm": 0.4820327564594707, + "learning_rate": 8.922104895763339e-06, + "loss": 0.1181, + "step": 828 + }, + { + "epoch": 0.24, + "grad_norm": 0.36098422965878196, + "learning_rate": 8.919233546624677e-06, + "loss": 0.0807, + "step": 829 + }, + { + "epoch": 0.24, + "grad_norm": 0.3319217140421602, + "learning_rate": 8.916358841491046e-06, + "loss": 0.0769, + "step": 830 + }, + { + "epoch": 0.24, + "grad_norm": 0.40274034029099054, + "learning_rate": 8.913480782824025e-06, + "loss": 0.1067, + "step": 831 + }, + { + "epoch": 0.24, + "grad_norm": 0.4767912541059932, + "learning_rate": 8.91059937308806e-06, + "loss": 0.1207, + "step": 832 + }, + { + "epoch": 0.24, + "grad_norm": 0.4908848791522766, + "learning_rate": 8.907714614750473e-06, + "loss": 0.0925, + "step": 833 + }, + { + "epoch": 0.24, + "grad_norm": 0.5120088266126959, + "learning_rate": 8.904826510281448e-06, + "loss": 0.1213, + "step": 834 + }, + { + "epoch": 0.24, + "grad_norm": 0.5826862448715896, + "learning_rate": 8.901935062154035e-06, + "loss": 0.1064, + "step": 835 + }, + { + "epoch": 0.24, + "grad_norm": 0.45695283672301107, + "learning_rate": 8.899040272844149e-06, + "loss": 0.0915, + "step": 836 + }, + { + "epoch": 0.24, + "grad_norm": 0.38119870235578857, + "learning_rate": 8.896142144830565e-06, + "loss": 0.0889, + "step": 837 + }, + { + "epoch": 0.24, + "grad_norm": 0.5054610878270641, + "learning_rate": 8.893240680594916e-06, + "loss": 0.1025, + "step": 838 + }, + { + "epoch": 0.24, + "grad_norm": 0.44381009273730515, + "learning_rate": 8.890335882621697e-06, + "loss": 0.0896, + "step": 839 + }, + { + "epoch": 0.24, + "grad_norm": 0.34372743762996427, + "learning_rate": 8.887427753398249e-06, + "loss": 0.0924, + "step": 840 + }, + { + "epoch": 0.24, + "grad_norm": 0.588425547513867, + "learning_rate": 8.88451629541477e-06, + "loss": 0.1044, + "step": 841 + }, + { + "epoch": 0.24, + "grad_norm": 0.45866727011355307, + "learning_rate": 8.881601511164306e-06, + "loss": 0.0641, + "step": 842 + }, + { + "epoch": 0.24, + "grad_norm": 0.5856834534904065, + "learning_rate": 8.87868340314276e-06, + "loss": 0.091, + "step": 843 + }, + { + "epoch": 0.24, + "grad_norm": 0.5321123695622559, + "learning_rate": 8.87576197384887e-06, + "loss": 0.0782, + "step": 844 + }, + { + "epoch": 0.24, + "grad_norm": 0.379052329623936, + "learning_rate": 8.872837225784227e-06, + "loss": 0.0861, + "step": 845 + }, + { + "epoch": 0.24, + "grad_norm": 0.5868727341308448, + "learning_rate": 8.869909161453254e-06, + "loss": 0.1258, + "step": 846 + }, + { + "epoch": 0.24, + "grad_norm": 0.3688506033382923, + "learning_rate": 8.866977783363219e-06, + "loss": 0.0621, + "step": 847 + }, + { + "epoch": 0.24, + "grad_norm": 0.4016791149564159, + "learning_rate": 8.864043094024233e-06, + "loss": 0.0849, + "step": 848 + }, + { + "epoch": 0.24, + "grad_norm": 0.43053240748250243, + "learning_rate": 8.861105095949234e-06, + "loss": 0.0921, + "step": 849 + }, + { + "epoch": 0.24, + "grad_norm": 0.629742603869501, + "learning_rate": 8.858163791653994e-06, + "loss": 0.1201, + "step": 850 + }, + { + "epoch": 0.24, + "grad_norm": 0.43952511190365295, + "learning_rate": 8.855219183657122e-06, + "loss": 0.079, + "step": 851 + }, + { + "epoch": 0.24, + "grad_norm": 0.5248966692778887, + "learning_rate": 8.85227127448005e-06, + "loss": 0.1004, + "step": 852 + }, + { + "epoch": 0.24, + "grad_norm": 0.4758828808847174, + "learning_rate": 8.84932006664704e-06, + "loss": 0.117, + "step": 853 + }, + { + "epoch": 0.24, + "grad_norm": 0.36892765193767757, + "learning_rate": 8.846365562685178e-06, + "loss": 0.0968, + "step": 854 + }, + { + "epoch": 0.24, + "grad_norm": 0.5009491205345755, + "learning_rate": 8.84340776512437e-06, + "loss": 0.1131, + "step": 855 + }, + { + "epoch": 0.24, + "grad_norm": 0.9089159097110372, + "learning_rate": 8.840446676497344e-06, + "loss": 0.1165, + "step": 856 + }, + { + "epoch": 0.24, + "grad_norm": 0.4161456090989337, + "learning_rate": 8.837482299339652e-06, + "loss": 0.1069, + "step": 857 + }, + { + "epoch": 0.25, + "grad_norm": 1.0317820640217963, + "learning_rate": 8.83451463618965e-06, + "loss": 0.0809, + "step": 858 + }, + { + "epoch": 0.25, + "grad_norm": 0.5800774784418494, + "learning_rate": 8.831543689588517e-06, + "loss": 0.1071, + "step": 859 + }, + { + "epoch": 0.25, + "grad_norm": 0.4007033123605079, + "learning_rate": 8.82856946208024e-06, + "loss": 0.0969, + "step": 860 + }, + { + "epoch": 0.25, + "grad_norm": 0.3946324231265691, + "learning_rate": 8.825591956211614e-06, + "loss": 0.084, + "step": 861 + }, + { + "epoch": 0.25, + "grad_norm": 0.5111307803372946, + "learning_rate": 8.822611174532248e-06, + "loss": 0.1191, + "step": 862 + }, + { + "epoch": 0.25, + "grad_norm": 0.6711085048466602, + "learning_rate": 8.819627119594549e-06, + "loss": 0.11, + "step": 863 + }, + { + "epoch": 0.25, + "grad_norm": 0.6188807241146488, + "learning_rate": 8.816639793953727e-06, + "loss": 0.0849, + "step": 864 + }, + { + "epoch": 0.25, + "grad_norm": 0.6968074134324962, + "learning_rate": 8.8136492001678e-06, + "loss": 0.1275, + "step": 865 + }, + { + "epoch": 0.25, + "grad_norm": 0.5287857662766221, + "learning_rate": 8.810655340797574e-06, + "loss": 0.0807, + "step": 866 + }, + { + "epoch": 0.25, + "grad_norm": 0.3558440872750076, + "learning_rate": 8.807658218406658e-06, + "loss": 0.0755, + "step": 867 + }, + { + "epoch": 0.25, + "grad_norm": 0.3331007464909322, + "learning_rate": 8.804657835561456e-06, + "loss": 0.1036, + "step": 868 + }, + { + "epoch": 0.25, + "grad_norm": 0.4497950761853012, + "learning_rate": 8.801654194831159e-06, + "loss": 0.0757, + "step": 869 + }, + { + "epoch": 0.25, + "grad_norm": 0.5087848866630862, + "learning_rate": 8.798647298787754e-06, + "loss": 0.0893, + "step": 870 + }, + { + "epoch": 0.25, + "grad_norm": 0.45894665768788273, + "learning_rate": 8.795637150006007e-06, + "loss": 0.1201, + "step": 871 + }, + { + "epoch": 0.25, + "grad_norm": 0.354858690530361, + "learning_rate": 8.792623751063476e-06, + "loss": 0.0733, + "step": 872 + }, + { + "epoch": 0.25, + "grad_norm": 0.6719833618670081, + "learning_rate": 8.789607104540502e-06, + "loss": 0.16, + "step": 873 + }, + { + "epoch": 0.25, + "grad_norm": 0.44781858187738477, + "learning_rate": 8.786587213020202e-06, + "loss": 0.0757, + "step": 874 + }, + { + "epoch": 0.25, + "grad_norm": 0.45570009334354245, + "learning_rate": 8.783564079088478e-06, + "loss": 0.0983, + "step": 875 + }, + { + "epoch": 0.25, + "grad_norm": 0.5766552732083019, + "learning_rate": 8.780537705334e-06, + "loss": 0.0833, + "step": 876 + }, + { + "epoch": 0.25, + "grad_norm": 0.3320289562900106, + "learning_rate": 8.777508094348222e-06, + "loss": 0.0785, + "step": 877 + }, + { + "epoch": 0.25, + "grad_norm": 0.6086385199833315, + "learning_rate": 8.774475248725366e-06, + "loss": 0.1006, + "step": 878 + }, + { + "epoch": 0.25, + "grad_norm": 0.48025884615280756, + "learning_rate": 8.771439171062417e-06, + "loss": 0.1355, + "step": 879 + }, + { + "epoch": 0.25, + "grad_norm": 0.4580674740736033, + "learning_rate": 8.76839986395914e-06, + "loss": 0.1165, + "step": 880 + }, + { + "epoch": 0.25, + "grad_norm": 0.3298155358437963, + "learning_rate": 8.765357330018056e-06, + "loss": 0.0939, + "step": 881 + }, + { + "epoch": 0.25, + "grad_norm": 0.3922558669302778, + "learning_rate": 8.762311571844453e-06, + "loss": 0.1046, + "step": 882 + }, + { + "epoch": 0.25, + "grad_norm": 0.7558459488933367, + "learning_rate": 8.759262592046378e-06, + "loss": 0.0952, + "step": 883 + }, + { + "epoch": 0.25, + "grad_norm": 0.4258296794792277, + "learning_rate": 8.756210393234636e-06, + "loss": 0.0799, + "step": 884 + }, + { + "epoch": 0.25, + "grad_norm": 0.3555262486853882, + "learning_rate": 8.753154978022795e-06, + "loss": 0.0826, + "step": 885 + }, + { + "epoch": 0.25, + "grad_norm": 0.7015025200179268, + "learning_rate": 8.75009634902717e-06, + "loss": 0.108, + "step": 886 + }, + { + "epoch": 0.25, + "grad_norm": 0.7530615840579308, + "learning_rate": 8.747034508866828e-06, + "loss": 0.21, + "step": 887 + }, + { + "epoch": 0.25, + "grad_norm": 0.5315933652038956, + "learning_rate": 8.74396946016359e-06, + "loss": 0.0863, + "step": 888 + }, + { + "epoch": 0.25, + "grad_norm": 1.0945944899723976, + "learning_rate": 8.74090120554202e-06, + "loss": 0.0902, + "step": 889 + }, + { + "epoch": 0.25, + "grad_norm": 0.5683446077431457, + "learning_rate": 8.737829747629432e-06, + "loss": 0.0809, + "step": 890 + }, + { + "epoch": 0.25, + "grad_norm": 0.29614996842243607, + "learning_rate": 8.73475508905588e-06, + "loss": 0.0735, + "step": 891 + }, + { + "epoch": 0.25, + "grad_norm": 0.46585249998082157, + "learning_rate": 8.731677232454159e-06, + "loss": 0.0861, + "step": 892 + }, + { + "epoch": 0.26, + "grad_norm": 0.46383857614209745, + "learning_rate": 8.728596180459799e-06, + "loss": 0.0833, + "step": 893 + }, + { + "epoch": 0.26, + "grad_norm": 0.387209190084108, + "learning_rate": 8.725511935711074e-06, + "loss": 0.0739, + "step": 894 + }, + { + "epoch": 0.26, + "grad_norm": 0.5959494360000862, + "learning_rate": 8.722424500848988e-06, + "loss": 0.1125, + "step": 895 + }, + { + "epoch": 0.26, + "grad_norm": 0.4229318299158801, + "learning_rate": 8.719333878517274e-06, + "loss": 0.117, + "step": 896 + }, + { + "epoch": 0.26, + "grad_norm": 0.4931293812890095, + "learning_rate": 8.716240071362394e-06, + "loss": 0.1337, + "step": 897 + }, + { + "epoch": 0.26, + "grad_norm": 0.34618356814511986, + "learning_rate": 8.713143082033546e-06, + "loss": 0.0839, + "step": 898 + }, + { + "epoch": 0.26, + "grad_norm": 0.7007740491097011, + "learning_rate": 8.710042913182642e-06, + "loss": 0.1235, + "step": 899 + }, + { + "epoch": 0.26, + "grad_norm": 0.39627229141713144, + "learning_rate": 8.706939567464322e-06, + "loss": 0.0882, + "step": 900 + }, + { + "epoch": 0.26, + "grad_norm": 0.44060741286425664, + "learning_rate": 8.703833047535946e-06, + "loss": 0.1002, + "step": 901 + }, + { + "epoch": 0.26, + "grad_norm": 0.3488303699913442, + "learning_rate": 8.700723356057593e-06, + "loss": 0.0698, + "step": 902 + }, + { + "epoch": 0.26, + "grad_norm": 0.5795997255716634, + "learning_rate": 8.697610495692055e-06, + "loss": 0.1081, + "step": 903 + }, + { + "epoch": 0.26, + "grad_norm": 0.6542499486577853, + "learning_rate": 8.694494469104839e-06, + "loss": 0.0741, + "step": 904 + }, + { + "epoch": 0.26, + "grad_norm": 0.4638422976582731, + "learning_rate": 8.691375278964161e-06, + "loss": 0.0892, + "step": 905 + }, + { + "epoch": 0.26, + "grad_norm": 0.263318870009058, + "learning_rate": 8.688252927940951e-06, + "loss": 0.0715, + "step": 906 + }, + { + "epoch": 0.26, + "grad_norm": 0.3974683063077847, + "learning_rate": 8.685127418708843e-06, + "loss": 0.0891, + "step": 907 + }, + { + "epoch": 0.26, + "grad_norm": 0.49445402902436825, + "learning_rate": 8.68199875394417e-06, + "loss": 0.077, + "step": 908 + }, + { + "epoch": 0.26, + "grad_norm": 0.4449436932278595, + "learning_rate": 8.678866936325978e-06, + "loss": 0.0768, + "step": 909 + }, + { + "epoch": 0.26, + "grad_norm": 0.4215841710611692, + "learning_rate": 8.675731968536004e-06, + "loss": 0.0832, + "step": 910 + }, + { + "epoch": 0.26, + "grad_norm": 0.407905652057663, + "learning_rate": 8.672593853258683e-06, + "loss": 0.0913, + "step": 911 + }, + { + "epoch": 0.26, + "grad_norm": 0.574925733644576, + "learning_rate": 8.66945259318115e-06, + "loss": 0.093, + "step": 912 + }, + { + "epoch": 0.26, + "grad_norm": 0.6442804473503869, + "learning_rate": 8.66630819099323e-06, + "loss": 0.0873, + "step": 913 + }, + { + "epoch": 0.26, + "grad_norm": 0.7524676718124926, + "learning_rate": 8.663160649387437e-06, + "loss": 0.1657, + "step": 914 + }, + { + "epoch": 0.26, + "grad_norm": 0.34435275238634266, + "learning_rate": 8.660009971058977e-06, + "loss": 0.0915, + "step": 915 + }, + { + "epoch": 0.26, + "grad_norm": 0.35403476880377843, + "learning_rate": 8.656856158705739e-06, + "loss": 0.0709, + "step": 916 + }, + { + "epoch": 0.26, + "grad_norm": 0.382013326900908, + "learning_rate": 8.653699215028298e-06, + "loss": 0.0887, + "step": 917 + }, + { + "epoch": 0.26, + "grad_norm": 0.2969627627438944, + "learning_rate": 8.650539142729906e-06, + "loss": 0.0568, + "step": 918 + }, + { + "epoch": 0.26, + "grad_norm": 0.23465454383079634, + "learning_rate": 8.647375944516498e-06, + "loss": 0.0311, + "step": 919 + }, + { + "epoch": 0.26, + "grad_norm": 0.5186825503251482, + "learning_rate": 8.644209623096686e-06, + "loss": 0.1222, + "step": 920 + }, + { + "epoch": 0.26, + "grad_norm": 0.4625313033137186, + "learning_rate": 8.641040181181755e-06, + "loss": 0.0929, + "step": 921 + }, + { + "epoch": 0.26, + "grad_norm": 0.5628437161261859, + "learning_rate": 8.637867621485658e-06, + "loss": 0.104, + "step": 922 + }, + { + "epoch": 0.26, + "grad_norm": 0.39877521396518395, + "learning_rate": 8.634691946725026e-06, + "loss": 0.083, + "step": 923 + }, + { + "epoch": 0.26, + "grad_norm": 0.4711420645294162, + "learning_rate": 8.63151315961915e-06, + "loss": 0.0914, + "step": 924 + }, + { + "epoch": 0.26, + "grad_norm": 0.42262596072091024, + "learning_rate": 8.628331262889992e-06, + "loss": 0.0926, + "step": 925 + }, + { + "epoch": 0.26, + "grad_norm": 0.2583481971720481, + "learning_rate": 8.625146259262171e-06, + "loss": 0.0497, + "step": 926 + }, + { + "epoch": 0.26, + "grad_norm": 0.42162082330689993, + "learning_rate": 8.621958151462972e-06, + "loss": 0.1033, + "step": 927 + }, + { + "epoch": 0.27, + "grad_norm": 0.39189691045759195, + "learning_rate": 8.618766942222334e-06, + "loss": 0.0963, + "step": 928 + }, + { + "epoch": 0.27, + "grad_norm": 0.5795328923387679, + "learning_rate": 8.615572634272853e-06, + "loss": 0.0996, + "step": 929 + }, + { + "epoch": 0.27, + "grad_norm": 0.5871640470001664, + "learning_rate": 8.612375230349779e-06, + "loss": 0.0997, + "step": 930 + }, + { + "epoch": 0.27, + "grad_norm": 1.3740854127694093, + "learning_rate": 8.609174733191012e-06, + "loss": 0.1078, + "step": 931 + }, + { + "epoch": 0.27, + "grad_norm": 0.3675860631682876, + "learning_rate": 8.6059711455371e-06, + "loss": 0.0919, + "step": 932 + }, + { + "epoch": 0.27, + "grad_norm": 0.8034782523140173, + "learning_rate": 8.602764470131241e-06, + "loss": 0.115, + "step": 933 + }, + { + "epoch": 0.27, + "grad_norm": 0.5684175011157698, + "learning_rate": 8.599554709719273e-06, + "loss": 0.0553, + "step": 934 + }, + { + "epoch": 0.27, + "grad_norm": 0.4621484777944867, + "learning_rate": 8.596341867049677e-06, + "loss": 0.0608, + "step": 935 + }, + { + "epoch": 0.27, + "grad_norm": 0.7909439828218208, + "learning_rate": 8.593125944873575e-06, + "loss": 0.1247, + "step": 936 + }, + { + "epoch": 0.27, + "grad_norm": 0.4000895031285387, + "learning_rate": 8.589906945944722e-06, + "loss": 0.0717, + "step": 937 + }, + { + "epoch": 0.27, + "grad_norm": 0.5060977475727799, + "learning_rate": 8.586684873019513e-06, + "loss": 0.119, + "step": 938 + }, + { + "epoch": 0.27, + "grad_norm": 0.449585847206327, + "learning_rate": 8.583459728856972e-06, + "loss": 0.0956, + "step": 939 + }, + { + "epoch": 0.27, + "grad_norm": 0.6216744445303553, + "learning_rate": 8.58023151621875e-06, + "loss": 0.1282, + "step": 940 + }, + { + "epoch": 0.27, + "grad_norm": 0.36326462305814344, + "learning_rate": 8.577000237869131e-06, + "loss": 0.0674, + "step": 941 + }, + { + "epoch": 0.27, + "grad_norm": 0.4122286497124454, + "learning_rate": 8.573765896575022e-06, + "loss": 0.072, + "step": 942 + }, + { + "epoch": 0.27, + "grad_norm": 0.36702351763564395, + "learning_rate": 8.570528495105952e-06, + "loss": 0.0956, + "step": 943 + }, + { + "epoch": 0.27, + "grad_norm": 0.3055341362571203, + "learning_rate": 8.567288036234071e-06, + "loss": 0.0782, + "step": 944 + }, + { + "epoch": 0.27, + "grad_norm": 0.5808103442156552, + "learning_rate": 8.564044522734147e-06, + "loss": 0.087, + "step": 945 + }, + { + "epoch": 0.27, + "grad_norm": 0.3718400606813192, + "learning_rate": 8.560797957383564e-06, + "loss": 0.0889, + "step": 946 + }, + { + "epoch": 0.27, + "grad_norm": 1.228691714152912, + "learning_rate": 8.557548342962318e-06, + "loss": 0.1751, + "step": 947 + }, + { + "epoch": 0.27, + "grad_norm": 0.30425561951883184, + "learning_rate": 8.554295682253016e-06, + "loss": 0.0844, + "step": 948 + }, + { + "epoch": 0.27, + "grad_norm": 0.5269394360838388, + "learning_rate": 8.551039978040877e-06, + "loss": 0.1292, + "step": 949 + }, + { + "epoch": 0.27, + "grad_norm": 0.3307915158146362, + "learning_rate": 8.54778123311372e-06, + "loss": 0.0721, + "step": 950 + }, + { + "epoch": 0.27, + "grad_norm": 0.6568645637523918, + "learning_rate": 8.544519450261976e-06, + "loss": 0.0465, + "step": 951 + }, + { + "epoch": 0.27, + "grad_norm": 0.3854610769587849, + "learning_rate": 8.541254632278667e-06, + "loss": 0.1248, + "step": 952 + }, + { + "epoch": 0.27, + "grad_norm": 0.44736870573530385, + "learning_rate": 8.537986781959423e-06, + "loss": 0.1002, + "step": 953 + }, + { + "epoch": 0.27, + "grad_norm": 0.4998194788485879, + "learning_rate": 8.534715902102463e-06, + "loss": 0.0828, + "step": 954 + }, + { + "epoch": 0.27, + "grad_norm": 0.5464549611667345, + "learning_rate": 8.531441995508609e-06, + "loss": 0.1342, + "step": 955 + }, + { + "epoch": 0.27, + "grad_norm": 0.7232068112164314, + "learning_rate": 8.528165064981266e-06, + "loss": 0.0999, + "step": 956 + }, + { + "epoch": 0.27, + "grad_norm": 0.29674152937097625, + "learning_rate": 8.524885113326435e-06, + "loss": 0.0752, + "step": 957 + }, + { + "epoch": 0.27, + "grad_norm": 0.4164296072770713, + "learning_rate": 8.5216021433527e-06, + "loss": 0.0838, + "step": 958 + }, + { + "epoch": 0.27, + "grad_norm": 0.43165949526934533, + "learning_rate": 8.518316157871232e-06, + "loss": 0.1017, + "step": 959 + }, + { + "epoch": 0.27, + "grad_norm": 0.659632022701993, + "learning_rate": 8.515027159695781e-06, + "loss": 0.1027, + "step": 960 + }, + { + "epoch": 0.27, + "grad_norm": 0.4494197565037416, + "learning_rate": 8.511735151642678e-06, + "loss": 0.1264, + "step": 961 + }, + { + "epoch": 0.27, + "grad_norm": 0.6056482137264181, + "learning_rate": 8.508440136530833e-06, + "loss": 0.1275, + "step": 962 + }, + { + "epoch": 0.28, + "grad_norm": 0.6873658026634043, + "learning_rate": 8.505142117181732e-06, + "loss": 0.0999, + "step": 963 + }, + { + "epoch": 0.28, + "grad_norm": 0.6179202466779846, + "learning_rate": 8.501841096419428e-06, + "loss": 0.0961, + "step": 964 + }, + { + "epoch": 0.28, + "grad_norm": 0.31766812424786744, + "learning_rate": 8.498537077070548e-06, + "loss": 0.0805, + "step": 965 + }, + { + "epoch": 0.28, + "grad_norm": 0.47004606745251126, + "learning_rate": 8.495230061964289e-06, + "loss": 0.0889, + "step": 966 + }, + { + "epoch": 0.28, + "grad_norm": 0.38399650573413024, + "learning_rate": 8.491920053932406e-06, + "loss": 0.0606, + "step": 967 + }, + { + "epoch": 0.28, + "grad_norm": 0.5825205963634896, + "learning_rate": 8.488607055809223e-06, + "loss": 0.0706, + "step": 968 + }, + { + "epoch": 0.28, + "grad_norm": 0.45989814660217077, + "learning_rate": 8.485291070431625e-06, + "loss": 0.0948, + "step": 969 + }, + { + "epoch": 0.28, + "grad_norm": 0.6746008813092472, + "learning_rate": 8.481972100639049e-06, + "loss": 0.1472, + "step": 970 + }, + { + "epoch": 0.28, + "grad_norm": 0.3041188595130515, + "learning_rate": 8.478650149273493e-06, + "loss": 0.0709, + "step": 971 + }, + { + "epoch": 0.28, + "grad_norm": 0.4066723941499203, + "learning_rate": 8.475325219179502e-06, + "loss": 0.0808, + "step": 972 + }, + { + "epoch": 0.28, + "grad_norm": 0.4273194677897243, + "learning_rate": 8.471997313204183e-06, + "loss": 0.0861, + "step": 973 + }, + { + "epoch": 0.28, + "grad_norm": 0.3656756351013927, + "learning_rate": 8.468666434197177e-06, + "loss": 0.0818, + "step": 974 + }, + { + "epoch": 0.28, + "grad_norm": 0.7699240643770021, + "learning_rate": 8.465332585010682e-06, + "loss": 0.0904, + "step": 975 + }, + { + "epoch": 0.28, + "grad_norm": 0.49151957054172846, + "learning_rate": 8.461995768499433e-06, + "loss": 0.071, + "step": 976 + }, + { + "epoch": 0.28, + "grad_norm": 0.586020044524259, + "learning_rate": 8.458655987520708e-06, + "loss": 0.1369, + "step": 977 + }, + { + "epoch": 0.28, + "grad_norm": 0.36776011299451683, + "learning_rate": 8.455313244934324e-06, + "loss": 0.0748, + "step": 978 + }, + { + "epoch": 0.28, + "grad_norm": 0.7619473882290455, + "learning_rate": 8.451967543602635e-06, + "loss": 0.1337, + "step": 979 + }, + { + "epoch": 0.28, + "grad_norm": 0.5303405477984192, + "learning_rate": 8.448618886390523e-06, + "loss": 0.0969, + "step": 980 + }, + { + "epoch": 0.28, + "grad_norm": 0.31130010552918846, + "learning_rate": 8.445267276165406e-06, + "loss": 0.0563, + "step": 981 + }, + { + "epoch": 0.28, + "grad_norm": 0.7283887396492364, + "learning_rate": 8.441912715797231e-06, + "loss": 0.1205, + "step": 982 + }, + { + "epoch": 0.28, + "grad_norm": 0.5454777059466462, + "learning_rate": 8.43855520815847e-06, + "loss": 0.141, + "step": 983 + }, + { + "epoch": 0.28, + "grad_norm": 0.49972607460894874, + "learning_rate": 8.435194756124118e-06, + "loss": 0.1073, + "step": 984 + }, + { + "epoch": 0.28, + "grad_norm": 0.43957090825164086, + "learning_rate": 8.431831362571692e-06, + "loss": 0.127, + "step": 985 + }, + { + "epoch": 0.28, + "grad_norm": 0.3446840899666005, + "learning_rate": 8.428465030381227e-06, + "loss": 0.0793, + "step": 986 + }, + { + "epoch": 0.28, + "grad_norm": 0.5881703597812649, + "learning_rate": 8.425095762435274e-06, + "loss": 0.0844, + "step": 987 + }, + { + "epoch": 0.28, + "grad_norm": 0.5494944307253974, + "learning_rate": 8.4217235616189e-06, + "loss": 0.0808, + "step": 988 + }, + { + "epoch": 0.28, + "grad_norm": 0.7162758642553052, + "learning_rate": 8.418348430819681e-06, + "loss": 0.1203, + "step": 989 + }, + { + "epoch": 0.28, + "grad_norm": 0.39597458879945174, + "learning_rate": 8.414970372927705e-06, + "loss": 0.0744, + "step": 990 + }, + { + "epoch": 0.28, + "grad_norm": 0.38118073928129964, + "learning_rate": 8.411589390835561e-06, + "loss": 0.0963, + "step": 991 + }, + { + "epoch": 0.28, + "grad_norm": 0.48131735200581544, + "learning_rate": 8.408205487438351e-06, + "loss": 0.105, + "step": 992 + }, + { + "epoch": 0.28, + "grad_norm": 0.6537539054016474, + "learning_rate": 8.404818665633666e-06, + "loss": 0.0946, + "step": 993 + }, + { + "epoch": 0.28, + "grad_norm": 0.6881627111782112, + "learning_rate": 8.401428928321607e-06, + "loss": 0.108, + "step": 994 + }, + { + "epoch": 0.28, + "grad_norm": 0.6937707564738863, + "learning_rate": 8.398036278404768e-06, + "loss": 0.074, + "step": 995 + }, + { + "epoch": 0.28, + "grad_norm": 0.42510209933061105, + "learning_rate": 8.394640718788234e-06, + "loss": 0.076, + "step": 996 + }, + { + "epoch": 0.28, + "grad_norm": 0.39062853225418204, + "learning_rate": 8.391242252379586e-06, + "loss": 0.0945, + "step": 997 + }, + { + "epoch": 0.29, + "grad_norm": 0.47000992816828807, + "learning_rate": 8.387840882088889e-06, + "loss": 0.0949, + "step": 998 + }, + { + "epoch": 0.29, + "grad_norm": 0.3630042614344615, + "learning_rate": 8.384436610828702e-06, + "loss": 0.0894, + "step": 999 + }, + { + "epoch": 0.29, + "grad_norm": 0.4473125165778053, + "learning_rate": 8.38102944151406e-06, + "loss": 0.0729, + "step": 1000 + }, + { + "epoch": 0.29, + "grad_norm": 0.2925190783459522, + "learning_rate": 8.377619377062483e-06, + "loss": 0.0498, + "step": 1001 + }, + { + "epoch": 0.29, + "grad_norm": 0.39742597520065365, + "learning_rate": 8.374206420393973e-06, + "loss": 0.0623, + "step": 1002 + }, + { + "epoch": 0.29, + "grad_norm": 0.4478276878530916, + "learning_rate": 8.370790574431005e-06, + "loss": 0.1058, + "step": 1003 + }, + { + "epoch": 0.29, + "grad_norm": 0.42328523998064277, + "learning_rate": 8.367371842098528e-06, + "loss": 0.1258, + "step": 1004 + }, + { + "epoch": 0.29, + "grad_norm": 1.5080160093202588, + "learning_rate": 8.363950226323963e-06, + "loss": 0.1033, + "step": 1005 + }, + { + "epoch": 0.29, + "grad_norm": 0.33864954751696186, + "learning_rate": 8.360525730037201e-06, + "loss": 0.0551, + "step": 1006 + }, + { + "epoch": 0.29, + "grad_norm": 0.36966845092822054, + "learning_rate": 8.357098356170603e-06, + "loss": 0.1034, + "step": 1007 + }, + { + "epoch": 0.29, + "grad_norm": 0.8101625761239845, + "learning_rate": 8.353668107658984e-06, + "loss": 0.1378, + "step": 1008 + }, + { + "epoch": 0.29, + "grad_norm": 0.36101811088061997, + "learning_rate": 8.35023498743963e-06, + "loss": 0.0811, + "step": 1009 + }, + { + "epoch": 0.29, + "grad_norm": 0.44657333797588306, + "learning_rate": 8.346798998452283e-06, + "loss": 0.087, + "step": 1010 + }, + { + "epoch": 0.29, + "grad_norm": 0.48977663941340943, + "learning_rate": 8.343360143639138e-06, + "loss": 0.0931, + "step": 1011 + }, + { + "epoch": 0.29, + "grad_norm": 0.5098982142975803, + "learning_rate": 8.339918425944851e-06, + "loss": 0.0953, + "step": 1012 + }, + { + "epoch": 0.29, + "grad_norm": 0.3262565161128692, + "learning_rate": 8.336473848316524e-06, + "loss": 0.0631, + "step": 1013 + }, + { + "epoch": 0.29, + "grad_norm": 0.35421779112164864, + "learning_rate": 8.33302641370371e-06, + "loss": 0.0977, + "step": 1014 + }, + { + "epoch": 0.29, + "grad_norm": 0.2869545712405884, + "learning_rate": 8.329576125058406e-06, + "loss": 0.0639, + "step": 1015 + }, + { + "epoch": 0.29, + "grad_norm": 0.20287072416268473, + "learning_rate": 8.326122985335057e-06, + "loss": 0.0685, + "step": 1016 + }, + { + "epoch": 0.29, + "grad_norm": 1.7583154745559284, + "learning_rate": 8.322666997490547e-06, + "loss": 0.0875, + "step": 1017 + }, + { + "epoch": 0.29, + "grad_norm": 0.4885199266295902, + "learning_rate": 8.319208164484197e-06, + "loss": 0.1041, + "step": 1018 + }, + { + "epoch": 0.29, + "grad_norm": 0.4043930830511361, + "learning_rate": 8.315746489277769e-06, + "loss": 0.1157, + "step": 1019 + }, + { + "epoch": 0.29, + "grad_norm": 0.648302343309106, + "learning_rate": 8.312281974835452e-06, + "loss": 0.1407, + "step": 1020 + }, + { + "epoch": 0.29, + "grad_norm": 0.5627686264207775, + "learning_rate": 8.308814624123875e-06, + "loss": 0.1281, + "step": 1021 + }, + { + "epoch": 0.29, + "grad_norm": 0.7904679232456217, + "learning_rate": 8.305344440112089e-06, + "loss": 0.1918, + "step": 1022 + }, + { + "epoch": 0.29, + "grad_norm": 1.1811965424431194, + "learning_rate": 8.30187142577157e-06, + "loss": 0.1582, + "step": 1023 + }, + { + "epoch": 0.29, + "grad_norm": 0.4097653619564071, + "learning_rate": 8.298395584076225e-06, + "loss": 0.0797, + "step": 1024 + }, + { + "epoch": 0.29, + "grad_norm": 0.5762426286667174, + "learning_rate": 8.294916918002377e-06, + "loss": 0.1297, + "step": 1025 + }, + { + "epoch": 0.29, + "grad_norm": 0.6371882149026293, + "learning_rate": 8.291435430528762e-06, + "loss": 0.1266, + "step": 1026 + }, + { + "epoch": 0.29, + "grad_norm": 0.6461947680286652, + "learning_rate": 8.287951124636546e-06, + "loss": 0.1466, + "step": 1027 + }, + { + "epoch": 0.29, + "grad_norm": 0.33244417086603184, + "learning_rate": 8.284464003309298e-06, + "loss": 0.1034, + "step": 1028 + }, + { + "epoch": 0.29, + "grad_norm": 0.4759563814993332, + "learning_rate": 8.280974069532999e-06, + "loss": 0.1027, + "step": 1029 + }, + { + "epoch": 0.29, + "grad_norm": 0.45943737713144167, + "learning_rate": 8.277481326296039e-06, + "loss": 0.1161, + "step": 1030 + }, + { + "epoch": 0.29, + "grad_norm": 0.29575566208843385, + "learning_rate": 8.273985776589215e-06, + "loss": 0.053, + "step": 1031 + }, + { + "epoch": 0.29, + "grad_norm": 0.4466903877659513, + "learning_rate": 8.270487423405727e-06, + "loss": 0.1189, + "step": 1032 + }, + { + "epoch": 0.3, + "grad_norm": 0.5105463418911009, + "learning_rate": 8.266986269741173e-06, + "loss": 0.0778, + "step": 1033 + }, + { + "epoch": 0.3, + "grad_norm": 0.31254158962842304, + "learning_rate": 8.263482318593553e-06, + "loss": 0.0893, + "step": 1034 + }, + { + "epoch": 0.3, + "grad_norm": 0.8478579221449796, + "learning_rate": 8.259975572963257e-06, + "loss": 0.123, + "step": 1035 + }, + { + "epoch": 0.3, + "grad_norm": 0.36709753626424013, + "learning_rate": 8.256466035853077e-06, + "loss": 0.0497, + "step": 1036 + }, + { + "epoch": 0.3, + "grad_norm": 0.6835638168250348, + "learning_rate": 8.252953710268185e-06, + "loss": 0.0725, + "step": 1037 + }, + { + "epoch": 0.3, + "grad_norm": 0.4340829342289024, + "learning_rate": 8.249438599216149e-06, + "loss": 0.0809, + "step": 1038 + }, + { + "epoch": 0.3, + "grad_norm": 0.36138727045471125, + "learning_rate": 8.245920705706913e-06, + "loss": 0.0889, + "step": 1039 + }, + { + "epoch": 0.3, + "grad_norm": 0.2993361892304179, + "learning_rate": 8.242400032752813e-06, + "loss": 0.0526, + "step": 1040 + }, + { + "epoch": 0.3, + "grad_norm": 0.5094376855780399, + "learning_rate": 8.238876583368563e-06, + "loss": 0.1006, + "step": 1041 + }, + { + "epoch": 0.3, + "grad_norm": 0.5634589100474235, + "learning_rate": 8.235350360571249e-06, + "loss": 0.1157, + "step": 1042 + }, + { + "epoch": 0.3, + "grad_norm": 0.47421182884319807, + "learning_rate": 8.231821367380335e-06, + "loss": 0.1418, + "step": 1043 + }, + { + "epoch": 0.3, + "grad_norm": 0.3645236056670461, + "learning_rate": 8.228289606817658e-06, + "loss": 0.0614, + "step": 1044 + }, + { + "epoch": 0.3, + "grad_norm": 0.4875507064892114, + "learning_rate": 8.224755081907427e-06, + "loss": 0.0763, + "step": 1045 + }, + { + "epoch": 0.3, + "grad_norm": 0.7407249078045226, + "learning_rate": 8.221217795676213e-06, + "loss": 0.1152, + "step": 1046 + }, + { + "epoch": 0.3, + "grad_norm": 0.3649004315172086, + "learning_rate": 8.217677751152954e-06, + "loss": 0.0814, + "step": 1047 + }, + { + "epoch": 0.3, + "grad_norm": 0.536336536250725, + "learning_rate": 8.21413495136895e-06, + "loss": 0.1196, + "step": 1048 + }, + { + "epoch": 0.3, + "grad_norm": 0.2855081985137709, + "learning_rate": 8.21058939935786e-06, + "loss": 0.029, + "step": 1049 + }, + { + "epoch": 0.3, + "grad_norm": 0.2680957073915973, + "learning_rate": 8.207041098155701e-06, + "loss": 0.0786, + "step": 1050 + }, + { + "epoch": 0.3, + "grad_norm": 0.45054152748401677, + "learning_rate": 8.20349005080084e-06, + "loss": 0.1169, + "step": 1051 + }, + { + "epoch": 0.3, + "grad_norm": 0.6139110124025449, + "learning_rate": 8.199936260334e-06, + "loss": 0.1165, + "step": 1052 + }, + { + "epoch": 0.3, + "grad_norm": 0.33867977009829214, + "learning_rate": 8.196379729798252e-06, + "loss": 0.1048, + "step": 1053 + }, + { + "epoch": 0.3, + "grad_norm": 0.37942391245149126, + "learning_rate": 8.192820462239012e-06, + "loss": 0.1012, + "step": 1054 + }, + { + "epoch": 0.3, + "grad_norm": 0.5645099693984074, + "learning_rate": 8.189258460704039e-06, + "loss": 0.1294, + "step": 1055 + }, + { + "epoch": 0.3, + "grad_norm": 0.4853097526050573, + "learning_rate": 8.185693728243435e-06, + "loss": 0.1012, + "step": 1056 + }, + { + "epoch": 0.3, + "grad_norm": 0.9300273392049366, + "learning_rate": 8.182126267909642e-06, + "loss": 0.1265, + "step": 1057 + }, + { + "epoch": 0.3, + "grad_norm": 0.6999893711864746, + "learning_rate": 8.178556082757431e-06, + "loss": 0.1152, + "step": 1058 + }, + { + "epoch": 0.3, + "grad_norm": 0.9138579044020679, + "learning_rate": 8.174983175843915e-06, + "loss": 0.1931, + "step": 1059 + }, + { + "epoch": 0.3, + "grad_norm": 0.41861793606440134, + "learning_rate": 8.171407550228532e-06, + "loss": 0.1189, + "step": 1060 + }, + { + "epoch": 0.3, + "grad_norm": 0.4241523484325529, + "learning_rate": 8.167829208973048e-06, + "loss": 0.0863, + "step": 1061 + }, + { + "epoch": 0.3, + "grad_norm": 0.6830500366547438, + "learning_rate": 8.164248155141557e-06, + "loss": 0.1394, + "step": 1062 + }, + { + "epoch": 0.3, + "grad_norm": 0.4446345716747391, + "learning_rate": 8.160664391800475e-06, + "loss": 0.1029, + "step": 1063 + }, + { + "epoch": 0.3, + "grad_norm": 0.4953977838410171, + "learning_rate": 8.157077922018537e-06, + "loss": 0.1023, + "step": 1064 + }, + { + "epoch": 0.3, + "grad_norm": 0.32633677272271966, + "learning_rate": 8.153488748866795e-06, + "loss": 0.0624, + "step": 1065 + }, + { + "epoch": 0.3, + "grad_norm": 0.8318169681199975, + "learning_rate": 8.14989687541862e-06, + "loss": 0.1282, + "step": 1066 + }, + { + "epoch": 0.3, + "grad_norm": 0.3466502879782476, + "learning_rate": 8.146302304749689e-06, + "loss": 0.0852, + "step": 1067 + }, + { + "epoch": 0.31, + "grad_norm": 0.28109079631108735, + "learning_rate": 8.142705039937994e-06, + "loss": 0.0665, + "step": 1068 + }, + { + "epoch": 0.31, + "grad_norm": 0.7215831667805335, + "learning_rate": 8.139105084063832e-06, + "loss": 0.1144, + "step": 1069 + }, + { + "epoch": 0.31, + "grad_norm": 0.31182985041749733, + "learning_rate": 8.135502440209803e-06, + "loss": 0.0874, + "step": 1070 + }, + { + "epoch": 0.31, + "grad_norm": 0.4850093138487995, + "learning_rate": 8.13189711146081e-06, + "loss": 0.128, + "step": 1071 + }, + { + "epoch": 0.31, + "grad_norm": 0.2797988827907325, + "learning_rate": 8.128289100904056e-06, + "loss": 0.0691, + "step": 1072 + }, + { + "epoch": 0.31, + "grad_norm": 0.8220920414966311, + "learning_rate": 8.124678411629037e-06, + "loss": 0.1486, + "step": 1073 + }, + { + "epoch": 0.31, + "grad_norm": 1.0613437851569856, + "learning_rate": 8.121065046727547e-06, + "loss": 0.1489, + "step": 1074 + }, + { + "epoch": 0.31, + "grad_norm": 0.5075715468063482, + "learning_rate": 8.117449009293668e-06, + "loss": 0.1168, + "step": 1075 + }, + { + "epoch": 0.31, + "grad_norm": 0.8245510726304975, + "learning_rate": 8.113830302423773e-06, + "loss": 0.1081, + "step": 1076 + }, + { + "epoch": 0.31, + "grad_norm": 0.34847096383222875, + "learning_rate": 8.110208929216518e-06, + "loss": 0.0714, + "step": 1077 + }, + { + "epoch": 0.31, + "grad_norm": 0.513751607536443, + "learning_rate": 8.106584892772844e-06, + "loss": 0.1279, + "step": 1078 + }, + { + "epoch": 0.31, + "grad_norm": 0.47401253237123436, + "learning_rate": 8.102958196195972e-06, + "loss": 0.1397, + "step": 1079 + }, + { + "epoch": 0.31, + "grad_norm": 0.40449775116622433, + "learning_rate": 8.0993288425914e-06, + "loss": 0.0794, + "step": 1080 + }, + { + "epoch": 0.31, + "grad_norm": 0.5951573885001263, + "learning_rate": 8.095696835066906e-06, + "loss": 0.15, + "step": 1081 + }, + { + "epoch": 0.31, + "grad_norm": 0.5569632760651496, + "learning_rate": 8.092062176732531e-06, + "loss": 0.1026, + "step": 1082 + }, + { + "epoch": 0.31, + "grad_norm": 0.6521827209880674, + "learning_rate": 8.088424870700595e-06, + "loss": 0.1117, + "step": 1083 + }, + { + "epoch": 0.31, + "grad_norm": 0.6260366045494986, + "learning_rate": 8.084784920085682e-06, + "loss": 0.1141, + "step": 1084 + }, + { + "epoch": 0.31, + "grad_norm": 0.37305490577484035, + "learning_rate": 8.081142328004638e-06, + "loss": 0.0886, + "step": 1085 + }, + { + "epoch": 0.31, + "grad_norm": 0.49597637338839584, + "learning_rate": 8.077497097576573e-06, + "loss": 0.1086, + "step": 1086 + }, + { + "epoch": 0.31, + "grad_norm": 0.8158622365930779, + "learning_rate": 8.073849231922859e-06, + "loss": 0.1661, + "step": 1087 + }, + { + "epoch": 0.31, + "grad_norm": 0.458905363150963, + "learning_rate": 8.070198734167119e-06, + "loss": 0.0989, + "step": 1088 + }, + { + "epoch": 0.31, + "grad_norm": 0.4259627728408611, + "learning_rate": 8.066545607435232e-06, + "loss": 0.1298, + "step": 1089 + }, + { + "epoch": 0.31, + "grad_norm": 0.33877904788872976, + "learning_rate": 8.062889854855334e-06, + "loss": 0.1014, + "step": 1090 + }, + { + "epoch": 0.31, + "grad_norm": 0.26955903561824496, + "learning_rate": 8.059231479557797e-06, + "loss": 0.0526, + "step": 1091 + }, + { + "epoch": 0.31, + "grad_norm": 0.4960593268108397, + "learning_rate": 8.055570484675252e-06, + "loss": 0.1127, + "step": 1092 + }, + { + "epoch": 0.31, + "grad_norm": 0.5296626194371825, + "learning_rate": 8.051906873342563e-06, + "loss": 0.1129, + "step": 1093 + }, + { + "epoch": 0.31, + "grad_norm": 0.24575969028576672, + "learning_rate": 8.048240648696842e-06, + "loss": 0.0442, + "step": 1094 + }, + { + "epoch": 0.31, + "grad_norm": 0.3513992957638845, + "learning_rate": 8.044571813877431e-06, + "loss": 0.0905, + "step": 1095 + }, + { + "epoch": 0.31, + "grad_norm": 0.3203419179094259, + "learning_rate": 8.040900372025916e-06, + "loss": 0.0793, + "step": 1096 + }, + { + "epoch": 0.31, + "grad_norm": 0.4700084002956758, + "learning_rate": 8.03722632628611e-06, + "loss": 0.109, + "step": 1097 + }, + { + "epoch": 0.31, + "grad_norm": 0.2483215033554251, + "learning_rate": 8.033549679804052e-06, + "loss": 0.0561, + "step": 1098 + }, + { + "epoch": 0.31, + "grad_norm": 0.3597357354623874, + "learning_rate": 8.029870435728018e-06, + "loss": 0.0996, + "step": 1099 + }, + { + "epoch": 0.31, + "grad_norm": 0.3000531772010142, + "learning_rate": 8.0261885972085e-06, + "loss": 0.0623, + "step": 1100 + }, + { + "epoch": 0.31, + "grad_norm": 0.41445398585338367, + "learning_rate": 8.022504167398214e-06, + "loss": 0.0942, + "step": 1101 + }, + { + "epoch": 0.31, + "grad_norm": 0.5169367410952866, + "learning_rate": 8.018817149452096e-06, + "loss": 0.0746, + "step": 1102 + }, + { + "epoch": 0.32, + "grad_norm": 0.44796645479944325, + "learning_rate": 8.015127546527299e-06, + "loss": 0.0998, + "step": 1103 + }, + { + "epoch": 0.32, + "grad_norm": 0.44025839130861805, + "learning_rate": 8.011435361783184e-06, + "loss": 0.1114, + "step": 1104 + }, + { + "epoch": 0.32, + "grad_norm": 0.3937437646224052, + "learning_rate": 8.007740598381329e-06, + "loss": 0.0594, + "step": 1105 + }, + { + "epoch": 0.32, + "grad_norm": 0.30715817255303246, + "learning_rate": 8.004043259485519e-06, + "loss": 0.0619, + "step": 1106 + }, + { + "epoch": 0.32, + "grad_norm": 0.3406652493401725, + "learning_rate": 8.000343348261741e-06, + "loss": 0.078, + "step": 1107 + }, + { + "epoch": 0.32, + "grad_norm": 0.3548624704379509, + "learning_rate": 7.996640867878188e-06, + "loss": 0.0767, + "step": 1108 + }, + { + "epoch": 0.32, + "grad_norm": 0.34396569921394493, + "learning_rate": 7.99293582150525e-06, + "loss": 0.0535, + "step": 1109 + }, + { + "epoch": 0.32, + "grad_norm": 0.5047146478790318, + "learning_rate": 7.989228212315516e-06, + "loss": 0.0967, + "step": 1110 + }, + { + "epoch": 0.32, + "grad_norm": 0.19463895010201257, + "learning_rate": 7.985518043483774e-06, + "loss": 0.0522, + "step": 1111 + }, + { + "epoch": 0.32, + "grad_norm": 0.27575738138782074, + "learning_rate": 7.981805318186992e-06, + "loss": 0.0653, + "step": 1112 + }, + { + "epoch": 0.32, + "grad_norm": 0.44116358775158315, + "learning_rate": 7.978090039604342e-06, + "loss": 0.0946, + "step": 1113 + }, + { + "epoch": 0.32, + "grad_norm": 0.30929616820651784, + "learning_rate": 7.974372210917168e-06, + "loss": 0.053, + "step": 1114 + }, + { + "epoch": 0.32, + "grad_norm": 0.47226923600043097, + "learning_rate": 7.970651835309009e-06, + "loss": 0.0934, + "step": 1115 + }, + { + "epoch": 0.32, + "grad_norm": 0.3661054432573276, + "learning_rate": 7.966928915965578e-06, + "loss": 0.0551, + "step": 1116 + }, + { + "epoch": 0.32, + "grad_norm": 0.5489439870804758, + "learning_rate": 7.963203456074767e-06, + "loss": 0.1272, + "step": 1117 + }, + { + "epoch": 0.32, + "grad_norm": 0.3052756891544375, + "learning_rate": 7.959475458826647e-06, + "loss": 0.0638, + "step": 1118 + }, + { + "epoch": 0.32, + "grad_norm": 0.5124683390357987, + "learning_rate": 7.95574492741346e-06, + "loss": 0.108, + "step": 1119 + }, + { + "epoch": 0.32, + "grad_norm": 0.4310133133606615, + "learning_rate": 7.952011865029614e-06, + "loss": 0.0785, + "step": 1120 + }, + { + "epoch": 0.32, + "grad_norm": 0.4294669892547384, + "learning_rate": 7.94827627487169e-06, + "loss": 0.0714, + "step": 1121 + }, + { + "epoch": 0.32, + "grad_norm": 0.40917875591040775, + "learning_rate": 7.944538160138435e-06, + "loss": 0.104, + "step": 1122 + }, + { + "epoch": 0.32, + "grad_norm": 0.36318268664891995, + "learning_rate": 7.940797524030748e-06, + "loss": 0.1028, + "step": 1123 + }, + { + "epoch": 0.32, + "grad_norm": 0.25870197230093644, + "learning_rate": 7.937054369751696e-06, + "loss": 0.0613, + "step": 1124 + }, + { + "epoch": 0.32, + "grad_norm": 0.5492907488895081, + "learning_rate": 7.933308700506497e-06, + "loss": 0.1046, + "step": 1125 + }, + { + "epoch": 0.32, + "grad_norm": 0.33197899047198387, + "learning_rate": 7.929560519502528e-06, + "loss": 0.0656, + "step": 1126 + }, + { + "epoch": 0.32, + "grad_norm": 0.6347481219081285, + "learning_rate": 7.925809829949312e-06, + "loss": 0.0981, + "step": 1127 + }, + { + "epoch": 0.32, + "grad_norm": 0.34912028643554205, + "learning_rate": 7.922056635058522e-06, + "loss": 0.0651, + "step": 1128 + }, + { + "epoch": 0.32, + "grad_norm": 0.5078039159530806, + "learning_rate": 7.918300938043974e-06, + "loss": 0.1305, + "step": 1129 + }, + { + "epoch": 0.32, + "grad_norm": 0.5697315782808061, + "learning_rate": 7.914542742121632e-06, + "loss": 0.1025, + "step": 1130 + }, + { + "epoch": 0.32, + "grad_norm": 0.32530947380410763, + "learning_rate": 7.910782050509596e-06, + "loss": 0.0383, + "step": 1131 + }, + { + "epoch": 0.32, + "grad_norm": 0.32869399919836406, + "learning_rate": 7.9070188664281e-06, + "loss": 0.0593, + "step": 1132 + }, + { + "epoch": 0.32, + "grad_norm": 0.33694692372972257, + "learning_rate": 7.903253193099516e-06, + "loss": 0.0837, + "step": 1133 + }, + { + "epoch": 0.32, + "grad_norm": 0.2904129730568934, + "learning_rate": 7.89948503374835e-06, + "loss": 0.0819, + "step": 1134 + }, + { + "epoch": 0.32, + "grad_norm": 0.49514352849483495, + "learning_rate": 7.895714391601232e-06, + "loss": 0.0783, + "step": 1135 + }, + { + "epoch": 0.32, + "grad_norm": 0.34327959303358785, + "learning_rate": 7.891941269886922e-06, + "loss": 0.0928, + "step": 1136 + }, + { + "epoch": 0.32, + "grad_norm": 0.5075477782555474, + "learning_rate": 7.888165671836297e-06, + "loss": 0.0748, + "step": 1137 + }, + { + "epoch": 0.33, + "grad_norm": 0.3502041423172233, + "learning_rate": 7.884387600682362e-06, + "loss": 0.0789, + "step": 1138 + }, + { + "epoch": 0.33, + "grad_norm": 0.39308755449196536, + "learning_rate": 7.880607059660235e-06, + "loss": 0.0823, + "step": 1139 + }, + { + "epoch": 0.33, + "grad_norm": 0.4244428577656742, + "learning_rate": 7.87682405200715e-06, + "loss": 0.0993, + "step": 1140 + }, + { + "epoch": 0.33, + "grad_norm": 0.43510702450394106, + "learning_rate": 7.873038580962453e-06, + "loss": 0.0891, + "step": 1141 + }, + { + "epoch": 0.33, + "grad_norm": 0.4456295982152556, + "learning_rate": 7.869250649767601e-06, + "loss": 0.1099, + "step": 1142 + }, + { + "epoch": 0.33, + "grad_norm": 0.39182860845353573, + "learning_rate": 7.865460261666155e-06, + "loss": 0.0798, + "step": 1143 + }, + { + "epoch": 0.33, + "grad_norm": 0.5344187460056024, + "learning_rate": 7.861667419903783e-06, + "loss": 0.0985, + "step": 1144 + }, + { + "epoch": 0.33, + "grad_norm": 0.7891369466523669, + "learning_rate": 7.857872127728248e-06, + "loss": 0.0819, + "step": 1145 + }, + { + "epoch": 0.33, + "grad_norm": 0.5215596996530159, + "learning_rate": 7.854074388389421e-06, + "loss": 0.0814, + "step": 1146 + }, + { + "epoch": 0.33, + "grad_norm": 0.46852218611317137, + "learning_rate": 7.850274205139258e-06, + "loss": 0.0667, + "step": 1147 + }, + { + "epoch": 0.33, + "grad_norm": 0.5449716291985394, + "learning_rate": 7.846471581231814e-06, + "loss": 0.0671, + "step": 1148 + }, + { + "epoch": 0.33, + "grad_norm": 0.5031026351604613, + "learning_rate": 7.842666519923235e-06, + "loss": 0.0975, + "step": 1149 + }, + { + "epoch": 0.33, + "grad_norm": 0.37185250654919283, + "learning_rate": 7.838859024471747e-06, + "loss": 0.0683, + "step": 1150 + }, + { + "epoch": 0.33, + "grad_norm": 0.6139521239306095, + "learning_rate": 7.835049098137669e-06, + "loss": 0.1509, + "step": 1151 + }, + { + "epoch": 0.33, + "grad_norm": 0.40880878757373446, + "learning_rate": 7.831236744183395e-06, + "loss": 0.1124, + "step": 1152 + }, + { + "epoch": 0.33, + "grad_norm": 1.2575037307931132, + "learning_rate": 7.827421965873403e-06, + "loss": 0.1537, + "step": 1153 + }, + { + "epoch": 0.33, + "grad_norm": 0.6546122947184678, + "learning_rate": 7.823604766474239e-06, + "loss": 0.1225, + "step": 1154 + }, + { + "epoch": 0.33, + "grad_norm": 0.5498410531971489, + "learning_rate": 7.819785149254534e-06, + "loss": 0.1383, + "step": 1155 + }, + { + "epoch": 0.33, + "grad_norm": 0.6197056198658327, + "learning_rate": 7.815963117484977e-06, + "loss": 0.0945, + "step": 1156 + }, + { + "epoch": 0.33, + "grad_norm": 0.4924690545451198, + "learning_rate": 7.812138674438332e-06, + "loss": 0.1279, + "step": 1157 + }, + { + "epoch": 0.33, + "grad_norm": 0.6229791928315794, + "learning_rate": 7.808311823389428e-06, + "loss": 0.099, + "step": 1158 + }, + { + "epoch": 0.33, + "grad_norm": 0.7343873627959406, + "learning_rate": 7.80448256761515e-06, + "loss": 0.14, + "step": 1159 + }, + { + "epoch": 0.33, + "grad_norm": 0.40830450132964397, + "learning_rate": 7.80065091039445e-06, + "loss": 0.0654, + "step": 1160 + }, + { + "epoch": 0.33, + "grad_norm": 0.5631745495802228, + "learning_rate": 7.79681685500833e-06, + "loss": 0.0971, + "step": 1161 + }, + { + "epoch": 0.33, + "grad_norm": 0.32978136538134334, + "learning_rate": 7.792980404739849e-06, + "loss": 0.0787, + "step": 1162 + }, + { + "epoch": 0.33, + "grad_norm": 0.39506083561245803, + "learning_rate": 7.789141562874114e-06, + "loss": 0.0561, + "step": 1163 + }, + { + "epoch": 0.33, + "grad_norm": 0.2966677165805511, + "learning_rate": 7.785300332698282e-06, + "loss": 0.0803, + "step": 1164 + }, + { + "epoch": 0.33, + "grad_norm": 0.4966997858510323, + "learning_rate": 7.781456717501557e-06, + "loss": 0.1164, + "step": 1165 + }, + { + "epoch": 0.33, + "grad_norm": 0.43113926216677567, + "learning_rate": 7.77761072057518e-06, + "loss": 0.1117, + "step": 1166 + }, + { + "epoch": 0.33, + "grad_norm": 0.4856468390318449, + "learning_rate": 7.773762345212434e-06, + "loss": 0.0683, + "step": 1167 + }, + { + "epoch": 0.33, + "grad_norm": 0.3968324268099564, + "learning_rate": 7.76991159470864e-06, + "loss": 0.0768, + "step": 1168 + }, + { + "epoch": 0.33, + "grad_norm": 0.5842974160225576, + "learning_rate": 7.766058472361154e-06, + "loss": 0.0605, + "step": 1169 + }, + { + "epoch": 0.33, + "grad_norm": 0.7117350909882223, + "learning_rate": 7.762202981469358e-06, + "loss": 0.1317, + "step": 1170 + }, + { + "epoch": 0.33, + "grad_norm": 0.42990855690076935, + "learning_rate": 7.758345125334665e-06, + "loss": 0.1125, + "step": 1171 + }, + { + "epoch": 0.33, + "grad_norm": 0.6010850452916343, + "learning_rate": 7.754484907260513e-06, + "loss": 0.1592, + "step": 1172 + }, + { + "epoch": 0.34, + "grad_norm": 0.5278953074527011, + "learning_rate": 7.750622330552365e-06, + "loss": 0.137, + "step": 1173 + }, + { + "epoch": 0.34, + "grad_norm": 0.2708849122181835, + "learning_rate": 7.746757398517696e-06, + "loss": 0.0611, + "step": 1174 + }, + { + "epoch": 0.34, + "grad_norm": 0.6684456893451128, + "learning_rate": 7.74289011446601e-06, + "loss": 0.0861, + "step": 1175 + }, + { + "epoch": 0.34, + "grad_norm": 0.3248118299458125, + "learning_rate": 7.739020481708816e-06, + "loss": 0.0752, + "step": 1176 + }, + { + "epoch": 0.34, + "grad_norm": 0.2840766394037429, + "learning_rate": 7.735148503559633e-06, + "loss": 0.0845, + "step": 1177 + }, + { + "epoch": 0.34, + "grad_norm": 0.3729838347067792, + "learning_rate": 7.731274183333995e-06, + "loss": 0.0722, + "step": 1178 + }, + { + "epoch": 0.34, + "grad_norm": 0.3818306176788432, + "learning_rate": 7.727397524349437e-06, + "loss": 0.0638, + "step": 1179 + }, + { + "epoch": 0.34, + "grad_norm": 0.32945247662636773, + "learning_rate": 7.7235185299255e-06, + "loss": 0.0896, + "step": 1180 + }, + { + "epoch": 0.34, + "grad_norm": 0.47214454470745265, + "learning_rate": 7.719637203383718e-06, + "loss": 0.1208, + "step": 1181 + }, + { + "epoch": 0.34, + "grad_norm": 0.313714152344336, + "learning_rate": 7.715753548047632e-06, + "loss": 0.0842, + "step": 1182 + }, + { + "epoch": 0.34, + "grad_norm": 0.6362640918581809, + "learning_rate": 7.711867567242769e-06, + "loss": 0.0806, + "step": 1183 + }, + { + "epoch": 0.34, + "grad_norm": 0.2668214451753407, + "learning_rate": 7.707979264296649e-06, + "loss": 0.0272, + "step": 1184 + }, + { + "epoch": 0.34, + "grad_norm": 0.35934940489629713, + "learning_rate": 7.704088642538782e-06, + "loss": 0.0791, + "step": 1185 + }, + { + "epoch": 0.34, + "grad_norm": 0.30190362982869606, + "learning_rate": 7.700195705300667e-06, + "loss": 0.0248, + "step": 1186 + }, + { + "epoch": 0.34, + "grad_norm": 0.7878973347362918, + "learning_rate": 7.696300455915775e-06, + "loss": 0.1631, + "step": 1187 + }, + { + "epoch": 0.34, + "grad_norm": 0.4697535069263179, + "learning_rate": 7.692402897719568e-06, + "loss": 0.0834, + "step": 1188 + }, + { + "epoch": 0.34, + "grad_norm": 0.4593108243237563, + "learning_rate": 7.68850303404948e-06, + "loss": 0.092, + "step": 1189 + }, + { + "epoch": 0.34, + "grad_norm": 0.5735805679363245, + "learning_rate": 7.68460086824492e-06, + "loss": 0.0949, + "step": 1190 + }, + { + "epoch": 0.34, + "grad_norm": 0.6014239675928695, + "learning_rate": 7.680696403647268e-06, + "loss": 0.1592, + "step": 1191 + }, + { + "epoch": 0.34, + "grad_norm": 0.36052541407201133, + "learning_rate": 7.676789643599871e-06, + "loss": 0.1029, + "step": 1192 + }, + { + "epoch": 0.34, + "grad_norm": 0.4811788369901242, + "learning_rate": 7.672880591448043e-06, + "loss": 0.0768, + "step": 1193 + }, + { + "epoch": 0.34, + "grad_norm": 1.7774597072896245, + "learning_rate": 7.668969250539063e-06, + "loss": 0.0671, + "step": 1194 + }, + { + "epoch": 0.34, + "grad_norm": 0.29186336807886026, + "learning_rate": 7.665055624222166e-06, + "loss": 0.0852, + "step": 1195 + }, + { + "epoch": 0.34, + "grad_norm": 0.637159785983851, + "learning_rate": 7.661139715848547e-06, + "loss": 0.0847, + "step": 1196 + }, + { + "epoch": 0.34, + "grad_norm": 0.7085992935208999, + "learning_rate": 7.657221528771352e-06, + "loss": 0.1229, + "step": 1197 + }, + { + "epoch": 0.34, + "grad_norm": 0.4036885370956982, + "learning_rate": 7.653301066345681e-06, + "loss": 0.0963, + "step": 1198 + }, + { + "epoch": 0.34, + "grad_norm": 0.6550127017484116, + "learning_rate": 7.649378331928581e-06, + "loss": 0.1257, + "step": 1199 + }, + { + "epoch": 0.34, + "grad_norm": 0.45172260752731824, + "learning_rate": 7.645453328879042e-06, + "loss": 0.1076, + "step": 1200 + }, + { + "epoch": 0.34, + "grad_norm": 0.33171439155135335, + "learning_rate": 7.641526060558005e-06, + "loss": 0.0782, + "step": 1201 + }, + { + "epoch": 0.34, + "grad_norm": 0.3023781409312154, + "learning_rate": 7.63759653032834e-06, + "loss": 0.077, + "step": 1202 + }, + { + "epoch": 0.34, + "grad_norm": 1.194190932076639, + "learning_rate": 7.633664741554863e-06, + "loss": 0.0655, + "step": 1203 + }, + { + "epoch": 0.34, + "grad_norm": 0.5366691119548006, + "learning_rate": 7.629730697604314e-06, + "loss": 0.1036, + "step": 1204 + }, + { + "epoch": 0.34, + "grad_norm": 0.4074123782121894, + "learning_rate": 7.625794401845376e-06, + "loss": 0.1139, + "step": 1205 + }, + { + "epoch": 0.34, + "grad_norm": 0.4496057248969518, + "learning_rate": 7.621855857648651e-06, + "loss": 0.116, + "step": 1206 + }, + { + "epoch": 0.34, + "grad_norm": 0.4215659597010978, + "learning_rate": 7.617915068386671e-06, + "loss": 0.1036, + "step": 1207 + }, + { + "epoch": 0.35, + "grad_norm": 0.3945657635817372, + "learning_rate": 7.613972037433886e-06, + "loss": 0.085, + "step": 1208 + }, + { + "epoch": 0.35, + "grad_norm": 0.3006720640134774, + "learning_rate": 7.610026768166674e-06, + "loss": 0.0695, + "step": 1209 + }, + { + "epoch": 0.35, + "grad_norm": 0.5542776181889553, + "learning_rate": 7.606079263963318e-06, + "loss": 0.1502, + "step": 1210 + }, + { + "epoch": 0.35, + "grad_norm": 0.6237299007530562, + "learning_rate": 7.602129528204023e-06, + "loss": 0.0802, + "step": 1211 + }, + { + "epoch": 0.35, + "grad_norm": 0.3318324299310615, + "learning_rate": 7.5981775642709056e-06, + "loss": 0.0765, + "step": 1212 + }, + { + "epoch": 0.35, + "grad_norm": 0.6140720417596165, + "learning_rate": 7.594223375547982e-06, + "loss": 0.1221, + "step": 1213 + }, + { + "epoch": 0.35, + "grad_norm": 0.39126371219320183, + "learning_rate": 7.590266965421182e-06, + "loss": 0.0855, + "step": 1214 + }, + { + "epoch": 0.35, + "grad_norm": 0.25712230229700495, + "learning_rate": 7.5863083372783365e-06, + "loss": 0.0582, + "step": 1215 + }, + { + "epoch": 0.35, + "grad_norm": 0.2968649920922504, + "learning_rate": 7.58234749450917e-06, + "loss": 0.0624, + "step": 1216 + }, + { + "epoch": 0.35, + "grad_norm": 0.5077027081562454, + "learning_rate": 7.5783844405053064e-06, + "loss": 0.0922, + "step": 1217 + }, + { + "epoch": 0.35, + "grad_norm": 1.0093736048263418, + "learning_rate": 7.574419178660269e-06, + "loss": 0.1619, + "step": 1218 + }, + { + "epoch": 0.35, + "grad_norm": 0.5041760420555579, + "learning_rate": 7.57045171236946e-06, + "loss": 0.0968, + "step": 1219 + }, + { + "epoch": 0.35, + "grad_norm": 0.32562538777616085, + "learning_rate": 7.566482045030179e-06, + "loss": 0.1013, + "step": 1220 + }, + { + "epoch": 0.35, + "grad_norm": 0.6546255816024102, + "learning_rate": 7.5625101800416055e-06, + "loss": 0.1335, + "step": 1221 + }, + { + "epoch": 0.35, + "grad_norm": 0.4318407082961103, + "learning_rate": 7.558536120804804e-06, + "loss": 0.1222, + "step": 1222 + }, + { + "epoch": 0.35, + "grad_norm": 0.2712329289615602, + "learning_rate": 7.554559870722714e-06, + "loss": 0.0619, + "step": 1223 + }, + { + "epoch": 0.35, + "grad_norm": 0.38976509691487654, + "learning_rate": 7.550581433200155e-06, + "loss": 0.0983, + "step": 1224 + }, + { + "epoch": 0.35, + "grad_norm": 0.39089260900309103, + "learning_rate": 7.546600811643816e-06, + "loss": 0.0704, + "step": 1225 + }, + { + "epoch": 0.35, + "grad_norm": 0.8971786888240914, + "learning_rate": 7.542618009462258e-06, + "loss": 0.1157, + "step": 1226 + }, + { + "epoch": 0.35, + "grad_norm": 0.4520304215432081, + "learning_rate": 7.538633030065909e-06, + "loss": 0.0899, + "step": 1227 + }, + { + "epoch": 0.35, + "grad_norm": 0.4206029596159851, + "learning_rate": 7.534645876867064e-06, + "loss": 0.1164, + "step": 1228 + }, + { + "epoch": 0.35, + "grad_norm": 0.35222251931872256, + "learning_rate": 7.530656553279873e-06, + "loss": 0.0895, + "step": 1229 + }, + { + "epoch": 0.35, + "grad_norm": 0.5895836229513604, + "learning_rate": 7.526665062720351e-06, + "loss": 0.1304, + "step": 1230 + }, + { + "epoch": 0.35, + "grad_norm": 0.4194301687103519, + "learning_rate": 7.522671408606363e-06, + "loss": 0.0969, + "step": 1231 + }, + { + "epoch": 0.35, + "grad_norm": 0.4186167818878908, + "learning_rate": 7.5186755943576324e-06, + "loss": 0.0957, + "step": 1232 + }, + { + "epoch": 0.35, + "grad_norm": 0.3626156880181813, + "learning_rate": 7.51467762339573e-06, + "loss": 0.0801, + "step": 1233 + }, + { + "epoch": 0.35, + "grad_norm": 0.48752646939344874, + "learning_rate": 7.510677499144068e-06, + "loss": 0.1013, + "step": 1234 + }, + { + "epoch": 0.35, + "grad_norm": 0.4865304274990717, + "learning_rate": 7.5066752250279104e-06, + "loss": 0.0828, + "step": 1235 + }, + { + "epoch": 0.35, + "grad_norm": 0.4398756534676093, + "learning_rate": 7.502670804474359e-06, + "loss": 0.0917, + "step": 1236 + }, + { + "epoch": 0.35, + "grad_norm": 0.4890809214016786, + "learning_rate": 7.498664240912354e-06, + "loss": 0.0881, + "step": 1237 + }, + { + "epoch": 0.35, + "grad_norm": 0.48589244001927956, + "learning_rate": 7.494655537772667e-06, + "loss": 0.0783, + "step": 1238 + }, + { + "epoch": 0.35, + "grad_norm": 0.4906795310797478, + "learning_rate": 7.490644698487909e-06, + "loss": 0.0824, + "step": 1239 + }, + { + "epoch": 0.35, + "grad_norm": 0.37049834734013626, + "learning_rate": 7.486631726492511e-06, + "loss": 0.0985, + "step": 1240 + }, + { + "epoch": 0.35, + "grad_norm": 0.3350345268942506, + "learning_rate": 7.482616625222741e-06, + "loss": 0.0652, + "step": 1241 + }, + { + "epoch": 0.35, + "grad_norm": 0.43138496833814577, + "learning_rate": 7.478599398116678e-06, + "loss": 0.0964, + "step": 1242 + }, + { + "epoch": 0.36, + "grad_norm": 0.4699342535775678, + "learning_rate": 7.474580048614233e-06, + "loss": 0.0704, + "step": 1243 + }, + { + "epoch": 0.36, + "grad_norm": 0.6043115294856498, + "learning_rate": 7.470558580157126e-06, + "loss": 0.1101, + "step": 1244 + }, + { + "epoch": 0.36, + "grad_norm": 0.3253143486775578, + "learning_rate": 7.466534996188897e-06, + "loss": 0.0824, + "step": 1245 + }, + { + "epoch": 0.36, + "grad_norm": 0.7001258558864883, + "learning_rate": 7.462509300154892e-06, + "loss": 0.0983, + "step": 1246 + }, + { + "epoch": 0.36, + "grad_norm": 0.49755871463826296, + "learning_rate": 7.45848149550227e-06, + "loss": 0.1227, + "step": 1247 + }, + { + "epoch": 0.36, + "grad_norm": 0.5260467794728861, + "learning_rate": 7.454451585679996e-06, + "loss": 0.081, + "step": 1248 + }, + { + "epoch": 0.36, + "grad_norm": 0.5080804721923634, + "learning_rate": 7.450419574138833e-06, + "loss": 0.1053, + "step": 1249 + }, + { + "epoch": 0.36, + "grad_norm": 0.3791342728629578, + "learning_rate": 7.446385464331349e-06, + "loss": 0.107, + "step": 1250 + }, + { + "epoch": 0.36, + "grad_norm": 0.5399108324070111, + "learning_rate": 7.442349259711904e-06, + "loss": 0.1181, + "step": 1251 + }, + { + "epoch": 0.36, + "grad_norm": 0.6639879999689687, + "learning_rate": 7.438310963736655e-06, + "loss": 0.1495, + "step": 1252 + }, + { + "epoch": 0.36, + "grad_norm": 1.1723017539269172, + "learning_rate": 7.434270579863549e-06, + "loss": 0.0658, + "step": 1253 + }, + { + "epoch": 0.36, + "grad_norm": 0.3034849286931736, + "learning_rate": 7.43022811155232e-06, + "loss": 0.0832, + "step": 1254 + }, + { + "epoch": 0.36, + "grad_norm": 0.36893262943136473, + "learning_rate": 7.426183562264487e-06, + "loss": 0.0971, + "step": 1255 + }, + { + "epoch": 0.36, + "grad_norm": 0.5974855328671015, + "learning_rate": 7.422136935463354e-06, + "loss": 0.1166, + "step": 1256 + }, + { + "epoch": 0.36, + "grad_norm": 0.3229655949631019, + "learning_rate": 7.418088234613997e-06, + "loss": 0.0785, + "step": 1257 + }, + { + "epoch": 0.36, + "grad_norm": 0.4571919749930792, + "learning_rate": 7.414037463183276e-06, + "loss": 0.0725, + "step": 1258 + }, + { + "epoch": 0.36, + "grad_norm": 0.4660452723925712, + "learning_rate": 7.4099846246398185e-06, + "loss": 0.0995, + "step": 1259 + }, + { + "epoch": 0.36, + "grad_norm": 0.7272549656849545, + "learning_rate": 7.405929722454026e-06, + "loss": 0.0774, + "step": 1260 + }, + { + "epoch": 0.36, + "grad_norm": 0.5476897323036802, + "learning_rate": 7.401872760098063e-06, + "loss": 0.1126, + "step": 1261 + }, + { + "epoch": 0.36, + "grad_norm": 0.3810391769235816, + "learning_rate": 7.3978137410458595e-06, + "loss": 0.11, + "step": 1262 + }, + { + "epoch": 0.36, + "grad_norm": 0.2522541561241933, + "learning_rate": 7.39375266877311e-06, + "loss": 0.0678, + "step": 1263 + }, + { + "epoch": 0.36, + "grad_norm": 0.6174401914519404, + "learning_rate": 7.3896895467572616e-06, + "loss": 0.1361, + "step": 1264 + }, + { + "epoch": 0.36, + "grad_norm": 0.5310372812239782, + "learning_rate": 7.385624378477521e-06, + "loss": 0.1132, + "step": 1265 + }, + { + "epoch": 0.36, + "grad_norm": 0.5532722658518993, + "learning_rate": 7.381557167414844e-06, + "loss": 0.1235, + "step": 1266 + }, + { + "epoch": 0.36, + "grad_norm": 0.5359325875315766, + "learning_rate": 7.3774879170519386e-06, + "loss": 0.1386, + "step": 1267 + }, + { + "epoch": 0.36, + "grad_norm": 0.7013688142544489, + "learning_rate": 7.373416630873255e-06, + "loss": 0.1025, + "step": 1268 + }, + { + "epoch": 0.36, + "grad_norm": 0.21878971083627835, + "learning_rate": 7.369343312364994e-06, + "loss": 0.0706, + "step": 1269 + }, + { + "epoch": 0.36, + "grad_norm": 0.5444098139359753, + "learning_rate": 7.365267965015086e-06, + "loss": 0.1428, + "step": 1270 + }, + { + "epoch": 0.36, + "grad_norm": 0.40046008322119825, + "learning_rate": 7.361190592313209e-06, + "loss": 0.0911, + "step": 1271 + }, + { + "epoch": 0.36, + "grad_norm": 0.4753066450054673, + "learning_rate": 7.357111197750768e-06, + "loss": 0.1104, + "step": 1272 + }, + { + "epoch": 0.36, + "grad_norm": 0.34302904252531163, + "learning_rate": 7.353029784820902e-06, + "loss": 0.0894, + "step": 1273 + }, + { + "epoch": 0.36, + "grad_norm": 0.5175561729935336, + "learning_rate": 7.348946357018479e-06, + "loss": 0.0685, + "step": 1274 + }, + { + "epoch": 0.36, + "grad_norm": 0.33407349978091094, + "learning_rate": 7.344860917840092e-06, + "loss": 0.0769, + "step": 1275 + }, + { + "epoch": 0.36, + "grad_norm": 0.537296939644874, + "learning_rate": 7.3407734707840575e-06, + "loss": 0.1415, + "step": 1276 + }, + { + "epoch": 0.36, + "grad_norm": 0.4714955294845538, + "learning_rate": 7.336684019350405e-06, + "loss": 0.0602, + "step": 1277 + }, + { + "epoch": 0.37, + "grad_norm": 0.5231687005390631, + "learning_rate": 7.332592567040889e-06, + "loss": 0.1012, + "step": 1278 + }, + { + "epoch": 0.37, + "grad_norm": 0.6451651770169565, + "learning_rate": 7.328499117358973e-06, + "loss": 0.1086, + "step": 1279 + }, + { + "epoch": 0.37, + "grad_norm": 0.4396753615304459, + "learning_rate": 7.324403673809831e-06, + "loss": 0.0974, + "step": 1280 + }, + { + "epoch": 0.37, + "grad_norm": 0.3351717334860073, + "learning_rate": 7.320306239900343e-06, + "loss": 0.0444, + "step": 1281 + }, + { + "epoch": 0.37, + "grad_norm": 0.6057477710603322, + "learning_rate": 7.316206819139098e-06, + "loss": 0.0557, + "step": 1282 + }, + { + "epoch": 0.37, + "grad_norm": 0.4853121311355041, + "learning_rate": 7.312105415036379e-06, + "loss": 0.1157, + "step": 1283 + }, + { + "epoch": 0.37, + "grad_norm": 0.3956609692436241, + "learning_rate": 7.308002031104177e-06, + "loss": 0.0959, + "step": 1284 + }, + { + "epoch": 0.37, + "grad_norm": 0.36579601627698144, + "learning_rate": 7.303896670856168e-06, + "loss": 0.0818, + "step": 1285 + }, + { + "epoch": 0.37, + "grad_norm": 0.5950403488333442, + "learning_rate": 7.299789337807727e-06, + "loss": 0.0962, + "step": 1286 + }, + { + "epoch": 0.37, + "grad_norm": 0.7302893406165183, + "learning_rate": 7.2956800354759165e-06, + "loss": 0.1507, + "step": 1287 + }, + { + "epoch": 0.37, + "grad_norm": 0.35248968306116857, + "learning_rate": 7.291568767379484e-06, + "loss": 0.0677, + "step": 1288 + }, + { + "epoch": 0.37, + "grad_norm": 0.35954962807442997, + "learning_rate": 7.287455537038865e-06, + "loss": 0.0789, + "step": 1289 + }, + { + "epoch": 0.37, + "grad_norm": 0.2545344927730057, + "learning_rate": 7.283340347976167e-06, + "loss": 0.0438, + "step": 1290 + }, + { + "epoch": 0.37, + "grad_norm": 0.6622217651425762, + "learning_rate": 7.279223203715183e-06, + "loss": 0.1154, + "step": 1291 + }, + { + "epoch": 0.37, + "grad_norm": 0.6497852254352842, + "learning_rate": 7.275104107781374e-06, + "loss": 0.1187, + "step": 1292 + }, + { + "epoch": 0.37, + "grad_norm": 0.2948404321113314, + "learning_rate": 7.270983063701878e-06, + "loss": 0.0834, + "step": 1293 + }, + { + "epoch": 0.37, + "grad_norm": 0.323389471633835, + "learning_rate": 7.2668600750054955e-06, + "loss": 0.0593, + "step": 1294 + }, + { + "epoch": 0.37, + "grad_norm": 0.6256941837415155, + "learning_rate": 7.262735145222696e-06, + "loss": 0.1153, + "step": 1295 + }, + { + "epoch": 0.37, + "grad_norm": 0.4091099856115394, + "learning_rate": 7.258608277885608e-06, + "loss": 0.1209, + "step": 1296 + }, + { + "epoch": 0.37, + "grad_norm": 0.4989328325430237, + "learning_rate": 7.254479476528023e-06, + "loss": 0.1174, + "step": 1297 + }, + { + "epoch": 0.37, + "grad_norm": 0.437548733354485, + "learning_rate": 7.250348744685385e-06, + "loss": 0.0895, + "step": 1298 + }, + { + "epoch": 0.37, + "grad_norm": 0.4457904798020956, + "learning_rate": 7.246216085894793e-06, + "loss": 0.0738, + "step": 1299 + }, + { + "epoch": 0.37, + "grad_norm": 0.4490462670693121, + "learning_rate": 7.242081503694996e-06, + "loss": 0.1226, + "step": 1300 + }, + { + "epoch": 0.37, + "grad_norm": 0.394034010362623, + "learning_rate": 7.237945001626388e-06, + "loss": 0.0882, + "step": 1301 + }, + { + "epoch": 0.37, + "grad_norm": 0.5396818419239576, + "learning_rate": 7.233806583231012e-06, + "loss": 0.0992, + "step": 1302 + }, + { + "epoch": 0.37, + "grad_norm": 0.42430139774657966, + "learning_rate": 7.229666252052545e-06, + "loss": 0.0947, + "step": 1303 + }, + { + "epoch": 0.37, + "grad_norm": 0.6830348155509963, + "learning_rate": 7.225524011636308e-06, + "loss": 0.144, + "step": 1304 + }, + { + "epoch": 0.37, + "grad_norm": 0.4371075646071382, + "learning_rate": 7.221379865529251e-06, + "loss": 0.0821, + "step": 1305 + }, + { + "epoch": 0.37, + "grad_norm": 0.453529637864777, + "learning_rate": 7.2172338172799625e-06, + "loss": 0.0848, + "step": 1306 + }, + { + "epoch": 0.37, + "grad_norm": 0.5704528490914308, + "learning_rate": 7.213085870438653e-06, + "loss": 0.0566, + "step": 1307 + }, + { + "epoch": 0.37, + "grad_norm": 0.6960155872822488, + "learning_rate": 7.208936028557165e-06, + "loss": 0.1474, + "step": 1308 + }, + { + "epoch": 0.37, + "grad_norm": 0.5817205230296579, + "learning_rate": 7.204784295188959e-06, + "loss": 0.1037, + "step": 1309 + }, + { + "epoch": 0.37, + "grad_norm": 0.40251750204790904, + "learning_rate": 7.200630673889118e-06, + "loss": 0.0951, + "step": 1310 + }, + { + "epoch": 0.37, + "grad_norm": 0.44423514359563776, + "learning_rate": 7.1964751682143385e-06, + "loss": 0.1336, + "step": 1311 + }, + { + "epoch": 0.37, + "grad_norm": 0.31689798590143897, + "learning_rate": 7.192317781722935e-06, + "loss": 0.0477, + "step": 1312 + }, + { + "epoch": 0.38, + "grad_norm": 0.4498326110158957, + "learning_rate": 7.18815851797483e-06, + "loss": 0.0755, + "step": 1313 + }, + { + "epoch": 0.38, + "grad_norm": 0.4214579949532243, + "learning_rate": 7.183997380531551e-06, + "loss": 0.0916, + "step": 1314 + }, + { + "epoch": 0.38, + "grad_norm": 0.3629707871148528, + "learning_rate": 7.179834372956236e-06, + "loss": 0.0457, + "step": 1315 + }, + { + "epoch": 0.38, + "grad_norm": 0.23890986120364538, + "learning_rate": 7.1756694988136165e-06, + "loss": 0.0317, + "step": 1316 + }, + { + "epoch": 0.38, + "grad_norm": 0.3178539575728721, + "learning_rate": 7.171502761670032e-06, + "loss": 0.0839, + "step": 1317 + }, + { + "epoch": 0.38, + "grad_norm": 0.36087840792972364, + "learning_rate": 7.167334165093407e-06, + "loss": 0.0576, + "step": 1318 + }, + { + "epoch": 0.38, + "grad_norm": 0.7533321546317311, + "learning_rate": 7.163163712653267e-06, + "loss": 0.072, + "step": 1319 + }, + { + "epoch": 0.38, + "grad_norm": 0.8250500485970101, + "learning_rate": 7.158991407920721e-06, + "loss": 0.0744, + "step": 1320 + }, + { + "epoch": 0.38, + "grad_norm": 0.22566404858687902, + "learning_rate": 7.154817254468467e-06, + "loss": 0.0532, + "step": 1321 + }, + { + "epoch": 0.38, + "grad_norm": 0.35811467808863606, + "learning_rate": 7.150641255870783e-06, + "loss": 0.089, + "step": 1322 + }, + { + "epoch": 0.38, + "grad_norm": 0.4505878258632678, + "learning_rate": 7.14646341570353e-06, + "loss": 0.0848, + "step": 1323 + }, + { + "epoch": 0.38, + "grad_norm": 0.34669718798707816, + "learning_rate": 7.142283737544146e-06, + "loss": 0.0839, + "step": 1324 + }, + { + "epoch": 0.38, + "grad_norm": 0.36144477138663855, + "learning_rate": 7.13810222497164e-06, + "loss": 0.0737, + "step": 1325 + }, + { + "epoch": 0.38, + "grad_norm": 0.34491353044678136, + "learning_rate": 7.133918881566594e-06, + "loss": 0.0974, + "step": 1326 + }, + { + "epoch": 0.38, + "grad_norm": 0.4263799438922573, + "learning_rate": 7.129733710911159e-06, + "loss": 0.0922, + "step": 1327 + }, + { + "epoch": 0.38, + "grad_norm": 0.2239485715232622, + "learning_rate": 7.125546716589046e-06, + "loss": 0.0212, + "step": 1328 + }, + { + "epoch": 0.38, + "grad_norm": 0.4832278651309879, + "learning_rate": 7.1213579021855325e-06, + "loss": 0.099, + "step": 1329 + }, + { + "epoch": 0.38, + "grad_norm": 0.5101730154765555, + "learning_rate": 7.117167271287453e-06, + "loss": 0.1017, + "step": 1330 + }, + { + "epoch": 0.38, + "grad_norm": 0.40492803572216257, + "learning_rate": 7.112974827483193e-06, + "loss": 0.1275, + "step": 1331 + }, + { + "epoch": 0.38, + "grad_norm": 0.43584343014282445, + "learning_rate": 7.108780574362699e-06, + "loss": 0.1082, + "step": 1332 + }, + { + "epoch": 0.38, + "grad_norm": 0.43533816162835104, + "learning_rate": 7.104584515517459e-06, + "loss": 0.0986, + "step": 1333 + }, + { + "epoch": 0.38, + "grad_norm": 0.2963252094256265, + "learning_rate": 7.100386654540512e-06, + "loss": 0.0628, + "step": 1334 + }, + { + "epoch": 0.38, + "grad_norm": 0.4259754685969777, + "learning_rate": 7.096186995026439e-06, + "loss": 0.0915, + "step": 1335 + }, + { + "epoch": 0.38, + "grad_norm": 0.2668788584851861, + "learning_rate": 7.091985540571358e-06, + "loss": 0.0635, + "step": 1336 + }, + { + "epoch": 0.38, + "grad_norm": 0.47725951760375, + "learning_rate": 7.0877822947729265e-06, + "loss": 0.1054, + "step": 1337 + }, + { + "epoch": 0.38, + "grad_norm": 0.3467052183988419, + "learning_rate": 7.083577261230341e-06, + "loss": 0.0859, + "step": 1338 + }, + { + "epoch": 0.38, + "grad_norm": 0.577650273671718, + "learning_rate": 7.079370443544318e-06, + "loss": 0.1355, + "step": 1339 + }, + { + "epoch": 0.38, + "grad_norm": 0.45582154635810423, + "learning_rate": 7.07516184531711e-06, + "loss": 0.101, + "step": 1340 + }, + { + "epoch": 0.38, + "grad_norm": 0.3014023955248303, + "learning_rate": 7.07095147015249e-06, + "loss": 0.0802, + "step": 1341 + }, + { + "epoch": 0.38, + "grad_norm": 0.5655097720036469, + "learning_rate": 7.066739321655757e-06, + "loss": 0.0684, + "step": 1342 + }, + { + "epoch": 0.38, + "grad_norm": 0.41287951668563994, + "learning_rate": 7.062525403433723e-06, + "loss": 0.0954, + "step": 1343 + }, + { + "epoch": 0.38, + "grad_norm": 0.28403130016009487, + "learning_rate": 7.05830971909472e-06, + "loss": 0.084, + "step": 1344 + }, + { + "epoch": 0.38, + "grad_norm": 0.30073241076499924, + "learning_rate": 7.05409227224859e-06, + "loss": 0.083, + "step": 1345 + }, + { + "epoch": 0.38, + "grad_norm": 0.3742776309578899, + "learning_rate": 7.049873066506684e-06, + "loss": 0.0885, + "step": 1346 + }, + { + "epoch": 0.38, + "grad_norm": 0.444158701035714, + "learning_rate": 7.04565210548186e-06, + "loss": 0.1088, + "step": 1347 + }, + { + "epoch": 0.39, + "grad_norm": 0.6000393018205739, + "learning_rate": 7.041429392788477e-06, + "loss": 0.1029, + "step": 1348 + }, + { + "epoch": 0.39, + "grad_norm": 0.4965731557138154, + "learning_rate": 7.0372049320424e-06, + "loss": 0.0789, + "step": 1349 + }, + { + "epoch": 0.39, + "grad_norm": 0.6645267790695691, + "learning_rate": 7.032978726860981e-06, + "loss": 0.1253, + "step": 1350 + }, + { + "epoch": 0.39, + "grad_norm": 0.41647744727531605, + "learning_rate": 7.028750780863078e-06, + "loss": 0.0782, + "step": 1351 + }, + { + "epoch": 0.39, + "grad_norm": 0.49520716867368675, + "learning_rate": 7.024521097669026e-06, + "loss": 0.1119, + "step": 1352 + }, + { + "epoch": 0.39, + "grad_norm": 0.885596141103793, + "learning_rate": 7.020289680900658e-06, + "loss": 0.0834, + "step": 1353 + }, + { + "epoch": 0.39, + "grad_norm": 0.400198089989043, + "learning_rate": 7.0160565341812885e-06, + "loss": 0.0735, + "step": 1354 + }, + { + "epoch": 0.39, + "grad_norm": 0.465346235006083, + "learning_rate": 7.0118216611357125e-06, + "loss": 0.0857, + "step": 1355 + }, + { + "epoch": 0.39, + "grad_norm": 0.6727623425168257, + "learning_rate": 7.007585065390203e-06, + "loss": 0.1098, + "step": 1356 + }, + { + "epoch": 0.39, + "grad_norm": 0.4291856686417795, + "learning_rate": 7.003346750572506e-06, + "loss": 0.0712, + "step": 1357 + }, + { + "epoch": 0.39, + "grad_norm": 0.4471592586419728, + "learning_rate": 6.999106720311846e-06, + "loss": 0.0685, + "step": 1358 + }, + { + "epoch": 0.39, + "grad_norm": 0.2479345132477074, + "learning_rate": 6.99486497823891e-06, + "loss": 0.0783, + "step": 1359 + }, + { + "epoch": 0.39, + "grad_norm": 0.5948936709525752, + "learning_rate": 6.990621527985856e-06, + "loss": 0.0794, + "step": 1360 + }, + { + "epoch": 0.39, + "grad_norm": 0.5471409914024203, + "learning_rate": 6.9863763731862984e-06, + "loss": 0.1365, + "step": 1361 + }, + { + "epoch": 0.39, + "grad_norm": 0.5551858472302786, + "learning_rate": 6.9821295174753175e-06, + "loss": 0.0861, + "step": 1362 + }, + { + "epoch": 0.39, + "grad_norm": 0.5523454522986879, + "learning_rate": 6.9778809644894475e-06, + "loss": 0.1092, + "step": 1363 + }, + { + "epoch": 0.39, + "grad_norm": 0.2730055038951132, + "learning_rate": 6.973630717866675e-06, + "loss": 0.0525, + "step": 1364 + }, + { + "epoch": 0.39, + "grad_norm": 0.5811598336796037, + "learning_rate": 6.969378781246436e-06, + "loss": 0.0605, + "step": 1365 + }, + { + "epoch": 0.39, + "grad_norm": 0.3005758151587515, + "learning_rate": 6.965125158269619e-06, + "loss": 0.0455, + "step": 1366 + }, + { + "epoch": 0.39, + "grad_norm": 0.6381799184781491, + "learning_rate": 6.960869852578549e-06, + "loss": 0.1451, + "step": 1367 + }, + { + "epoch": 0.39, + "grad_norm": 0.5810543533037971, + "learning_rate": 6.956612867816999e-06, + "loss": 0.1039, + "step": 1368 + }, + { + "epoch": 0.39, + "grad_norm": 0.3466037402685036, + "learning_rate": 6.952354207630174e-06, + "loss": 0.0969, + "step": 1369 + }, + { + "epoch": 0.39, + "grad_norm": 0.4397121792763084, + "learning_rate": 6.948093875664719e-06, + "loss": 0.0933, + "step": 1370 + }, + { + "epoch": 0.39, + "grad_norm": 0.9929070614731769, + "learning_rate": 6.943831875568703e-06, + "loss": 0.1126, + "step": 1371 + }, + { + "epoch": 0.39, + "grad_norm": 0.35228268609643915, + "learning_rate": 6.939568210991633e-06, + "loss": 0.0849, + "step": 1372 + }, + { + "epoch": 0.39, + "grad_norm": 0.31320050463586374, + "learning_rate": 6.935302885584434e-06, + "loss": 0.0526, + "step": 1373 + }, + { + "epoch": 0.39, + "grad_norm": 0.31814123779422676, + "learning_rate": 6.931035902999454e-06, + "loss": 0.05, + "step": 1374 + }, + { + "epoch": 0.39, + "grad_norm": 0.5358566718177803, + "learning_rate": 6.926767266890466e-06, + "loss": 0.1168, + "step": 1375 + }, + { + "epoch": 0.39, + "grad_norm": 0.3585165835216518, + "learning_rate": 6.92249698091265e-06, + "loss": 0.117, + "step": 1376 + }, + { + "epoch": 0.39, + "grad_norm": 0.24361369021060725, + "learning_rate": 6.918225048722604e-06, + "loss": 0.0565, + "step": 1377 + }, + { + "epoch": 0.39, + "grad_norm": 0.5300060580289199, + "learning_rate": 6.9139514739783364e-06, + "loss": 0.1047, + "step": 1378 + }, + { + "epoch": 0.39, + "grad_norm": 0.3235397107082942, + "learning_rate": 6.9096762603392595e-06, + "loss": 0.0678, + "step": 1379 + }, + { + "epoch": 0.39, + "grad_norm": 0.43862995403368876, + "learning_rate": 6.90539941146619e-06, + "loss": 0.1168, + "step": 1380 + }, + { + "epoch": 0.39, + "grad_norm": 1.19116774365664, + "learning_rate": 6.901120931021345e-06, + "loss": 0.0935, + "step": 1381 + }, + { + "epoch": 0.39, + "grad_norm": 0.4868001872212555, + "learning_rate": 6.896840822668337e-06, + "loss": 0.0654, + "step": 1382 + }, + { + "epoch": 0.4, + "grad_norm": 0.3875830997642732, + "learning_rate": 6.892559090072177e-06, + "loss": 0.1, + "step": 1383 + }, + { + "epoch": 0.4, + "grad_norm": 0.6759822116009601, + "learning_rate": 6.888275736899262e-06, + "loss": 0.1395, + "step": 1384 + }, + { + "epoch": 0.4, + "grad_norm": 0.48997515323127844, + "learning_rate": 6.883990766817378e-06, + "loss": 0.0786, + "step": 1385 + }, + { + "epoch": 0.4, + "grad_norm": 0.38317298569150376, + "learning_rate": 6.8797041834956955e-06, + "loss": 0.081, + "step": 1386 + }, + { + "epoch": 0.4, + "grad_norm": 0.5123538753261575, + "learning_rate": 6.8754159906047706e-06, + "loss": 0.0973, + "step": 1387 + }, + { + "epoch": 0.4, + "grad_norm": 0.4060250824613997, + "learning_rate": 6.871126191816529e-06, + "loss": 0.1088, + "step": 1388 + }, + { + "epoch": 0.4, + "grad_norm": 0.6699260467691455, + "learning_rate": 6.866834790804278e-06, + "loss": 0.1409, + "step": 1389 + }, + { + "epoch": 0.4, + "grad_norm": 0.39217404810140766, + "learning_rate": 6.862541791242698e-06, + "loss": 0.0903, + "step": 1390 + }, + { + "epoch": 0.4, + "grad_norm": 0.3319049584807593, + "learning_rate": 6.858247196807833e-06, + "loss": 0.0907, + "step": 1391 + }, + { + "epoch": 0.4, + "grad_norm": 0.48402241987122746, + "learning_rate": 6.853951011177095e-06, + "loss": 0.0993, + "step": 1392 + }, + { + "epoch": 0.4, + "grad_norm": 0.43227601551456063, + "learning_rate": 6.849653238029261e-06, + "loss": 0.1217, + "step": 1393 + }, + { + "epoch": 0.4, + "grad_norm": 0.3561639464417621, + "learning_rate": 6.845353881044463e-06, + "loss": 0.0921, + "step": 1394 + }, + { + "epoch": 0.4, + "grad_norm": 0.5367946800369504, + "learning_rate": 6.84105294390419e-06, + "loss": 0.1309, + "step": 1395 + }, + { + "epoch": 0.4, + "grad_norm": 0.3147138255535657, + "learning_rate": 6.83675043029129e-06, + "loss": 0.0813, + "step": 1396 + }, + { + "epoch": 0.4, + "grad_norm": 0.42929135778736255, + "learning_rate": 6.832446343889952e-06, + "loss": 0.1122, + "step": 1397 + }, + { + "epoch": 0.4, + "grad_norm": 0.2808487389242168, + "learning_rate": 6.8281406883857194e-06, + "loss": 0.0924, + "step": 1398 + }, + { + "epoch": 0.4, + "grad_norm": 0.3213911099478094, + "learning_rate": 6.823833467465473e-06, + "loss": 0.0806, + "step": 1399 + }, + { + "epoch": 0.4, + "grad_norm": 0.3767345159241612, + "learning_rate": 6.819524684817439e-06, + "loss": 0.0874, + "step": 1400 + }, + { + "epoch": 0.4, + "grad_norm": 0.321662199553273, + "learning_rate": 6.8152143441311765e-06, + "loss": 0.0403, + "step": 1401 + }, + { + "epoch": 0.4, + "grad_norm": 0.3167712405692955, + "learning_rate": 6.810902449097584e-06, + "loss": 0.1029, + "step": 1402 + }, + { + "epoch": 0.4, + "grad_norm": 0.3093713780061254, + "learning_rate": 6.806589003408886e-06, + "loss": 0.0899, + "step": 1403 + }, + { + "epoch": 0.4, + "grad_norm": 0.7724714628357934, + "learning_rate": 6.802274010758637e-06, + "loss": 0.1413, + "step": 1404 + }, + { + "epoch": 0.4, + "grad_norm": 0.3961863310948659, + "learning_rate": 6.797957474841717e-06, + "loss": 0.0574, + "step": 1405 + }, + { + "epoch": 0.4, + "grad_norm": 0.3871784418101659, + "learning_rate": 6.793639399354324e-06, + "loss": 0.1117, + "step": 1406 + }, + { + "epoch": 0.4, + "grad_norm": 0.5945824916656249, + "learning_rate": 6.78931978799398e-06, + "loss": 0.0919, + "step": 1407 + }, + { + "epoch": 0.4, + "grad_norm": 0.30112716546299095, + "learning_rate": 6.784998644459517e-06, + "loss": 0.0491, + "step": 1408 + }, + { + "epoch": 0.4, + "grad_norm": 0.33613988205627376, + "learning_rate": 6.780675972451083e-06, + "loss": 0.0663, + "step": 1409 + }, + { + "epoch": 0.4, + "grad_norm": 0.3645211692038736, + "learning_rate": 6.776351775670129e-06, + "loss": 0.0865, + "step": 1410 + }, + { + "epoch": 0.4, + "grad_norm": 0.253234124055534, + "learning_rate": 6.7720260578194185e-06, + "loss": 0.0514, + "step": 1411 + }, + { + "epoch": 0.4, + "grad_norm": 0.3508724331773692, + "learning_rate": 6.767698822603013e-06, + "loss": 0.0804, + "step": 1412 + }, + { + "epoch": 0.4, + "grad_norm": 0.26034274444884775, + "learning_rate": 6.763370073726276e-06, + "loss": 0.0605, + "step": 1413 + }, + { + "epoch": 0.4, + "grad_norm": 0.6546642097245746, + "learning_rate": 6.7590398148958625e-06, + "loss": 0.1923, + "step": 1414 + }, + { + "epoch": 0.4, + "grad_norm": 0.40170447802361675, + "learning_rate": 6.754708049819728e-06, + "loss": 0.0923, + "step": 1415 + }, + { + "epoch": 0.4, + "grad_norm": 0.4235066268178241, + "learning_rate": 6.750374782207112e-06, + "loss": 0.0516, + "step": 1416 + }, + { + "epoch": 0.4, + "grad_norm": 0.3383309060193509, + "learning_rate": 6.74604001576854e-06, + "loss": 0.0898, + "step": 1417 + }, + { + "epoch": 0.41, + "grad_norm": 0.4183934545627158, + "learning_rate": 6.741703754215825e-06, + "loss": 0.1187, + "step": 1418 + }, + { + "epoch": 0.41, + "grad_norm": 0.29694276385261686, + "learning_rate": 6.7373660012620575e-06, + "loss": 0.0801, + "step": 1419 + }, + { + "epoch": 0.41, + "grad_norm": 0.35023179019387557, + "learning_rate": 6.733026760621607e-06, + "loss": 0.079, + "step": 1420 + }, + { + "epoch": 0.41, + "grad_norm": 0.4280954679718074, + "learning_rate": 6.728686036010115e-06, + "loss": 0.0545, + "step": 1421 + }, + { + "epoch": 0.41, + "grad_norm": 0.36828927733308037, + "learning_rate": 6.724343831144494e-06, + "loss": 0.1062, + "step": 1422 + }, + { + "epoch": 0.41, + "grad_norm": 0.275826926343428, + "learning_rate": 6.720000149742925e-06, + "loss": 0.0613, + "step": 1423 + }, + { + "epoch": 0.41, + "grad_norm": 0.4212043430155424, + "learning_rate": 6.715654995524853e-06, + "loss": 0.106, + "step": 1424 + }, + { + "epoch": 0.41, + "grad_norm": 0.2702659912031575, + "learning_rate": 6.711308372210983e-06, + "loss": 0.0876, + "step": 1425 + }, + { + "epoch": 0.41, + "grad_norm": 0.4948367930619922, + "learning_rate": 6.706960283523282e-06, + "loss": 0.0878, + "step": 1426 + }, + { + "epoch": 0.41, + "grad_norm": 0.5187744933374802, + "learning_rate": 6.702610733184965e-06, + "loss": 0.106, + "step": 1427 + }, + { + "epoch": 0.41, + "grad_norm": 0.48915435202940594, + "learning_rate": 6.698259724920503e-06, + "loss": 0.1079, + "step": 1428 + }, + { + "epoch": 0.41, + "grad_norm": 0.3784939134253547, + "learning_rate": 6.6939072624556155e-06, + "loss": 0.0674, + "step": 1429 + }, + { + "epoch": 0.41, + "grad_norm": 0.28085894067989636, + "learning_rate": 6.689553349517268e-06, + "loss": 0.0776, + "step": 1430 + }, + { + "epoch": 0.41, + "grad_norm": 0.89551810622946, + "learning_rate": 6.685197989833665e-06, + "loss": 0.1587, + "step": 1431 + }, + { + "epoch": 0.41, + "grad_norm": 0.41601809001145373, + "learning_rate": 6.68084118713425e-06, + "loss": 0.0905, + "step": 1432 + }, + { + "epoch": 0.41, + "grad_norm": 0.7815321799772386, + "learning_rate": 6.676482945149705e-06, + "loss": 0.1905, + "step": 1433 + }, + { + "epoch": 0.41, + "grad_norm": 0.7439023887847122, + "learning_rate": 6.672123267611942e-06, + "loss": 0.1396, + "step": 1434 + }, + { + "epoch": 0.41, + "grad_norm": 0.28561834571202693, + "learning_rate": 6.667762158254104e-06, + "loss": 0.0548, + "step": 1435 + }, + { + "epoch": 0.41, + "grad_norm": 0.37121560968314343, + "learning_rate": 6.663399620810559e-06, + "loss": 0.0713, + "step": 1436 + }, + { + "epoch": 0.41, + "grad_norm": 0.5599469503795745, + "learning_rate": 6.659035659016898e-06, + "loss": 0.1428, + "step": 1437 + }, + { + "epoch": 0.41, + "grad_norm": 0.4501706729423448, + "learning_rate": 6.654670276609932e-06, + "loss": 0.1264, + "step": 1438 + }, + { + "epoch": 0.41, + "grad_norm": 0.5229707122928983, + "learning_rate": 6.650303477327686e-06, + "loss": 0.1049, + "step": 1439 + }, + { + "epoch": 0.41, + "grad_norm": 0.859696260542617, + "learning_rate": 6.645935264909404e-06, + "loss": 0.118, + "step": 1440 + }, + { + "epoch": 0.41, + "grad_norm": 0.5737426947982244, + "learning_rate": 6.641565643095534e-06, + "loss": 0.1233, + "step": 1441 + }, + { + "epoch": 0.41, + "grad_norm": 0.5524141557794943, + "learning_rate": 6.637194615627733e-06, + "loss": 0.0829, + "step": 1442 + }, + { + "epoch": 0.41, + "grad_norm": 0.2981411094203284, + "learning_rate": 6.632822186248865e-06, + "loss": 0.0589, + "step": 1443 + }, + { + "epoch": 0.41, + "grad_norm": 0.40743741308632836, + "learning_rate": 6.628448358702988e-06, + "loss": 0.098, + "step": 1444 + }, + { + "epoch": 0.41, + "grad_norm": 0.4791202138734424, + "learning_rate": 6.6240731367353624e-06, + "loss": 0.1442, + "step": 1445 + }, + { + "epoch": 0.41, + "grad_norm": 0.7185057917214397, + "learning_rate": 6.619696524092439e-06, + "loss": 0.1549, + "step": 1446 + }, + { + "epoch": 0.41, + "grad_norm": 0.34993534334995197, + "learning_rate": 6.6153185245218645e-06, + "loss": 0.0995, + "step": 1447 + }, + { + "epoch": 0.41, + "grad_norm": 0.4304917727215731, + "learning_rate": 6.610939141772467e-06, + "loss": 0.1462, + "step": 1448 + }, + { + "epoch": 0.41, + "grad_norm": 0.5365579848682441, + "learning_rate": 6.6065583795942625e-06, + "loss": 0.1064, + "step": 1449 + }, + { + "epoch": 0.41, + "grad_norm": 0.26302471257781734, + "learning_rate": 6.602176241738449e-06, + "loss": 0.0799, + "step": 1450 + }, + { + "epoch": 0.41, + "grad_norm": 0.29495288160584304, + "learning_rate": 6.597792731957399e-06, + "loss": 0.0733, + "step": 1451 + }, + { + "epoch": 0.41, + "grad_norm": 0.5462077923402763, + "learning_rate": 6.593407854004666e-06, + "loss": 0.1174, + "step": 1452 + }, + { + "epoch": 0.42, + "grad_norm": 0.49316568084281137, + "learning_rate": 6.589021611634966e-06, + "loss": 0.1103, + "step": 1453 + }, + { + "epoch": 0.42, + "grad_norm": 0.5132963762830282, + "learning_rate": 6.584634008604191e-06, + "loss": 0.0898, + "step": 1454 + }, + { + "epoch": 0.42, + "grad_norm": 0.26852926328328786, + "learning_rate": 6.580245048669395e-06, + "loss": 0.0703, + "step": 1455 + }, + { + "epoch": 0.42, + "grad_norm": 0.195147398121515, + "learning_rate": 6.5758547355887944e-06, + "loss": 0.0509, + "step": 1456 + }, + { + "epoch": 0.42, + "grad_norm": 0.2298303915335172, + "learning_rate": 6.571463073121763e-06, + "loss": 0.0582, + "step": 1457 + }, + { + "epoch": 0.42, + "grad_norm": 0.3338043228952181, + "learning_rate": 6.567070065028833e-06, + "loss": 0.0654, + "step": 1458 + }, + { + "epoch": 0.42, + "grad_norm": 0.46569377694297515, + "learning_rate": 6.562675715071687e-06, + "loss": 0.1032, + "step": 1459 + }, + { + "epoch": 0.42, + "grad_norm": 0.23718009946674504, + "learning_rate": 6.558280027013155e-06, + "loss": 0.0574, + "step": 1460 + }, + { + "epoch": 0.42, + "grad_norm": 0.36490650797169855, + "learning_rate": 6.5538830046172175e-06, + "loss": 0.0872, + "step": 1461 + }, + { + "epoch": 0.42, + "grad_norm": 0.27201650887410783, + "learning_rate": 6.549484651648991e-06, + "loss": 0.0726, + "step": 1462 + }, + { + "epoch": 0.42, + "grad_norm": 0.8062505615723329, + "learning_rate": 6.545084971874738e-06, + "loss": 0.1241, + "step": 1463 + }, + { + "epoch": 0.42, + "grad_norm": 0.31937336599729654, + "learning_rate": 6.540683969061852e-06, + "loss": 0.0846, + "step": 1464 + }, + { + "epoch": 0.42, + "grad_norm": 0.3313888405197613, + "learning_rate": 6.536281646978863e-06, + "loss": 0.1053, + "step": 1465 + }, + { + "epoch": 0.42, + "grad_norm": 0.599728482274412, + "learning_rate": 6.5318780093954274e-06, + "loss": 0.1326, + "step": 1466 + }, + { + "epoch": 0.42, + "grad_norm": 0.41134979401673594, + "learning_rate": 6.527473060082332e-06, + "loss": 0.0952, + "step": 1467 + }, + { + "epoch": 0.42, + "grad_norm": 0.46696607191491313, + "learning_rate": 6.523066802811483e-06, + "loss": 0.0854, + "step": 1468 + }, + { + "epoch": 0.42, + "grad_norm": 0.7528784570553645, + "learning_rate": 6.5186592413559095e-06, + "loss": 0.149, + "step": 1469 + }, + { + "epoch": 0.42, + "grad_norm": 0.3896412581457436, + "learning_rate": 6.514250379489754e-06, + "loss": 0.0834, + "step": 1470 + }, + { + "epoch": 0.42, + "grad_norm": 0.3704572654202895, + "learning_rate": 6.509840220988277e-06, + "loss": 0.1132, + "step": 1471 + }, + { + "epoch": 0.42, + "grad_norm": 0.3044333766039238, + "learning_rate": 6.5054287696278455e-06, + "loss": 0.0467, + "step": 1472 + }, + { + "epoch": 0.42, + "grad_norm": 0.3783897809573116, + "learning_rate": 6.501016029185936e-06, + "loss": 0.0907, + "step": 1473 + }, + { + "epoch": 0.42, + "grad_norm": 0.30012817272031916, + "learning_rate": 6.496602003441127e-06, + "loss": 0.0789, + "step": 1474 + }, + { + "epoch": 0.42, + "grad_norm": 0.44966823770999975, + "learning_rate": 6.492186696173097e-06, + "loss": 0.0966, + "step": 1475 + }, + { + "epoch": 0.42, + "grad_norm": 0.3876318539712018, + "learning_rate": 6.4877701111626266e-06, + "loss": 0.0542, + "step": 1476 + }, + { + "epoch": 0.42, + "grad_norm": 0.33770978651482897, + "learning_rate": 6.483352252191585e-06, + "loss": 0.0642, + "step": 1477 + }, + { + "epoch": 0.42, + "grad_norm": 0.3674837351329926, + "learning_rate": 6.478933123042934e-06, + "loss": 0.0373, + "step": 1478 + }, + { + "epoch": 0.42, + "grad_norm": 0.30121807029193237, + "learning_rate": 6.474512727500725e-06, + "loss": 0.0827, + "step": 1479 + }, + { + "epoch": 0.42, + "grad_norm": 0.5511239489168589, + "learning_rate": 6.47009106935009e-06, + "loss": 0.1201, + "step": 1480 + }, + { + "epoch": 0.42, + "grad_norm": 0.5329639725909865, + "learning_rate": 6.465668152377247e-06, + "loss": 0.1094, + "step": 1481 + }, + { + "epoch": 0.42, + "grad_norm": 0.467998118357438, + "learning_rate": 6.461243980369489e-06, + "loss": 0.098, + "step": 1482 + }, + { + "epoch": 0.42, + "grad_norm": 0.27611797369931845, + "learning_rate": 6.456818557115182e-06, + "loss": 0.0666, + "step": 1483 + }, + { + "epoch": 0.42, + "grad_norm": 0.504297881148237, + "learning_rate": 6.452391886403767e-06, + "loss": 0.0926, + "step": 1484 + }, + { + "epoch": 0.42, + "grad_norm": 0.47484383288801263, + "learning_rate": 6.447963972025752e-06, + "loss": 0.0678, + "step": 1485 + }, + { + "epoch": 0.42, + "grad_norm": 0.7171332966002333, + "learning_rate": 6.443534817772707e-06, + "loss": 0.1475, + "step": 1486 + }, + { + "epoch": 0.42, + "grad_norm": 0.5516107285527765, + "learning_rate": 6.4391044274372706e-06, + "loss": 0.1202, + "step": 1487 + }, + { + "epoch": 0.43, + "grad_norm": 0.7818961227000709, + "learning_rate": 6.434672804813131e-06, + "loss": 0.1328, + "step": 1488 + }, + { + "epoch": 0.43, + "grad_norm": 0.38446665166782235, + "learning_rate": 6.430239953695039e-06, + "loss": 0.0565, + "step": 1489 + }, + { + "epoch": 0.43, + "grad_norm": 0.6605993525909495, + "learning_rate": 6.425805877878794e-06, + "loss": 0.0956, + "step": 1490 + }, + { + "epoch": 0.43, + "grad_norm": 0.6157783453818049, + "learning_rate": 6.421370581161244e-06, + "loss": 0.1527, + "step": 1491 + }, + { + "epoch": 0.43, + "grad_norm": 0.41730228265956526, + "learning_rate": 6.4169340673402814e-06, + "loss": 0.1015, + "step": 1492 + }, + { + "epoch": 0.43, + "grad_norm": 0.7657718379287314, + "learning_rate": 6.412496340214846e-06, + "loss": 0.0909, + "step": 1493 + }, + { + "epoch": 0.43, + "grad_norm": 0.4930339127764775, + "learning_rate": 6.4080574035849096e-06, + "loss": 0.0811, + "step": 1494 + }, + { + "epoch": 0.43, + "grad_norm": 0.6344653860002332, + "learning_rate": 6.403617261251485e-06, + "loss": 0.0954, + "step": 1495 + }, + { + "epoch": 0.43, + "grad_norm": 0.550946449107486, + "learning_rate": 6.399175917016613e-06, + "loss": 0.0728, + "step": 1496 + }, + { + "epoch": 0.43, + "grad_norm": 0.2713422016625722, + "learning_rate": 6.394733374683371e-06, + "loss": 0.0615, + "step": 1497 + }, + { + "epoch": 0.43, + "grad_norm": 0.9531303100198866, + "learning_rate": 6.390289638055851e-06, + "loss": 0.0648, + "step": 1498 + }, + { + "epoch": 0.43, + "grad_norm": 0.40299138120665573, + "learning_rate": 6.385844710939179e-06, + "loss": 0.0745, + "step": 1499 + }, + { + "epoch": 0.43, + "grad_norm": 0.5637010517456598, + "learning_rate": 6.381398597139492e-06, + "loss": 0.1182, + "step": 1500 + }, + { + "epoch": 0.43, + "grad_norm": 0.28354064126598905, + "learning_rate": 6.376951300463948e-06, + "loss": 0.0579, + "step": 1501 + }, + { + "epoch": 0.43, + "grad_norm": 0.5843975672098469, + "learning_rate": 6.372502824720716e-06, + "loss": 0.1196, + "step": 1502 + }, + { + "epoch": 0.43, + "grad_norm": 0.5052794192271838, + "learning_rate": 6.368053173718978e-06, + "loss": 0.1069, + "step": 1503 + }, + { + "epoch": 0.43, + "grad_norm": 0.5564935999984026, + "learning_rate": 6.363602351268913e-06, + "loss": 0.1397, + "step": 1504 + }, + { + "epoch": 0.43, + "grad_norm": 0.5095795974359075, + "learning_rate": 6.3591503611817155e-06, + "loss": 0.1064, + "step": 1505 + }, + { + "epoch": 0.43, + "grad_norm": 0.6050161213150755, + "learning_rate": 6.35469720726957e-06, + "loss": 0.1015, + "step": 1506 + }, + { + "epoch": 0.43, + "grad_norm": 0.2588092761354788, + "learning_rate": 6.350242893345664e-06, + "loss": 0.0446, + "step": 1507 + }, + { + "epoch": 0.43, + "grad_norm": 0.4729876848485526, + "learning_rate": 6.345787423224174e-06, + "loss": 0.1086, + "step": 1508 + }, + { + "epoch": 0.43, + "grad_norm": 0.2992442928232585, + "learning_rate": 6.341330800720269e-06, + "loss": 0.0793, + "step": 1509 + }, + { + "epoch": 0.43, + "grad_norm": 0.4261129994450151, + "learning_rate": 6.336873029650104e-06, + "loss": 0.089, + "step": 1510 + }, + { + "epoch": 0.43, + "grad_norm": 0.4443368196552463, + "learning_rate": 6.332414113830816e-06, + "loss": 0.1577, + "step": 1511 + }, + { + "epoch": 0.43, + "grad_norm": 0.27405575464330806, + "learning_rate": 6.3279540570805265e-06, + "loss": 0.049, + "step": 1512 + }, + { + "epoch": 0.43, + "grad_norm": 1.2245225957989012, + "learning_rate": 6.3234928632183276e-06, + "loss": 0.1352, + "step": 1513 + }, + { + "epoch": 0.43, + "grad_norm": 0.5191731693081545, + "learning_rate": 6.319030536064295e-06, + "loss": 0.1246, + "step": 1514 + }, + { + "epoch": 0.43, + "grad_norm": 0.4021125762969779, + "learning_rate": 6.3145670794394595e-06, + "loss": 0.093, + "step": 1515 + }, + { + "epoch": 0.43, + "grad_norm": 0.2847701154707854, + "learning_rate": 6.310102497165836e-06, + "loss": 0.0495, + "step": 1516 + }, + { + "epoch": 0.43, + "grad_norm": 0.2923960049003912, + "learning_rate": 6.305636793066392e-06, + "loss": 0.081, + "step": 1517 + }, + { + "epoch": 0.43, + "grad_norm": 0.617457696014792, + "learning_rate": 6.3011699709650586e-06, + "loss": 0.0788, + "step": 1518 + }, + { + "epoch": 0.43, + "grad_norm": 0.4874484095341976, + "learning_rate": 6.296702034686726e-06, + "loss": 0.0467, + "step": 1519 + }, + { + "epoch": 0.43, + "grad_norm": 0.26254787568666776, + "learning_rate": 6.292232988057235e-06, + "loss": 0.0705, + "step": 1520 + }, + { + "epoch": 0.43, + "grad_norm": 0.45079075618602954, + "learning_rate": 6.2877628349033825e-06, + "loss": 0.1232, + "step": 1521 + }, + { + "epoch": 0.43, + "grad_norm": 0.4535865482010611, + "learning_rate": 6.283291579052906e-06, + "loss": 0.0748, + "step": 1522 + }, + { + "epoch": 0.44, + "grad_norm": 0.3932746346044528, + "learning_rate": 6.2788192243344935e-06, + "loss": 0.0929, + "step": 1523 + }, + { + "epoch": 0.44, + "grad_norm": 0.5699864981861645, + "learning_rate": 6.27434577457777e-06, + "loss": 0.0876, + "step": 1524 + }, + { + "epoch": 0.44, + "grad_norm": 0.2960336863436895, + "learning_rate": 6.2698712336133e-06, + "loss": 0.075, + "step": 1525 + }, + { + "epoch": 0.44, + "grad_norm": 0.2852590643376082, + "learning_rate": 6.265395605272581e-06, + "loss": 0.0726, + "step": 1526 + }, + { + "epoch": 0.44, + "grad_norm": 0.30239596225304155, + "learning_rate": 6.260918893388045e-06, + "loss": 0.0838, + "step": 1527 + }, + { + "epoch": 0.44, + "grad_norm": 0.3464618142262354, + "learning_rate": 6.256441101793046e-06, + "loss": 0.0872, + "step": 1528 + }, + { + "epoch": 0.44, + "grad_norm": 0.3643088366043018, + "learning_rate": 6.251962234321869e-06, + "loss": 0.0881, + "step": 1529 + }, + { + "epoch": 0.44, + "grad_norm": 0.512353931165177, + "learning_rate": 6.247482294809712e-06, + "loss": 0.141, + "step": 1530 + }, + { + "epoch": 0.44, + "grad_norm": 0.3101410930114144, + "learning_rate": 6.243001287092704e-06, + "loss": 0.0491, + "step": 1531 + }, + { + "epoch": 0.44, + "grad_norm": 0.3687686011837704, + "learning_rate": 6.238519215007874e-06, + "loss": 0.1096, + "step": 1532 + }, + { + "epoch": 0.44, + "grad_norm": 0.34002579100593294, + "learning_rate": 6.234036082393171e-06, + "loss": 0.0914, + "step": 1533 + }, + { + "epoch": 0.44, + "grad_norm": 0.3508100813297846, + "learning_rate": 6.229551893087453e-06, + "loss": 0.099, + "step": 1534 + }, + { + "epoch": 0.44, + "grad_norm": 0.4404280287492867, + "learning_rate": 6.225066650930476e-06, + "loss": 0.1161, + "step": 1535 + }, + { + "epoch": 0.44, + "grad_norm": 0.5343769153846271, + "learning_rate": 6.2205803597629054e-06, + "loss": 0.103, + "step": 1536 + }, + { + "epoch": 0.44, + "grad_norm": 0.27792273844699805, + "learning_rate": 6.2160930234262976e-06, + "loss": 0.0686, + "step": 1537 + }, + { + "epoch": 0.44, + "grad_norm": 0.4600398166711245, + "learning_rate": 6.21160464576311e-06, + "loss": 0.1058, + "step": 1538 + }, + { + "epoch": 0.44, + "grad_norm": 0.43877281003570706, + "learning_rate": 6.207115230616689e-06, + "loss": 0.0945, + "step": 1539 + }, + { + "epoch": 0.44, + "grad_norm": 0.5050427102883335, + "learning_rate": 6.202624781831269e-06, + "loss": 0.122, + "step": 1540 + }, + { + "epoch": 0.44, + "grad_norm": 0.5784410529318813, + "learning_rate": 6.19813330325197e-06, + "loss": 0.1267, + "step": 1541 + }, + { + "epoch": 0.44, + "grad_norm": 0.4745251410287598, + "learning_rate": 6.193640798724794e-06, + "loss": 0.1139, + "step": 1542 + }, + { + "epoch": 0.44, + "grad_norm": 0.8324889040644113, + "learning_rate": 6.18914727209662e-06, + "loss": 0.1506, + "step": 1543 + }, + { + "epoch": 0.44, + "grad_norm": 0.570287586282885, + "learning_rate": 6.184652727215207e-06, + "loss": 0.0856, + "step": 1544 + }, + { + "epoch": 0.44, + "grad_norm": 0.5169236252677503, + "learning_rate": 6.18015716792918e-06, + "loss": 0.1102, + "step": 1545 + }, + { + "epoch": 0.44, + "grad_norm": 0.48575848359046936, + "learning_rate": 6.175660598088034e-06, + "loss": 0.092, + "step": 1546 + }, + { + "epoch": 0.44, + "grad_norm": 0.33627824765172376, + "learning_rate": 6.171163021542134e-06, + "loss": 0.0916, + "step": 1547 + }, + { + "epoch": 0.44, + "grad_norm": 0.39397531807969416, + "learning_rate": 6.1666644421427015e-06, + "loss": 0.07, + "step": 1548 + }, + { + "epoch": 0.44, + "grad_norm": 0.6584198092141058, + "learning_rate": 6.162164863741817e-06, + "loss": 0.1271, + "step": 1549 + }, + { + "epoch": 0.44, + "grad_norm": 0.6403512448013448, + "learning_rate": 6.157664290192421e-06, + "loss": 0.1301, + "step": 1550 + }, + { + "epoch": 0.44, + "grad_norm": 0.3767507345736296, + "learning_rate": 6.1531627253483025e-06, + "loss": 0.073, + "step": 1551 + }, + { + "epoch": 0.44, + "grad_norm": 0.34170917949479634, + "learning_rate": 6.148660173064098e-06, + "loss": 0.088, + "step": 1552 + }, + { + "epoch": 0.44, + "grad_norm": 0.4116452204062582, + "learning_rate": 6.144156637195296e-06, + "loss": 0.0668, + "step": 1553 + }, + { + "epoch": 0.44, + "grad_norm": 0.42638117894199135, + "learning_rate": 6.139652121598219e-06, + "loss": 0.0918, + "step": 1554 + }, + { + "epoch": 0.44, + "grad_norm": 0.3220134925525686, + "learning_rate": 6.135146630130033e-06, + "loss": 0.0539, + "step": 1555 + }, + { + "epoch": 0.44, + "grad_norm": 0.44043160920308577, + "learning_rate": 6.13064016664874e-06, + "loss": 0.108, + "step": 1556 + }, + { + "epoch": 0.44, + "grad_norm": 0.3188262864721422, + "learning_rate": 6.1261327350131726e-06, + "loss": 0.0688, + "step": 1557 + }, + { + "epoch": 0.45, + "grad_norm": 0.4866909580068489, + "learning_rate": 6.1216243390829926e-06, + "loss": 0.0987, + "step": 1558 + }, + { + "epoch": 0.45, + "grad_norm": 0.3897816169344711, + "learning_rate": 6.11711498271869e-06, + "loss": 0.113, + "step": 1559 + }, + { + "epoch": 0.45, + "grad_norm": 0.33934496018117577, + "learning_rate": 6.112604669781572e-06, + "loss": 0.0818, + "step": 1560 + }, + { + "epoch": 0.45, + "grad_norm": 0.47644992139601133, + "learning_rate": 6.108093404133772e-06, + "loss": 0.0799, + "step": 1561 + }, + { + "epoch": 0.45, + "grad_norm": 0.29261316731482023, + "learning_rate": 6.103581189638232e-06, + "loss": 0.0595, + "step": 1562 + }, + { + "epoch": 0.45, + "grad_norm": 0.41203875162572184, + "learning_rate": 6.099068030158712e-06, + "loss": 0.073, + "step": 1563 + }, + { + "epoch": 0.45, + "grad_norm": 0.2928798237478887, + "learning_rate": 6.094553929559778e-06, + "loss": 0.0478, + "step": 1564 + }, + { + "epoch": 0.45, + "grad_norm": 0.44021163728740104, + "learning_rate": 6.090038891706801e-06, + "loss": 0.1198, + "step": 1565 + }, + { + "epoch": 0.45, + "grad_norm": 0.291893015226659, + "learning_rate": 6.08552292046596e-06, + "loss": 0.0969, + "step": 1566 + }, + { + "epoch": 0.45, + "grad_norm": 0.24413792006020263, + "learning_rate": 6.081006019704227e-06, + "loss": 0.0499, + "step": 1567 + }, + { + "epoch": 0.45, + "grad_norm": 0.5508667269722093, + "learning_rate": 6.076488193289375e-06, + "loss": 0.1126, + "step": 1568 + }, + { + "epoch": 0.45, + "grad_norm": 0.2722830665827193, + "learning_rate": 6.071969445089965e-06, + "loss": 0.0641, + "step": 1569 + }, + { + "epoch": 0.45, + "grad_norm": 0.4868550402933533, + "learning_rate": 6.06744977897535e-06, + "loss": 0.1031, + "step": 1570 + }, + { + "epoch": 0.45, + "grad_norm": 0.607048435151134, + "learning_rate": 6.0629291988156676e-06, + "loss": 0.1195, + "step": 1571 + }, + { + "epoch": 0.45, + "grad_norm": 0.4025357090256397, + "learning_rate": 6.058407708481843e-06, + "loss": 0.1115, + "step": 1572 + }, + { + "epoch": 0.45, + "grad_norm": 0.5038586230503376, + "learning_rate": 6.053885311845571e-06, + "loss": 0.131, + "step": 1573 + }, + { + "epoch": 0.45, + "grad_norm": 0.34539097482969766, + "learning_rate": 6.049362012779333e-06, + "loss": 0.0633, + "step": 1574 + }, + { + "epoch": 0.45, + "grad_norm": 0.3174818014038329, + "learning_rate": 6.044837815156377e-06, + "loss": 0.0796, + "step": 1575 + }, + { + "epoch": 0.45, + "grad_norm": 0.5081576105379931, + "learning_rate": 6.04031272285072e-06, + "loss": 0.0929, + "step": 1576 + }, + { + "epoch": 0.45, + "grad_norm": 0.38081862575066916, + "learning_rate": 6.035786739737148e-06, + "loss": 0.0959, + "step": 1577 + }, + { + "epoch": 0.45, + "grad_norm": 0.4209994258378272, + "learning_rate": 6.0312598696912096e-06, + "loss": 0.0468, + "step": 1578 + }, + { + "epoch": 0.45, + "grad_norm": 0.4831905612133384, + "learning_rate": 6.026732116589211e-06, + "loss": 0.1195, + "step": 1579 + }, + { + "epoch": 0.45, + "grad_norm": 0.554745652322106, + "learning_rate": 6.022203484308217e-06, + "loss": 0.0795, + "step": 1580 + }, + { + "epoch": 0.45, + "grad_norm": 0.8422194257132133, + "learning_rate": 6.0176739767260415e-06, + "loss": 0.1266, + "step": 1581 + }, + { + "epoch": 0.45, + "grad_norm": 0.2629273377921984, + "learning_rate": 6.013143597721252e-06, + "loss": 0.0778, + "step": 1582 + }, + { + "epoch": 0.45, + "grad_norm": 0.46135694152117934, + "learning_rate": 6.008612351173159e-06, + "loss": 0.1339, + "step": 1583 + }, + { + "epoch": 0.45, + "grad_norm": 0.5323434048873409, + "learning_rate": 6.004080240961818e-06, + "loss": 0.0572, + "step": 1584 + }, + { + "epoch": 0.45, + "grad_norm": 0.35903437337253497, + "learning_rate": 5.999547270968024e-06, + "loss": 0.0696, + "step": 1585 + }, + { + "epoch": 0.45, + "grad_norm": 0.3355668525678312, + "learning_rate": 5.9950134450733045e-06, + "loss": 0.0958, + "step": 1586 + }, + { + "epoch": 0.45, + "grad_norm": 0.5217738759488709, + "learning_rate": 5.990478767159926e-06, + "loss": 0.06, + "step": 1587 + }, + { + "epoch": 0.45, + "grad_norm": 1.008012013800437, + "learning_rate": 5.985943241110881e-06, + "loss": 0.0652, + "step": 1588 + }, + { + "epoch": 0.45, + "grad_norm": 0.22873010594932353, + "learning_rate": 5.981406870809889e-06, + "loss": 0.0562, + "step": 1589 + }, + { + "epoch": 0.45, + "grad_norm": 0.3172153423836355, + "learning_rate": 5.976869660141389e-06, + "loss": 0.0641, + "step": 1590 + }, + { + "epoch": 0.45, + "grad_norm": 0.22765343451518014, + "learning_rate": 5.972331612990546e-06, + "loss": 0.0622, + "step": 1591 + }, + { + "epoch": 0.45, + "grad_norm": 0.44709331552672515, + "learning_rate": 5.967792733243239e-06, + "loss": 0.11, + "step": 1592 + }, + { + "epoch": 0.46, + "grad_norm": 0.7527874138092266, + "learning_rate": 5.963253024786053e-06, + "loss": 0.1596, + "step": 1593 + }, + { + "epoch": 0.46, + "grad_norm": 0.461994906572781, + "learning_rate": 5.958712491506295e-06, + "loss": 0.0871, + "step": 1594 + }, + { + "epoch": 0.46, + "grad_norm": 0.42772132151415737, + "learning_rate": 5.954171137291968e-06, + "loss": 0.1114, + "step": 1595 + }, + { + "epoch": 0.46, + "grad_norm": 0.3103664339231245, + "learning_rate": 5.949628966031785e-06, + "loss": 0.0807, + "step": 1596 + }, + { + "epoch": 0.46, + "grad_norm": 0.44769070484034534, + "learning_rate": 5.945085981615153e-06, + "loss": 0.1122, + "step": 1597 + }, + { + "epoch": 0.46, + "grad_norm": 0.273929006442074, + "learning_rate": 5.9405421879321775e-06, + "loss": 0.0522, + "step": 1598 + }, + { + "epoch": 0.46, + "grad_norm": 0.6064356448255825, + "learning_rate": 5.9359975888736575e-06, + "loss": 0.1255, + "step": 1599 + }, + { + "epoch": 0.46, + "grad_norm": 0.5084612825724896, + "learning_rate": 5.931452188331084e-06, + "loss": 0.0977, + "step": 1600 + }, + { + "epoch": 0.46, + "grad_norm": 0.5017692422046749, + "learning_rate": 5.9269059901966276e-06, + "loss": 0.1213, + "step": 1601 + }, + { + "epoch": 0.46, + "grad_norm": 0.5983936099829104, + "learning_rate": 5.922358998363148e-06, + "loss": 0.1326, + "step": 1602 + }, + { + "epoch": 0.46, + "grad_norm": 0.5481813785522884, + "learning_rate": 5.9178112167241805e-06, + "loss": 0.1221, + "step": 1603 + }, + { + "epoch": 0.46, + "grad_norm": 0.28442226686611066, + "learning_rate": 5.9132626491739434e-06, + "loss": 0.0912, + "step": 1604 + }, + { + "epoch": 0.46, + "grad_norm": 0.521759258678626, + "learning_rate": 5.908713299607318e-06, + "loss": 0.1018, + "step": 1605 + }, + { + "epoch": 0.46, + "grad_norm": 0.3343345513119317, + "learning_rate": 5.904163171919863e-06, + "loss": 0.0762, + "step": 1606 + }, + { + "epoch": 0.46, + "grad_norm": 0.4843724584068069, + "learning_rate": 5.8996122700077995e-06, + "loss": 0.0725, + "step": 1607 + }, + { + "epoch": 0.46, + "grad_norm": 0.5069068934350319, + "learning_rate": 5.8950605977680156e-06, + "loss": 0.1411, + "step": 1608 + }, + { + "epoch": 0.46, + "grad_norm": 0.8410660748434027, + "learning_rate": 5.890508159098054e-06, + "loss": 0.1587, + "step": 1609 + }, + { + "epoch": 0.46, + "grad_norm": 0.4906275464104598, + "learning_rate": 5.885954957896115e-06, + "loss": 0.0849, + "step": 1610 + }, + { + "epoch": 0.46, + "grad_norm": 0.41330337650584437, + "learning_rate": 5.8814009980610556e-06, + "loss": 0.0716, + "step": 1611 + }, + { + "epoch": 0.46, + "grad_norm": 0.34820977706585926, + "learning_rate": 5.8768462834923765e-06, + "loss": 0.1004, + "step": 1612 + }, + { + "epoch": 0.46, + "grad_norm": 0.29546894717527333, + "learning_rate": 5.87229081809023e-06, + "loss": 0.0455, + "step": 1613 + }, + { + "epoch": 0.46, + "grad_norm": 0.4006065237789604, + "learning_rate": 5.86773460575541e-06, + "loss": 0.0864, + "step": 1614 + }, + { + "epoch": 0.46, + "grad_norm": 0.371814294402616, + "learning_rate": 5.863177650389346e-06, + "loss": 0.0568, + "step": 1615 + }, + { + "epoch": 0.46, + "grad_norm": 0.2632650599541928, + "learning_rate": 5.85861995589411e-06, + "loss": 0.0801, + "step": 1616 + }, + { + "epoch": 0.46, + "grad_norm": 0.3400357185661022, + "learning_rate": 5.854061526172402e-06, + "loss": 0.0604, + "step": 1617 + }, + { + "epoch": 0.46, + "grad_norm": 0.2766265717611615, + "learning_rate": 5.849502365127555e-06, + "loss": 0.0538, + "step": 1618 + }, + { + "epoch": 0.46, + "grad_norm": 0.32857827802430734, + "learning_rate": 5.844942476663524e-06, + "loss": 0.0723, + "step": 1619 + }, + { + "epoch": 0.46, + "grad_norm": 0.34501487504396006, + "learning_rate": 5.840381864684892e-06, + "loss": 0.0568, + "step": 1620 + }, + { + "epoch": 0.46, + "grad_norm": 0.2920261742900055, + "learning_rate": 5.8358205330968565e-06, + "loss": 0.0889, + "step": 1621 + }, + { + "epoch": 0.46, + "grad_norm": 0.26203902687466885, + "learning_rate": 5.8312584858052366e-06, + "loss": 0.0574, + "step": 1622 + }, + { + "epoch": 0.46, + "grad_norm": 0.5179357804745687, + "learning_rate": 5.826695726716459e-06, + "loss": 0.1241, + "step": 1623 + }, + { + "epoch": 0.46, + "grad_norm": 0.5270969682793168, + "learning_rate": 5.822132259737565e-06, + "loss": 0.119, + "step": 1624 + }, + { + "epoch": 0.46, + "grad_norm": 0.43987989675138955, + "learning_rate": 5.817568088776195e-06, + "loss": 0.1053, + "step": 1625 + }, + { + "epoch": 0.46, + "grad_norm": 0.3553046774849037, + "learning_rate": 5.8130032177406e-06, + "loss": 0.0751, + "step": 1626 + }, + { + "epoch": 0.46, + "grad_norm": 0.38753349293150197, + "learning_rate": 5.808437650539625e-06, + "loss": 0.0901, + "step": 1627 + }, + { + "epoch": 0.47, + "grad_norm": 0.3550301524423936, + "learning_rate": 5.803871391082716e-06, + "loss": 0.076, + "step": 1628 + }, + { + "epoch": 0.47, + "grad_norm": 0.47481754395175735, + "learning_rate": 5.799304443279905e-06, + "loss": 0.1049, + "step": 1629 + }, + { + "epoch": 0.47, + "grad_norm": 0.5041031416435303, + "learning_rate": 5.794736811041821e-06, + "loss": 0.1303, + "step": 1630 + }, + { + "epoch": 0.47, + "grad_norm": 0.40276753935917553, + "learning_rate": 5.7901684982796716e-06, + "loss": 0.1103, + "step": 1631 + }, + { + "epoch": 0.47, + "grad_norm": 0.3539687959012876, + "learning_rate": 5.785599508905254e-06, + "loss": 0.0908, + "step": 1632 + }, + { + "epoch": 0.47, + "grad_norm": 0.36296725857750745, + "learning_rate": 5.7810298468309404e-06, + "loss": 0.0909, + "step": 1633 + }, + { + "epoch": 0.47, + "grad_norm": 0.4679603141707834, + "learning_rate": 5.776459515969681e-06, + "loss": 0.1416, + "step": 1634 + }, + { + "epoch": 0.47, + "grad_norm": 0.3552108408298559, + "learning_rate": 5.771888520234997e-06, + "loss": 0.0715, + "step": 1635 + }, + { + "epoch": 0.47, + "grad_norm": 0.45845237300279773, + "learning_rate": 5.767316863540979e-06, + "loss": 0.0903, + "step": 1636 + }, + { + "epoch": 0.47, + "grad_norm": 0.3333494974439071, + "learning_rate": 5.762744549802288e-06, + "loss": 0.0787, + "step": 1637 + }, + { + "epoch": 0.47, + "grad_norm": 0.3663448907549116, + "learning_rate": 5.75817158293414e-06, + "loss": 0.0858, + "step": 1638 + }, + { + "epoch": 0.47, + "grad_norm": 0.33256693244323554, + "learning_rate": 5.7535979668523175e-06, + "loss": 0.0868, + "step": 1639 + }, + { + "epoch": 0.47, + "grad_norm": 0.30085328869337624, + "learning_rate": 5.749023705473154e-06, + "loss": 0.0613, + "step": 1640 + }, + { + "epoch": 0.47, + "grad_norm": 0.31145374124401287, + "learning_rate": 5.744448802713537e-06, + "loss": 0.0643, + "step": 1641 + }, + { + "epoch": 0.47, + "grad_norm": 0.5108038140962555, + "learning_rate": 5.739873262490905e-06, + "loss": 0.0952, + "step": 1642 + }, + { + "epoch": 0.47, + "grad_norm": 0.5800985739639948, + "learning_rate": 5.7352970887232395e-06, + "loss": 0.1517, + "step": 1643 + }, + { + "epoch": 0.47, + "grad_norm": 0.3264374011859333, + "learning_rate": 5.730720285329067e-06, + "loss": 0.0551, + "step": 1644 + }, + { + "epoch": 0.47, + "grad_norm": 0.2857478479789638, + "learning_rate": 5.726142856227453e-06, + "loss": 0.0789, + "step": 1645 + }, + { + "epoch": 0.47, + "grad_norm": 0.42996187472922226, + "learning_rate": 5.721564805337994e-06, + "loss": 0.0867, + "step": 1646 + }, + { + "epoch": 0.47, + "grad_norm": 0.7146067840887904, + "learning_rate": 5.716986136580827e-06, + "loss": 0.102, + "step": 1647 + }, + { + "epoch": 0.47, + "grad_norm": 0.4221017557141259, + "learning_rate": 5.712406853876611e-06, + "loss": 0.0924, + "step": 1648 + }, + { + "epoch": 0.47, + "grad_norm": 0.44854720995597286, + "learning_rate": 5.7078269611465355e-06, + "loss": 0.1095, + "step": 1649 + }, + { + "epoch": 0.47, + "grad_norm": 0.5802845790628374, + "learning_rate": 5.703246462312307e-06, + "loss": 0.1045, + "step": 1650 + }, + { + "epoch": 0.47, + "grad_norm": 0.3941843737136875, + "learning_rate": 5.698665361296159e-06, + "loss": 0.0834, + "step": 1651 + }, + { + "epoch": 0.47, + "grad_norm": 0.3819219745328019, + "learning_rate": 5.694083662020835e-06, + "loss": 0.0957, + "step": 1652 + }, + { + "epoch": 0.47, + "grad_norm": 0.41415081906132956, + "learning_rate": 5.689501368409588e-06, + "loss": 0.1024, + "step": 1653 + }, + { + "epoch": 0.47, + "grad_norm": 0.4979139055435071, + "learning_rate": 5.684918484386188e-06, + "loss": 0.086, + "step": 1654 + }, + { + "epoch": 0.47, + "grad_norm": 0.3073305612027108, + "learning_rate": 5.680335013874903e-06, + "loss": 0.0687, + "step": 1655 + }, + { + "epoch": 0.47, + "grad_norm": 0.3042939310338812, + "learning_rate": 5.6757509608005104e-06, + "loss": 0.0628, + "step": 1656 + }, + { + "epoch": 0.47, + "grad_norm": 0.4752393760198132, + "learning_rate": 5.671166329088278e-06, + "loss": 0.1245, + "step": 1657 + }, + { + "epoch": 0.47, + "grad_norm": 0.6190283409663718, + "learning_rate": 5.666581122663978e-06, + "loss": 0.1435, + "step": 1658 + }, + { + "epoch": 0.47, + "grad_norm": 0.4020750810602729, + "learning_rate": 5.661995345453867e-06, + "loss": 0.1079, + "step": 1659 + }, + { + "epoch": 0.47, + "grad_norm": 0.4453041951082366, + "learning_rate": 5.657409001384695e-06, + "loss": 0.0819, + "step": 1660 + }, + { + "epoch": 0.47, + "grad_norm": 0.5326253661866354, + "learning_rate": 5.652822094383697e-06, + "loss": 0.1189, + "step": 1661 + }, + { + "epoch": 0.47, + "grad_norm": 0.6304797285655436, + "learning_rate": 5.6482346283785875e-06, + "loss": 0.1007, + "step": 1662 + }, + { + "epoch": 0.48, + "grad_norm": 0.345498055989681, + "learning_rate": 5.643646607297562e-06, + "loss": 0.068, + "step": 1663 + }, + { + "epoch": 0.48, + "grad_norm": 0.30374633281909824, + "learning_rate": 5.63905803506929e-06, + "loss": 0.0869, + "step": 1664 + }, + { + "epoch": 0.48, + "grad_norm": 0.3167988165275447, + "learning_rate": 5.634468915622915e-06, + "loss": 0.0779, + "step": 1665 + }, + { + "epoch": 0.48, + "grad_norm": 0.34570834283436513, + "learning_rate": 5.629879252888046e-06, + "loss": 0.0842, + "step": 1666 + }, + { + "epoch": 0.48, + "grad_norm": 0.28167780729170155, + "learning_rate": 5.625289050794761e-06, + "loss": 0.0797, + "step": 1667 + }, + { + "epoch": 0.48, + "grad_norm": 0.3643783364716649, + "learning_rate": 5.6206983132735946e-06, + "loss": 0.1014, + "step": 1668 + }, + { + "epoch": 0.48, + "grad_norm": 0.43282734392081657, + "learning_rate": 5.6161070442555465e-06, + "loss": 0.0943, + "step": 1669 + }, + { + "epoch": 0.48, + "grad_norm": 0.49469597540911775, + "learning_rate": 5.611515247672063e-06, + "loss": 0.128, + "step": 1670 + }, + { + "epoch": 0.48, + "grad_norm": 0.3600198581892039, + "learning_rate": 5.606922927455054e-06, + "loss": 0.0927, + "step": 1671 + }, + { + "epoch": 0.48, + "grad_norm": 0.4649005442976091, + "learning_rate": 5.602330087536865e-06, + "loss": 0.1025, + "step": 1672 + }, + { + "epoch": 0.48, + "grad_norm": 0.4889004397083401, + "learning_rate": 5.597736731850295e-06, + "loss": 0.1147, + "step": 1673 + }, + { + "epoch": 0.48, + "grad_norm": 0.5386122076842417, + "learning_rate": 5.593142864328581e-06, + "loss": 0.0858, + "step": 1674 + }, + { + "epoch": 0.48, + "grad_norm": 0.5213397170894769, + "learning_rate": 5.588548488905402e-06, + "loss": 0.105, + "step": 1675 + }, + { + "epoch": 0.48, + "grad_norm": 0.339827614017928, + "learning_rate": 5.583953609514865e-06, + "loss": 0.0575, + "step": 1676 + }, + { + "epoch": 0.48, + "grad_norm": 0.4752875943339977, + "learning_rate": 5.579358230091516e-06, + "loss": 0.0552, + "step": 1677 + }, + { + "epoch": 0.48, + "grad_norm": 0.6416464185421734, + "learning_rate": 5.574762354570326e-06, + "loss": 0.1486, + "step": 1678 + }, + { + "epoch": 0.48, + "grad_norm": 0.3603957152814472, + "learning_rate": 5.570165986886689e-06, + "loss": 0.1159, + "step": 1679 + }, + { + "epoch": 0.48, + "grad_norm": 0.39091485209605414, + "learning_rate": 5.5655691309764225e-06, + "loss": 0.0692, + "step": 1680 + }, + { + "epoch": 0.48, + "grad_norm": 0.4486385227761031, + "learning_rate": 5.560971790775762e-06, + "loss": 0.1328, + "step": 1681 + }, + { + "epoch": 0.48, + "grad_norm": 0.3301996352135462, + "learning_rate": 5.556373970221358e-06, + "loss": 0.0804, + "step": 1682 + }, + { + "epoch": 0.48, + "grad_norm": 0.48044802766468475, + "learning_rate": 5.551775673250271e-06, + "loss": 0.1247, + "step": 1683 + }, + { + "epoch": 0.48, + "grad_norm": 0.4630108842458435, + "learning_rate": 5.547176903799972e-06, + "loss": 0.0786, + "step": 1684 + }, + { + "epoch": 0.48, + "grad_norm": 0.5722662598875865, + "learning_rate": 5.542577665808332e-06, + "loss": 0.1361, + "step": 1685 + }, + { + "epoch": 0.48, + "grad_norm": 0.4614918847166946, + "learning_rate": 5.5379779632136284e-06, + "loss": 0.1133, + "step": 1686 + }, + { + "epoch": 0.48, + "grad_norm": 0.38964247144139563, + "learning_rate": 5.533377799954532e-06, + "loss": 0.1278, + "step": 1687 + }, + { + "epoch": 0.48, + "grad_norm": 0.381542864421644, + "learning_rate": 5.528777179970114e-06, + "loss": 0.1053, + "step": 1688 + }, + { + "epoch": 0.48, + "grad_norm": 0.5333533581918855, + "learning_rate": 5.524176107199828e-06, + "loss": 0.1012, + "step": 1689 + }, + { + "epoch": 0.48, + "grad_norm": 0.2590788715383924, + "learning_rate": 5.519574585583523e-06, + "loss": 0.0649, + "step": 1690 + }, + { + "epoch": 0.48, + "grad_norm": 0.5322182709840814, + "learning_rate": 5.514972619061427e-06, + "loss": 0.1026, + "step": 1691 + }, + { + "epoch": 0.48, + "grad_norm": 0.35366858025515724, + "learning_rate": 5.510370211574156e-06, + "loss": 0.0888, + "step": 1692 + }, + { + "epoch": 0.48, + "grad_norm": 0.3886167683604698, + "learning_rate": 5.505767367062695e-06, + "loss": 0.1082, + "step": 1693 + }, + { + "epoch": 0.48, + "grad_norm": 0.32878539158919307, + "learning_rate": 5.501164089468406e-06, + "loss": 0.0599, + "step": 1694 + }, + { + "epoch": 0.48, + "grad_norm": 0.449508249233077, + "learning_rate": 5.496560382733028e-06, + "loss": 0.08, + "step": 1695 + }, + { + "epoch": 0.48, + "grad_norm": 0.3450803689585386, + "learning_rate": 5.491956250798658e-06, + "loss": 0.101, + "step": 1696 + }, + { + "epoch": 0.48, + "grad_norm": 0.38664765688421115, + "learning_rate": 5.487351697607765e-06, + "loss": 0.0458, + "step": 1697 + }, + { + "epoch": 0.49, + "grad_norm": 0.6695114807096503, + "learning_rate": 5.482746727103174e-06, + "loss": 0.0793, + "step": 1698 + }, + { + "epoch": 0.49, + "grad_norm": 0.34257299124486956, + "learning_rate": 5.4781413432280685e-06, + "loss": 0.0911, + "step": 1699 + }, + { + "epoch": 0.49, + "grad_norm": 0.2698521025005036, + "learning_rate": 5.473535549925986e-06, + "loss": 0.0419, + "step": 1700 + }, + { + "epoch": 0.49, + "grad_norm": 0.4596661061762761, + "learning_rate": 5.4689293511408155e-06, + "loss": 0.1221, + "step": 1701 + }, + { + "epoch": 0.49, + "grad_norm": 0.6884284811535573, + "learning_rate": 5.464322750816791e-06, + "loss": 0.1311, + "step": 1702 + }, + { + "epoch": 0.49, + "grad_norm": 0.70145172750144, + "learning_rate": 5.459715752898494e-06, + "loss": 0.1311, + "step": 1703 + }, + { + "epoch": 0.49, + "grad_norm": 0.3882456722292968, + "learning_rate": 5.455108361330843e-06, + "loss": 0.0722, + "step": 1704 + }, + { + "epoch": 0.49, + "grad_norm": 0.7701544381387703, + "learning_rate": 5.450500580059095e-06, + "loss": 0.162, + "step": 1705 + }, + { + "epoch": 0.49, + "grad_norm": 0.39029789328923986, + "learning_rate": 5.445892413028839e-06, + "loss": 0.094, + "step": 1706 + }, + { + "epoch": 0.49, + "grad_norm": 0.2728642985894729, + "learning_rate": 5.441283864186e-06, + "loss": 0.0599, + "step": 1707 + }, + { + "epoch": 0.49, + "grad_norm": 0.5439590225895382, + "learning_rate": 5.43667493747682e-06, + "loss": 0.1258, + "step": 1708 + }, + { + "epoch": 0.49, + "grad_norm": 0.3635704725591149, + "learning_rate": 5.432065636847876e-06, + "loss": 0.0937, + "step": 1709 + }, + { + "epoch": 0.49, + "grad_norm": 0.44263235253746064, + "learning_rate": 5.427455966246057e-06, + "loss": 0.0911, + "step": 1710 + }, + { + "epoch": 0.49, + "grad_norm": 0.3852561623062434, + "learning_rate": 5.4228459296185696e-06, + "loss": 0.0943, + "step": 1711 + }, + { + "epoch": 0.49, + "grad_norm": 1.2710827522782717, + "learning_rate": 5.418235530912939e-06, + "loss": 0.1595, + "step": 1712 + }, + { + "epoch": 0.49, + "grad_norm": 0.38866509341925787, + "learning_rate": 5.413624774076993e-06, + "loss": 0.0485, + "step": 1713 + }, + { + "epoch": 0.49, + "grad_norm": 0.3768872146849069, + "learning_rate": 5.409013663058874e-06, + "loss": 0.0941, + "step": 1714 + }, + { + "epoch": 0.49, + "grad_norm": 0.4843679193799688, + "learning_rate": 5.404402201807022e-06, + "loss": 0.0787, + "step": 1715 + }, + { + "epoch": 0.49, + "grad_norm": 0.6257603145756986, + "learning_rate": 5.399790394270179e-06, + "loss": 0.1034, + "step": 1716 + }, + { + "epoch": 0.49, + "grad_norm": 0.3771225744028005, + "learning_rate": 5.395178244397382e-06, + "loss": 0.1121, + "step": 1717 + }, + { + "epoch": 0.49, + "grad_norm": 0.36688068268341084, + "learning_rate": 5.390565756137964e-06, + "loss": 0.0905, + "step": 1718 + }, + { + "epoch": 0.49, + "grad_norm": 0.33871418359254224, + "learning_rate": 5.385952933441545e-06, + "loss": 0.0691, + "step": 1719 + }, + { + "epoch": 0.49, + "grad_norm": 0.35771477481335084, + "learning_rate": 5.381339780258034e-06, + "loss": 0.0669, + "step": 1720 + }, + { + "epoch": 0.49, + "grad_norm": 0.7362815592084997, + "learning_rate": 5.376726300537618e-06, + "loss": 0.128, + "step": 1721 + }, + { + "epoch": 0.49, + "grad_norm": 0.2955159047101439, + "learning_rate": 5.372112498230771e-06, + "loss": 0.055, + "step": 1722 + }, + { + "epoch": 0.49, + "grad_norm": 0.3805913987843583, + "learning_rate": 5.367498377288236e-06, + "loss": 0.0606, + "step": 1723 + }, + { + "epoch": 0.49, + "grad_norm": 0.5074061802339812, + "learning_rate": 5.362883941661034e-06, + "loss": 0.1399, + "step": 1724 + }, + { + "epoch": 0.49, + "grad_norm": 0.4624573222358884, + "learning_rate": 5.358269195300454e-06, + "loss": 0.0929, + "step": 1725 + }, + { + "epoch": 0.49, + "grad_norm": 0.4293300336910408, + "learning_rate": 5.353654142158049e-06, + "loss": 0.0977, + "step": 1726 + }, + { + "epoch": 0.49, + "grad_norm": 0.39845874593890546, + "learning_rate": 5.349038786185639e-06, + "loss": 0.092, + "step": 1727 + }, + { + "epoch": 0.49, + "grad_norm": 0.3505025908314534, + "learning_rate": 5.3444231313352965e-06, + "loss": 0.0897, + "step": 1728 + }, + { + "epoch": 0.49, + "grad_norm": 0.42745382667252335, + "learning_rate": 5.339807181559359e-06, + "loss": 0.0938, + "step": 1729 + }, + { + "epoch": 0.49, + "grad_norm": 0.22585096868673774, + "learning_rate": 5.335190940810407e-06, + "loss": 0.0339, + "step": 1730 + }, + { + "epoch": 0.49, + "grad_norm": 0.8104057816148835, + "learning_rate": 5.330574413041278e-06, + "loss": 0.1438, + "step": 1731 + }, + { + "epoch": 0.49, + "grad_norm": 0.5442941487882357, + "learning_rate": 5.325957602205051e-06, + "loss": 0.1154, + "step": 1732 + }, + { + "epoch": 0.5, + "grad_norm": 0.3295448531274127, + "learning_rate": 5.321340512255047e-06, + "loss": 0.0724, + "step": 1733 + }, + { + "epoch": 0.5, + "grad_norm": 0.4071347177236589, + "learning_rate": 5.3167231471448296e-06, + "loss": 0.0483, + "step": 1734 + }, + { + "epoch": 0.5, + "grad_norm": 0.3818177023038044, + "learning_rate": 5.312105510828196e-06, + "loss": 0.0599, + "step": 1735 + }, + { + "epoch": 0.5, + "grad_norm": 0.40413321433612515, + "learning_rate": 5.307487607259175e-06, + "loss": 0.096, + "step": 1736 + }, + { + "epoch": 0.5, + "grad_norm": 0.27123886566658834, + "learning_rate": 5.302869440392022e-06, + "loss": 0.0602, + "step": 1737 + }, + { + "epoch": 0.5, + "grad_norm": 0.5408898387276906, + "learning_rate": 5.2982510141812245e-06, + "loss": 0.1086, + "step": 1738 + }, + { + "epoch": 0.5, + "grad_norm": 0.5563925108749879, + "learning_rate": 5.293632332581487e-06, + "loss": 0.1442, + "step": 1739 + }, + { + "epoch": 0.5, + "grad_norm": 0.4135835036813737, + "learning_rate": 5.289013399547732e-06, + "loss": 0.1028, + "step": 1740 + }, + { + "epoch": 0.5, + "grad_norm": 0.4467060010997873, + "learning_rate": 5.2843942190351e-06, + "loss": 0.0994, + "step": 1741 + }, + { + "epoch": 0.5, + "grad_norm": 0.8541430244209335, + "learning_rate": 5.2797747949989454e-06, + "loss": 0.1531, + "step": 1742 + }, + { + "epoch": 0.5, + "grad_norm": 0.6015915655857912, + "learning_rate": 5.275155131394825e-06, + "loss": 0.1221, + "step": 1743 + }, + { + "epoch": 0.5, + "grad_norm": 0.6835209558694298, + "learning_rate": 5.270535232178505e-06, + "loss": 0.1235, + "step": 1744 + }, + { + "epoch": 0.5, + "grad_norm": 0.4752525613133181, + "learning_rate": 5.265915101305952e-06, + "loss": 0.0737, + "step": 1745 + }, + { + "epoch": 0.5, + "grad_norm": 0.41208071341670405, + "learning_rate": 5.261294742733333e-06, + "loss": 0.0614, + "step": 1746 + }, + { + "epoch": 0.5, + "grad_norm": 0.31722740389800774, + "learning_rate": 5.256674160417006e-06, + "loss": 0.0607, + "step": 1747 + }, + { + "epoch": 0.5, + "grad_norm": 0.4338570013808057, + "learning_rate": 5.252053358313525e-06, + "loss": 0.1098, + "step": 1748 + }, + { + "epoch": 0.5, + "grad_norm": 0.47930442265747286, + "learning_rate": 5.247432340379628e-06, + "loss": 0.1117, + "step": 1749 + }, + { + "epoch": 0.5, + "grad_norm": 0.2795485943598554, + "learning_rate": 5.242811110572243e-06, + "loss": 0.0771, + "step": 1750 + }, + { + "epoch": 0.5, + "grad_norm": 0.8587182145827098, + "learning_rate": 5.238189672848472e-06, + "loss": 0.174, + "step": 1751 + }, + { + "epoch": 0.5, + "grad_norm": 0.3850768937056276, + "learning_rate": 5.233568031165603e-06, + "loss": 0.0831, + "step": 1752 + }, + { + "epoch": 0.5, + "grad_norm": 0.5811047288629524, + "learning_rate": 5.228946189481094e-06, + "loss": 0.0985, + "step": 1753 + }, + { + "epoch": 0.5, + "grad_norm": 0.4938779692662693, + "learning_rate": 5.224324151752575e-06, + "loss": 0.103, + "step": 1754 + }, + { + "epoch": 0.5, + "grad_norm": 0.40324228456050887, + "learning_rate": 5.219701921937845e-06, + "loss": 0.08, + "step": 1755 + }, + { + "epoch": 0.5, + "grad_norm": 0.4794459631357845, + "learning_rate": 5.215079503994866e-06, + "loss": 0.0994, + "step": 1756 + }, + { + "epoch": 0.5, + "grad_norm": 0.42398369004618675, + "learning_rate": 5.210456901881761e-06, + "loss": 0.1033, + "step": 1757 + }, + { + "epoch": 0.5, + "grad_norm": 0.39932128986874454, + "learning_rate": 5.2058341195568115e-06, + "loss": 0.084, + "step": 1758 + }, + { + "epoch": 0.5, + "grad_norm": 0.33495593223377956, + "learning_rate": 5.201211160978457e-06, + "loss": 0.0842, + "step": 1759 + }, + { + "epoch": 0.5, + "grad_norm": 0.5776028108630641, + "learning_rate": 5.196588030105278e-06, + "loss": 0.0797, + "step": 1760 + }, + { + "epoch": 0.5, + "grad_norm": 0.4644351553118459, + "learning_rate": 5.191964730896013e-06, + "loss": 0.0793, + "step": 1761 + }, + { + "epoch": 0.5, + "grad_norm": 0.42614134097942846, + "learning_rate": 5.187341267309539e-06, + "loss": 0.108, + "step": 1762 + }, + { + "epoch": 0.5, + "grad_norm": 0.4491794654933965, + "learning_rate": 5.182717643304876e-06, + "loss": 0.079, + "step": 1763 + }, + { + "epoch": 0.5, + "grad_norm": 0.29289051004587, + "learning_rate": 5.1780938628411795e-06, + "loss": 0.1037, + "step": 1764 + }, + { + "epoch": 0.5, + "grad_norm": 0.21499893698290443, + "learning_rate": 5.173469929877741e-06, + "loss": 0.0443, + "step": 1765 + }, + { + "epoch": 0.5, + "grad_norm": 0.35423226480291703, + "learning_rate": 5.168845848373979e-06, + "loss": 0.1047, + "step": 1766 + }, + { + "epoch": 0.5, + "grad_norm": 0.33833942312962095, + "learning_rate": 5.164221622289445e-06, + "loss": 0.0491, + "step": 1767 + }, + { + "epoch": 0.51, + "grad_norm": 0.588866523484762, + "learning_rate": 5.159597255583808e-06, + "loss": 0.0909, + "step": 1768 + }, + { + "epoch": 0.51, + "grad_norm": 0.1910207915071211, + "learning_rate": 5.154972752216865e-06, + "loss": 0.0629, + "step": 1769 + }, + { + "epoch": 0.51, + "grad_norm": 0.3875092536142003, + "learning_rate": 5.1503481161485206e-06, + "loss": 0.0757, + "step": 1770 + }, + { + "epoch": 0.51, + "grad_norm": 0.47769256631299484, + "learning_rate": 5.145723351338799e-06, + "loss": 0.117, + "step": 1771 + }, + { + "epoch": 0.51, + "grad_norm": 0.3002554362509087, + "learning_rate": 5.141098461747837e-06, + "loss": 0.0825, + "step": 1772 + }, + { + "epoch": 0.51, + "grad_norm": 0.4122361039490995, + "learning_rate": 5.136473451335869e-06, + "loss": 0.1097, + "step": 1773 + }, + { + "epoch": 0.51, + "grad_norm": 0.5074687911950858, + "learning_rate": 5.131848324063243e-06, + "loss": 0.1234, + "step": 1774 + }, + { + "epoch": 0.51, + "grad_norm": 0.26160101336100056, + "learning_rate": 5.127223083890402e-06, + "loss": 0.0723, + "step": 1775 + }, + { + "epoch": 0.51, + "grad_norm": 0.41088471896219075, + "learning_rate": 5.122597734777884e-06, + "loss": 0.0918, + "step": 1776 + }, + { + "epoch": 0.51, + "grad_norm": 0.41377911072083023, + "learning_rate": 5.1179722806863264e-06, + "loss": 0.1046, + "step": 1777 + }, + { + "epoch": 0.51, + "grad_norm": 0.40122840114603303, + "learning_rate": 5.11334672557645e-06, + "loss": 0.0643, + "step": 1778 + }, + { + "epoch": 0.51, + "grad_norm": 0.5176300532922882, + "learning_rate": 5.108721073409067e-06, + "loss": 0.0717, + "step": 1779 + }, + { + "epoch": 0.51, + "grad_norm": 0.44071189944171935, + "learning_rate": 5.104095328145069e-06, + "loss": 0.1066, + "step": 1780 + }, + { + "epoch": 0.51, + "grad_norm": 0.45067724330772735, + "learning_rate": 5.099469493745429e-06, + "loss": 0.0912, + "step": 1781 + }, + { + "epoch": 0.51, + "grad_norm": 0.304169288511332, + "learning_rate": 5.094843574171195e-06, + "loss": 0.0633, + "step": 1782 + }, + { + "epoch": 0.51, + "grad_norm": 0.4923736256734507, + "learning_rate": 5.0902175733834926e-06, + "loss": 0.1282, + "step": 1783 + }, + { + "epoch": 0.51, + "grad_norm": 0.3045856973069122, + "learning_rate": 5.08559149534351e-06, + "loss": 0.0645, + "step": 1784 + }, + { + "epoch": 0.51, + "grad_norm": 0.3108558213473051, + "learning_rate": 5.080965344012509e-06, + "loss": 0.0559, + "step": 1785 + }, + { + "epoch": 0.51, + "grad_norm": 0.5671978774407673, + "learning_rate": 5.076339123351805e-06, + "loss": 0.0905, + "step": 1786 + }, + { + "epoch": 0.51, + "grad_norm": 0.4856432131264074, + "learning_rate": 5.071712837322782e-06, + "loss": 0.1296, + "step": 1787 + }, + { + "epoch": 0.51, + "grad_norm": 0.3389170179946792, + "learning_rate": 5.067086489886873e-06, + "loss": 0.096, + "step": 1788 + }, + { + "epoch": 0.51, + "grad_norm": 0.5632118394665078, + "learning_rate": 5.06246008500557e-06, + "loss": 0.0964, + "step": 1789 + }, + { + "epoch": 0.51, + "grad_norm": 0.648512793509673, + "learning_rate": 5.0578336266404085e-06, + "loss": 0.122, + "step": 1790 + }, + { + "epoch": 0.51, + "grad_norm": 0.3258354727516121, + "learning_rate": 5.053207118752973e-06, + "loss": 0.068, + "step": 1791 + }, + { + "epoch": 0.51, + "grad_norm": 0.7180310667839394, + "learning_rate": 5.048580565304887e-06, + "loss": 0.1101, + "step": 1792 + }, + { + "epoch": 0.51, + "grad_norm": 0.2764883777437533, + "learning_rate": 5.043953970257819e-06, + "loss": 0.0713, + "step": 1793 + }, + { + "epoch": 0.51, + "grad_norm": 0.508040561230719, + "learning_rate": 5.0393273375734664e-06, + "loss": 0.1132, + "step": 1794 + }, + { + "epoch": 0.51, + "grad_norm": 0.3303481225347844, + "learning_rate": 5.034700671213565e-06, + "loss": 0.0791, + "step": 1795 + }, + { + "epoch": 0.51, + "grad_norm": 0.46792985989080804, + "learning_rate": 5.0300739751398744e-06, + "loss": 0.0796, + "step": 1796 + }, + { + "epoch": 0.51, + "grad_norm": 0.3827573561801135, + "learning_rate": 5.025447253314181e-06, + "loss": 0.1074, + "step": 1797 + }, + { + "epoch": 0.51, + "grad_norm": 0.36133266664408525, + "learning_rate": 5.020820509698296e-06, + "loss": 0.0908, + "step": 1798 + }, + { + "epoch": 0.51, + "grad_norm": 0.35464439574957596, + "learning_rate": 5.016193748254045e-06, + "loss": 0.0874, + "step": 1799 + }, + { + "epoch": 0.51, + "grad_norm": 0.4123590202063127, + "learning_rate": 5.0115669729432725e-06, + "loss": 0.0671, + "step": 1800 + }, + { + "epoch": 0.51, + "grad_norm": 0.44071860293528237, + "learning_rate": 5.006940187727832e-06, + "loss": 0.0894, + "step": 1801 + }, + { + "epoch": 0.51, + "grad_norm": 0.35522717609795534, + "learning_rate": 5.00231339656959e-06, + "loss": 0.1204, + "step": 1802 + }, + { + "epoch": 0.52, + "grad_norm": 0.38112553943593, + "learning_rate": 4.9976866034304116e-06, + "loss": 0.0683, + "step": 1803 + }, + { + "epoch": 0.52, + "grad_norm": 0.4659657997607652, + "learning_rate": 4.9930598122721695e-06, + "loss": 0.0628, + "step": 1804 + }, + { + "epoch": 0.52, + "grad_norm": 0.2967690693715893, + "learning_rate": 4.988433027056729e-06, + "loss": 0.0427, + "step": 1805 + }, + { + "epoch": 0.52, + "grad_norm": 0.44824372545590935, + "learning_rate": 4.983806251745958e-06, + "loss": 0.0956, + "step": 1806 + }, + { + "epoch": 0.52, + "grad_norm": 0.37800412975486114, + "learning_rate": 4.979179490301706e-06, + "loss": 0.1326, + "step": 1807 + }, + { + "epoch": 0.52, + "grad_norm": 0.7885437464260369, + "learning_rate": 4.9745527466858215e-06, + "loss": 0.1018, + "step": 1808 + }, + { + "epoch": 0.52, + "grad_norm": 0.4051763334362205, + "learning_rate": 4.969926024860127e-06, + "loss": 0.0654, + "step": 1809 + }, + { + "epoch": 0.52, + "grad_norm": 0.3781882626945609, + "learning_rate": 4.965299328786437e-06, + "loss": 0.0695, + "step": 1810 + }, + { + "epoch": 0.52, + "grad_norm": 0.3830674250490139, + "learning_rate": 4.960672662426535e-06, + "loss": 0.0909, + "step": 1811 + }, + { + "epoch": 0.52, + "grad_norm": 0.5041786500328712, + "learning_rate": 4.956046029742183e-06, + "loss": 0.1102, + "step": 1812 + }, + { + "epoch": 0.52, + "grad_norm": 0.5880617457405704, + "learning_rate": 4.951419434695115e-06, + "loss": 0.1123, + "step": 1813 + }, + { + "epoch": 0.52, + "grad_norm": 0.28522135886218086, + "learning_rate": 4.946792881247028e-06, + "loss": 0.0734, + "step": 1814 + }, + { + "epoch": 0.52, + "grad_norm": 0.3294796218255551, + "learning_rate": 4.942166373359593e-06, + "loss": 0.0801, + "step": 1815 + }, + { + "epoch": 0.52, + "grad_norm": 0.29668438106948947, + "learning_rate": 4.9375399149944305e-06, + "loss": 0.0702, + "step": 1816 + }, + { + "epoch": 0.52, + "grad_norm": 0.27321131758204753, + "learning_rate": 4.932913510113128e-06, + "loss": 0.0679, + "step": 1817 + }, + { + "epoch": 0.52, + "grad_norm": 0.3614485415983663, + "learning_rate": 4.9282871626772195e-06, + "loss": 0.0752, + "step": 1818 + }, + { + "epoch": 0.52, + "grad_norm": 0.3685190017228016, + "learning_rate": 4.923660876648197e-06, + "loss": 0.0686, + "step": 1819 + }, + { + "epoch": 0.52, + "grad_norm": 0.3290672961024479, + "learning_rate": 4.919034655987493e-06, + "loss": 0.0784, + "step": 1820 + }, + { + "epoch": 0.52, + "grad_norm": 0.5367577533454958, + "learning_rate": 4.914408504656491e-06, + "loss": 0.1332, + "step": 1821 + }, + { + "epoch": 0.52, + "grad_norm": 0.3384153629731904, + "learning_rate": 4.909782426616508e-06, + "loss": 0.0936, + "step": 1822 + }, + { + "epoch": 0.52, + "grad_norm": 0.29261661358632907, + "learning_rate": 4.9051564258288055e-06, + "loss": 0.0543, + "step": 1823 + }, + { + "epoch": 0.52, + "grad_norm": 0.6011233546440757, + "learning_rate": 4.900530506254573e-06, + "loss": 0.0882, + "step": 1824 + }, + { + "epoch": 0.52, + "grad_norm": 0.4096237989727531, + "learning_rate": 4.895904671854933e-06, + "loss": 0.1072, + "step": 1825 + }, + { + "epoch": 0.52, + "grad_norm": 0.4846911623910697, + "learning_rate": 4.8912789265909335e-06, + "loss": 0.1136, + "step": 1826 + }, + { + "epoch": 0.52, + "grad_norm": 0.42455388147756434, + "learning_rate": 4.886653274423551e-06, + "loss": 0.1202, + "step": 1827 + }, + { + "epoch": 0.52, + "grad_norm": 0.4470496073472172, + "learning_rate": 4.882027719313675e-06, + "loss": 0.115, + "step": 1828 + }, + { + "epoch": 0.52, + "grad_norm": 0.4737130846179845, + "learning_rate": 4.877402265222117e-06, + "loss": 0.1264, + "step": 1829 + }, + { + "epoch": 0.52, + "grad_norm": 0.5387389491188319, + "learning_rate": 4.872776916109601e-06, + "loss": 0.1138, + "step": 1830 + }, + { + "epoch": 0.52, + "grad_norm": 0.40017293609374827, + "learning_rate": 4.8681516759367595e-06, + "loss": 0.0819, + "step": 1831 + }, + { + "epoch": 0.52, + "grad_norm": 0.6302318750178969, + "learning_rate": 4.863526548664133e-06, + "loss": 0.0975, + "step": 1832 + }, + { + "epoch": 0.52, + "grad_norm": 0.4792126507643318, + "learning_rate": 4.858901538252166e-06, + "loss": 0.0995, + "step": 1833 + }, + { + "epoch": 0.52, + "grad_norm": 0.762190300077111, + "learning_rate": 4.8542766486612035e-06, + "loss": 0.0864, + "step": 1834 + }, + { + "epoch": 0.52, + "grad_norm": 0.4152887567328987, + "learning_rate": 4.849651883851482e-06, + "loss": 0.0966, + "step": 1835 + }, + { + "epoch": 0.52, + "grad_norm": 0.24764893449554634, + "learning_rate": 4.845027247783138e-06, + "loss": 0.0567, + "step": 1836 + }, + { + "epoch": 0.52, + "grad_norm": 0.39746673426372264, + "learning_rate": 4.840402744416193e-06, + "loss": 0.1, + "step": 1837 + }, + { + "epoch": 0.53, + "grad_norm": 0.3066734894909942, + "learning_rate": 4.8357783777105575e-06, + "loss": 0.0674, + "step": 1838 + }, + { + "epoch": 0.53, + "grad_norm": 0.40823531386248535, + "learning_rate": 4.831154151626022e-06, + "loss": 0.0943, + "step": 1839 + }, + { + "epoch": 0.53, + "grad_norm": 0.29000721163358345, + "learning_rate": 4.826530070122262e-06, + "loss": 0.0657, + "step": 1840 + }, + { + "epoch": 0.53, + "grad_norm": 0.45437564693786975, + "learning_rate": 4.821906137158822e-06, + "loss": 0.0965, + "step": 1841 + }, + { + "epoch": 0.53, + "grad_norm": 0.4105245817105171, + "learning_rate": 4.817282356695126e-06, + "loss": 0.0596, + "step": 1842 + }, + { + "epoch": 0.53, + "grad_norm": 0.3657179553981144, + "learning_rate": 4.812658732690463e-06, + "loss": 0.0913, + "step": 1843 + }, + { + "epoch": 0.53, + "grad_norm": 0.6054162389353315, + "learning_rate": 4.808035269103989e-06, + "loss": 0.1201, + "step": 1844 + }, + { + "epoch": 0.53, + "grad_norm": 0.5274034016500073, + "learning_rate": 4.8034119698947244e-06, + "loss": 0.1235, + "step": 1845 + }, + { + "epoch": 0.53, + "grad_norm": 0.7871638385148911, + "learning_rate": 4.798788839021546e-06, + "loss": 0.1209, + "step": 1846 + }, + { + "epoch": 0.53, + "grad_norm": 0.3949687348906425, + "learning_rate": 4.79416588044319e-06, + "loss": 0.0842, + "step": 1847 + }, + { + "epoch": 0.53, + "grad_norm": 0.33777294776552913, + "learning_rate": 4.7895430981182415e-06, + "loss": 0.0735, + "step": 1848 + }, + { + "epoch": 0.53, + "grad_norm": 0.3567016111842002, + "learning_rate": 4.784920496005137e-06, + "loss": 0.0698, + "step": 1849 + }, + { + "epoch": 0.53, + "grad_norm": 0.5219398402468832, + "learning_rate": 4.780298078062157e-06, + "loss": 0.0613, + "step": 1850 + }, + { + "epoch": 0.53, + "grad_norm": 0.4723601537455463, + "learning_rate": 4.775675848247427e-06, + "loss": 0.109, + "step": 1851 + }, + { + "epoch": 0.53, + "grad_norm": 0.4621040919355528, + "learning_rate": 4.771053810518908e-06, + "loss": 0.1276, + "step": 1852 + }, + { + "epoch": 0.53, + "grad_norm": 0.5518875067551766, + "learning_rate": 4.766431968834399e-06, + "loss": 0.1421, + "step": 1853 + }, + { + "epoch": 0.53, + "grad_norm": 0.45749371003943096, + "learning_rate": 4.76181032715153e-06, + "loss": 0.0731, + "step": 1854 + }, + { + "epoch": 0.53, + "grad_norm": 0.38560233895242374, + "learning_rate": 4.757188889427761e-06, + "loss": 0.0788, + "step": 1855 + }, + { + "epoch": 0.53, + "grad_norm": 0.3570216875704459, + "learning_rate": 4.7525676596203726e-06, + "loss": 0.0885, + "step": 1856 + }, + { + "epoch": 0.53, + "grad_norm": 0.5116624767912511, + "learning_rate": 4.747946641686475e-06, + "loss": 0.1175, + "step": 1857 + }, + { + "epoch": 0.53, + "grad_norm": 0.4595799526423975, + "learning_rate": 4.743325839582995e-06, + "loss": 0.0946, + "step": 1858 + }, + { + "epoch": 0.53, + "grad_norm": 0.4031297689490686, + "learning_rate": 4.738705257266667e-06, + "loss": 0.078, + "step": 1859 + }, + { + "epoch": 0.53, + "grad_norm": 0.412017964505888, + "learning_rate": 4.734084898694049e-06, + "loss": 0.1005, + "step": 1860 + }, + { + "epoch": 0.53, + "grad_norm": 0.2721539235413136, + "learning_rate": 4.729464767821496e-06, + "loss": 0.0865, + "step": 1861 + }, + { + "epoch": 0.53, + "grad_norm": 0.6004550191016197, + "learning_rate": 4.724844868605176e-06, + "loss": 0.1226, + "step": 1862 + }, + { + "epoch": 0.53, + "grad_norm": 0.5098773103913625, + "learning_rate": 4.720225205001056e-06, + "loss": 0.0415, + "step": 1863 + }, + { + "epoch": 0.53, + "grad_norm": 0.40815745605351283, + "learning_rate": 4.7156057809649e-06, + "loss": 0.0859, + "step": 1864 + }, + { + "epoch": 0.53, + "grad_norm": 0.3686636187580829, + "learning_rate": 4.710986600452269e-06, + "loss": 0.0814, + "step": 1865 + }, + { + "epoch": 0.53, + "grad_norm": 0.5084807939262025, + "learning_rate": 4.706367667418514e-06, + "loss": 0.109, + "step": 1866 + }, + { + "epoch": 0.53, + "grad_norm": 0.23827947558998486, + "learning_rate": 4.701748985818776e-06, + "loss": 0.064, + "step": 1867 + }, + { + "epoch": 0.53, + "grad_norm": 0.454437908383136, + "learning_rate": 4.697130559607978e-06, + "loss": 0.1086, + "step": 1868 + }, + { + "epoch": 0.53, + "grad_norm": 0.2937112277707689, + "learning_rate": 4.6925123927408265e-06, + "loss": 0.0724, + "step": 1869 + }, + { + "epoch": 0.53, + "grad_norm": 0.5127832515625701, + "learning_rate": 4.687894489171804e-06, + "loss": 0.0572, + "step": 1870 + }, + { + "epoch": 0.53, + "grad_norm": 0.7056047133510236, + "learning_rate": 4.68327685285517e-06, + "loss": 0.1115, + "step": 1871 + }, + { + "epoch": 0.53, + "grad_norm": 0.39573545656865444, + "learning_rate": 4.678659487744953e-06, + "loss": 0.1019, + "step": 1872 + }, + { + "epoch": 0.54, + "grad_norm": 0.4083774884476607, + "learning_rate": 4.67404239779495e-06, + "loss": 0.1011, + "step": 1873 + }, + { + "epoch": 0.54, + "grad_norm": 0.49253317487897935, + "learning_rate": 4.669425586958723e-06, + "loss": 0.1171, + "step": 1874 + }, + { + "epoch": 0.54, + "grad_norm": 0.3729798762255827, + "learning_rate": 4.664809059189594e-06, + "loss": 0.0734, + "step": 1875 + }, + { + "epoch": 0.54, + "grad_norm": 0.3317724517590702, + "learning_rate": 4.660192818440642e-06, + "loss": 0.0642, + "step": 1876 + }, + { + "epoch": 0.54, + "grad_norm": 0.6723221845107754, + "learning_rate": 4.6555768686647035e-06, + "loss": 0.0904, + "step": 1877 + }, + { + "epoch": 0.54, + "grad_norm": 0.47709826160936375, + "learning_rate": 4.650961213814362e-06, + "loss": 0.1241, + "step": 1878 + }, + { + "epoch": 0.54, + "grad_norm": 0.4713509969286293, + "learning_rate": 4.6463458578419504e-06, + "loss": 0.103, + "step": 1879 + }, + { + "epoch": 0.54, + "grad_norm": 0.39502541556809767, + "learning_rate": 4.641730804699547e-06, + "loss": 0.0943, + "step": 1880 + }, + { + "epoch": 0.54, + "grad_norm": 0.26264286988087127, + "learning_rate": 4.637116058338966e-06, + "loss": 0.0679, + "step": 1881 + }, + { + "epoch": 0.54, + "grad_norm": 0.332228478121911, + "learning_rate": 4.6325016227117655e-06, + "loss": 0.104, + "step": 1882 + }, + { + "epoch": 0.54, + "grad_norm": 0.3450264960357861, + "learning_rate": 4.627887501769231e-06, + "loss": 0.0698, + "step": 1883 + }, + { + "epoch": 0.54, + "grad_norm": 0.3570095299234063, + "learning_rate": 4.623273699462384e-06, + "loss": 0.0925, + "step": 1884 + }, + { + "epoch": 0.54, + "grad_norm": 1.025992789630151, + "learning_rate": 4.618660219741968e-06, + "loss": 0.1786, + "step": 1885 + }, + { + "epoch": 0.54, + "grad_norm": 0.3313978386930275, + "learning_rate": 4.614047066558457e-06, + "loss": 0.0977, + "step": 1886 + }, + { + "epoch": 0.54, + "grad_norm": 0.35813875580660093, + "learning_rate": 4.609434243862037e-06, + "loss": 0.1059, + "step": 1887 + }, + { + "epoch": 0.54, + "grad_norm": 0.4691471990597548, + "learning_rate": 4.60482175560262e-06, + "loss": 0.1158, + "step": 1888 + }, + { + "epoch": 0.54, + "grad_norm": 0.6836332521112931, + "learning_rate": 4.600209605729823e-06, + "loss": 0.1469, + "step": 1889 + }, + { + "epoch": 0.54, + "grad_norm": 0.2599298811639539, + "learning_rate": 4.59559779819298e-06, + "loss": 0.0797, + "step": 1890 + }, + { + "epoch": 0.54, + "grad_norm": 0.48795608016248054, + "learning_rate": 4.5909863369411275e-06, + "loss": 0.1116, + "step": 1891 + }, + { + "epoch": 0.54, + "grad_norm": 0.46588600797664814, + "learning_rate": 4.5863752259230085e-06, + "loss": 0.115, + "step": 1892 + }, + { + "epoch": 0.54, + "grad_norm": 0.3217361498668818, + "learning_rate": 4.581764469087064e-06, + "loss": 0.1014, + "step": 1893 + }, + { + "epoch": 0.54, + "grad_norm": 0.42944486698052714, + "learning_rate": 4.577154070381432e-06, + "loss": 0.0852, + "step": 1894 + }, + { + "epoch": 0.54, + "grad_norm": 0.4183007935702926, + "learning_rate": 4.572544033753945e-06, + "loss": 0.0915, + "step": 1895 + }, + { + "epoch": 0.54, + "grad_norm": 0.4803824728872886, + "learning_rate": 4.567934363152126e-06, + "loss": 0.081, + "step": 1896 + }, + { + "epoch": 0.54, + "grad_norm": 0.31338340306069945, + "learning_rate": 4.5633250625231806e-06, + "loss": 0.0825, + "step": 1897 + }, + { + "epoch": 0.54, + "grad_norm": 0.3241077163131851, + "learning_rate": 4.558716135814002e-06, + "loss": 0.0945, + "step": 1898 + }, + { + "epoch": 0.54, + "grad_norm": 0.5323237759812478, + "learning_rate": 4.554107586971162e-06, + "loss": 0.1013, + "step": 1899 + }, + { + "epoch": 0.54, + "grad_norm": 0.3229637312120239, + "learning_rate": 4.549499419940906e-06, + "loss": 0.0801, + "step": 1900 + }, + { + "epoch": 0.54, + "grad_norm": 0.40184114865363024, + "learning_rate": 4.544891638669159e-06, + "loss": 0.1009, + "step": 1901 + }, + { + "epoch": 0.54, + "grad_norm": 0.29438681164914965, + "learning_rate": 4.540284247101507e-06, + "loss": 0.0422, + "step": 1902 + }, + { + "epoch": 0.54, + "grad_norm": 0.38210881048051887, + "learning_rate": 4.53567724918321e-06, + "loss": 0.0932, + "step": 1903 + }, + { + "epoch": 0.54, + "grad_norm": 0.4300572949739541, + "learning_rate": 4.531070648859186e-06, + "loss": 0.0986, + "step": 1904 + }, + { + "epoch": 0.54, + "grad_norm": 0.27597139234993856, + "learning_rate": 4.526464450074016e-06, + "loss": 0.0832, + "step": 1905 + }, + { + "epoch": 0.54, + "grad_norm": 0.789166463160726, + "learning_rate": 4.521858656771933e-06, + "loss": 0.1544, + "step": 1906 + }, + { + "epoch": 0.54, + "grad_norm": 0.462613706943806, + "learning_rate": 4.517253272896827e-06, + "loss": 0.1217, + "step": 1907 + }, + { + "epoch": 0.55, + "grad_norm": 0.34773668379669725, + "learning_rate": 4.5126483023922354e-06, + "loss": 0.0915, + "step": 1908 + }, + { + "epoch": 0.55, + "grad_norm": 0.42662219449198857, + "learning_rate": 4.508043749201343e-06, + "loss": 0.0931, + "step": 1909 + }, + { + "epoch": 0.55, + "grad_norm": 0.4031899824903054, + "learning_rate": 4.503439617266974e-06, + "loss": 0.0724, + "step": 1910 + }, + { + "epoch": 0.55, + "grad_norm": 0.5801786850476682, + "learning_rate": 4.498835910531595e-06, + "loss": 0.1087, + "step": 1911 + }, + { + "epoch": 0.55, + "grad_norm": 0.2857207517449495, + "learning_rate": 4.494232632937308e-06, + "loss": 0.0618, + "step": 1912 + }, + { + "epoch": 0.55, + "grad_norm": 0.25686815839937577, + "learning_rate": 4.489629788425847e-06, + "loss": 0.0689, + "step": 1913 + }, + { + "epoch": 0.55, + "grad_norm": 0.45778970543379055, + "learning_rate": 4.485027380938574e-06, + "loss": 0.0666, + "step": 1914 + }, + { + "epoch": 0.55, + "grad_norm": 0.31223254043849147, + "learning_rate": 4.480425414416479e-06, + "loss": 0.0681, + "step": 1915 + }, + { + "epoch": 0.55, + "grad_norm": 0.2713060954370277, + "learning_rate": 4.475823892800174e-06, + "loss": 0.0851, + "step": 1916 + }, + { + "epoch": 0.55, + "grad_norm": 0.2373650242836078, + "learning_rate": 4.471222820029888e-06, + "loss": 0.0594, + "step": 1917 + }, + { + "epoch": 0.55, + "grad_norm": 0.29806082273090767, + "learning_rate": 4.4666222000454685e-06, + "loss": 0.0695, + "step": 1918 + }, + { + "epoch": 0.55, + "grad_norm": 0.3278947720919759, + "learning_rate": 4.462022036786372e-06, + "loss": 0.0612, + "step": 1919 + }, + { + "epoch": 0.55, + "grad_norm": 0.39896879891390513, + "learning_rate": 4.45742233419167e-06, + "loss": 0.0901, + "step": 1920 + }, + { + "epoch": 0.55, + "grad_norm": 0.42255087490743387, + "learning_rate": 4.452823096200029e-06, + "loss": 0.059, + "step": 1921 + }, + { + "epoch": 0.55, + "grad_norm": 0.4469970144861933, + "learning_rate": 4.4482243267497304e-06, + "loss": 0.1199, + "step": 1922 + }, + { + "epoch": 0.55, + "grad_norm": 0.49217727261777366, + "learning_rate": 4.443626029778643e-06, + "loss": 0.1083, + "step": 1923 + }, + { + "epoch": 0.55, + "grad_norm": 0.28235135435374265, + "learning_rate": 4.43902820922424e-06, + "loss": 0.0699, + "step": 1924 + }, + { + "epoch": 0.55, + "grad_norm": 0.4553360312689322, + "learning_rate": 4.434430869023579e-06, + "loss": 0.0723, + "step": 1925 + }, + { + "epoch": 0.55, + "grad_norm": 0.3516303902725191, + "learning_rate": 4.4298340131133135e-06, + "loss": 0.0555, + "step": 1926 + }, + { + "epoch": 0.55, + "grad_norm": 0.4504339813681332, + "learning_rate": 4.425237645429675e-06, + "loss": 0.1104, + "step": 1927 + }, + { + "epoch": 0.55, + "grad_norm": 0.33091913082621083, + "learning_rate": 4.420641769908485e-06, + "loss": 0.0665, + "step": 1928 + }, + { + "epoch": 0.55, + "grad_norm": 0.26775536702424624, + "learning_rate": 4.416046390485136e-06, + "loss": 0.0492, + "step": 1929 + }, + { + "epoch": 0.55, + "grad_norm": 0.44832676079468037, + "learning_rate": 4.4114515110946e-06, + "loss": 0.1076, + "step": 1930 + }, + { + "epoch": 0.55, + "grad_norm": 0.4178039232742665, + "learning_rate": 4.40685713567142e-06, + "loss": 0.0842, + "step": 1931 + }, + { + "epoch": 0.55, + "grad_norm": 0.2860223061574364, + "learning_rate": 4.402263268149707e-06, + "loss": 0.0599, + "step": 1932 + }, + { + "epoch": 0.55, + "grad_norm": 0.4050827275327297, + "learning_rate": 4.397669912463137e-06, + "loss": 0.1322, + "step": 1933 + }, + { + "epoch": 0.55, + "grad_norm": 0.5299214420573273, + "learning_rate": 4.393077072544948e-06, + "loss": 0.1088, + "step": 1934 + }, + { + "epoch": 0.55, + "grad_norm": 0.2887170502060338, + "learning_rate": 4.3884847523279374e-06, + "loss": 0.0837, + "step": 1935 + }, + { + "epoch": 0.55, + "grad_norm": 0.6404760499001632, + "learning_rate": 4.383892955744456e-06, + "loss": 0.1495, + "step": 1936 + }, + { + "epoch": 0.55, + "grad_norm": 0.3196506146253498, + "learning_rate": 4.379301686726407e-06, + "loss": 0.1011, + "step": 1937 + }, + { + "epoch": 0.55, + "grad_norm": 0.3140363793235934, + "learning_rate": 4.374710949205241e-06, + "loss": 0.0865, + "step": 1938 + }, + { + "epoch": 0.55, + "grad_norm": 0.702703285669765, + "learning_rate": 4.370120747111956e-06, + "loss": 0.0791, + "step": 1939 + }, + { + "epoch": 0.55, + "grad_norm": 0.39221731152640016, + "learning_rate": 4.365531084377087e-06, + "loss": 0.0864, + "step": 1940 + }, + { + "epoch": 0.55, + "grad_norm": 0.4947895718262526, + "learning_rate": 4.360941964930712e-06, + "loss": 0.1428, + "step": 1941 + }, + { + "epoch": 0.55, + "grad_norm": 0.7603743640074522, + "learning_rate": 4.35635339270244e-06, + "loss": 0.1211, + "step": 1942 + }, + { + "epoch": 0.56, + "grad_norm": 0.3384779650531658, + "learning_rate": 4.351765371621415e-06, + "loss": 0.0828, + "step": 1943 + }, + { + "epoch": 0.56, + "grad_norm": 0.26062740442709526, + "learning_rate": 4.347177905616306e-06, + "loss": 0.0456, + "step": 1944 + }, + { + "epoch": 0.56, + "grad_norm": 0.33304292863241364, + "learning_rate": 4.342590998615308e-06, + "loss": 0.1021, + "step": 1945 + }, + { + "epoch": 0.56, + "grad_norm": 0.5634434685367286, + "learning_rate": 4.338004654546136e-06, + "loss": 0.089, + "step": 1946 + }, + { + "epoch": 0.56, + "grad_norm": 0.2939575586372781, + "learning_rate": 4.333418877336024e-06, + "loss": 0.0561, + "step": 1947 + }, + { + "epoch": 0.56, + "grad_norm": 0.5901392544645243, + "learning_rate": 4.3288336709117246e-06, + "loss": 0.1279, + "step": 1948 + }, + { + "epoch": 0.56, + "grad_norm": 0.4660849817812266, + "learning_rate": 4.324249039199492e-06, + "loss": 0.0747, + "step": 1949 + }, + { + "epoch": 0.56, + "grad_norm": 0.3023997208506098, + "learning_rate": 4.319664986125099e-06, + "loss": 0.0599, + "step": 1950 + }, + { + "epoch": 0.56, + "grad_norm": 0.4394114231502662, + "learning_rate": 4.315081515613815e-06, + "loss": 0.0753, + "step": 1951 + }, + { + "epoch": 0.56, + "grad_norm": 0.30320345750113625, + "learning_rate": 4.3104986315904144e-06, + "loss": 0.0692, + "step": 1952 + }, + { + "epoch": 0.56, + "grad_norm": 0.4215626390647296, + "learning_rate": 4.3059163379791676e-06, + "loss": 0.0632, + "step": 1953 + }, + { + "epoch": 0.56, + "grad_norm": 0.3450477954279993, + "learning_rate": 4.301334638703843e-06, + "loss": 0.0853, + "step": 1954 + }, + { + "epoch": 0.56, + "grad_norm": 0.3439957224189273, + "learning_rate": 4.296753537687694e-06, + "loss": 0.0473, + "step": 1955 + }, + { + "epoch": 0.56, + "grad_norm": 0.5498819847692374, + "learning_rate": 4.292173038853468e-06, + "loss": 0.0957, + "step": 1956 + }, + { + "epoch": 0.56, + "grad_norm": 0.43969107091704507, + "learning_rate": 4.287593146123391e-06, + "loss": 0.1049, + "step": 1957 + }, + { + "epoch": 0.56, + "grad_norm": 0.42322994487068916, + "learning_rate": 4.283013863419176e-06, + "loss": 0.1158, + "step": 1958 + }, + { + "epoch": 0.56, + "grad_norm": 0.7882983420945847, + "learning_rate": 4.278435194662007e-06, + "loss": 0.1018, + "step": 1959 + }, + { + "epoch": 0.56, + "grad_norm": 0.2879325362757321, + "learning_rate": 4.27385714377255e-06, + "loss": 0.0454, + "step": 1960 + }, + { + "epoch": 0.56, + "grad_norm": 0.4926947096035045, + "learning_rate": 4.269279714670934e-06, + "loss": 0.097, + "step": 1961 + }, + { + "epoch": 0.56, + "grad_norm": 0.40498556832141896, + "learning_rate": 4.2647029112767605e-06, + "loss": 0.0935, + "step": 1962 + }, + { + "epoch": 0.56, + "grad_norm": 0.44690871261554077, + "learning_rate": 4.260126737509096e-06, + "loss": 0.0926, + "step": 1963 + }, + { + "epoch": 0.56, + "grad_norm": 0.4218838023368967, + "learning_rate": 4.2555511972864634e-06, + "loss": 0.0915, + "step": 1964 + }, + { + "epoch": 0.56, + "grad_norm": 0.49503167389875924, + "learning_rate": 4.250976294526847e-06, + "loss": 0.1332, + "step": 1965 + }, + { + "epoch": 0.56, + "grad_norm": 0.42508535712984674, + "learning_rate": 4.246402033147684e-06, + "loss": 0.0763, + "step": 1966 + }, + { + "epoch": 0.56, + "grad_norm": 0.42110059103624764, + "learning_rate": 4.24182841706586e-06, + "loss": 0.0633, + "step": 1967 + }, + { + "epoch": 0.56, + "grad_norm": 0.5772184031699668, + "learning_rate": 4.237255450197714e-06, + "loss": 0.1092, + "step": 1968 + }, + { + "epoch": 0.56, + "grad_norm": 0.2812932305462848, + "learning_rate": 4.232683136459021e-06, + "loss": 0.0877, + "step": 1969 + }, + { + "epoch": 0.56, + "grad_norm": 0.9032745882328743, + "learning_rate": 4.228111479765004e-06, + "loss": 0.1463, + "step": 1970 + }, + { + "epoch": 0.56, + "grad_norm": 0.42769977338437704, + "learning_rate": 4.22354048403032e-06, + "loss": 0.1148, + "step": 1971 + }, + { + "epoch": 0.56, + "grad_norm": 0.445114007386137, + "learning_rate": 4.2189701531690595e-06, + "loss": 0.0952, + "step": 1972 + }, + { + "epoch": 0.56, + "grad_norm": 0.5369118283803712, + "learning_rate": 4.214400491094746e-06, + "loss": 0.0951, + "step": 1973 + }, + { + "epoch": 0.56, + "grad_norm": 0.5754994017134322, + "learning_rate": 4.209831501720328e-06, + "loss": 0.1224, + "step": 1974 + }, + { + "epoch": 0.56, + "grad_norm": 0.4266059198848772, + "learning_rate": 4.20526318895818e-06, + "loss": 0.0713, + "step": 1975 + }, + { + "epoch": 0.56, + "grad_norm": 0.2870313247515032, + "learning_rate": 4.200695556720095e-06, + "loss": 0.0745, + "step": 1976 + }, + { + "epoch": 0.56, + "grad_norm": 0.4307908012439384, + "learning_rate": 4.196128608917284e-06, + "loss": 0.1243, + "step": 1977 + }, + { + "epoch": 0.57, + "grad_norm": 0.39299932618763994, + "learning_rate": 4.191562349460375e-06, + "loss": 0.0588, + "step": 1978 + }, + { + "epoch": 0.57, + "grad_norm": 0.5861435780886223, + "learning_rate": 4.1869967822594e-06, + "loss": 0.0869, + "step": 1979 + }, + { + "epoch": 0.57, + "grad_norm": 0.30422620184804433, + "learning_rate": 4.182431911223806e-06, + "loss": 0.0542, + "step": 1980 + }, + { + "epoch": 0.57, + "grad_norm": 0.311026370712942, + "learning_rate": 4.177867740262437e-06, + "loss": 0.0431, + "step": 1981 + }, + { + "epoch": 0.57, + "grad_norm": 0.3465167976558529, + "learning_rate": 4.173304273283541e-06, + "loss": 0.1026, + "step": 1982 + }, + { + "epoch": 0.57, + "grad_norm": 0.3469748721584691, + "learning_rate": 4.168741514194764e-06, + "loss": 0.0888, + "step": 1983 + }, + { + "epoch": 0.57, + "grad_norm": 0.5800733373315916, + "learning_rate": 4.1641794669031435e-06, + "loss": 0.1066, + "step": 1984 + }, + { + "epoch": 0.57, + "grad_norm": 0.3243784108523238, + "learning_rate": 4.159618135315109e-06, + "loss": 0.0686, + "step": 1985 + }, + { + "epoch": 0.57, + "grad_norm": 0.7434421684735129, + "learning_rate": 4.155057523336477e-06, + "loss": 0.1197, + "step": 1986 + }, + { + "epoch": 0.57, + "grad_norm": 0.7143013220770097, + "learning_rate": 4.1504976348724465e-06, + "loss": 0.1333, + "step": 1987 + }, + { + "epoch": 0.57, + "grad_norm": 0.458633239134645, + "learning_rate": 4.145938473827598e-06, + "loss": 0.0704, + "step": 1988 + }, + { + "epoch": 0.57, + "grad_norm": 0.32373332074690114, + "learning_rate": 4.141380044105891e-06, + "loss": 0.081, + "step": 1989 + }, + { + "epoch": 0.57, + "grad_norm": 0.3113298579819848, + "learning_rate": 4.1368223496106544e-06, + "loss": 0.0648, + "step": 1990 + }, + { + "epoch": 0.57, + "grad_norm": 0.46622683878776333, + "learning_rate": 4.1322653942445925e-06, + "loss": 0.1127, + "step": 1991 + }, + { + "epoch": 0.57, + "grad_norm": 0.5308035018252141, + "learning_rate": 4.127709181909771e-06, + "loss": 0.1024, + "step": 1992 + }, + { + "epoch": 0.57, + "grad_norm": 0.5468851125410406, + "learning_rate": 4.123153716507625e-06, + "loss": 0.1032, + "step": 1993 + }, + { + "epoch": 0.57, + "grad_norm": 0.33844503605315174, + "learning_rate": 4.118599001938947e-06, + "loss": 0.0941, + "step": 1994 + }, + { + "epoch": 0.57, + "grad_norm": 0.3068677222704898, + "learning_rate": 4.1140450421038865e-06, + "loss": 0.0779, + "step": 1995 + }, + { + "epoch": 0.57, + "grad_norm": 0.48738135575860847, + "learning_rate": 4.109491840901948e-06, + "loss": 0.1086, + "step": 1996 + }, + { + "epoch": 0.57, + "grad_norm": 0.28863622022466223, + "learning_rate": 4.104939402231986e-06, + "loss": 0.0662, + "step": 1997 + }, + { + "epoch": 0.57, + "grad_norm": 0.368527455061649, + "learning_rate": 4.100387729992201e-06, + "loss": 0.0493, + "step": 1998 + }, + { + "epoch": 0.57, + "grad_norm": 0.37516101208183883, + "learning_rate": 4.095836828080138e-06, + "loss": 0.0636, + "step": 1999 + }, + { + "epoch": 0.57, + "grad_norm": 0.2591543806192558, + "learning_rate": 4.091286700392683e-06, + "loss": 0.0429, + "step": 2000 + }, + { + "epoch": 0.57, + "grad_norm": 0.4028847043574642, + "learning_rate": 4.086737350826058e-06, + "loss": 0.0783, + "step": 2001 + }, + { + "epoch": 0.57, + "grad_norm": 0.4184077498856964, + "learning_rate": 4.08218878327582e-06, + "loss": 0.0786, + "step": 2002 + }, + { + "epoch": 0.57, + "grad_norm": 0.32607010234508654, + "learning_rate": 4.077641001636854e-06, + "loss": 0.0705, + "step": 2003 + }, + { + "epoch": 0.57, + "grad_norm": 0.29771386911145054, + "learning_rate": 4.073094009803374e-06, + "loss": 0.0918, + "step": 2004 + }, + { + "epoch": 0.57, + "grad_norm": 0.4315479249247474, + "learning_rate": 4.068547811668918e-06, + "loss": 0.0978, + "step": 2005 + }, + { + "epoch": 0.57, + "grad_norm": 0.5330447499926427, + "learning_rate": 4.064002411126343e-06, + "loss": 0.1061, + "step": 2006 + }, + { + "epoch": 0.57, + "grad_norm": 1.258613621124329, + "learning_rate": 4.059457812067823e-06, + "loss": 0.0641, + "step": 2007 + }, + { + "epoch": 0.57, + "grad_norm": 1.3220392034549073, + "learning_rate": 4.054914018384849e-06, + "loss": 0.1006, + "step": 2008 + }, + { + "epoch": 0.57, + "grad_norm": 0.36690079395341285, + "learning_rate": 4.050371033968216e-06, + "loss": 0.0729, + "step": 2009 + }, + { + "epoch": 0.57, + "grad_norm": 0.4473416973250645, + "learning_rate": 4.0458288627080325e-06, + "loss": 0.0851, + "step": 2010 + }, + { + "epoch": 0.57, + "grad_norm": 0.3053859056768928, + "learning_rate": 4.041287508493706e-06, + "loss": 0.0828, + "step": 2011 + }, + { + "epoch": 0.57, + "grad_norm": 0.40630274324795607, + "learning_rate": 4.0367469752139475e-06, + "loss": 0.0763, + "step": 2012 + }, + { + "epoch": 0.58, + "grad_norm": 0.3701731626037116, + "learning_rate": 4.032207266756764e-06, + "loss": 0.0885, + "step": 2013 + }, + { + "epoch": 0.58, + "grad_norm": 0.4265760298185957, + "learning_rate": 4.027668387009455e-06, + "loss": 0.0938, + "step": 2014 + }, + { + "epoch": 0.58, + "grad_norm": 0.28060969979017275, + "learning_rate": 4.0231303398586124e-06, + "loss": 0.0652, + "step": 2015 + }, + { + "epoch": 0.58, + "grad_norm": 0.29222129070474345, + "learning_rate": 4.018593129190113e-06, + "loss": 0.0643, + "step": 2016 + }, + { + "epoch": 0.58, + "grad_norm": 0.27138955197208636, + "learning_rate": 4.014056758889121e-06, + "loss": 0.0716, + "step": 2017 + }, + { + "epoch": 0.58, + "grad_norm": 0.6951548257280417, + "learning_rate": 4.009521232840075e-06, + "loss": 0.121, + "step": 2018 + }, + { + "epoch": 0.58, + "grad_norm": 0.7563475757578632, + "learning_rate": 4.004986554926697e-06, + "loss": 0.1071, + "step": 2019 + }, + { + "epoch": 0.58, + "grad_norm": 0.5375498548744064, + "learning_rate": 4.000452729031978e-06, + "loss": 0.0747, + "step": 2020 + }, + { + "epoch": 0.58, + "grad_norm": 0.22798813992235856, + "learning_rate": 3.995919759038184e-06, + "loss": 0.0466, + "step": 2021 + }, + { + "epoch": 0.58, + "grad_norm": 0.3595766444515047, + "learning_rate": 3.991387648826842e-06, + "loss": 0.0774, + "step": 2022 + }, + { + "epoch": 0.58, + "grad_norm": 0.2998093674841255, + "learning_rate": 3.98685640227875e-06, + "loss": 0.0473, + "step": 2023 + }, + { + "epoch": 0.58, + "grad_norm": 0.3985626427420547, + "learning_rate": 3.982326023273959e-06, + "loss": 0.1184, + "step": 2024 + }, + { + "epoch": 0.58, + "grad_norm": 0.6113417200642814, + "learning_rate": 3.977796515691785e-06, + "loss": 0.1277, + "step": 2025 + }, + { + "epoch": 0.58, + "grad_norm": 0.5718721177873624, + "learning_rate": 3.97326788341079e-06, + "loss": 0.1053, + "step": 2026 + }, + { + "epoch": 0.58, + "grad_norm": 0.5847265187586651, + "learning_rate": 3.968740130308792e-06, + "loss": 0.1068, + "step": 2027 + }, + { + "epoch": 0.58, + "grad_norm": 0.6206156584815474, + "learning_rate": 3.964213260262853e-06, + "loss": 0.1301, + "step": 2028 + }, + { + "epoch": 0.58, + "grad_norm": 0.501407405321876, + "learning_rate": 3.959687277149283e-06, + "loss": 0.1127, + "step": 2029 + }, + { + "epoch": 0.58, + "grad_norm": 0.4804538068281452, + "learning_rate": 3.955162184843625e-06, + "loss": 0.1281, + "step": 2030 + }, + { + "epoch": 0.58, + "grad_norm": 0.5231994878403236, + "learning_rate": 3.950637987220669e-06, + "loss": 0.1136, + "step": 2031 + }, + { + "epoch": 0.58, + "grad_norm": 0.37016488981910206, + "learning_rate": 3.94611468815443e-06, + "loss": 0.0686, + "step": 2032 + }, + { + "epoch": 0.58, + "grad_norm": 0.26871161684579375, + "learning_rate": 3.9415922915181595e-06, + "loss": 0.0305, + "step": 2033 + }, + { + "epoch": 0.58, + "grad_norm": 0.37256629953416204, + "learning_rate": 3.937070801184333e-06, + "loss": 0.0885, + "step": 2034 + }, + { + "epoch": 0.58, + "grad_norm": 0.3738101422663222, + "learning_rate": 3.932550221024651e-06, + "loss": 0.1334, + "step": 2035 + }, + { + "epoch": 0.58, + "grad_norm": 0.6029645029931219, + "learning_rate": 3.928030554910037e-06, + "loss": 0.136, + "step": 2036 + }, + { + "epoch": 0.58, + "grad_norm": 0.3554442316959907, + "learning_rate": 3.9235118067106255e-06, + "loss": 0.0938, + "step": 2037 + }, + { + "epoch": 0.58, + "grad_norm": 0.3235074713065256, + "learning_rate": 3.918993980295774e-06, + "loss": 0.0579, + "step": 2038 + }, + { + "epoch": 0.58, + "grad_norm": 0.4983600522619609, + "learning_rate": 3.914477079534041e-06, + "loss": 0.1103, + "step": 2039 + }, + { + "epoch": 0.58, + "grad_norm": 0.39863078748668757, + "learning_rate": 3.9099611082932e-06, + "loss": 0.0977, + "step": 2040 + }, + { + "epoch": 0.58, + "grad_norm": 0.28027634060902773, + "learning_rate": 3.9054460704402246e-06, + "loss": 0.0636, + "step": 2041 + }, + { + "epoch": 0.58, + "grad_norm": 0.3253098584369501, + "learning_rate": 3.90093196984129e-06, + "loss": 0.0623, + "step": 2042 + }, + { + "epoch": 0.58, + "grad_norm": 0.6229740514530065, + "learning_rate": 3.8964188103617685e-06, + "loss": 0.0998, + "step": 2043 + }, + { + "epoch": 0.58, + "grad_norm": 0.48643580185941077, + "learning_rate": 3.89190659586623e-06, + "loss": 0.112, + "step": 2044 + }, + { + "epoch": 0.58, + "grad_norm": 0.28979063553797807, + "learning_rate": 3.887395330218429e-06, + "loss": 0.0687, + "step": 2045 + }, + { + "epoch": 0.58, + "grad_norm": 0.535227874364318, + "learning_rate": 3.882885017281312e-06, + "loss": 0.1024, + "step": 2046 + }, + { + "epoch": 0.58, + "grad_norm": 0.5717070347300525, + "learning_rate": 3.87837566091701e-06, + "loss": 0.1171, + "step": 2047 + }, + { + "epoch": 0.59, + "grad_norm": 0.31107606437867763, + "learning_rate": 3.87386726498683e-06, + "loss": 0.0771, + "step": 2048 + }, + { + "epoch": 0.59, + "grad_norm": 0.23525510475650052, + "learning_rate": 3.869359833351263e-06, + "loss": 0.0406, + "step": 2049 + }, + { + "epoch": 0.59, + "grad_norm": 0.5419064036518809, + "learning_rate": 3.8648533698699695e-06, + "loss": 0.1364, + "step": 2050 + }, + { + "epoch": 0.59, + "grad_norm": 0.4316380279177835, + "learning_rate": 3.8603478784017845e-06, + "loss": 0.0964, + "step": 2051 + }, + { + "epoch": 0.59, + "grad_norm": 0.41189041144702077, + "learning_rate": 3.855843362804707e-06, + "loss": 0.0968, + "step": 2052 + }, + { + "epoch": 0.59, + "grad_norm": 0.5044277218391547, + "learning_rate": 3.851339826935904e-06, + "loss": 0.1223, + "step": 2053 + }, + { + "epoch": 0.59, + "grad_norm": 0.47221302636165313, + "learning_rate": 3.8468372746517e-06, + "loss": 0.0965, + "step": 2054 + }, + { + "epoch": 0.59, + "grad_norm": 0.44530159406031433, + "learning_rate": 3.842335709807582e-06, + "loss": 0.0863, + "step": 2055 + }, + { + "epoch": 0.59, + "grad_norm": 0.586630340755237, + "learning_rate": 3.8378351362581844e-06, + "loss": 0.1075, + "step": 2056 + }, + { + "epoch": 0.59, + "grad_norm": 0.2809788979845124, + "learning_rate": 3.833335557857302e-06, + "loss": 0.0637, + "step": 2057 + }, + { + "epoch": 0.59, + "grad_norm": 0.34647577554375586, + "learning_rate": 3.828836978457868e-06, + "loss": 0.0897, + "step": 2058 + }, + { + "epoch": 0.59, + "grad_norm": 0.3246781426625446, + "learning_rate": 3.824339401911967e-06, + "loss": 0.0655, + "step": 2059 + }, + { + "epoch": 0.59, + "grad_norm": 0.4237243542390314, + "learning_rate": 3.819842832070822e-06, + "loss": 0.0668, + "step": 2060 + }, + { + "epoch": 0.59, + "grad_norm": 0.49527451270191114, + "learning_rate": 3.815347272784795e-06, + "loss": 0.0912, + "step": 2061 + }, + { + "epoch": 0.59, + "grad_norm": 0.2359837032835801, + "learning_rate": 3.810852727903381e-06, + "loss": 0.0651, + "step": 2062 + }, + { + "epoch": 0.59, + "grad_norm": 0.7228845888573183, + "learning_rate": 3.806359201275209e-06, + "loss": 0.1581, + "step": 2063 + }, + { + "epoch": 0.59, + "grad_norm": 0.3270182167549771, + "learning_rate": 3.8018666967480333e-06, + "loss": 0.0881, + "step": 2064 + }, + { + "epoch": 0.59, + "grad_norm": 0.28759702320644553, + "learning_rate": 3.7973752181687336e-06, + "loss": 0.0687, + "step": 2065 + }, + { + "epoch": 0.59, + "grad_norm": 0.29294071260960564, + "learning_rate": 3.7928847693833136e-06, + "loss": 0.032, + "step": 2066 + }, + { + "epoch": 0.59, + "grad_norm": 0.37243145893279406, + "learning_rate": 3.7883953542368917e-06, + "loss": 0.066, + "step": 2067 + }, + { + "epoch": 0.59, + "grad_norm": 0.4596837041749248, + "learning_rate": 3.7839069765737024e-06, + "loss": 0.0874, + "step": 2068 + }, + { + "epoch": 0.59, + "grad_norm": 0.5068728692270521, + "learning_rate": 3.7794196402370962e-06, + "loss": 0.0876, + "step": 2069 + }, + { + "epoch": 0.59, + "grad_norm": 0.4575890766694308, + "learning_rate": 3.774933349069524e-06, + "loss": 0.1353, + "step": 2070 + }, + { + "epoch": 0.59, + "grad_norm": 0.5473491177990236, + "learning_rate": 3.7704481069125486e-06, + "loss": 0.1561, + "step": 2071 + }, + { + "epoch": 0.59, + "grad_norm": 0.5012890458465599, + "learning_rate": 3.7659639176068287e-06, + "loss": 0.1215, + "step": 2072 + }, + { + "epoch": 0.59, + "grad_norm": 0.4131563875581746, + "learning_rate": 3.761480784992127e-06, + "loss": 0.0933, + "step": 2073 + }, + { + "epoch": 0.59, + "grad_norm": 0.36967504989043437, + "learning_rate": 3.756998712907297e-06, + "loss": 0.0918, + "step": 2074 + }, + { + "epoch": 0.59, + "grad_norm": 0.47895162415247106, + "learning_rate": 3.752517705190287e-06, + "loss": 0.1049, + "step": 2075 + }, + { + "epoch": 0.59, + "grad_norm": 0.7174889321607305, + "learning_rate": 3.748037765678132e-06, + "loss": 0.1359, + "step": 2076 + }, + { + "epoch": 0.59, + "grad_norm": 0.2949635943803785, + "learning_rate": 3.743558898206955e-06, + "loss": 0.0727, + "step": 2077 + }, + { + "epoch": 0.59, + "grad_norm": 0.4494486899447512, + "learning_rate": 3.7390811066119552e-06, + "loss": 0.1244, + "step": 2078 + }, + { + "epoch": 0.59, + "grad_norm": 0.3855162317520461, + "learning_rate": 3.734604394727419e-06, + "loss": 0.0733, + "step": 2079 + }, + { + "epoch": 0.59, + "grad_norm": 0.26065067980205636, + "learning_rate": 3.7301287663867002e-06, + "loss": 0.0759, + "step": 2080 + }, + { + "epoch": 0.59, + "grad_norm": 0.4370956810185193, + "learning_rate": 3.7256542254222307e-06, + "loss": 0.0851, + "step": 2081 + }, + { + "epoch": 0.59, + "grad_norm": 0.34142654390983773, + "learning_rate": 3.7211807756655065e-06, + "loss": 0.0413, + "step": 2082 + }, + { + "epoch": 0.6, + "grad_norm": 0.2160190652719215, + "learning_rate": 3.7167084209470938e-06, + "loss": 0.0621, + "step": 2083 + }, + { + "epoch": 0.6, + "grad_norm": 0.7083407811772259, + "learning_rate": 3.7122371650966188e-06, + "loss": 0.1387, + "step": 2084 + }, + { + "epoch": 0.6, + "grad_norm": 0.27300976717333647, + "learning_rate": 3.7077670119427644e-06, + "loss": 0.0587, + "step": 2085 + }, + { + "epoch": 0.6, + "grad_norm": 0.47681884685724285, + "learning_rate": 3.703297965313275e-06, + "loss": 0.092, + "step": 2086 + }, + { + "epoch": 0.6, + "grad_norm": 0.6321938008876815, + "learning_rate": 3.6988300290349414e-06, + "loss": 0.1317, + "step": 2087 + }, + { + "epoch": 0.6, + "grad_norm": 0.2016558933938644, + "learning_rate": 3.694363206933609e-06, + "loss": 0.031, + "step": 2088 + }, + { + "epoch": 0.6, + "grad_norm": 0.3052856384218095, + "learning_rate": 3.6898975028341636e-06, + "loss": 0.0803, + "step": 2089 + }, + { + "epoch": 0.6, + "grad_norm": 0.4115355894563856, + "learning_rate": 3.68543292056054e-06, + "loss": 0.0701, + "step": 2090 + }, + { + "epoch": 0.6, + "grad_norm": 0.4755149189406155, + "learning_rate": 3.680969463935706e-06, + "loss": 0.0824, + "step": 2091 + }, + { + "epoch": 0.6, + "grad_norm": 0.44893729851957614, + "learning_rate": 3.676507136781672e-06, + "loss": 0.1018, + "step": 2092 + }, + { + "epoch": 0.6, + "grad_norm": 0.26040225037583753, + "learning_rate": 3.6720459429194743e-06, + "loss": 0.0463, + "step": 2093 + }, + { + "epoch": 0.6, + "grad_norm": 0.35296792517971115, + "learning_rate": 3.6675858861691848e-06, + "loss": 0.0993, + "step": 2094 + }, + { + "epoch": 0.6, + "grad_norm": 0.3225183136762917, + "learning_rate": 3.6631269703498974e-06, + "loss": 0.0726, + "step": 2095 + }, + { + "epoch": 0.6, + "grad_norm": 0.2979364180447945, + "learning_rate": 3.6586691992797318e-06, + "loss": 0.0702, + "step": 2096 + }, + { + "epoch": 0.6, + "grad_norm": 0.30125386069696103, + "learning_rate": 3.6542125767758263e-06, + "loss": 0.0664, + "step": 2097 + }, + { + "epoch": 0.6, + "grad_norm": 0.21973910774686486, + "learning_rate": 3.649757106654337e-06, + "loss": 0.053, + "step": 2098 + }, + { + "epoch": 0.6, + "grad_norm": 0.36142441994666674, + "learning_rate": 3.6453027927304313e-06, + "loss": 0.062, + "step": 2099 + }, + { + "epoch": 0.6, + "grad_norm": 0.3179639445416395, + "learning_rate": 3.6408496388182857e-06, + "loss": 0.0921, + "step": 2100 + }, + { + "epoch": 0.6, + "grad_norm": 0.37353396499274105, + "learning_rate": 3.636397648731088e-06, + "loss": 0.0799, + "step": 2101 + }, + { + "epoch": 0.6, + "grad_norm": 0.3391512824605172, + "learning_rate": 3.631946826281024e-06, + "loss": 0.0753, + "step": 2102 + }, + { + "epoch": 0.6, + "grad_norm": 0.5814300713636016, + "learning_rate": 3.627497175279285e-06, + "loss": 0.1163, + "step": 2103 + }, + { + "epoch": 0.6, + "grad_norm": 0.45910829179398005, + "learning_rate": 3.6230486995360524e-06, + "loss": 0.0694, + "step": 2104 + }, + { + "epoch": 0.6, + "grad_norm": 0.5575080215268169, + "learning_rate": 3.61860140286051e-06, + "loss": 0.114, + "step": 2105 + }, + { + "epoch": 0.6, + "grad_norm": 0.4390168966721899, + "learning_rate": 3.6141552890608224e-06, + "loss": 0.1202, + "step": 2106 + }, + { + "epoch": 0.6, + "grad_norm": 0.3800681344027182, + "learning_rate": 3.6097103619441505e-06, + "loss": 0.0884, + "step": 2107 + }, + { + "epoch": 0.6, + "grad_norm": 0.2101455047264348, + "learning_rate": 3.6052666253166306e-06, + "loss": 0.0475, + "step": 2108 + }, + { + "epoch": 0.6, + "grad_norm": 0.29183422779381585, + "learning_rate": 3.6008240829833873e-06, + "loss": 0.0611, + "step": 2109 + }, + { + "epoch": 0.6, + "grad_norm": 0.31568111702329904, + "learning_rate": 3.596382738748516e-06, + "loss": 0.072, + "step": 2110 + }, + { + "epoch": 0.6, + "grad_norm": 0.28554512922628006, + "learning_rate": 3.591942596415092e-06, + "loss": 0.0637, + "step": 2111 + }, + { + "epoch": 0.6, + "grad_norm": 0.3857264661032645, + "learning_rate": 3.5875036597851553e-06, + "loss": 0.091, + "step": 2112 + }, + { + "epoch": 0.6, + "grad_norm": 0.7699789192606351, + "learning_rate": 3.5830659326597194e-06, + "loss": 0.0688, + "step": 2113 + }, + { + "epoch": 0.6, + "grad_norm": 0.5046650432948563, + "learning_rate": 3.578629418838757e-06, + "loss": 0.0845, + "step": 2114 + }, + { + "epoch": 0.6, + "grad_norm": 0.3232395398561082, + "learning_rate": 3.574194122121207e-06, + "loss": 0.0571, + "step": 2115 + }, + { + "epoch": 0.6, + "grad_norm": 0.28310953285578133, + "learning_rate": 3.5697600463049626e-06, + "loss": 0.0781, + "step": 2116 + }, + { + "epoch": 0.6, + "grad_norm": 0.3089597690100642, + "learning_rate": 3.5653271951868695e-06, + "loss": 0.0877, + "step": 2117 + }, + { + "epoch": 0.61, + "grad_norm": 0.3348014203597471, + "learning_rate": 3.5608955725627315e-06, + "loss": 0.0544, + "step": 2118 + }, + { + "epoch": 0.61, + "grad_norm": 0.8053852606312272, + "learning_rate": 3.556465182227293e-06, + "loss": 0.1304, + "step": 2119 + }, + { + "epoch": 0.61, + "grad_norm": 0.5324945990152041, + "learning_rate": 3.55203602797425e-06, + "loss": 0.1281, + "step": 2120 + }, + { + "epoch": 0.61, + "grad_norm": 0.20157718142726572, + "learning_rate": 3.5476081135962335e-06, + "loss": 0.0506, + "step": 2121 + }, + { + "epoch": 0.61, + "grad_norm": 0.3980883363467904, + "learning_rate": 3.5431814428848195e-06, + "loss": 0.086, + "step": 2122 + }, + { + "epoch": 0.61, + "grad_norm": 0.6848620674157934, + "learning_rate": 3.538756019630513e-06, + "loss": 0.1021, + "step": 2123 + }, + { + "epoch": 0.61, + "grad_norm": 0.24304849186337293, + "learning_rate": 3.534331847622754e-06, + "loss": 0.0496, + "step": 2124 + }, + { + "epoch": 0.61, + "grad_norm": 0.6450678304980599, + "learning_rate": 3.52990893064991e-06, + "loss": 0.1042, + "step": 2125 + }, + { + "epoch": 0.61, + "grad_norm": 0.6959247328834508, + "learning_rate": 3.525487272499277e-06, + "loss": 0.1424, + "step": 2126 + }, + { + "epoch": 0.61, + "grad_norm": 0.5588994073367515, + "learning_rate": 3.5210668769570665e-06, + "loss": 0.1053, + "step": 2127 + }, + { + "epoch": 0.61, + "grad_norm": 0.48674739074483947, + "learning_rate": 3.516647747808417e-06, + "loss": 0.0928, + "step": 2128 + }, + { + "epoch": 0.61, + "grad_norm": 0.35485867318752906, + "learning_rate": 3.5122298888373742e-06, + "loss": 0.1112, + "step": 2129 + }, + { + "epoch": 0.61, + "grad_norm": 0.6166926786366669, + "learning_rate": 3.5078133038269034e-06, + "loss": 0.1541, + "step": 2130 + }, + { + "epoch": 0.61, + "grad_norm": 0.42556221404167865, + "learning_rate": 3.503397996558874e-06, + "loss": 0.089, + "step": 2131 + }, + { + "epoch": 0.61, + "grad_norm": 0.7448092765410511, + "learning_rate": 3.4989839708140655e-06, + "loss": 0.1801, + "step": 2132 + }, + { + "epoch": 0.61, + "grad_norm": 0.27981953417812216, + "learning_rate": 3.4945712303721558e-06, + "loss": 0.0749, + "step": 2133 + }, + { + "epoch": 0.61, + "grad_norm": 0.20423481686160258, + "learning_rate": 3.490159779011724e-06, + "loss": 0.0564, + "step": 2134 + }, + { + "epoch": 0.61, + "grad_norm": 0.43105399089281066, + "learning_rate": 3.4857496205102475e-06, + "loss": 0.0595, + "step": 2135 + }, + { + "epoch": 0.61, + "grad_norm": 0.4278895694978233, + "learning_rate": 3.481340758644092e-06, + "loss": 0.1133, + "step": 2136 + }, + { + "epoch": 0.61, + "grad_norm": 0.5413177432687966, + "learning_rate": 3.4769331971885188e-06, + "loss": 0.1183, + "step": 2137 + }, + { + "epoch": 0.61, + "grad_norm": 0.295541889286931, + "learning_rate": 3.4725269399176693e-06, + "loss": 0.082, + "step": 2138 + }, + { + "epoch": 0.61, + "grad_norm": 0.3775732730926014, + "learning_rate": 3.468121990604574e-06, + "loss": 0.0612, + "step": 2139 + }, + { + "epoch": 0.61, + "grad_norm": 0.48139120350221437, + "learning_rate": 3.463718353021138e-06, + "loss": 0.0895, + "step": 2140 + }, + { + "epoch": 0.61, + "grad_norm": 0.3874337431938493, + "learning_rate": 3.45931603093815e-06, + "loss": 0.0955, + "step": 2141 + }, + { + "epoch": 0.61, + "grad_norm": 0.43567749982613474, + "learning_rate": 3.4549150281252635e-06, + "loss": 0.0573, + "step": 2142 + }, + { + "epoch": 0.61, + "grad_norm": 0.272848024064493, + "learning_rate": 3.45051534835101e-06, + "loss": 0.0628, + "step": 2143 + }, + { + "epoch": 0.61, + "grad_norm": 0.35791505390401157, + "learning_rate": 3.4461169953827846e-06, + "loss": 0.0724, + "step": 2144 + }, + { + "epoch": 0.61, + "grad_norm": 0.4601821466912252, + "learning_rate": 3.441719972986846e-06, + "loss": 0.1207, + "step": 2145 + }, + { + "epoch": 0.61, + "grad_norm": 0.6526346090023954, + "learning_rate": 3.437324284928314e-06, + "loss": 0.1118, + "step": 2146 + }, + { + "epoch": 0.61, + "grad_norm": 0.6249212896997014, + "learning_rate": 3.4329299349711687e-06, + "loss": 0.1339, + "step": 2147 + }, + { + "epoch": 0.61, + "grad_norm": 0.47480506648556675, + "learning_rate": 3.4285369268782383e-06, + "loss": 0.1276, + "step": 2148 + }, + { + "epoch": 0.61, + "grad_norm": 0.6535323858490625, + "learning_rate": 3.4241452644112085e-06, + "loss": 0.1101, + "step": 2149 + }, + { + "epoch": 0.61, + "grad_norm": 0.4758298024601593, + "learning_rate": 3.4197549513306076e-06, + "loss": 0.1161, + "step": 2150 + }, + { + "epoch": 0.61, + "grad_norm": 0.2615492377403989, + "learning_rate": 3.4153659913958116e-06, + "loss": 0.0548, + "step": 2151 + }, + { + "epoch": 0.61, + "grad_norm": 0.31850448121328606, + "learning_rate": 3.4109783883650373e-06, + "loss": 0.069, + "step": 2152 + }, + { + "epoch": 0.62, + "grad_norm": 0.4594313366056636, + "learning_rate": 3.4065921459953365e-06, + "loss": 0.0795, + "step": 2153 + }, + { + "epoch": 0.62, + "grad_norm": 0.4106609140190271, + "learning_rate": 3.4022072680426027e-06, + "loss": 0.0744, + "step": 2154 + }, + { + "epoch": 0.62, + "grad_norm": 0.4732180186440131, + "learning_rate": 3.3978237582615535e-06, + "loss": 0.0687, + "step": 2155 + }, + { + "epoch": 0.62, + "grad_norm": 0.5910776142453107, + "learning_rate": 3.3934416204057396e-06, + "loss": 0.1225, + "step": 2156 + }, + { + "epoch": 0.62, + "grad_norm": 0.4936430605684156, + "learning_rate": 3.3890608582275355e-06, + "loss": 0.0754, + "step": 2157 + }, + { + "epoch": 0.62, + "grad_norm": 0.583288419944622, + "learning_rate": 3.384681475478139e-06, + "loss": 0.1412, + "step": 2158 + }, + { + "epoch": 0.62, + "grad_norm": 0.5287228494839469, + "learning_rate": 3.3803034759075624e-06, + "loss": 0.104, + "step": 2159 + }, + { + "epoch": 0.62, + "grad_norm": 0.3975751459912013, + "learning_rate": 3.37592686326464e-06, + "loss": 0.0774, + "step": 2160 + }, + { + "epoch": 0.62, + "grad_norm": 0.43572649667643254, + "learning_rate": 3.371551641297014e-06, + "loss": 0.0825, + "step": 2161 + }, + { + "epoch": 0.62, + "grad_norm": 0.3196413576474667, + "learning_rate": 3.367177813751137e-06, + "loss": 0.0629, + "step": 2162 + }, + { + "epoch": 0.62, + "grad_norm": 0.21128788446811378, + "learning_rate": 3.3628053843722674e-06, + "loss": 0.0559, + "step": 2163 + }, + { + "epoch": 0.62, + "grad_norm": 0.27702982294106715, + "learning_rate": 3.3584343569044673e-06, + "loss": 0.0851, + "step": 2164 + }, + { + "epoch": 0.62, + "grad_norm": 0.3875813962199568, + "learning_rate": 3.3540647350905985e-06, + "loss": 0.0622, + "step": 2165 + }, + { + "epoch": 0.62, + "grad_norm": 0.4497287018982541, + "learning_rate": 3.3496965226723156e-06, + "loss": 0.1178, + "step": 2166 + }, + { + "epoch": 0.62, + "grad_norm": 0.31694673104908516, + "learning_rate": 3.3453297233900713e-06, + "loss": 0.0649, + "step": 2167 + }, + { + "epoch": 0.62, + "grad_norm": 0.4098826564932513, + "learning_rate": 3.340964340983104e-06, + "loss": 0.0967, + "step": 2168 + }, + { + "epoch": 0.62, + "grad_norm": 0.26200242125710105, + "learning_rate": 3.336600379189444e-06, + "loss": 0.0531, + "step": 2169 + }, + { + "epoch": 0.62, + "grad_norm": 0.435971327270325, + "learning_rate": 3.3322378417458985e-06, + "loss": 0.1184, + "step": 2170 + }, + { + "epoch": 0.62, + "grad_norm": 0.3718658211792619, + "learning_rate": 3.327876732388061e-06, + "loss": 0.0503, + "step": 2171 + }, + { + "epoch": 0.62, + "grad_norm": 0.3756197979693583, + "learning_rate": 3.323517054850298e-06, + "loss": 0.1149, + "step": 2172 + }, + { + "epoch": 0.62, + "grad_norm": 0.3569703926571223, + "learning_rate": 3.3191588128657537e-06, + "loss": 0.088, + "step": 2173 + }, + { + "epoch": 0.62, + "grad_norm": 0.38788936508162164, + "learning_rate": 3.314802010166337e-06, + "loss": 0.1116, + "step": 2174 + }, + { + "epoch": 0.62, + "grad_norm": 0.8353656774664352, + "learning_rate": 3.3104466504827327e-06, + "loss": 0.1362, + "step": 2175 + }, + { + "epoch": 0.62, + "grad_norm": 0.4300659430461634, + "learning_rate": 3.3060927375443845e-06, + "loss": 0.0702, + "step": 2176 + }, + { + "epoch": 0.62, + "grad_norm": 0.2865548349541641, + "learning_rate": 3.3017402750794976e-06, + "loss": 0.0845, + "step": 2177 + }, + { + "epoch": 0.62, + "grad_norm": 0.9610315165190803, + "learning_rate": 3.2973892668150365e-06, + "loss": 0.1596, + "step": 2178 + }, + { + "epoch": 0.62, + "grad_norm": 0.429895758130708, + "learning_rate": 3.293039716476719e-06, + "loss": 0.1016, + "step": 2179 + }, + { + "epoch": 0.62, + "grad_norm": 0.367772969826401, + "learning_rate": 3.288691627789017e-06, + "loss": 0.0625, + "step": 2180 + }, + { + "epoch": 0.62, + "grad_norm": 0.3544993583005483, + "learning_rate": 3.2843450044751468e-06, + "loss": 0.0913, + "step": 2181 + }, + { + "epoch": 0.62, + "grad_norm": 0.6121129591343168, + "learning_rate": 3.279999850257076e-06, + "loss": 0.1027, + "step": 2182 + }, + { + "epoch": 0.62, + "grad_norm": 0.3790910928707313, + "learning_rate": 3.275656168855506e-06, + "loss": 0.0752, + "step": 2183 + }, + { + "epoch": 0.62, + "grad_norm": 0.44063285211477354, + "learning_rate": 3.271313963989886e-06, + "loss": 0.083, + "step": 2184 + }, + { + "epoch": 0.62, + "grad_norm": 0.43900728889382, + "learning_rate": 3.2669732393783944e-06, + "loss": 0.0853, + "step": 2185 + }, + { + "epoch": 0.62, + "grad_norm": 0.46661852778740603, + "learning_rate": 3.262633998737943e-06, + "loss": 0.1349, + "step": 2186 + }, + { + "epoch": 0.62, + "grad_norm": 0.4818974737283349, + "learning_rate": 3.258296245784176e-06, + "loss": 0.0918, + "step": 2187 + }, + { + "epoch": 0.63, + "grad_norm": 0.3958916826996518, + "learning_rate": 3.253959984231461e-06, + "loss": 0.062, + "step": 2188 + }, + { + "epoch": 0.63, + "grad_norm": 0.8232773554824766, + "learning_rate": 3.24962521779289e-06, + "loss": 0.0629, + "step": 2189 + }, + { + "epoch": 0.63, + "grad_norm": 0.4250745879875681, + "learning_rate": 3.2452919501802714e-06, + "loss": 0.0653, + "step": 2190 + }, + { + "epoch": 0.63, + "grad_norm": 0.4639581230707936, + "learning_rate": 3.240960185104137e-06, + "loss": 0.1221, + "step": 2191 + }, + { + "epoch": 0.63, + "grad_norm": 0.6113470051884101, + "learning_rate": 3.2366299262737246e-06, + "loss": 0.1047, + "step": 2192 + }, + { + "epoch": 0.63, + "grad_norm": 0.17230213607604017, + "learning_rate": 3.232301177396987e-06, + "loss": 0.0503, + "step": 2193 + }, + { + "epoch": 0.63, + "grad_norm": 0.5241393634202148, + "learning_rate": 3.227973942180581e-06, + "loss": 0.1253, + "step": 2194 + }, + { + "epoch": 0.63, + "grad_norm": 0.32115684457500066, + "learning_rate": 3.223648224329872e-06, + "loss": 0.084, + "step": 2195 + }, + { + "epoch": 0.63, + "grad_norm": 0.35818981969753083, + "learning_rate": 3.219324027548918e-06, + "loss": 0.0801, + "step": 2196 + }, + { + "epoch": 0.63, + "grad_norm": 0.4356633496199792, + "learning_rate": 3.215001355540483e-06, + "loss": 0.0964, + "step": 2197 + }, + { + "epoch": 0.63, + "grad_norm": 0.5290389502712896, + "learning_rate": 3.2106802120060197e-06, + "loss": 0.0986, + "step": 2198 + }, + { + "epoch": 0.63, + "grad_norm": 0.7535690752406217, + "learning_rate": 3.206360600645676e-06, + "loss": 0.1215, + "step": 2199 + }, + { + "epoch": 0.63, + "grad_norm": 0.33861654307401245, + "learning_rate": 3.202042525158284e-06, + "loss": 0.072, + "step": 2200 + }, + { + "epoch": 0.63, + "grad_norm": 0.5242903634387209, + "learning_rate": 3.197725989241364e-06, + "loss": 0.0874, + "step": 2201 + }, + { + "epoch": 0.63, + "grad_norm": 0.46914209904777004, + "learning_rate": 3.1934109965911166e-06, + "loss": 0.102, + "step": 2202 + }, + { + "epoch": 0.63, + "grad_norm": 0.7636962947952328, + "learning_rate": 3.1890975509024174e-06, + "loss": 0.1623, + "step": 2203 + }, + { + "epoch": 0.63, + "grad_norm": 0.3609409113896043, + "learning_rate": 3.184785655868825e-06, + "loss": 0.0725, + "step": 2204 + }, + { + "epoch": 0.63, + "grad_norm": 0.5225538568770066, + "learning_rate": 3.180475315182563e-06, + "loss": 0.1297, + "step": 2205 + }, + { + "epoch": 0.63, + "grad_norm": 0.46137712922314433, + "learning_rate": 3.1761665325345285e-06, + "loss": 0.0935, + "step": 2206 + }, + { + "epoch": 0.63, + "grad_norm": 0.2626466733851561, + "learning_rate": 3.1718593116142814e-06, + "loss": 0.0317, + "step": 2207 + }, + { + "epoch": 0.63, + "grad_norm": 0.30894544511701955, + "learning_rate": 3.167553656110048e-06, + "loss": 0.0945, + "step": 2208 + }, + { + "epoch": 0.63, + "grad_norm": 0.41646567609931173, + "learning_rate": 3.1632495697087105e-06, + "loss": 0.1083, + "step": 2209 + }, + { + "epoch": 0.63, + "grad_norm": 0.5049042907399488, + "learning_rate": 3.1589470560958104e-06, + "loss": 0.104, + "step": 2210 + }, + { + "epoch": 0.63, + "grad_norm": 0.21207648860814676, + "learning_rate": 3.1546461189555388e-06, + "loss": 0.0315, + "step": 2211 + }, + { + "epoch": 0.63, + "grad_norm": 0.39901941438593314, + "learning_rate": 3.1503467619707407e-06, + "loss": 0.0805, + "step": 2212 + }, + { + "epoch": 0.63, + "grad_norm": 0.383123390502709, + "learning_rate": 3.146048988822905e-06, + "loss": 0.0615, + "step": 2213 + }, + { + "epoch": 0.63, + "grad_norm": 0.3750701945577319, + "learning_rate": 3.1417528031921686e-06, + "loss": 0.0791, + "step": 2214 + }, + { + "epoch": 0.63, + "grad_norm": 0.24813876823351655, + "learning_rate": 3.1374582087573026e-06, + "loss": 0.0447, + "step": 2215 + }, + { + "epoch": 0.63, + "grad_norm": 0.2256052598341187, + "learning_rate": 3.133165209195722e-06, + "loss": 0.048, + "step": 2216 + }, + { + "epoch": 0.63, + "grad_norm": 0.264399776068169, + "learning_rate": 3.1288738081834734e-06, + "loss": 0.0658, + "step": 2217 + }, + { + "epoch": 0.63, + "grad_norm": 0.42808072206628417, + "learning_rate": 3.124584009395232e-06, + "loss": 0.0862, + "step": 2218 + }, + { + "epoch": 0.63, + "grad_norm": 0.3066496864333556, + "learning_rate": 3.1202958165043053e-06, + "loss": 0.0546, + "step": 2219 + }, + { + "epoch": 0.63, + "grad_norm": 0.4165697205719138, + "learning_rate": 3.1160092331826235e-06, + "loss": 0.1064, + "step": 2220 + }, + { + "epoch": 0.63, + "grad_norm": 0.43532672421992785, + "learning_rate": 3.11172426310074e-06, + "loss": 0.1159, + "step": 2221 + }, + { + "epoch": 0.63, + "grad_norm": 0.31113532383650017, + "learning_rate": 3.107440909927824e-06, + "loss": 0.0882, + "step": 2222 + }, + { + "epoch": 0.64, + "grad_norm": 0.3267355531934448, + "learning_rate": 3.1031591773316636e-06, + "loss": 0.0765, + "step": 2223 + }, + { + "epoch": 0.64, + "grad_norm": 0.41591711380489454, + "learning_rate": 3.0988790689786563e-06, + "loss": 0.0722, + "step": 2224 + }, + { + "epoch": 0.64, + "grad_norm": 0.2688967848001076, + "learning_rate": 3.0946005885338116e-06, + "loss": 0.0406, + "step": 2225 + }, + { + "epoch": 0.64, + "grad_norm": 0.515598215348305, + "learning_rate": 3.090323739660742e-06, + "loss": 0.1035, + "step": 2226 + }, + { + "epoch": 0.64, + "grad_norm": 0.3830700355400384, + "learning_rate": 3.0860485260216656e-06, + "loss": 0.0794, + "step": 2227 + }, + { + "epoch": 0.64, + "grad_norm": 0.4657863791560485, + "learning_rate": 3.081774951277397e-06, + "loss": 0.1038, + "step": 2228 + }, + { + "epoch": 0.64, + "grad_norm": 0.3447325976522972, + "learning_rate": 3.077503019087352e-06, + "loss": 0.0988, + "step": 2229 + }, + { + "epoch": 0.64, + "grad_norm": 0.45516248898991346, + "learning_rate": 3.073232733109536e-06, + "loss": 0.1096, + "step": 2230 + }, + { + "epoch": 0.64, + "grad_norm": 0.5297690308234168, + "learning_rate": 3.068964097000546e-06, + "loss": 0.0732, + "step": 2231 + }, + { + "epoch": 0.64, + "grad_norm": 0.29911509616116244, + "learning_rate": 3.064697114415567e-06, + "loss": 0.0619, + "step": 2232 + }, + { + "epoch": 0.64, + "grad_norm": 1.1595404958128304, + "learning_rate": 3.060431789008368e-06, + "loss": 0.098, + "step": 2233 + }, + { + "epoch": 0.64, + "grad_norm": 0.4594281388434096, + "learning_rate": 3.056168124431298e-06, + "loss": 0.1078, + "step": 2234 + }, + { + "epoch": 0.64, + "grad_norm": 0.4139696114976327, + "learning_rate": 3.0519061243352833e-06, + "loss": 0.105, + "step": 2235 + }, + { + "epoch": 0.64, + "grad_norm": 0.320372204104932, + "learning_rate": 3.0476457923698272e-06, + "loss": 0.0844, + "step": 2236 + }, + { + "epoch": 0.64, + "grad_norm": 0.4529211258444078, + "learning_rate": 3.043387132183002e-06, + "loss": 0.1162, + "step": 2237 + }, + { + "epoch": 0.64, + "grad_norm": 0.3511034466577657, + "learning_rate": 3.039130147421452e-06, + "loss": 0.0839, + "step": 2238 + }, + { + "epoch": 0.64, + "grad_norm": 0.33845853053609387, + "learning_rate": 3.0348748417303826e-06, + "loss": 0.0539, + "step": 2239 + }, + { + "epoch": 0.64, + "grad_norm": 0.6523629577883808, + "learning_rate": 3.0306212187535653e-06, + "loss": 0.1134, + "step": 2240 + }, + { + "epoch": 0.64, + "grad_norm": 0.2719026631173136, + "learning_rate": 3.026369282133327e-06, + "loss": 0.067, + "step": 2241 + }, + { + "epoch": 0.64, + "grad_norm": 0.4498298121008999, + "learning_rate": 3.022119035510554e-06, + "loss": 0.1098, + "step": 2242 + }, + { + "epoch": 0.64, + "grad_norm": 0.5022798077012678, + "learning_rate": 3.017870482524683e-06, + "loss": 0.1272, + "step": 2243 + }, + { + "epoch": 0.64, + "grad_norm": 0.2807535534499567, + "learning_rate": 3.0136236268137032e-06, + "loss": 0.0726, + "step": 2244 + }, + { + "epoch": 0.64, + "grad_norm": 0.5121126276985974, + "learning_rate": 3.0093784720141456e-06, + "loss": 0.0913, + "step": 2245 + }, + { + "epoch": 0.64, + "grad_norm": 0.26096862249592634, + "learning_rate": 3.005135021761091e-06, + "loss": 0.0724, + "step": 2246 + }, + { + "epoch": 0.64, + "grad_norm": 0.7872548946211441, + "learning_rate": 3.000893279688155e-06, + "loss": 0.1343, + "step": 2247 + }, + { + "epoch": 0.64, + "grad_norm": 0.3225511049616021, + "learning_rate": 2.996653249427496e-06, + "loss": 0.0654, + "step": 2248 + }, + { + "epoch": 0.64, + "grad_norm": 0.2511025912885566, + "learning_rate": 2.992414934609799e-06, + "loss": 0.0384, + "step": 2249 + }, + { + "epoch": 0.64, + "grad_norm": 0.2524792538389449, + "learning_rate": 2.988178338864289e-06, + "loss": 0.0614, + "step": 2250 + }, + { + "epoch": 0.64, + "grad_norm": 0.5460979812134171, + "learning_rate": 2.9839434658187128e-06, + "loss": 0.1359, + "step": 2251 + }, + { + "epoch": 0.64, + "grad_norm": 0.3406985894983384, + "learning_rate": 2.9797103190993424e-06, + "loss": 0.0801, + "step": 2252 + }, + { + "epoch": 0.64, + "grad_norm": 0.22919616400573684, + "learning_rate": 2.975478902330976e-06, + "loss": 0.0541, + "step": 2253 + }, + { + "epoch": 0.64, + "grad_norm": 0.4635200241574937, + "learning_rate": 2.9712492191369245e-06, + "loss": 0.0847, + "step": 2254 + }, + { + "epoch": 0.64, + "grad_norm": 0.6917975599765706, + "learning_rate": 2.9670212731390202e-06, + "loss": 0.1315, + "step": 2255 + }, + { + "epoch": 0.64, + "grad_norm": 0.2251775767650042, + "learning_rate": 2.9627950679576023e-06, + "loss": 0.0472, + "step": 2256 + }, + { + "epoch": 0.64, + "grad_norm": 0.6397004555752241, + "learning_rate": 2.958570607211525e-06, + "loss": 0.1295, + "step": 2257 + }, + { + "epoch": 0.65, + "grad_norm": 0.4237224899475914, + "learning_rate": 2.9543478945181425e-06, + "loss": 0.0561, + "step": 2258 + }, + { + "epoch": 0.65, + "grad_norm": 0.5939801535434834, + "learning_rate": 2.9501269334933193e-06, + "loss": 0.1463, + "step": 2259 + }, + { + "epoch": 0.65, + "grad_norm": 0.7527823900971068, + "learning_rate": 2.945907727751412e-06, + "loss": 0.101, + "step": 2260 + }, + { + "epoch": 0.65, + "grad_norm": 0.3487757755684883, + "learning_rate": 2.9416902809052817e-06, + "loss": 0.066, + "step": 2261 + }, + { + "epoch": 0.65, + "grad_norm": 0.5378627638414805, + "learning_rate": 2.9374745965662784e-06, + "loss": 0.0837, + "step": 2262 + }, + { + "epoch": 0.65, + "grad_norm": 0.760392574774493, + "learning_rate": 2.933260678344245e-06, + "loss": 0.1198, + "step": 2263 + }, + { + "epoch": 0.65, + "grad_norm": 0.32810580236675957, + "learning_rate": 2.9290485298475115e-06, + "loss": 0.1006, + "step": 2264 + }, + { + "epoch": 0.65, + "grad_norm": 0.2822204050564391, + "learning_rate": 2.924838154682893e-06, + "loss": 0.0705, + "step": 2265 + }, + { + "epoch": 0.65, + "grad_norm": 0.5341164505024624, + "learning_rate": 2.920629556455684e-06, + "loss": 0.0847, + "step": 2266 + }, + { + "epoch": 0.65, + "grad_norm": 0.3743597615263854, + "learning_rate": 2.916422738769663e-06, + "loss": 0.0841, + "step": 2267 + }, + { + "epoch": 0.65, + "grad_norm": 0.7059118764674842, + "learning_rate": 2.912217705227075e-06, + "loss": 0.1319, + "step": 2268 + }, + { + "epoch": 0.65, + "grad_norm": 0.35156988006072726, + "learning_rate": 2.908014459428644e-06, + "loss": 0.0672, + "step": 2269 + }, + { + "epoch": 0.65, + "grad_norm": 0.3914407366252117, + "learning_rate": 2.9038130049735634e-06, + "loss": 0.0769, + "step": 2270 + }, + { + "epoch": 0.65, + "grad_norm": 0.2595686182686397, + "learning_rate": 2.899613345459491e-06, + "loss": 0.0632, + "step": 2271 + }, + { + "epoch": 0.65, + "grad_norm": 0.2833529473405738, + "learning_rate": 2.8954154844825433e-06, + "loss": 0.0643, + "step": 2272 + }, + { + "epoch": 0.65, + "grad_norm": 0.47269186350984344, + "learning_rate": 2.8912194256373034e-06, + "loss": 0.1092, + "step": 2273 + }, + { + "epoch": 0.65, + "grad_norm": 0.46887301792249453, + "learning_rate": 2.8870251725168086e-06, + "loss": 0.0981, + "step": 2274 + }, + { + "epoch": 0.65, + "grad_norm": 0.29914548278429276, + "learning_rate": 2.882832728712551e-06, + "loss": 0.0647, + "step": 2275 + }, + { + "epoch": 0.65, + "grad_norm": 0.582651436150537, + "learning_rate": 2.8786420978144696e-06, + "loss": 0.1176, + "step": 2276 + }, + { + "epoch": 0.65, + "grad_norm": 0.37627485850216297, + "learning_rate": 2.8744532834109556e-06, + "loss": 0.0807, + "step": 2277 + }, + { + "epoch": 0.65, + "grad_norm": 0.9227049972258796, + "learning_rate": 2.870266289088842e-06, + "loss": 0.0877, + "step": 2278 + }, + { + "epoch": 0.65, + "grad_norm": 0.35587640966716405, + "learning_rate": 2.866081118433408e-06, + "loss": 0.0694, + "step": 2279 + }, + { + "epoch": 0.65, + "grad_norm": 0.462732949217908, + "learning_rate": 2.8618977750283605e-06, + "loss": 0.0932, + "step": 2280 + }, + { + "epoch": 0.65, + "grad_norm": 0.5029208423475465, + "learning_rate": 2.857716262455854e-06, + "loss": 0.1059, + "step": 2281 + }, + { + "epoch": 0.65, + "grad_norm": 0.25775354993098337, + "learning_rate": 2.8535365842964713e-06, + "loss": 0.0536, + "step": 2282 + }, + { + "epoch": 0.65, + "grad_norm": 0.6197971299923342, + "learning_rate": 2.849358744129218e-06, + "loss": 0.0819, + "step": 2283 + }, + { + "epoch": 0.65, + "grad_norm": 0.32764532195894824, + "learning_rate": 2.845182745531534e-06, + "loss": 0.1028, + "step": 2284 + }, + { + "epoch": 0.65, + "grad_norm": 0.540918453714262, + "learning_rate": 2.841008592079281e-06, + "loss": 0.1116, + "step": 2285 + }, + { + "epoch": 0.65, + "grad_norm": 0.49124903855960494, + "learning_rate": 2.8368362873467335e-06, + "loss": 0.0937, + "step": 2286 + }, + { + "epoch": 0.65, + "grad_norm": 0.5143010565119079, + "learning_rate": 2.832665834906593e-06, + "loss": 0.0796, + "step": 2287 + }, + { + "epoch": 0.65, + "grad_norm": 0.30782139294150074, + "learning_rate": 2.8284972383299694e-06, + "loss": 0.0743, + "step": 2288 + }, + { + "epoch": 0.65, + "grad_norm": 0.42864511735546884, + "learning_rate": 2.8243305011863843e-06, + "loss": 0.083, + "step": 2289 + }, + { + "epoch": 0.65, + "grad_norm": 0.38034608340888637, + "learning_rate": 2.8201656270437662e-06, + "loss": 0.0827, + "step": 2290 + }, + { + "epoch": 0.65, + "grad_norm": 0.4534563520465355, + "learning_rate": 2.81600261946845e-06, + "loss": 0.095, + "step": 2291 + }, + { + "epoch": 0.65, + "grad_norm": 0.3548952684845326, + "learning_rate": 2.8118414820251707e-06, + "loss": 0.0987, + "step": 2292 + }, + { + "epoch": 0.66, + "grad_norm": 0.28516233145113645, + "learning_rate": 2.8076822182770656e-06, + "loss": 0.09, + "step": 2293 + }, + { + "epoch": 0.66, + "grad_norm": 0.42426040806675386, + "learning_rate": 2.8035248317856615e-06, + "loss": 0.0991, + "step": 2294 + }, + { + "epoch": 0.66, + "grad_norm": 0.3887180211646909, + "learning_rate": 2.7993693261108823e-06, + "loss": 0.1032, + "step": 2295 + }, + { + "epoch": 0.66, + "grad_norm": 0.5686559544022615, + "learning_rate": 2.7952157048110406e-06, + "loss": 0.1218, + "step": 2296 + }, + { + "epoch": 0.66, + "grad_norm": 0.29805688973147837, + "learning_rate": 2.791063971442836e-06, + "loss": 0.0733, + "step": 2297 + }, + { + "epoch": 0.66, + "grad_norm": 0.355908684610805, + "learning_rate": 2.7869141295613472e-06, + "loss": 0.0913, + "step": 2298 + }, + { + "epoch": 0.66, + "grad_norm": 0.5548207284391696, + "learning_rate": 2.782766182720038e-06, + "loss": 0.1288, + "step": 2299 + }, + { + "epoch": 0.66, + "grad_norm": 0.5217284728277821, + "learning_rate": 2.7786201344707487e-06, + "loss": 0.1233, + "step": 2300 + }, + { + "epoch": 0.66, + "grad_norm": 0.4866602616651964, + "learning_rate": 2.774475988363694e-06, + "loss": 0.1037, + "step": 2301 + }, + { + "epoch": 0.66, + "grad_norm": 0.2904837015128024, + "learning_rate": 2.770333747947455e-06, + "loss": 0.0466, + "step": 2302 + }, + { + "epoch": 0.66, + "grad_norm": 0.33017920417950397, + "learning_rate": 2.7661934167689887e-06, + "loss": 0.1245, + "step": 2303 + }, + { + "epoch": 0.66, + "grad_norm": 0.3270457335369551, + "learning_rate": 2.762054998373613e-06, + "loss": 0.0462, + "step": 2304 + }, + { + "epoch": 0.66, + "grad_norm": 0.36402043517321275, + "learning_rate": 2.7579184963050056e-06, + "loss": 0.094, + "step": 2305 + }, + { + "epoch": 0.66, + "grad_norm": 0.23328938667347496, + "learning_rate": 2.753783914105208e-06, + "loss": 0.0527, + "step": 2306 + }, + { + "epoch": 0.66, + "grad_norm": 0.4116614024354482, + "learning_rate": 2.749651255314616e-06, + "loss": 0.0804, + "step": 2307 + }, + { + "epoch": 0.66, + "grad_norm": 0.35467948540901745, + "learning_rate": 2.7455205234719797e-06, + "loss": 0.1148, + "step": 2308 + }, + { + "epoch": 0.66, + "grad_norm": 0.3075646107856484, + "learning_rate": 2.741391722114394e-06, + "loss": 0.0664, + "step": 2309 + }, + { + "epoch": 0.66, + "grad_norm": 0.5760189831599258, + "learning_rate": 2.7372648547773063e-06, + "loss": 0.083, + "step": 2310 + }, + { + "epoch": 0.66, + "grad_norm": 0.4691585993981161, + "learning_rate": 2.733139924994505e-06, + "loss": 0.0907, + "step": 2311 + }, + { + "epoch": 0.66, + "grad_norm": 0.4152150789814372, + "learning_rate": 2.729016936298124e-06, + "loss": 0.1374, + "step": 2312 + }, + { + "epoch": 0.66, + "grad_norm": 0.37046898927892935, + "learning_rate": 2.7248958922186263e-06, + "loss": 0.1009, + "step": 2313 + }, + { + "epoch": 0.66, + "grad_norm": 0.5507208814623268, + "learning_rate": 2.720776796284818e-06, + "loss": 0.1398, + "step": 2314 + }, + { + "epoch": 0.66, + "grad_norm": 0.45677895593757234, + "learning_rate": 2.716659652023833e-06, + "loss": 0.0814, + "step": 2315 + }, + { + "epoch": 0.66, + "grad_norm": 0.4696547355345099, + "learning_rate": 2.7125444629611376e-06, + "loss": 0.1145, + "step": 2316 + }, + { + "epoch": 0.66, + "grad_norm": 0.262415017630031, + "learning_rate": 2.7084312326205164e-06, + "loss": 0.0743, + "step": 2317 + }, + { + "epoch": 0.66, + "grad_norm": 0.34551270325499794, + "learning_rate": 2.704319964524085e-06, + "loss": 0.1069, + "step": 2318 + }, + { + "epoch": 0.66, + "grad_norm": 0.42502155045237316, + "learning_rate": 2.700210662192276e-06, + "loss": 0.1037, + "step": 2319 + }, + { + "epoch": 0.66, + "grad_norm": 0.4374639820148765, + "learning_rate": 2.6961033291438343e-06, + "loss": 0.0675, + "step": 2320 + }, + { + "epoch": 0.66, + "grad_norm": 0.36941815964521674, + "learning_rate": 2.6919979688958255e-06, + "loss": 0.0922, + "step": 2321 + }, + { + "epoch": 0.66, + "grad_norm": 0.35262405423393256, + "learning_rate": 2.6878945849636206e-06, + "loss": 0.0954, + "step": 2322 + }, + { + "epoch": 0.66, + "grad_norm": 0.4429212320097036, + "learning_rate": 2.683793180860905e-06, + "loss": 0.1116, + "step": 2323 + }, + { + "epoch": 0.66, + "grad_norm": 0.4694217281221106, + "learning_rate": 2.6796937600996587e-06, + "loss": 0.0957, + "step": 2324 + }, + { + "epoch": 0.66, + "grad_norm": 0.3089160314002098, + "learning_rate": 2.6755963261901706e-06, + "loss": 0.0707, + "step": 2325 + }, + { + "epoch": 0.66, + "grad_norm": 0.8337121284683863, + "learning_rate": 2.671500882641027e-06, + "loss": 0.1809, + "step": 2326 + }, + { + "epoch": 0.66, + "grad_norm": 0.35030972559754975, + "learning_rate": 2.6674074329591127e-06, + "loss": 0.0749, + "step": 2327 + }, + { + "epoch": 0.67, + "grad_norm": 0.6028853006788526, + "learning_rate": 2.6633159806495967e-06, + "loss": 0.1404, + "step": 2328 + }, + { + "epoch": 0.67, + "grad_norm": 0.43589668081556143, + "learning_rate": 2.6592265292159446e-06, + "loss": 0.0852, + "step": 2329 + }, + { + "epoch": 0.67, + "grad_norm": 0.5621747812600117, + "learning_rate": 2.655139082159908e-06, + "loss": 0.117, + "step": 2330 + }, + { + "epoch": 0.67, + "grad_norm": 0.5545220254320469, + "learning_rate": 2.6510536429815224e-06, + "loss": 0.0997, + "step": 2331 + }, + { + "epoch": 0.67, + "grad_norm": 0.32132232425096724, + "learning_rate": 2.6469702151791e-06, + "loss": 0.0761, + "step": 2332 + }, + { + "epoch": 0.67, + "grad_norm": 0.34028601038042017, + "learning_rate": 2.6428888022492335e-06, + "loss": 0.0973, + "step": 2333 + }, + { + "epoch": 0.67, + "grad_norm": 0.3065868212890701, + "learning_rate": 2.6388094076867916e-06, + "loss": 0.0992, + "step": 2334 + }, + { + "epoch": 0.67, + "grad_norm": 0.31547156332143655, + "learning_rate": 2.634732034984915e-06, + "loss": 0.097, + "step": 2335 + }, + { + "epoch": 0.67, + "grad_norm": 0.39473869436419473, + "learning_rate": 2.6306566876350072e-06, + "loss": 0.0886, + "step": 2336 + }, + { + "epoch": 0.67, + "grad_norm": 0.4191660642892399, + "learning_rate": 2.6265833691267438e-06, + "loss": 0.0876, + "step": 2337 + }, + { + "epoch": 0.67, + "grad_norm": 0.4452460547410256, + "learning_rate": 2.622512082948063e-06, + "loss": 0.0907, + "step": 2338 + }, + { + "epoch": 0.67, + "grad_norm": 0.3975032750118112, + "learning_rate": 2.6184428325851576e-06, + "loss": 0.0554, + "step": 2339 + }, + { + "epoch": 0.67, + "grad_norm": 0.4047627494777148, + "learning_rate": 2.6143756215224803e-06, + "loss": 0.1015, + "step": 2340 + }, + { + "epoch": 0.67, + "grad_norm": 0.5047816744255545, + "learning_rate": 2.6103104532427392e-06, + "loss": 0.115, + "step": 2341 + }, + { + "epoch": 0.67, + "grad_norm": 0.42429120474061843, + "learning_rate": 2.606247331226892e-06, + "loss": 0.0852, + "step": 2342 + }, + { + "epoch": 0.67, + "grad_norm": 0.5137784528616098, + "learning_rate": 2.6021862589541413e-06, + "loss": 0.0951, + "step": 2343 + }, + { + "epoch": 0.67, + "grad_norm": 0.6173231686309556, + "learning_rate": 2.5981272399019384e-06, + "loss": 0.1037, + "step": 2344 + }, + { + "epoch": 0.67, + "grad_norm": 0.4402885133628901, + "learning_rate": 2.594070277545975e-06, + "loss": 0.1001, + "step": 2345 + }, + { + "epoch": 0.67, + "grad_norm": 0.47376847868750077, + "learning_rate": 2.590015375360183e-06, + "loss": 0.094, + "step": 2346 + }, + { + "epoch": 0.67, + "grad_norm": 0.6065813528577138, + "learning_rate": 2.585962536816725e-06, + "loss": 0.1145, + "step": 2347 + }, + { + "epoch": 0.67, + "grad_norm": 0.33636633356163, + "learning_rate": 2.581911765386004e-06, + "loss": 0.0589, + "step": 2348 + }, + { + "epoch": 0.67, + "grad_norm": 0.3881472042788369, + "learning_rate": 2.5778630645366477e-06, + "loss": 0.084, + "step": 2349 + }, + { + "epoch": 0.67, + "grad_norm": 0.5420653870323854, + "learning_rate": 2.5738164377355148e-06, + "loss": 0.0987, + "step": 2350 + }, + { + "epoch": 0.67, + "grad_norm": 0.5276164136281538, + "learning_rate": 2.569771888447682e-06, + "loss": 0.0973, + "step": 2351 + }, + { + "epoch": 0.67, + "grad_norm": 0.3759638370555702, + "learning_rate": 2.5657294201364526e-06, + "loss": 0.0692, + "step": 2352 + }, + { + "epoch": 0.67, + "grad_norm": 0.5737221650371205, + "learning_rate": 2.561689036263347e-06, + "loss": 0.1292, + "step": 2353 + }, + { + "epoch": 0.67, + "grad_norm": 1.616287565521532, + "learning_rate": 2.557650740288098e-06, + "loss": 0.1382, + "step": 2354 + }, + { + "epoch": 0.67, + "grad_norm": 0.33382954054747793, + "learning_rate": 2.5536145356686528e-06, + "loss": 0.0863, + "step": 2355 + }, + { + "epoch": 0.67, + "grad_norm": 0.5897948544876979, + "learning_rate": 2.5495804258611667e-06, + "loss": 0.1484, + "step": 2356 + }, + { + "epoch": 0.67, + "grad_norm": 0.2847946190578612, + "learning_rate": 2.545548414320006e-06, + "loss": 0.0624, + "step": 2357 + }, + { + "epoch": 0.67, + "grad_norm": 0.3132650762979806, + "learning_rate": 2.541518504497731e-06, + "loss": 0.0455, + "step": 2358 + }, + { + "epoch": 0.67, + "grad_norm": 0.5106073690364898, + "learning_rate": 2.5374906998451094e-06, + "loss": 0.0776, + "step": 2359 + }, + { + "epoch": 0.67, + "grad_norm": 0.5190182152117772, + "learning_rate": 2.5334650038111045e-06, + "loss": 0.0939, + "step": 2360 + }, + { + "epoch": 0.67, + "grad_norm": 0.32729181393164075, + "learning_rate": 2.5294414198428764e-06, + "loss": 0.0756, + "step": 2361 + }, + { + "epoch": 0.67, + "grad_norm": 0.30135334398108327, + "learning_rate": 2.525419951385769e-06, + "loss": 0.0653, + "step": 2362 + }, + { + "epoch": 0.68, + "grad_norm": 0.5707745801369596, + "learning_rate": 2.5214006018833225e-06, + "loss": 0.0994, + "step": 2363 + }, + { + "epoch": 0.68, + "grad_norm": 0.5057873530785689, + "learning_rate": 2.5173833747772614e-06, + "loss": 0.0739, + "step": 2364 + }, + { + "epoch": 0.68, + "grad_norm": 0.46545610006026966, + "learning_rate": 2.5133682735074904e-06, + "loss": 0.1025, + "step": 2365 + }, + { + "epoch": 0.68, + "grad_norm": 0.37332773727241425, + "learning_rate": 2.5093553015120937e-06, + "loss": 0.0291, + "step": 2366 + }, + { + "epoch": 0.68, + "grad_norm": 0.46681952599272564, + "learning_rate": 2.5053444622273336e-06, + "loss": 0.0605, + "step": 2367 + }, + { + "epoch": 0.68, + "grad_norm": 0.5989662159459328, + "learning_rate": 2.5013357590876496e-06, + "loss": 0.1448, + "step": 2368 + }, + { + "epoch": 0.68, + "grad_norm": 0.4341230373512625, + "learning_rate": 2.4973291955256427e-06, + "loss": 0.1006, + "step": 2369 + }, + { + "epoch": 0.68, + "grad_norm": 0.29255441779312863, + "learning_rate": 2.4933247749720912e-06, + "loss": 0.0699, + "step": 2370 + }, + { + "epoch": 0.68, + "grad_norm": 0.23899279770949503, + "learning_rate": 2.4893225008559335e-06, + "loss": 0.0459, + "step": 2371 + }, + { + "epoch": 0.68, + "grad_norm": 1.6315854138492232, + "learning_rate": 2.4853223766042737e-06, + "loss": 0.0829, + "step": 2372 + }, + { + "epoch": 0.68, + "grad_norm": 0.573114555979581, + "learning_rate": 2.4813244056423692e-06, + "loss": 0.0958, + "step": 2373 + }, + { + "epoch": 0.68, + "grad_norm": 0.44635644161624893, + "learning_rate": 2.477328591393638e-06, + "loss": 0.0758, + "step": 2374 + }, + { + "epoch": 0.68, + "grad_norm": 0.4551954292578954, + "learning_rate": 2.4733349372796506e-06, + "loss": 0.0987, + "step": 2375 + }, + { + "epoch": 0.68, + "grad_norm": 0.31487377604106886, + "learning_rate": 2.4693434467201293e-06, + "loss": 0.0774, + "step": 2376 + }, + { + "epoch": 0.68, + "grad_norm": 0.6363468084043964, + "learning_rate": 2.4653541231329387e-06, + "loss": 0.1264, + "step": 2377 + }, + { + "epoch": 0.68, + "grad_norm": 0.321675658045344, + "learning_rate": 2.461366969934092e-06, + "loss": 0.0885, + "step": 2378 + }, + { + "epoch": 0.68, + "grad_norm": 0.6557954818417866, + "learning_rate": 2.457381990537743e-06, + "loss": 0.0874, + "step": 2379 + }, + { + "epoch": 0.68, + "grad_norm": 0.31224448558205475, + "learning_rate": 2.4533991883561868e-06, + "loss": 0.0708, + "step": 2380 + }, + { + "epoch": 0.68, + "grad_norm": 0.3424106441845974, + "learning_rate": 2.4494185667998473e-06, + "loss": 0.0562, + "step": 2381 + }, + { + "epoch": 0.68, + "grad_norm": 0.26569710307490246, + "learning_rate": 2.4454401292772874e-06, + "loss": 0.0753, + "step": 2382 + }, + { + "epoch": 0.68, + "grad_norm": 0.34777337003477676, + "learning_rate": 2.4414638791951972e-06, + "loss": 0.0545, + "step": 2383 + }, + { + "epoch": 0.68, + "grad_norm": 0.3597508592783276, + "learning_rate": 2.437489819958396e-06, + "loss": 0.0544, + "step": 2384 + }, + { + "epoch": 0.68, + "grad_norm": 0.4589742365365593, + "learning_rate": 2.4335179549698233e-06, + "loss": 0.068, + "step": 2385 + }, + { + "epoch": 0.68, + "grad_norm": 0.5046809887906143, + "learning_rate": 2.429548287630542e-06, + "loss": 0.1102, + "step": 2386 + }, + { + "epoch": 0.68, + "grad_norm": 0.31299804205560783, + "learning_rate": 2.425580821339733e-06, + "loss": 0.0433, + "step": 2387 + }, + { + "epoch": 0.68, + "grad_norm": 0.26008699604595936, + "learning_rate": 2.421615559494693e-06, + "loss": 0.0552, + "step": 2388 + }, + { + "epoch": 0.68, + "grad_norm": 0.6297760568594132, + "learning_rate": 2.417652505490831e-06, + "loss": 0.1029, + "step": 2389 + }, + { + "epoch": 0.68, + "grad_norm": 0.4213946805287631, + "learning_rate": 2.4136916627216656e-06, + "loss": 0.0934, + "step": 2390 + }, + { + "epoch": 0.68, + "grad_norm": 0.390487020380129, + "learning_rate": 2.4097330345788184e-06, + "loss": 0.058, + "step": 2391 + }, + { + "epoch": 0.68, + "grad_norm": 0.7391042862420466, + "learning_rate": 2.4057766244520185e-06, + "loss": 0.1124, + "step": 2392 + }, + { + "epoch": 0.68, + "grad_norm": 0.316328593496384, + "learning_rate": 2.4018224357290952e-06, + "loss": 0.0647, + "step": 2393 + }, + { + "epoch": 0.68, + "grad_norm": 0.5438667023288013, + "learning_rate": 2.3978704717959777e-06, + "loss": 0.1048, + "step": 2394 + }, + { + "epoch": 0.68, + "grad_norm": 0.464045075823488, + "learning_rate": 2.393920736036683e-06, + "loss": 0.0812, + "step": 2395 + }, + { + "epoch": 0.68, + "grad_norm": 0.30258637973305913, + "learning_rate": 2.3899732318333275e-06, + "loss": 0.073, + "step": 2396 + }, + { + "epoch": 0.68, + "grad_norm": 0.30139675813601924, + "learning_rate": 2.3860279625661125e-06, + "loss": 0.0825, + "step": 2397 + }, + { + "epoch": 0.69, + "grad_norm": 0.6560813055550506, + "learning_rate": 2.3820849316133303e-06, + "loss": 0.102, + "step": 2398 + }, + { + "epoch": 0.69, + "grad_norm": 0.7185847389274075, + "learning_rate": 2.3781441423513497e-06, + "loss": 0.1199, + "step": 2399 + }, + { + "epoch": 0.69, + "grad_norm": 0.2171484352781935, + "learning_rate": 2.374205598154624e-06, + "loss": 0.0439, + "step": 2400 + }, + { + "epoch": 0.69, + "grad_norm": 0.575515360659286, + "learning_rate": 2.3702693023956853e-06, + "loss": 0.0949, + "step": 2401 + }, + { + "epoch": 0.69, + "grad_norm": 0.47632807746805766, + "learning_rate": 2.3663352584451394e-06, + "loss": 0.0777, + "step": 2402 + }, + { + "epoch": 0.69, + "grad_norm": 0.5535319658893976, + "learning_rate": 2.36240346967166e-06, + "loss": 0.0616, + "step": 2403 + }, + { + "epoch": 0.69, + "grad_norm": 0.980873831478703, + "learning_rate": 2.3584739394419953e-06, + "loss": 0.1417, + "step": 2404 + }, + { + "epoch": 0.69, + "grad_norm": 0.33218187698802715, + "learning_rate": 2.3545466711209585e-06, + "loss": 0.0918, + "step": 2405 + }, + { + "epoch": 0.69, + "grad_norm": 0.3762678115177452, + "learning_rate": 2.3506216680714204e-06, + "loss": 0.0813, + "step": 2406 + }, + { + "epoch": 0.69, + "grad_norm": 0.44929720481790353, + "learning_rate": 2.3466989336543196e-06, + "loss": 0.1212, + "step": 2407 + }, + { + "epoch": 0.69, + "grad_norm": 0.4881987037263634, + "learning_rate": 2.342778471228648e-06, + "loss": 0.0781, + "step": 2408 + }, + { + "epoch": 0.69, + "grad_norm": 0.37359657823670966, + "learning_rate": 2.3388602841514542e-06, + "loss": 0.0694, + "step": 2409 + }, + { + "epoch": 0.69, + "grad_norm": 0.45748035159517075, + "learning_rate": 2.3349443757778346e-06, + "loss": 0.0742, + "step": 2410 + }, + { + "epoch": 0.69, + "grad_norm": 0.3542304760666429, + "learning_rate": 2.3310307494609374e-06, + "loss": 0.0905, + "step": 2411 + }, + { + "epoch": 0.69, + "grad_norm": 0.5466585898427205, + "learning_rate": 2.3271194085519565e-06, + "loss": 0.087, + "step": 2412 + }, + { + "epoch": 0.69, + "grad_norm": 0.6021142086632345, + "learning_rate": 2.323210356400131e-06, + "loss": 0.1261, + "step": 2413 + }, + { + "epoch": 0.69, + "grad_norm": 0.47097148145120954, + "learning_rate": 2.3193035963527345e-06, + "loss": 0.1298, + "step": 2414 + }, + { + "epoch": 0.69, + "grad_norm": 0.21159878346072342, + "learning_rate": 2.315399131755081e-06, + "loss": 0.0548, + "step": 2415 + }, + { + "epoch": 0.69, + "grad_norm": 0.3219946254576339, + "learning_rate": 2.3114969659505197e-06, + "loss": 0.0839, + "step": 2416 + }, + { + "epoch": 0.69, + "grad_norm": 0.2530687573818007, + "learning_rate": 2.307597102280434e-06, + "loss": 0.0695, + "step": 2417 + }, + { + "epoch": 0.69, + "grad_norm": 0.5291999227454721, + "learning_rate": 2.303699544084227e-06, + "loss": 0.0776, + "step": 2418 + }, + { + "epoch": 0.69, + "grad_norm": 0.3675315539376898, + "learning_rate": 2.2998042946993355e-06, + "loss": 0.0819, + "step": 2419 + }, + { + "epoch": 0.69, + "grad_norm": 0.33178216560844864, + "learning_rate": 2.2959113574612204e-06, + "loss": 0.0737, + "step": 2420 + }, + { + "epoch": 0.69, + "grad_norm": 0.28283196379546766, + "learning_rate": 2.2920207357033535e-06, + "loss": 0.0645, + "step": 2421 + }, + { + "epoch": 0.69, + "grad_norm": 0.3188858291542534, + "learning_rate": 2.2881324327572336e-06, + "loss": 0.0709, + "step": 2422 + }, + { + "epoch": 0.69, + "grad_norm": 0.3255783974618697, + "learning_rate": 2.2842464519523692e-06, + "loss": 0.0771, + "step": 2423 + }, + { + "epoch": 0.69, + "grad_norm": 0.5174623259760657, + "learning_rate": 2.2803627966162833e-06, + "loss": 0.0921, + "step": 2424 + }, + { + "epoch": 0.69, + "grad_norm": 0.48022556995191873, + "learning_rate": 2.2764814700745025e-06, + "loss": 0.0902, + "step": 2425 + }, + { + "epoch": 0.69, + "grad_norm": 0.624728299602598, + "learning_rate": 2.272602475650563e-06, + "loss": 0.098, + "step": 2426 + }, + { + "epoch": 0.69, + "grad_norm": 0.24497852595002126, + "learning_rate": 2.2687258166660055e-06, + "loss": 0.0602, + "step": 2427 + }, + { + "epoch": 0.69, + "grad_norm": 0.5642659374889866, + "learning_rate": 2.2648514964403685e-06, + "loss": 0.0993, + "step": 2428 + }, + { + "epoch": 0.69, + "grad_norm": 0.4953658521208724, + "learning_rate": 2.260979518291186e-06, + "loss": 0.0739, + "step": 2429 + }, + { + "epoch": 0.69, + "grad_norm": 0.4888118571508726, + "learning_rate": 2.25710988553399e-06, + "loss": 0.0698, + "step": 2430 + }, + { + "epoch": 0.69, + "grad_norm": 0.34404975575745816, + "learning_rate": 2.253242601482303e-06, + "loss": 0.0618, + "step": 2431 + }, + { + "epoch": 0.69, + "grad_norm": 0.5492917518585991, + "learning_rate": 2.2493776694476376e-06, + "loss": 0.1028, + "step": 2432 + }, + { + "epoch": 0.7, + "grad_norm": 0.3704229676752792, + "learning_rate": 2.245515092739488e-06, + "loss": 0.0656, + "step": 2433 + }, + { + "epoch": 0.7, + "grad_norm": 0.26439338016716446, + "learning_rate": 2.241654874665336e-06, + "loss": 0.0818, + "step": 2434 + }, + { + "epoch": 0.7, + "grad_norm": 0.36120165175789193, + "learning_rate": 2.2377970185306424e-06, + "loss": 0.0798, + "step": 2435 + }, + { + "epoch": 0.7, + "grad_norm": 0.42707191592250243, + "learning_rate": 2.233941527638848e-06, + "loss": 0.0978, + "step": 2436 + }, + { + "epoch": 0.7, + "grad_norm": 0.2885428286429536, + "learning_rate": 2.23008840529136e-06, + "loss": 0.068, + "step": 2437 + }, + { + "epoch": 0.7, + "grad_norm": 0.5509856220690513, + "learning_rate": 2.2262376547875665e-06, + "loss": 0.1508, + "step": 2438 + }, + { + "epoch": 0.7, + "grad_norm": 0.3659854627199059, + "learning_rate": 2.222389279424823e-06, + "loss": 0.0588, + "step": 2439 + }, + { + "epoch": 0.7, + "grad_norm": 0.3313354240544666, + "learning_rate": 2.2185432824984455e-06, + "loss": 0.0627, + "step": 2440 + }, + { + "epoch": 0.7, + "grad_norm": 0.510604467026776, + "learning_rate": 2.214699667301719e-06, + "loss": 0.1026, + "step": 2441 + }, + { + "epoch": 0.7, + "grad_norm": 0.4047591204620746, + "learning_rate": 2.2108584371258875e-06, + "loss": 0.0535, + "step": 2442 + }, + { + "epoch": 0.7, + "grad_norm": 0.4356057849030091, + "learning_rate": 2.207019595260154e-06, + "loss": 0.0863, + "step": 2443 + }, + { + "epoch": 0.7, + "grad_norm": 0.39166512422836935, + "learning_rate": 2.2031831449916716e-06, + "loss": 0.0915, + "step": 2444 + }, + { + "epoch": 0.7, + "grad_norm": 0.4891860123956304, + "learning_rate": 2.1993490896055514e-06, + "loss": 0.1222, + "step": 2445 + }, + { + "epoch": 0.7, + "grad_norm": 0.29687866285229086, + "learning_rate": 2.19551743238485e-06, + "loss": 0.0742, + "step": 2446 + }, + { + "epoch": 0.7, + "grad_norm": 0.5393478987922212, + "learning_rate": 2.191688176610575e-06, + "loss": 0.1401, + "step": 2447 + }, + { + "epoch": 0.7, + "grad_norm": 0.3816829342957274, + "learning_rate": 2.1878613255616693e-06, + "loss": 0.0704, + "step": 2448 + }, + { + "epoch": 0.7, + "grad_norm": 0.26128072920496465, + "learning_rate": 2.1840368825150243e-06, + "loss": 0.069, + "step": 2449 + }, + { + "epoch": 0.7, + "grad_norm": 0.42514841528539127, + "learning_rate": 2.1802148507454675e-06, + "loss": 0.075, + "step": 2450 + }, + { + "epoch": 0.7, + "grad_norm": 0.356997507982979, + "learning_rate": 2.1763952335257622e-06, + "loss": 0.0736, + "step": 2451 + }, + { + "epoch": 0.7, + "grad_norm": 0.3273788304692756, + "learning_rate": 2.172578034126599e-06, + "loss": 0.084, + "step": 2452 + }, + { + "epoch": 0.7, + "grad_norm": 0.2841996763065209, + "learning_rate": 2.1687632558166055e-06, + "loss": 0.0719, + "step": 2453 + }, + { + "epoch": 0.7, + "grad_norm": 0.3224434379173398, + "learning_rate": 2.1649509018623334e-06, + "loss": 0.0895, + "step": 2454 + }, + { + "epoch": 0.7, + "grad_norm": 1.0355583441148388, + "learning_rate": 2.1611409755282542e-06, + "loss": 0.1385, + "step": 2455 + }, + { + "epoch": 0.7, + "grad_norm": 0.3504069134414007, + "learning_rate": 2.157333480076767e-06, + "loss": 0.086, + "step": 2456 + }, + { + "epoch": 0.7, + "grad_norm": 0.38088794633133544, + "learning_rate": 2.1535284187681866e-06, + "loss": 0.0982, + "step": 2457 + }, + { + "epoch": 0.7, + "grad_norm": 0.24279834366214054, + "learning_rate": 2.149725794860745e-06, + "loss": 0.041, + "step": 2458 + }, + { + "epoch": 0.7, + "grad_norm": 0.26569461877107126, + "learning_rate": 2.1459256116105815e-06, + "loss": 0.0595, + "step": 2459 + }, + { + "epoch": 0.7, + "grad_norm": 0.5549982536064857, + "learning_rate": 2.1421278722717524e-06, + "loss": 0.0644, + "step": 2460 + }, + { + "epoch": 0.7, + "grad_norm": 0.31356757379161765, + "learning_rate": 2.1383325800962187e-06, + "loss": 0.0639, + "step": 2461 + }, + { + "epoch": 0.7, + "grad_norm": 0.4153190752587748, + "learning_rate": 2.1345397383338466e-06, + "loss": 0.076, + "step": 2462 + }, + { + "epoch": 0.7, + "grad_norm": 0.6208101669149098, + "learning_rate": 2.1307493502324007e-06, + "loss": 0.1198, + "step": 2463 + }, + { + "epoch": 0.7, + "grad_norm": 0.476865981895542, + "learning_rate": 2.1269614190375477e-06, + "loss": 0.0803, + "step": 2464 + }, + { + "epoch": 0.7, + "grad_norm": 0.3277038078003154, + "learning_rate": 2.123175947992851e-06, + "loss": 0.0765, + "step": 2465 + }, + { + "epoch": 0.7, + "grad_norm": 0.34938994313075933, + "learning_rate": 2.119392940339767e-06, + "loss": 0.0716, + "step": 2466 + }, + { + "epoch": 0.7, + "grad_norm": 0.45887568894719905, + "learning_rate": 2.1156123993176398e-06, + "loss": 0.0843, + "step": 2467 + }, + { + "epoch": 0.71, + "grad_norm": 0.5093631762615431, + "learning_rate": 2.1118343281637042e-06, + "loss": 0.0825, + "step": 2468 + }, + { + "epoch": 0.71, + "grad_norm": 0.260934900231959, + "learning_rate": 2.1080587301130795e-06, + "loss": 0.0546, + "step": 2469 + }, + { + "epoch": 0.71, + "grad_norm": 0.4705286799504309, + "learning_rate": 2.1042856083987694e-06, + "loss": 0.1055, + "step": 2470 + }, + { + "epoch": 0.71, + "grad_norm": 0.5109345615450195, + "learning_rate": 2.1005149662516517e-06, + "loss": 0.1004, + "step": 2471 + }, + { + "epoch": 0.71, + "grad_norm": 0.27146351549399605, + "learning_rate": 2.096746806900485e-06, + "loss": 0.0617, + "step": 2472 + }, + { + "epoch": 0.71, + "grad_norm": 0.6455686348885666, + "learning_rate": 2.0929811335719037e-06, + "loss": 0.1418, + "step": 2473 + }, + { + "epoch": 0.71, + "grad_norm": 0.33708652872222555, + "learning_rate": 2.0892179494904076e-06, + "loss": 0.0685, + "step": 2474 + }, + { + "epoch": 0.71, + "grad_norm": 0.37054245087436677, + "learning_rate": 2.085457257878369e-06, + "loss": 0.0427, + "step": 2475 + }, + { + "epoch": 0.71, + "grad_norm": 0.2960352781684179, + "learning_rate": 2.0816990619560264e-06, + "loss": 0.0489, + "step": 2476 + }, + { + "epoch": 0.71, + "grad_norm": 0.5256612762305234, + "learning_rate": 2.077943364941481e-06, + "loss": 0.0941, + "step": 2477 + }, + { + "epoch": 0.71, + "grad_norm": 0.583439390213951, + "learning_rate": 2.07419017005069e-06, + "loss": 0.114, + "step": 2478 + }, + { + "epoch": 0.71, + "grad_norm": 0.4838188104210831, + "learning_rate": 2.0704394804974732e-06, + "loss": 0.0782, + "step": 2479 + }, + { + "epoch": 0.71, + "grad_norm": 0.516294044621413, + "learning_rate": 2.0666912994935034e-06, + "loss": 0.131, + "step": 2480 + }, + { + "epoch": 0.71, + "grad_norm": 0.5609974918952368, + "learning_rate": 2.0629456302483068e-06, + "loss": 0.1124, + "step": 2481 + }, + { + "epoch": 0.71, + "grad_norm": 0.39316597613776283, + "learning_rate": 2.059202475969254e-06, + "loss": 0.0864, + "step": 2482 + }, + { + "epoch": 0.71, + "grad_norm": 0.5216492741893036, + "learning_rate": 2.0554618398615674e-06, + "loss": 0.1055, + "step": 2483 + }, + { + "epoch": 0.71, + "grad_norm": 0.304966561786873, + "learning_rate": 2.0517237251283096e-06, + "loss": 0.0662, + "step": 2484 + }, + { + "epoch": 0.71, + "grad_norm": 0.3776164971594819, + "learning_rate": 2.0479881349703885e-06, + "loss": 0.09, + "step": 2485 + }, + { + "epoch": 0.71, + "grad_norm": 1.3502129832354215, + "learning_rate": 2.0442550725865434e-06, + "loss": 0.0872, + "step": 2486 + }, + { + "epoch": 0.71, + "grad_norm": 0.2679011132795074, + "learning_rate": 2.0405245411733548e-06, + "loss": 0.0731, + "step": 2487 + }, + { + "epoch": 0.71, + "grad_norm": 0.5659730523629836, + "learning_rate": 2.036796543925236e-06, + "loss": 0.1215, + "step": 2488 + }, + { + "epoch": 0.71, + "grad_norm": 0.26544866911837645, + "learning_rate": 2.0330710840344257e-06, + "loss": 0.0544, + "step": 2489 + }, + { + "epoch": 0.71, + "grad_norm": 0.426339964067119, + "learning_rate": 2.0293481646909934e-06, + "loss": 0.0761, + "step": 2490 + }, + { + "epoch": 0.71, + "grad_norm": 0.18642279906911316, + "learning_rate": 2.025627789082833e-06, + "loss": 0.0311, + "step": 2491 + }, + { + "epoch": 0.71, + "grad_norm": 0.573801440813466, + "learning_rate": 2.021909960395661e-06, + "loss": 0.108, + "step": 2492 + }, + { + "epoch": 0.71, + "grad_norm": 0.3164197409282355, + "learning_rate": 2.0181946818130073e-06, + "loss": 0.0743, + "step": 2493 + }, + { + "epoch": 0.71, + "grad_norm": 0.28230931710733365, + "learning_rate": 2.0144819565162265e-06, + "loss": 0.0707, + "step": 2494 + }, + { + "epoch": 0.71, + "grad_norm": 0.2776470109975148, + "learning_rate": 2.010771787684484e-06, + "loss": 0.0731, + "step": 2495 + }, + { + "epoch": 0.71, + "grad_norm": 0.49469879552584534, + "learning_rate": 2.0070641784947516e-06, + "loss": 0.0919, + "step": 2496 + }, + { + "epoch": 0.71, + "grad_norm": 0.4661691610632084, + "learning_rate": 2.0033591321218136e-06, + "loss": 0.0767, + "step": 2497 + }, + { + "epoch": 0.71, + "grad_norm": 0.47424428622317366, + "learning_rate": 1.999656651738259e-06, + "loss": 0.0427, + "step": 2498 + }, + { + "epoch": 0.71, + "grad_norm": 0.39612728899221655, + "learning_rate": 1.9959567405144825e-06, + "loss": 0.0935, + "step": 2499 + }, + { + "epoch": 0.71, + "grad_norm": 0.3083803254418959, + "learning_rate": 1.9922594016186716e-06, + "loss": 0.0647, + "step": 2500 + }, + { + "epoch": 0.71, + "grad_norm": 0.44479842277769166, + "learning_rate": 1.9885646382168165e-06, + "loss": 0.0866, + "step": 2501 + }, + { + "epoch": 0.71, + "grad_norm": 0.662541347825402, + "learning_rate": 1.984872453472702e-06, + "loss": 0.1136, + "step": 2502 + }, + { + "epoch": 0.72, + "grad_norm": 0.3126659292319549, + "learning_rate": 1.9811828505479046e-06, + "loss": 0.0607, + "step": 2503 + }, + { + "epoch": 0.72, + "grad_norm": 0.5218609887142224, + "learning_rate": 1.9774958326017867e-06, + "loss": 0.0864, + "step": 2504 + }, + { + "epoch": 0.72, + "grad_norm": 0.40934548562644574, + "learning_rate": 1.9738114027915007e-06, + "loss": 0.0929, + "step": 2505 + }, + { + "epoch": 0.72, + "grad_norm": 0.29386181225783486, + "learning_rate": 1.9701295642719836e-06, + "loss": 0.0707, + "step": 2506 + }, + { + "epoch": 0.72, + "grad_norm": 0.495108064171112, + "learning_rate": 1.9664503201959483e-06, + "loss": 0.0769, + "step": 2507 + }, + { + "epoch": 0.72, + "grad_norm": 0.28809812690886766, + "learning_rate": 1.9627736737138914e-06, + "loss": 0.0457, + "step": 2508 + }, + { + "epoch": 0.72, + "grad_norm": 0.38777632566314457, + "learning_rate": 1.9590996279740837e-06, + "loss": 0.0858, + "step": 2509 + }, + { + "epoch": 0.72, + "grad_norm": 0.4790592223308331, + "learning_rate": 1.9554281861225694e-06, + "loss": 0.1072, + "step": 2510 + }, + { + "epoch": 0.72, + "grad_norm": 0.4780180012717242, + "learning_rate": 1.9517593513031596e-06, + "loss": 0.09, + "step": 2511 + }, + { + "epoch": 0.72, + "grad_norm": 0.37817152714820146, + "learning_rate": 1.948093126657437e-06, + "loss": 0.112, + "step": 2512 + }, + { + "epoch": 0.72, + "grad_norm": 0.38182769355388146, + "learning_rate": 1.944429515324749e-06, + "loss": 0.0734, + "step": 2513 + }, + { + "epoch": 0.72, + "grad_norm": 0.4388759296646012, + "learning_rate": 1.940768520442204e-06, + "loss": 0.0856, + "step": 2514 + }, + { + "epoch": 0.72, + "grad_norm": 0.25436412200026715, + "learning_rate": 1.9371101451446685e-06, + "loss": 0.0573, + "step": 2515 + }, + { + "epoch": 0.72, + "grad_norm": 0.5931175586972784, + "learning_rate": 1.9334543925647676e-06, + "loss": 0.1285, + "step": 2516 + }, + { + "epoch": 0.72, + "grad_norm": 0.5238592325729333, + "learning_rate": 1.929801265832882e-06, + "loss": 0.1078, + "step": 2517 + }, + { + "epoch": 0.72, + "grad_norm": 0.35228657987798356, + "learning_rate": 1.9261507680771435e-06, + "loss": 0.0739, + "step": 2518 + }, + { + "epoch": 0.72, + "grad_norm": 0.4872524333058193, + "learning_rate": 1.9225029024234283e-06, + "loss": 0.11, + "step": 2519 + }, + { + "epoch": 0.72, + "grad_norm": 0.3897387784932764, + "learning_rate": 1.9188576719953635e-06, + "loss": 0.1008, + "step": 2520 + }, + { + "epoch": 0.72, + "grad_norm": 0.3625974938173454, + "learning_rate": 1.9152150799143205e-06, + "loss": 0.0941, + "step": 2521 + }, + { + "epoch": 0.72, + "grad_norm": 0.282595284072661, + "learning_rate": 1.911575129299406e-06, + "loss": 0.0852, + "step": 2522 + }, + { + "epoch": 0.72, + "grad_norm": 0.8018839374923361, + "learning_rate": 1.9079378232674694e-06, + "loss": 0.1595, + "step": 2523 + }, + { + "epoch": 0.72, + "grad_norm": 1.087252907192483, + "learning_rate": 1.904303164933095e-06, + "loss": 0.0973, + "step": 2524 + }, + { + "epoch": 0.72, + "grad_norm": 0.5931258894394622, + "learning_rate": 1.9006711574086006e-06, + "loss": 0.1251, + "step": 2525 + }, + { + "epoch": 0.72, + "grad_norm": 0.5540071030415847, + "learning_rate": 1.8970418038040294e-06, + "loss": 0.1252, + "step": 2526 + }, + { + "epoch": 0.72, + "grad_norm": 0.36403783381932675, + "learning_rate": 1.8934151072271573e-06, + "loss": 0.1087, + "step": 2527 + }, + { + "epoch": 0.72, + "grad_norm": 0.4849985474767569, + "learning_rate": 1.8897910707834827e-06, + "loss": 0.1204, + "step": 2528 + }, + { + "epoch": 0.72, + "grad_norm": 0.5593547938359725, + "learning_rate": 1.886169697576229e-06, + "loss": 0.1585, + "step": 2529 + }, + { + "epoch": 0.72, + "grad_norm": 0.5711921629050263, + "learning_rate": 1.8825509907063328e-06, + "loss": 0.0873, + "step": 2530 + }, + { + "epoch": 0.72, + "grad_norm": 0.3440541183818768, + "learning_rate": 1.8789349532724538e-06, + "loss": 0.023, + "step": 2531 + }, + { + "epoch": 0.72, + "grad_norm": 0.42056902290070336, + "learning_rate": 1.875321588370963e-06, + "loss": 0.0972, + "step": 2532 + }, + { + "epoch": 0.72, + "grad_norm": 0.5454169841754962, + "learning_rate": 1.871710899095946e-06, + "loss": 0.121, + "step": 2533 + }, + { + "epoch": 0.72, + "grad_norm": 0.6885413968558864, + "learning_rate": 1.8681028885391905e-06, + "loss": 0.0916, + "step": 2534 + }, + { + "epoch": 0.72, + "grad_norm": 0.35303891701106427, + "learning_rate": 1.8644975597901977e-06, + "loss": 0.0948, + "step": 2535 + }, + { + "epoch": 0.72, + "grad_norm": 0.37936485228068184, + "learning_rate": 1.8608949159361678e-06, + "loss": 0.0715, + "step": 2536 + }, + { + "epoch": 0.72, + "grad_norm": 0.2558958213488092, + "learning_rate": 1.8572949600620065e-06, + "loss": 0.0613, + "step": 2537 + }, + { + "epoch": 0.73, + "grad_norm": 0.479947753306411, + "learning_rate": 1.8536976952503117e-06, + "loss": 0.1268, + "step": 2538 + }, + { + "epoch": 0.73, + "grad_norm": 0.6083090733298329, + "learning_rate": 1.8501031245813812e-06, + "loss": 0.1272, + "step": 2539 + }, + { + "epoch": 0.73, + "grad_norm": 0.42330265842017006, + "learning_rate": 1.8465112511332068e-06, + "loss": 0.1344, + "step": 2540 + }, + { + "epoch": 0.73, + "grad_norm": 0.22908849834727446, + "learning_rate": 1.8429220779814654e-06, + "loss": 0.0506, + "step": 2541 + }, + { + "epoch": 0.73, + "grad_norm": 0.458204674075358, + "learning_rate": 1.8393356081995268e-06, + "loss": 0.1042, + "step": 2542 + }, + { + "epoch": 0.73, + "grad_norm": 0.45881719631610296, + "learning_rate": 1.8357518448584438e-06, + "loss": 0.1073, + "step": 2543 + }, + { + "epoch": 0.73, + "grad_norm": 0.5492049235325289, + "learning_rate": 1.8321707910269543e-06, + "loss": 0.107, + "step": 2544 + }, + { + "epoch": 0.73, + "grad_norm": 0.3855242374478838, + "learning_rate": 1.8285924497714702e-06, + "loss": 0.0735, + "step": 2545 + }, + { + "epoch": 0.73, + "grad_norm": 0.6934325475627683, + "learning_rate": 1.825016824156086e-06, + "loss": 0.0981, + "step": 2546 + }, + { + "epoch": 0.73, + "grad_norm": 0.5807867962657384, + "learning_rate": 1.821443917242569e-06, + "loss": 0.1551, + "step": 2547 + }, + { + "epoch": 0.73, + "grad_norm": 0.3945357997814155, + "learning_rate": 1.81787373209036e-06, + "loss": 0.1006, + "step": 2548 + }, + { + "epoch": 0.73, + "grad_norm": 0.7882629236339732, + "learning_rate": 1.814306271756565e-06, + "loss": 0.1191, + "step": 2549 + }, + { + "epoch": 0.73, + "grad_norm": 0.2720043278141304, + "learning_rate": 1.8107415392959615e-06, + "loss": 0.0473, + "step": 2550 + }, + { + "epoch": 0.73, + "grad_norm": 0.27683130579675463, + "learning_rate": 1.8071795377609886e-06, + "loss": 0.0551, + "step": 2551 + }, + { + "epoch": 0.73, + "grad_norm": 0.6525786380014297, + "learning_rate": 1.803620270201749e-06, + "loss": 0.1733, + "step": 2552 + }, + { + "epoch": 0.73, + "grad_norm": 0.4844662615536015, + "learning_rate": 1.8000637396660008e-06, + "loss": 0.0837, + "step": 2553 + }, + { + "epoch": 0.73, + "grad_norm": 0.50943545728967, + "learning_rate": 1.796509949199161e-06, + "loss": 0.108, + "step": 2554 + }, + { + "epoch": 0.73, + "grad_norm": 0.38158254719622997, + "learning_rate": 1.7929589018443016e-06, + "loss": 0.0937, + "step": 2555 + }, + { + "epoch": 0.73, + "grad_norm": 0.28357806200464436, + "learning_rate": 1.7894106006421413e-06, + "loss": 0.0663, + "step": 2556 + }, + { + "epoch": 0.73, + "grad_norm": 1.0821239900705801, + "learning_rate": 1.7858650486310514e-06, + "loss": 0.1272, + "step": 2557 + }, + { + "epoch": 0.73, + "grad_norm": 0.48254464476777337, + "learning_rate": 1.782322248847047e-06, + "loss": 0.1075, + "step": 2558 + }, + { + "epoch": 0.73, + "grad_norm": 0.4295029014847295, + "learning_rate": 1.778782204323789e-06, + "loss": 0.1138, + "step": 2559 + }, + { + "epoch": 0.73, + "grad_norm": 0.4753672790765889, + "learning_rate": 1.7752449180925746e-06, + "loss": 0.1429, + "step": 2560 + }, + { + "epoch": 0.73, + "grad_norm": 0.32048825085718274, + "learning_rate": 1.7717103931823427e-06, + "loss": 0.0665, + "step": 2561 + }, + { + "epoch": 0.73, + "grad_norm": 0.614838330598215, + "learning_rate": 1.7681786326196665e-06, + "loss": 0.1012, + "step": 2562 + }, + { + "epoch": 0.73, + "grad_norm": 0.41390375713775546, + "learning_rate": 1.7646496394287539e-06, + "loss": 0.0989, + "step": 2563 + }, + { + "epoch": 0.73, + "grad_norm": 0.5144818871779464, + "learning_rate": 1.7611234166314383e-06, + "loss": 0.1237, + "step": 2564 + }, + { + "epoch": 0.73, + "grad_norm": 0.5951516361411493, + "learning_rate": 1.7575999672471866e-06, + "loss": 0.1359, + "step": 2565 + }, + { + "epoch": 0.73, + "grad_norm": 0.4378968878280546, + "learning_rate": 1.7540792942930867e-06, + "loss": 0.0957, + "step": 2566 + }, + { + "epoch": 0.73, + "grad_norm": 0.3247311437329441, + "learning_rate": 1.750561400783854e-06, + "loss": 0.0907, + "step": 2567 + }, + { + "epoch": 0.73, + "grad_norm": 0.3465510699911141, + "learning_rate": 1.7470462897318158e-06, + "loss": 0.0722, + "step": 2568 + }, + { + "epoch": 0.73, + "grad_norm": 0.37834003760338325, + "learning_rate": 1.743533964146924e-06, + "loss": 0.0758, + "step": 2569 + }, + { + "epoch": 0.73, + "grad_norm": 0.4777199453984389, + "learning_rate": 1.7400244270367429e-06, + "loss": 0.09, + "step": 2570 + }, + { + "epoch": 0.73, + "grad_norm": 0.43798325624572815, + "learning_rate": 1.73651768140645e-06, + "loss": 0.0629, + "step": 2571 + }, + { + "epoch": 0.73, + "grad_norm": 0.5783430551555623, + "learning_rate": 1.7330137302588296e-06, + "loss": 0.0874, + "step": 2572 + }, + { + "epoch": 0.74, + "grad_norm": 0.525030866571335, + "learning_rate": 1.7295125765942756e-06, + "loss": 0.1089, + "step": 2573 + }, + { + "epoch": 0.74, + "grad_norm": 0.24218658107287364, + "learning_rate": 1.7260142234107884e-06, + "loss": 0.0503, + "step": 2574 + }, + { + "epoch": 0.74, + "grad_norm": 1.0969761535113411, + "learning_rate": 1.7225186737039639e-06, + "loss": 0.06, + "step": 2575 + }, + { + "epoch": 0.74, + "grad_norm": 0.28402634767935336, + "learning_rate": 1.7190259304670038e-06, + "loss": 0.075, + "step": 2576 + }, + { + "epoch": 0.74, + "grad_norm": 0.5489601653379527, + "learning_rate": 1.7155359966907031e-06, + "loss": 0.1, + "step": 2577 + }, + { + "epoch": 0.74, + "grad_norm": 0.47282715089742283, + "learning_rate": 1.7120488753634557e-06, + "loss": 0.0822, + "step": 2578 + }, + { + "epoch": 0.74, + "grad_norm": 0.5390442319351164, + "learning_rate": 1.7085645694712389e-06, + "loss": 0.0481, + "step": 2579 + }, + { + "epoch": 0.74, + "grad_norm": 0.4174594573499582, + "learning_rate": 1.7050830819976266e-06, + "loss": 0.1074, + "step": 2580 + }, + { + "epoch": 0.74, + "grad_norm": 0.36223049253439804, + "learning_rate": 1.701604415923776e-06, + "loss": 0.0666, + "step": 2581 + }, + { + "epoch": 0.74, + "grad_norm": 0.33486703002453844, + "learning_rate": 1.698128574228432e-06, + "loss": 0.0927, + "step": 2582 + }, + { + "epoch": 0.74, + "grad_norm": 0.30391198102043865, + "learning_rate": 1.6946555598879138e-06, + "loss": 0.0504, + "step": 2583 + }, + { + "epoch": 0.74, + "grad_norm": 0.3586138390448678, + "learning_rate": 1.6911853758761266e-06, + "loss": 0.0703, + "step": 2584 + }, + { + "epoch": 0.74, + "grad_norm": 0.4143796337534252, + "learning_rate": 1.6877180251645487e-06, + "loss": 0.0652, + "step": 2585 + }, + { + "epoch": 0.74, + "grad_norm": 0.5238135076267086, + "learning_rate": 1.6842535107222341e-06, + "loss": 0.1455, + "step": 2586 + }, + { + "epoch": 0.74, + "grad_norm": 0.5336304169816062, + "learning_rate": 1.6807918355158047e-06, + "loss": 0.1551, + "step": 2587 + }, + { + "epoch": 0.74, + "grad_norm": 0.4029823891279307, + "learning_rate": 1.6773330025094548e-06, + "loss": 0.0804, + "step": 2588 + }, + { + "epoch": 0.74, + "grad_norm": 0.3002081949142874, + "learning_rate": 1.6738770146649452e-06, + "loss": 0.0664, + "step": 2589 + }, + { + "epoch": 0.74, + "grad_norm": 0.30477472734831845, + "learning_rate": 1.6704238749415958e-06, + "loss": 0.0567, + "step": 2590 + }, + { + "epoch": 0.74, + "grad_norm": 0.2851830159767868, + "learning_rate": 1.6669735862962921e-06, + "loss": 0.0558, + "step": 2591 + }, + { + "epoch": 0.74, + "grad_norm": 0.24950351066145945, + "learning_rate": 1.6635261516834772e-06, + "loss": 0.0734, + "step": 2592 + }, + { + "epoch": 0.74, + "grad_norm": 0.6762482960226738, + "learning_rate": 1.6600815740551506e-06, + "loss": 0.1358, + "step": 2593 + }, + { + "epoch": 0.74, + "grad_norm": 0.44321232980048486, + "learning_rate": 1.6566398563608632e-06, + "loss": 0.0553, + "step": 2594 + }, + { + "epoch": 0.74, + "grad_norm": 0.33784167126787296, + "learning_rate": 1.653201001547719e-06, + "loss": 0.0742, + "step": 2595 + }, + { + "epoch": 0.74, + "grad_norm": 0.30014213361415476, + "learning_rate": 1.649765012560371e-06, + "loss": 0.076, + "step": 2596 + }, + { + "epoch": 0.74, + "grad_norm": 0.45292133178369026, + "learning_rate": 1.6463318923410183e-06, + "loss": 0.0869, + "step": 2597 + }, + { + "epoch": 0.74, + "grad_norm": 0.6184536306772263, + "learning_rate": 1.6429016438294e-06, + "loss": 0.0879, + "step": 2598 + }, + { + "epoch": 0.74, + "grad_norm": 0.495024335209103, + "learning_rate": 1.6394742699627974e-06, + "loss": 0.0912, + "step": 2599 + }, + { + "epoch": 0.74, + "grad_norm": 0.38168924285329525, + "learning_rate": 1.6360497736760383e-06, + "loss": 0.0553, + "step": 2600 + }, + { + "epoch": 0.74, + "grad_norm": 0.4806005582767, + "learning_rate": 1.6326281579014735e-06, + "loss": 0.0978, + "step": 2601 + }, + { + "epoch": 0.74, + "grad_norm": 0.29568765197846747, + "learning_rate": 1.629209425568996e-06, + "loss": 0.0691, + "step": 2602 + }, + { + "epoch": 0.74, + "grad_norm": 0.37174080360595985, + "learning_rate": 1.625793579606027e-06, + "loss": 0.0904, + "step": 2603 + }, + { + "epoch": 0.74, + "grad_norm": 0.6006605808807327, + "learning_rate": 1.6223806229375182e-06, + "loss": 0.095, + "step": 2604 + }, + { + "epoch": 0.74, + "grad_norm": 0.6992857364133147, + "learning_rate": 1.6189705584859422e-06, + "loss": 0.1207, + "step": 2605 + }, + { + "epoch": 0.74, + "grad_norm": 0.2736248669303282, + "learning_rate": 1.6155633891712996e-06, + "loss": 0.0585, + "step": 2606 + }, + { + "epoch": 0.74, + "grad_norm": 0.7911493275423609, + "learning_rate": 1.6121591179111123e-06, + "loss": 0.137, + "step": 2607 + }, + { + "epoch": 0.75, + "grad_norm": 0.625722077009125, + "learning_rate": 1.6087577476204157e-06, + "loss": 0.1127, + "step": 2608 + }, + { + "epoch": 0.75, + "grad_norm": 0.42699049142940376, + "learning_rate": 1.6053592812117664e-06, + "loss": 0.0883, + "step": 2609 + }, + { + "epoch": 0.75, + "grad_norm": 0.660223327398503, + "learning_rate": 1.6019637215952322e-06, + "loss": 0.1435, + "step": 2610 + }, + { + "epoch": 0.75, + "grad_norm": 0.31709778065732325, + "learning_rate": 1.5985710716783936e-06, + "loss": 0.0784, + "step": 2611 + }, + { + "epoch": 0.75, + "grad_norm": 0.48417144427391934, + "learning_rate": 1.5951813343663348e-06, + "loss": 0.104, + "step": 2612 + }, + { + "epoch": 0.75, + "grad_norm": 0.25883797075508036, + "learning_rate": 1.5917945125616501e-06, + "loss": 0.0387, + "step": 2613 + }, + { + "epoch": 0.75, + "grad_norm": 0.34870233021124686, + "learning_rate": 1.588410609164438e-06, + "loss": 0.0815, + "step": 2614 + }, + { + "epoch": 0.75, + "grad_norm": 0.5309655676696245, + "learning_rate": 1.5850296270722965e-06, + "loss": 0.0907, + "step": 2615 + }, + { + "epoch": 0.75, + "grad_norm": 0.42249690499901665, + "learning_rate": 1.5816515691803191e-06, + "loss": 0.0891, + "step": 2616 + }, + { + "epoch": 0.75, + "grad_norm": 0.3646035291729176, + "learning_rate": 1.5782764383811005e-06, + "loss": 0.081, + "step": 2617 + }, + { + "epoch": 0.75, + "grad_norm": 0.2889192192777551, + "learning_rate": 1.5749042375647261e-06, + "loss": 0.0622, + "step": 2618 + }, + { + "epoch": 0.75, + "grad_norm": 0.4339159041669743, + "learning_rate": 1.5715349696187747e-06, + "loss": 0.1031, + "step": 2619 + }, + { + "epoch": 0.75, + "grad_norm": 0.3869929594165701, + "learning_rate": 1.568168637428309e-06, + "loss": 0.0828, + "step": 2620 + }, + { + "epoch": 0.75, + "grad_norm": 0.47578851341027734, + "learning_rate": 1.5648052438758815e-06, + "loss": 0.0915, + "step": 2621 + }, + { + "epoch": 0.75, + "grad_norm": 0.2456645792508491, + "learning_rate": 1.5614447918415293e-06, + "loss": 0.0696, + "step": 2622 + }, + { + "epoch": 0.75, + "grad_norm": 0.4599299314364116, + "learning_rate": 1.5580872842027695e-06, + "loss": 0.0874, + "step": 2623 + }, + { + "epoch": 0.75, + "grad_norm": 0.4053157697879296, + "learning_rate": 1.554732723834595e-06, + "loss": 0.0957, + "step": 2624 + }, + { + "epoch": 0.75, + "grad_norm": 0.20457843183326466, + "learning_rate": 1.5513811136094786e-06, + "loss": 0.0411, + "step": 2625 + }, + { + "epoch": 0.75, + "grad_norm": 0.26154833938049865, + "learning_rate": 1.5480324563973682e-06, + "loss": 0.052, + "step": 2626 + }, + { + "epoch": 0.75, + "grad_norm": 0.3119819877266621, + "learning_rate": 1.544686755065677e-06, + "loss": 0.0575, + "step": 2627 + }, + { + "epoch": 0.75, + "grad_norm": 0.4989784511629701, + "learning_rate": 1.5413440124792927e-06, + "loss": 0.113, + "step": 2628 + }, + { + "epoch": 0.75, + "grad_norm": 0.4346739099405508, + "learning_rate": 1.5380042315005678e-06, + "loss": 0.0725, + "step": 2629 + }, + { + "epoch": 0.75, + "grad_norm": 0.6709816425775076, + "learning_rate": 1.5346674149893204e-06, + "loss": 0.1315, + "step": 2630 + }, + { + "epoch": 0.75, + "grad_norm": 0.9662191359456356, + "learning_rate": 1.5313335658028243e-06, + "loss": 0.1427, + "step": 2631 + }, + { + "epoch": 0.75, + "grad_norm": 0.5578108335434087, + "learning_rate": 1.5280026867958186e-06, + "loss": 0.1261, + "step": 2632 + }, + { + "epoch": 0.75, + "grad_norm": 0.5173757681366414, + "learning_rate": 1.5246747808204975e-06, + "loss": 0.1205, + "step": 2633 + }, + { + "epoch": 0.75, + "grad_norm": 0.24859148570689224, + "learning_rate": 1.52134985072651e-06, + "loss": 0.0643, + "step": 2634 + }, + { + "epoch": 0.75, + "grad_norm": 0.1771976967966618, + "learning_rate": 1.5180278993609527e-06, + "loss": 0.0391, + "step": 2635 + }, + { + "epoch": 0.75, + "grad_norm": 0.29787364829888674, + "learning_rate": 1.5147089295683764e-06, + "loss": 0.0654, + "step": 2636 + }, + { + "epoch": 0.75, + "grad_norm": 0.38107131734558986, + "learning_rate": 1.5113929441907765e-06, + "loss": 0.1127, + "step": 2637 + }, + { + "epoch": 0.75, + "grad_norm": 0.2732293353965668, + "learning_rate": 1.508079946067596e-06, + "loss": 0.0593, + "step": 2638 + }, + { + "epoch": 0.75, + "grad_norm": 0.33349814069711353, + "learning_rate": 1.5047699380357134e-06, + "loss": 0.0625, + "step": 2639 + }, + { + "epoch": 0.75, + "grad_norm": 0.38864589430654867, + "learning_rate": 1.5014629229294525e-06, + "loss": 0.0924, + "step": 2640 + }, + { + "epoch": 0.75, + "grad_norm": 0.3331088969764812, + "learning_rate": 1.4981589035805744e-06, + "loss": 0.07, + "step": 2641 + }, + { + "epoch": 0.75, + "grad_norm": 0.41250917034250284, + "learning_rate": 1.4948578828182702e-06, + "loss": 0.1086, + "step": 2642 + }, + { + "epoch": 0.76, + "grad_norm": 0.36533927786997383, + "learning_rate": 1.4915598634691676e-06, + "loss": 0.0921, + "step": 2643 + }, + { + "epoch": 0.76, + "grad_norm": 0.8328973222067414, + "learning_rate": 1.4882648483573225e-06, + "loss": 0.0829, + "step": 2644 + }, + { + "epoch": 0.76, + "grad_norm": 0.3114824784127253, + "learning_rate": 1.4849728403042213e-06, + "loss": 0.0738, + "step": 2645 + }, + { + "epoch": 0.76, + "grad_norm": 0.5497525059585238, + "learning_rate": 1.4816838421287693e-06, + "loss": 0.0961, + "step": 2646 + }, + { + "epoch": 0.76, + "grad_norm": 0.5519244639558591, + "learning_rate": 1.4783978566472996e-06, + "loss": 0.0901, + "step": 2647 + }, + { + "epoch": 0.76, + "grad_norm": 0.39311287049845434, + "learning_rate": 1.4751148866735643e-06, + "loss": 0.0456, + "step": 2648 + }, + { + "epoch": 0.76, + "grad_norm": 0.4883511060389514, + "learning_rate": 1.4718349350187345e-06, + "loss": 0.069, + "step": 2649 + }, + { + "epoch": 0.76, + "grad_norm": 0.5873910505805034, + "learning_rate": 1.4685580044913921e-06, + "loss": 0.0668, + "step": 2650 + }, + { + "epoch": 0.76, + "grad_norm": 0.43547640692040174, + "learning_rate": 1.4652840978975375e-06, + "loss": 0.0586, + "step": 2651 + }, + { + "epoch": 0.76, + "grad_norm": 0.4276000956755098, + "learning_rate": 1.4620132180405788e-06, + "loss": 0.0707, + "step": 2652 + }, + { + "epoch": 0.76, + "grad_norm": 0.37816521473506937, + "learning_rate": 1.4587453677213348e-06, + "loss": 0.1042, + "step": 2653 + }, + { + "epoch": 0.76, + "grad_norm": 0.39808430833250824, + "learning_rate": 1.4554805497380259e-06, + "loss": 0.0614, + "step": 2654 + }, + { + "epoch": 0.76, + "grad_norm": 0.7520466931538227, + "learning_rate": 1.4522187668862797e-06, + "loss": 0.1079, + "step": 2655 + }, + { + "epoch": 0.76, + "grad_norm": 0.34863470137890984, + "learning_rate": 1.4489600219591248e-06, + "loss": 0.0877, + "step": 2656 + }, + { + "epoch": 0.76, + "grad_norm": 0.6076465010479657, + "learning_rate": 1.4457043177469853e-06, + "loss": 0.1332, + "step": 2657 + }, + { + "epoch": 0.76, + "grad_norm": 0.6521913189689925, + "learning_rate": 1.4424516570376839e-06, + "loss": 0.0818, + "step": 2658 + }, + { + "epoch": 0.76, + "grad_norm": 0.39413640679159595, + "learning_rate": 1.4392020426164372e-06, + "loss": 0.0552, + "step": 2659 + }, + { + "epoch": 0.76, + "grad_norm": 0.470630071495147, + "learning_rate": 1.4359554772658551e-06, + "loss": 0.0789, + "step": 2660 + }, + { + "epoch": 0.76, + "grad_norm": 0.4960749156861106, + "learning_rate": 1.4327119637659303e-06, + "loss": 0.1069, + "step": 2661 + }, + { + "epoch": 0.76, + "grad_norm": 0.5254663328203152, + "learning_rate": 1.4294715048940488e-06, + "loss": 0.0772, + "step": 2662 + }, + { + "epoch": 0.76, + "grad_norm": 0.31937275119922653, + "learning_rate": 1.4262341034249788e-06, + "loss": 0.0673, + "step": 2663 + }, + { + "epoch": 0.76, + "grad_norm": 0.5169102555885261, + "learning_rate": 1.4229997621308706e-06, + "loss": 0.1103, + "step": 2664 + }, + { + "epoch": 0.76, + "grad_norm": 0.37006622430508596, + "learning_rate": 1.419768483781252e-06, + "loss": 0.0499, + "step": 2665 + }, + { + "epoch": 0.76, + "grad_norm": 0.2966919863409953, + "learning_rate": 1.4165402711430304e-06, + "loss": 0.0706, + "step": 2666 + }, + { + "epoch": 0.76, + "grad_norm": 0.5666527249676943, + "learning_rate": 1.4133151269804873e-06, + "loss": 0.1068, + "step": 2667 + }, + { + "epoch": 0.76, + "grad_norm": 0.3540776948842063, + "learning_rate": 1.4100930540552793e-06, + "loss": 0.0777, + "step": 2668 + }, + { + "epoch": 0.76, + "grad_norm": 0.4122198000786065, + "learning_rate": 1.4068740551264271e-06, + "loss": 0.0776, + "step": 2669 + }, + { + "epoch": 0.76, + "grad_norm": 0.5090814806700498, + "learning_rate": 1.4036581329503245e-06, + "loss": 0.0764, + "step": 2670 + }, + { + "epoch": 0.76, + "grad_norm": 0.320180169936241, + "learning_rate": 1.4004452902807287e-06, + "loss": 0.095, + "step": 2671 + }, + { + "epoch": 0.76, + "grad_norm": 0.4390591330106556, + "learning_rate": 1.3972355298687618e-06, + "loss": 0.0891, + "step": 2672 + }, + { + "epoch": 0.76, + "grad_norm": 0.34451407300811754, + "learning_rate": 1.3940288544629016e-06, + "loss": 0.0808, + "step": 2673 + }, + { + "epoch": 0.76, + "grad_norm": 0.28701435077378057, + "learning_rate": 1.39082526680899e-06, + "loss": 0.0826, + "step": 2674 + }, + { + "epoch": 0.76, + "grad_norm": 0.3053081800023214, + "learning_rate": 1.3876247696502238e-06, + "loss": 0.0716, + "step": 2675 + }, + { + "epoch": 0.76, + "grad_norm": 0.5194267560372032, + "learning_rate": 1.3844273657271484e-06, + "loss": 0.081, + "step": 2676 + }, + { + "epoch": 0.76, + "grad_norm": 0.387820089754962, + "learning_rate": 1.3812330577776673e-06, + "loss": 0.0796, + "step": 2677 + }, + { + "epoch": 0.77, + "grad_norm": 0.3732618211107229, + "learning_rate": 1.3780418485370284e-06, + "loss": 0.0913, + "step": 2678 + }, + { + "epoch": 0.77, + "grad_norm": 0.26776979055421385, + "learning_rate": 1.3748537407378304e-06, + "loss": 0.0673, + "step": 2679 + }, + { + "epoch": 0.77, + "grad_norm": 0.5440561516501896, + "learning_rate": 1.3716687371100096e-06, + "loss": 0.0989, + "step": 2680 + }, + { + "epoch": 0.77, + "grad_norm": 0.9010406860477679, + "learning_rate": 1.368486840380851e-06, + "loss": 0.1329, + "step": 2681 + }, + { + "epoch": 0.77, + "grad_norm": 0.3394260000622673, + "learning_rate": 1.3653080532749752e-06, + "loss": 0.1006, + "step": 2682 + }, + { + "epoch": 0.77, + "grad_norm": 0.6340984530569764, + "learning_rate": 1.3621323785143442e-06, + "loss": 0.1268, + "step": 2683 + }, + { + "epoch": 0.77, + "grad_norm": 0.2631174349729944, + "learning_rate": 1.358959818818248e-06, + "loss": 0.0731, + "step": 2684 + }, + { + "epoch": 0.77, + "grad_norm": 0.35241267836190354, + "learning_rate": 1.355790376903315e-06, + "loss": 0.0659, + "step": 2685 + }, + { + "epoch": 0.77, + "grad_norm": 0.28811053331300185, + "learning_rate": 1.3526240554835017e-06, + "loss": 0.0658, + "step": 2686 + }, + { + "epoch": 0.77, + "grad_norm": 0.6916709139010689, + "learning_rate": 1.3494608572700957e-06, + "loss": 0.1236, + "step": 2687 + }, + { + "epoch": 0.77, + "grad_norm": 0.32963589759427525, + "learning_rate": 1.3463007849717035e-06, + "loss": 0.0517, + "step": 2688 + }, + { + "epoch": 0.77, + "grad_norm": 0.5085408235852549, + "learning_rate": 1.343143841294261e-06, + "loss": 0.0887, + "step": 2689 + }, + { + "epoch": 0.77, + "grad_norm": 0.4967939438389619, + "learning_rate": 1.3399900289410245e-06, + "loss": 0.0866, + "step": 2690 + }, + { + "epoch": 0.77, + "grad_norm": 0.29057950619970113, + "learning_rate": 1.3368393506125644e-06, + "loss": 0.048, + "step": 2691 + }, + { + "epoch": 0.77, + "grad_norm": 0.3555035605248169, + "learning_rate": 1.3336918090067719e-06, + "loss": 0.0808, + "step": 2692 + }, + { + "epoch": 0.77, + "grad_norm": 0.3354657285502833, + "learning_rate": 1.3305474068188511e-06, + "loss": 0.0741, + "step": 2693 + }, + { + "epoch": 0.77, + "grad_norm": 0.49188656269912656, + "learning_rate": 1.3274061467413197e-06, + "loss": 0.1053, + "step": 2694 + }, + { + "epoch": 0.77, + "grad_norm": 0.544736178863369, + "learning_rate": 1.3242680314639995e-06, + "loss": 0.1306, + "step": 2695 + }, + { + "epoch": 0.77, + "grad_norm": 0.3153382932370989, + "learning_rate": 1.3211330636740238e-06, + "loss": 0.0634, + "step": 2696 + }, + { + "epoch": 0.77, + "grad_norm": 0.709153732467823, + "learning_rate": 1.31800124605583e-06, + "loss": 0.1374, + "step": 2697 + }, + { + "epoch": 0.77, + "grad_norm": 0.35550602803712794, + "learning_rate": 1.31487258129116e-06, + "loss": 0.0894, + "step": 2698 + }, + { + "epoch": 0.77, + "grad_norm": 0.4916818164442743, + "learning_rate": 1.3117470720590502e-06, + "loss": 0.1085, + "step": 2699 + }, + { + "epoch": 0.77, + "grad_norm": 0.7291601707842152, + "learning_rate": 1.30862472103584e-06, + "loss": 0.168, + "step": 2700 + }, + { + "epoch": 0.77, + "grad_norm": 0.4091074135324916, + "learning_rate": 1.305505530895163e-06, + "loss": 0.0869, + "step": 2701 + }, + { + "epoch": 0.77, + "grad_norm": 0.589806944907326, + "learning_rate": 1.3023895043079476e-06, + "loss": 0.1101, + "step": 2702 + }, + { + "epoch": 0.77, + "grad_norm": 0.4937937398097772, + "learning_rate": 1.2992766439424087e-06, + "loss": 0.1005, + "step": 2703 + }, + { + "epoch": 0.77, + "grad_norm": 0.25658833971200207, + "learning_rate": 1.2961669524640546e-06, + "loss": 0.0493, + "step": 2704 + }, + { + "epoch": 0.77, + "grad_norm": 0.628315964705664, + "learning_rate": 1.2930604325356793e-06, + "loss": 0.0886, + "step": 2705 + }, + { + "epoch": 0.77, + "grad_norm": 0.7835084995549616, + "learning_rate": 1.2899570868173594e-06, + "loss": 0.1345, + "step": 2706 + }, + { + "epoch": 0.77, + "grad_norm": 0.3607217679322699, + "learning_rate": 1.2868569179664553e-06, + "loss": 0.0909, + "step": 2707 + }, + { + "epoch": 0.77, + "grad_norm": 0.4026200411189742, + "learning_rate": 1.2837599286376068e-06, + "loss": 0.1031, + "step": 2708 + }, + { + "epoch": 0.77, + "grad_norm": 0.42305516993564185, + "learning_rate": 1.2806661214827286e-06, + "loss": 0.0747, + "step": 2709 + }, + { + "epoch": 0.77, + "grad_norm": 0.710974738363163, + "learning_rate": 1.277575499151013e-06, + "loss": 0.0961, + "step": 2710 + }, + { + "epoch": 0.77, + "grad_norm": 0.28907775814873893, + "learning_rate": 1.2744880642889252e-06, + "loss": 0.0978, + "step": 2711 + }, + { + "epoch": 0.77, + "grad_norm": 0.38625794475406194, + "learning_rate": 1.2714038195402012e-06, + "loss": 0.0776, + "step": 2712 + }, + { + "epoch": 0.78, + "grad_norm": 0.4519225312945511, + "learning_rate": 1.2683227675458425e-06, + "loss": 0.1095, + "step": 2713 + }, + { + "epoch": 0.78, + "grad_norm": 0.40309345074738945, + "learning_rate": 1.2652449109441201e-06, + "loss": 0.0916, + "step": 2714 + }, + { + "epoch": 0.78, + "grad_norm": 0.3047906846411138, + "learning_rate": 1.2621702523705676e-06, + "loss": 0.072, + "step": 2715 + }, + { + "epoch": 0.78, + "grad_norm": 0.6843646586237986, + "learning_rate": 1.2590987944579808e-06, + "loss": 0.087, + "step": 2716 + }, + { + "epoch": 0.78, + "grad_norm": 0.41241601177748055, + "learning_rate": 1.2560305398364114e-06, + "loss": 0.0708, + "step": 2717 + }, + { + "epoch": 0.78, + "grad_norm": 0.5304004162365995, + "learning_rate": 1.2529654911331728e-06, + "loss": 0.1086, + "step": 2718 + }, + { + "epoch": 0.78, + "grad_norm": 0.44528453113911076, + "learning_rate": 1.2499036509728307e-06, + "loss": 0.0896, + "step": 2719 + }, + { + "epoch": 0.78, + "grad_norm": 0.7207219633116309, + "learning_rate": 1.2468450219772054e-06, + "loss": 0.1107, + "step": 2720 + }, + { + "epoch": 0.78, + "grad_norm": 0.3741186812084746, + "learning_rate": 1.2437896067653638e-06, + "loss": 0.1031, + "step": 2721 + }, + { + "epoch": 0.78, + "grad_norm": 0.6283162647257376, + "learning_rate": 1.2407374079536239e-06, + "loss": 0.1327, + "step": 2722 + }, + { + "epoch": 0.78, + "grad_norm": 0.6475774510135026, + "learning_rate": 1.2376884281555485e-06, + "loss": 0.1085, + "step": 2723 + }, + { + "epoch": 0.78, + "grad_norm": 0.35222383764753795, + "learning_rate": 1.234642669981946e-06, + "loss": 0.086, + "step": 2724 + }, + { + "epoch": 0.78, + "grad_norm": 0.6922336687152492, + "learning_rate": 1.2316001360408614e-06, + "loss": 0.1373, + "step": 2725 + }, + { + "epoch": 0.78, + "grad_norm": 0.44725196472203255, + "learning_rate": 1.2285608289375833e-06, + "loss": 0.0615, + "step": 2726 + }, + { + "epoch": 0.78, + "grad_norm": 0.3213998336436028, + "learning_rate": 1.2255247512746372e-06, + "loss": 0.0642, + "step": 2727 + }, + { + "epoch": 0.78, + "grad_norm": 0.5776731153630892, + "learning_rate": 1.2224919056517786e-06, + "loss": 0.098, + "step": 2728 + }, + { + "epoch": 0.78, + "grad_norm": 0.40259312510282574, + "learning_rate": 1.2194622946660001e-06, + "loss": 0.0948, + "step": 2729 + }, + { + "epoch": 0.78, + "grad_norm": 0.686430169671714, + "learning_rate": 1.2164359209115235e-06, + "loss": 0.1243, + "step": 2730 + }, + { + "epoch": 0.78, + "grad_norm": 0.7633711815377928, + "learning_rate": 1.2134127869797991e-06, + "loss": 0.1514, + "step": 2731 + }, + { + "epoch": 0.78, + "grad_norm": 0.5600011653870505, + "learning_rate": 1.210392895459499e-06, + "loss": 0.1149, + "step": 2732 + }, + { + "epoch": 0.78, + "grad_norm": 0.2967848509388518, + "learning_rate": 1.2073762489365242e-06, + "loss": 0.076, + "step": 2733 + }, + { + "epoch": 0.78, + "grad_norm": 0.4489149559111999, + "learning_rate": 1.2043628499939935e-06, + "loss": 0.0992, + "step": 2734 + }, + { + "epoch": 0.78, + "grad_norm": 0.46295267641964, + "learning_rate": 1.2013527012122477e-06, + "loss": 0.0889, + "step": 2735 + }, + { + "epoch": 0.78, + "grad_norm": 0.8328140275817446, + "learning_rate": 1.1983458051688406e-06, + "loss": 0.0965, + "step": 2736 + }, + { + "epoch": 0.78, + "grad_norm": 0.41845875458614895, + "learning_rate": 1.1953421644385444e-06, + "loss": 0.0948, + "step": 2737 + }, + { + "epoch": 0.78, + "grad_norm": 0.49620564930349254, + "learning_rate": 1.192341781593342e-06, + "loss": 0.0816, + "step": 2738 + }, + { + "epoch": 0.78, + "grad_norm": 0.38504477754165445, + "learning_rate": 1.1893446592024283e-06, + "loss": 0.1031, + "step": 2739 + }, + { + "epoch": 0.78, + "grad_norm": 0.39016458551382455, + "learning_rate": 1.1863507998322022e-06, + "loss": 0.069, + "step": 2740 + }, + { + "epoch": 0.78, + "grad_norm": 0.22827157128847111, + "learning_rate": 1.1833602060462728e-06, + "loss": 0.0586, + "step": 2741 + }, + { + "epoch": 0.78, + "grad_norm": 0.8874348290726536, + "learning_rate": 1.1803728804054531e-06, + "loss": 0.1065, + "step": 2742 + }, + { + "epoch": 0.78, + "grad_norm": 0.3131888683682014, + "learning_rate": 1.177388825467753e-06, + "loss": 0.0751, + "step": 2743 + }, + { + "epoch": 0.78, + "grad_norm": 0.5194046067680366, + "learning_rate": 1.1744080437883859e-06, + "loss": 0.0593, + "step": 2744 + }, + { + "epoch": 0.78, + "grad_norm": 0.27003515092720987, + "learning_rate": 1.1714305379197616e-06, + "loss": 0.0821, + "step": 2745 + }, + { + "epoch": 0.78, + "grad_norm": 0.4808487609610235, + "learning_rate": 1.1684563104114855e-06, + "loss": 0.0853, + "step": 2746 + }, + { + "epoch": 0.78, + "grad_norm": 0.5099143639466569, + "learning_rate": 1.165485363810352e-06, + "loss": 0.1072, + "step": 2747 + }, + { + "epoch": 0.79, + "grad_norm": 0.319803676365498, + "learning_rate": 1.16251770066035e-06, + "loss": 0.0704, + "step": 2748 + }, + { + "epoch": 0.79, + "grad_norm": 0.365630626273584, + "learning_rate": 1.1595533235026553e-06, + "loss": 0.0597, + "step": 2749 + }, + { + "epoch": 0.79, + "grad_norm": 0.3697889630605743, + "learning_rate": 1.1565922348756324e-06, + "loss": 0.1002, + "step": 2750 + }, + { + "epoch": 0.79, + "grad_norm": 0.3913128920524271, + "learning_rate": 1.1536344373148245e-06, + "loss": 0.0925, + "step": 2751 + }, + { + "epoch": 0.79, + "grad_norm": 0.5866744171195533, + "learning_rate": 1.1506799333529612e-06, + "loss": 0.1068, + "step": 2752 + }, + { + "epoch": 0.79, + "grad_norm": 0.3769226940647506, + "learning_rate": 1.1477287255199505e-06, + "loss": 0.0579, + "step": 2753 + }, + { + "epoch": 0.79, + "grad_norm": 0.2894931067593348, + "learning_rate": 1.1447808163428797e-06, + "loss": 0.0452, + "step": 2754 + }, + { + "epoch": 0.79, + "grad_norm": 0.43340750097569425, + "learning_rate": 1.1418362083460067e-06, + "loss": 0.1164, + "step": 2755 + }, + { + "epoch": 0.79, + "grad_norm": 0.6733334804824626, + "learning_rate": 1.1388949040507675e-06, + "loss": 0.1172, + "step": 2756 + }, + { + "epoch": 0.79, + "grad_norm": 0.3527328121841945, + "learning_rate": 1.1359569059757687e-06, + "loss": 0.0743, + "step": 2757 + }, + { + "epoch": 0.79, + "grad_norm": 0.60009730725847, + "learning_rate": 1.133022216636781e-06, + "loss": 0.0965, + "step": 2758 + }, + { + "epoch": 0.79, + "grad_norm": 0.620437087256445, + "learning_rate": 1.130090838546748e-06, + "loss": 0.1133, + "step": 2759 + }, + { + "epoch": 0.79, + "grad_norm": 0.3329959959511501, + "learning_rate": 1.1271627742157743e-06, + "loss": 0.0662, + "step": 2760 + }, + { + "epoch": 0.79, + "grad_norm": 0.37892833642644586, + "learning_rate": 1.1242380261511304e-06, + "loss": 0.0842, + "step": 2761 + }, + { + "epoch": 0.79, + "grad_norm": 0.3817881696008621, + "learning_rate": 1.121316596857241e-06, + "loss": 0.0553, + "step": 2762 + }, + { + "epoch": 0.79, + "grad_norm": 0.24201730005673097, + "learning_rate": 1.1183984888356941e-06, + "loss": 0.0628, + "step": 2763 + }, + { + "epoch": 0.79, + "grad_norm": 0.35583340694806587, + "learning_rate": 1.1154837045852323e-06, + "loss": 0.0974, + "step": 2764 + }, + { + "epoch": 0.79, + "grad_norm": 0.34386430017657676, + "learning_rate": 1.1125722466017547e-06, + "loss": 0.0806, + "step": 2765 + }, + { + "epoch": 0.79, + "grad_norm": 0.5910163295261405, + "learning_rate": 1.1096641173783051e-06, + "loss": 0.0937, + "step": 2766 + }, + { + "epoch": 0.79, + "grad_norm": 0.4034154149418227, + "learning_rate": 1.1067593194050836e-06, + "loss": 0.1148, + "step": 2767 + }, + { + "epoch": 0.79, + "grad_norm": 0.5189484582631975, + "learning_rate": 1.1038578551694356e-06, + "loss": 0.0668, + "step": 2768 + }, + { + "epoch": 0.79, + "grad_norm": 0.3060186904872248, + "learning_rate": 1.100959727155853e-06, + "loss": 0.0667, + "step": 2769 + }, + { + "epoch": 0.79, + "grad_norm": 0.30294171126259456, + "learning_rate": 1.0980649378459668e-06, + "loss": 0.0731, + "step": 2770 + }, + { + "epoch": 0.79, + "grad_norm": 0.3874341784461755, + "learning_rate": 1.095173489718554e-06, + "loss": 0.09, + "step": 2771 + }, + { + "epoch": 0.79, + "grad_norm": 0.434440144321733, + "learning_rate": 1.092285385249528e-06, + "loss": 0.0855, + "step": 2772 + }, + { + "epoch": 0.79, + "grad_norm": 0.6359252146047165, + "learning_rate": 1.0894006269119418e-06, + "loss": 0.1389, + "step": 2773 + }, + { + "epoch": 0.79, + "grad_norm": 0.4430630849907701, + "learning_rate": 1.0865192171759775e-06, + "loss": 0.0776, + "step": 2774 + }, + { + "epoch": 0.79, + "grad_norm": 0.43476186080572843, + "learning_rate": 1.083641158508955e-06, + "loss": 0.0824, + "step": 2775 + }, + { + "epoch": 0.79, + "grad_norm": 1.1742104617023283, + "learning_rate": 1.0807664533753254e-06, + "loss": 0.0867, + "step": 2776 + }, + { + "epoch": 0.79, + "grad_norm": 0.334150985656815, + "learning_rate": 1.077895104236662e-06, + "loss": 0.085, + "step": 2777 + }, + { + "epoch": 0.79, + "grad_norm": 0.24993114212790044, + "learning_rate": 1.0750271135516699e-06, + "loss": 0.0591, + "step": 2778 + }, + { + "epoch": 0.79, + "grad_norm": 0.36716776788229694, + "learning_rate": 1.0721624837761768e-06, + "loss": 0.0766, + "step": 2779 + }, + { + "epoch": 0.79, + "grad_norm": 0.45376370633964364, + "learning_rate": 1.0693012173631346e-06, + "loss": 0.0637, + "step": 2780 + }, + { + "epoch": 0.79, + "grad_norm": 0.437864184116154, + "learning_rate": 1.0664433167626093e-06, + "loss": 0.0825, + "step": 2781 + }, + { + "epoch": 0.79, + "grad_norm": 0.370380840422749, + "learning_rate": 1.06358878442179e-06, + "loss": 0.0934, + "step": 2782 + }, + { + "epoch": 0.8, + "grad_norm": 0.2662176212691549, + "learning_rate": 1.0607376227849814e-06, + "loss": 0.0615, + "step": 2783 + }, + { + "epoch": 0.8, + "grad_norm": 0.3817764874201513, + "learning_rate": 1.0578898342936005e-06, + "loss": 0.1027, + "step": 2784 + }, + { + "epoch": 0.8, + "grad_norm": 0.2218744021163004, + "learning_rate": 1.055045421386175e-06, + "loss": 0.0579, + "step": 2785 + }, + { + "epoch": 0.8, + "grad_norm": 0.4552660320663978, + "learning_rate": 1.0522043864983428e-06, + "loss": 0.0851, + "step": 2786 + }, + { + "epoch": 0.8, + "grad_norm": 0.3729685725361397, + "learning_rate": 1.049366732062851e-06, + "loss": 0.0912, + "step": 2787 + }, + { + "epoch": 0.8, + "grad_norm": 0.7396419953001301, + "learning_rate": 1.0465324605095518e-06, + "loss": 0.1302, + "step": 2788 + }, + { + "epoch": 0.8, + "grad_norm": 0.3208893902931182, + "learning_rate": 1.043701574265396e-06, + "loss": 0.0734, + "step": 2789 + }, + { + "epoch": 0.8, + "grad_norm": 0.4076777130770347, + "learning_rate": 1.0408740757544416e-06, + "loss": 0.1091, + "step": 2790 + }, + { + "epoch": 0.8, + "grad_norm": 0.33703014908359386, + "learning_rate": 1.0380499673978445e-06, + "loss": 0.0313, + "step": 2791 + }, + { + "epoch": 0.8, + "grad_norm": 0.20277498351388953, + "learning_rate": 1.0352292516138534e-06, + "loss": 0.0632, + "step": 2792 + }, + { + "epoch": 0.8, + "grad_norm": 0.36471426024703923, + "learning_rate": 1.0324119308178166e-06, + "loss": 0.0855, + "step": 2793 + }, + { + "epoch": 0.8, + "grad_norm": 0.2687099659071702, + "learning_rate": 1.0295980074221734e-06, + "loss": 0.0535, + "step": 2794 + }, + { + "epoch": 0.8, + "grad_norm": 0.6404241662040591, + "learning_rate": 1.0267874838364561e-06, + "loss": 0.0761, + "step": 2795 + }, + { + "epoch": 0.8, + "grad_norm": 0.3952340607681106, + "learning_rate": 1.0239803624672812e-06, + "loss": 0.0924, + "step": 2796 + }, + { + "epoch": 0.8, + "grad_norm": 0.34656500673155227, + "learning_rate": 1.0211766457183559e-06, + "loss": 0.0727, + "step": 2797 + }, + { + "epoch": 0.8, + "grad_norm": 0.620946135308157, + "learning_rate": 1.0183763359904709e-06, + "loss": 0.0991, + "step": 2798 + }, + { + "epoch": 0.8, + "grad_norm": 0.3575695250399971, + "learning_rate": 1.0155794356815008e-06, + "loss": 0.049, + "step": 2799 + }, + { + "epoch": 0.8, + "grad_norm": 0.43396464675005203, + "learning_rate": 1.012785947186397e-06, + "loss": 0.0992, + "step": 2800 + }, + { + "epoch": 0.8, + "grad_norm": 0.5674349734622195, + "learning_rate": 1.0099958728971927e-06, + "loss": 0.0978, + "step": 2801 + }, + { + "epoch": 0.8, + "grad_norm": 0.5453517603974076, + "learning_rate": 1.007209215202997e-06, + "loss": 0.0801, + "step": 2802 + }, + { + "epoch": 0.8, + "grad_norm": 0.2618303304569744, + "learning_rate": 1.0044259764899944e-06, + "loss": 0.0616, + "step": 2803 + }, + { + "epoch": 0.8, + "grad_norm": 0.27981141303459556, + "learning_rate": 1.001646159141438e-06, + "loss": 0.0566, + "step": 2804 + }, + { + "epoch": 0.8, + "grad_norm": 0.34849611796154445, + "learning_rate": 9.988697655376544e-07, + "loss": 0.0783, + "step": 2805 + }, + { + "epoch": 0.8, + "grad_norm": 0.37035816726159837, + "learning_rate": 9.960967980560381e-07, + "loss": 0.0831, + "step": 2806 + }, + { + "epoch": 0.8, + "grad_norm": 0.9066553222055533, + "learning_rate": 9.933272590710508e-07, + "loss": 0.1539, + "step": 2807 + }, + { + "epoch": 0.8, + "grad_norm": 0.5795957607561886, + "learning_rate": 9.905611509542152e-07, + "loss": 0.0639, + "step": 2808 + }, + { + "epoch": 0.8, + "grad_norm": 0.3869834235549946, + "learning_rate": 9.877984760741178e-07, + "loss": 0.107, + "step": 2809 + }, + { + "epoch": 0.8, + "grad_norm": 0.4471845905449251, + "learning_rate": 9.850392367964085e-07, + "loss": 0.0869, + "step": 2810 + }, + { + "epoch": 0.8, + "grad_norm": 0.3496629403015358, + "learning_rate": 9.82283435483788e-07, + "loss": 0.07, + "step": 2811 + }, + { + "epoch": 0.8, + "grad_norm": 0.3740387045768065, + "learning_rate": 9.795310744960208e-07, + "loss": 0.0695, + "step": 2812 + }, + { + "epoch": 0.8, + "grad_norm": 0.7292364365628232, + "learning_rate": 9.767821561899228e-07, + "loss": 0.1467, + "step": 2813 + }, + { + "epoch": 0.8, + "grad_norm": 0.4994691523909614, + "learning_rate": 9.740366829193587e-07, + "loss": 0.0868, + "step": 2814 + }, + { + "epoch": 0.8, + "grad_norm": 0.38711287893410873, + "learning_rate": 9.71294657035247e-07, + "loss": 0.0938, + "step": 2815 + }, + { + "epoch": 0.8, + "grad_norm": 0.24490457740145347, + "learning_rate": 9.685560808855544e-07, + "loss": 0.0408, + "step": 2816 + }, + { + "epoch": 0.8, + "grad_norm": 0.5419294497471646, + "learning_rate": 9.658209568152916e-07, + "loss": 0.1329, + "step": 2817 + }, + { + "epoch": 0.81, + "grad_norm": 0.45460720715635144, + "learning_rate": 9.630892871665133e-07, + "loss": 0.111, + "step": 2818 + }, + { + "epoch": 0.81, + "grad_norm": 0.46255716715647505, + "learning_rate": 9.60361074278317e-07, + "loss": 0.0727, + "step": 2819 + }, + { + "epoch": 0.81, + "grad_norm": 0.4061108346522638, + "learning_rate": 9.576363204868417e-07, + "loss": 0.0513, + "step": 2820 + }, + { + "epoch": 0.81, + "grad_norm": 0.6744081389247782, + "learning_rate": 9.549150281252633e-07, + "loss": 0.122, + "step": 2821 + }, + { + "epoch": 0.81, + "grad_norm": 0.4360874287989033, + "learning_rate": 9.521971995237911e-07, + "loss": 0.1174, + "step": 2822 + }, + { + "epoch": 0.81, + "grad_norm": 0.2643017902959821, + "learning_rate": 9.494828370096715e-07, + "loss": 0.0643, + "step": 2823 + }, + { + "epoch": 0.81, + "grad_norm": 0.5564283230338363, + "learning_rate": 9.467719429071831e-07, + "loss": 0.099, + "step": 2824 + }, + { + "epoch": 0.81, + "grad_norm": 0.5505495465238062, + "learning_rate": 9.440645195376341e-07, + "loss": 0.0761, + "step": 2825 + }, + { + "epoch": 0.81, + "grad_norm": 0.30881846748015823, + "learning_rate": 9.413605692193584e-07, + "loss": 0.0837, + "step": 2826 + }, + { + "epoch": 0.81, + "grad_norm": 0.550776009963471, + "learning_rate": 9.386600942677182e-07, + "loss": 0.0695, + "step": 2827 + }, + { + "epoch": 0.81, + "grad_norm": 0.6265061663190412, + "learning_rate": 9.359630969951012e-07, + "loss": 0.1013, + "step": 2828 + }, + { + "epoch": 0.81, + "grad_norm": 0.3923154026245669, + "learning_rate": 9.332695797109132e-07, + "loss": 0.0494, + "step": 2829 + }, + { + "epoch": 0.81, + "grad_norm": 0.5266718204069095, + "learning_rate": 9.305795447215827e-07, + "loss": 0.1391, + "step": 2830 + }, + { + "epoch": 0.81, + "grad_norm": 0.34860035589051297, + "learning_rate": 9.278929943305564e-07, + "loss": 0.0686, + "step": 2831 + }, + { + "epoch": 0.81, + "grad_norm": 0.5124762529182002, + "learning_rate": 9.252099308382978e-07, + "loss": 0.0879, + "step": 2832 + }, + { + "epoch": 0.81, + "grad_norm": 0.5698073998785758, + "learning_rate": 9.225303565422817e-07, + "loss": 0.0959, + "step": 2833 + }, + { + "epoch": 0.81, + "grad_norm": 0.31438160295090767, + "learning_rate": 9.198542737369976e-07, + "loss": 0.0622, + "step": 2834 + }, + { + "epoch": 0.81, + "grad_norm": 0.6483003468673872, + "learning_rate": 9.171816847139447e-07, + "loss": 0.0963, + "step": 2835 + }, + { + "epoch": 0.81, + "grad_norm": 0.41165188927663543, + "learning_rate": 9.145125917616327e-07, + "loss": 0.0749, + "step": 2836 + }, + { + "epoch": 0.81, + "grad_norm": 0.4264223221349069, + "learning_rate": 9.118469971655713e-07, + "loss": 0.0961, + "step": 2837 + }, + { + "epoch": 0.81, + "grad_norm": 0.4301376449654001, + "learning_rate": 9.091849032082812e-07, + "loss": 0.1109, + "step": 2838 + }, + { + "epoch": 0.81, + "grad_norm": 0.576459655052675, + "learning_rate": 9.06526312169283e-07, + "loss": 0.1437, + "step": 2839 + }, + { + "epoch": 0.81, + "grad_norm": 0.1932529341020626, + "learning_rate": 9.038712263250982e-07, + "loss": 0.0204, + "step": 2840 + }, + { + "epoch": 0.81, + "grad_norm": 0.3484701734942682, + "learning_rate": 9.012196479492452e-07, + "loss": 0.0963, + "step": 2841 + }, + { + "epoch": 0.81, + "grad_norm": 0.6629192358786643, + "learning_rate": 8.985715793122407e-07, + "loss": 0.1323, + "step": 2842 + }, + { + "epoch": 0.81, + "grad_norm": 0.3534822491024164, + "learning_rate": 8.959270226815975e-07, + "loss": 0.085, + "step": 2843 + }, + { + "epoch": 0.81, + "grad_norm": 0.19751266473335766, + "learning_rate": 8.932859803218164e-07, + "loss": 0.0493, + "step": 2844 + }, + { + "epoch": 0.81, + "grad_norm": 0.3329729549344692, + "learning_rate": 8.906484544943933e-07, + "loss": 0.0901, + "step": 2845 + }, + { + "epoch": 0.81, + "grad_norm": 0.3335366646419601, + "learning_rate": 8.880144474578112e-07, + "loss": 0.0798, + "step": 2846 + }, + { + "epoch": 0.81, + "grad_norm": 0.5108606944550274, + "learning_rate": 8.853839614675419e-07, + "loss": 0.061, + "step": 2847 + }, + { + "epoch": 0.81, + "grad_norm": 0.4623437829139172, + "learning_rate": 8.827569987760376e-07, + "loss": 0.0543, + "step": 2848 + }, + { + "epoch": 0.81, + "grad_norm": 0.6842205149737411, + "learning_rate": 8.801335616327378e-07, + "loss": 0.0805, + "step": 2849 + }, + { + "epoch": 0.81, + "grad_norm": 0.3178162051179874, + "learning_rate": 8.775136522840622e-07, + "loss": 0.0655, + "step": 2850 + }, + { + "epoch": 0.81, + "grad_norm": 0.5848830188373092, + "learning_rate": 8.748972729734106e-07, + "loss": 0.1373, + "step": 2851 + }, + { + "epoch": 0.81, + "grad_norm": 0.4276547095637747, + "learning_rate": 8.722844259411556e-07, + "loss": 0.0956, + "step": 2852 + }, + { + "epoch": 0.82, + "grad_norm": 0.3585215992558086, + "learning_rate": 8.696751134246495e-07, + "loss": 0.0691, + "step": 2853 + }, + { + "epoch": 0.82, + "grad_norm": 0.40274959039113967, + "learning_rate": 8.670693376582178e-07, + "loss": 0.0698, + "step": 2854 + }, + { + "epoch": 0.82, + "grad_norm": 0.4248036823633964, + "learning_rate": 8.64467100873157e-07, + "loss": 0.0524, + "step": 2855 + }, + { + "epoch": 0.82, + "grad_norm": 0.3126984161887551, + "learning_rate": 8.618684052977305e-07, + "loss": 0.06, + "step": 2856 + }, + { + "epoch": 0.82, + "grad_norm": 0.39044460133519976, + "learning_rate": 8.592732531571729e-07, + "loss": 0.0793, + "step": 2857 + }, + { + "epoch": 0.82, + "grad_norm": 0.25780148761497845, + "learning_rate": 8.566816466736839e-07, + "loss": 0.0702, + "step": 2858 + }, + { + "epoch": 0.82, + "grad_norm": 0.4383894880905106, + "learning_rate": 8.540935880664264e-07, + "loss": 0.0866, + "step": 2859 + }, + { + "epoch": 0.82, + "grad_norm": 0.7016608638537952, + "learning_rate": 8.515090795515247e-07, + "loss": 0.1351, + "step": 2860 + }, + { + "epoch": 0.82, + "grad_norm": 0.5702103761570979, + "learning_rate": 8.489281233420638e-07, + "loss": 0.0721, + "step": 2861 + }, + { + "epoch": 0.82, + "grad_norm": 0.5198763043413318, + "learning_rate": 8.463507216480888e-07, + "loss": 0.1277, + "step": 2862 + }, + { + "epoch": 0.82, + "grad_norm": 0.30581526914788437, + "learning_rate": 8.437768766765975e-07, + "loss": 0.0745, + "step": 2863 + }, + { + "epoch": 0.82, + "grad_norm": 0.701597939475846, + "learning_rate": 8.412065906315442e-07, + "loss": 0.1264, + "step": 2864 + }, + { + "epoch": 0.82, + "grad_norm": 0.33795592011986664, + "learning_rate": 8.386398657138356e-07, + "loss": 0.1102, + "step": 2865 + }, + { + "epoch": 0.82, + "grad_norm": 0.32917807335753546, + "learning_rate": 8.360767041213313e-07, + "loss": 0.0669, + "step": 2866 + }, + { + "epoch": 0.82, + "grad_norm": 0.502718544738482, + "learning_rate": 8.335171080488335e-07, + "loss": 0.1058, + "step": 2867 + }, + { + "epoch": 0.82, + "grad_norm": 0.3955852977521546, + "learning_rate": 8.309610796880974e-07, + "loss": 0.0872, + "step": 2868 + }, + { + "epoch": 0.82, + "grad_norm": 0.2774387218426999, + "learning_rate": 8.284086212278203e-07, + "loss": 0.0546, + "step": 2869 + }, + { + "epoch": 0.82, + "grad_norm": 0.5783304515013379, + "learning_rate": 8.258597348536452e-07, + "loss": 0.1034, + "step": 2870 + }, + { + "epoch": 0.82, + "grad_norm": 0.4458221235508557, + "learning_rate": 8.23314422748151e-07, + "loss": 0.1201, + "step": 2871 + }, + { + "epoch": 0.82, + "grad_norm": 0.49341453415108794, + "learning_rate": 8.207726870908606e-07, + "loss": 0.1385, + "step": 2872 + }, + { + "epoch": 0.82, + "grad_norm": 0.3487671009900291, + "learning_rate": 8.182345300582345e-07, + "loss": 0.0707, + "step": 2873 + }, + { + "epoch": 0.82, + "grad_norm": 0.5522184102356418, + "learning_rate": 8.156999538236677e-07, + "loss": 0.097, + "step": 2874 + }, + { + "epoch": 0.82, + "grad_norm": 0.6330713775200014, + "learning_rate": 8.131689605574867e-07, + "loss": 0.1066, + "step": 2875 + }, + { + "epoch": 0.82, + "grad_norm": 0.3924522621969745, + "learning_rate": 8.106415524269534e-07, + "loss": 0.0787, + "step": 2876 + }, + { + "epoch": 0.82, + "grad_norm": 0.3224278855296004, + "learning_rate": 8.081177315962601e-07, + "loss": 0.0524, + "step": 2877 + }, + { + "epoch": 0.82, + "grad_norm": 0.47868413859796966, + "learning_rate": 8.055975002265232e-07, + "loss": 0.1041, + "step": 2878 + }, + { + "epoch": 0.82, + "grad_norm": 0.45484670557724943, + "learning_rate": 8.030808604757894e-07, + "loss": 0.0981, + "step": 2879 + }, + { + "epoch": 0.82, + "grad_norm": 0.32337393598167563, + "learning_rate": 8.005678144990281e-07, + "loss": 0.0843, + "step": 2880 + }, + { + "epoch": 0.82, + "grad_norm": 0.283998253163553, + "learning_rate": 7.980583644481332e-07, + "loss": 0.0691, + "step": 2881 + }, + { + "epoch": 0.82, + "grad_norm": 0.43132136388324704, + "learning_rate": 7.95552512471916e-07, + "loss": 0.0579, + "step": 2882 + }, + { + "epoch": 0.82, + "grad_norm": 0.48200739667118, + "learning_rate": 7.9305026071611e-07, + "loss": 0.078, + "step": 2883 + }, + { + "epoch": 0.82, + "grad_norm": 0.43047973112848487, + "learning_rate": 7.905516113233652e-07, + "loss": 0.0943, + "step": 2884 + }, + { + "epoch": 0.82, + "grad_norm": 0.5279160649395449, + "learning_rate": 7.880565664332473e-07, + "loss": 0.097, + "step": 2885 + }, + { + "epoch": 0.82, + "grad_norm": 0.3443432371418148, + "learning_rate": 7.85565128182233e-07, + "loss": 0.0776, + "step": 2886 + }, + { + "epoch": 0.82, + "grad_norm": 0.56806116067874, + "learning_rate": 7.830772987037127e-07, + "loss": 0.1325, + "step": 2887 + }, + { + "epoch": 0.83, + "grad_norm": 0.3497432221638485, + "learning_rate": 7.805930801279871e-07, + "loss": 0.0982, + "step": 2888 + }, + { + "epoch": 0.83, + "grad_norm": 0.6869156690954531, + "learning_rate": 7.781124745822649e-07, + "loss": 0.1377, + "step": 2889 + }, + { + "epoch": 0.83, + "grad_norm": 0.3190067588408853, + "learning_rate": 7.756354841906582e-07, + "loss": 0.0571, + "step": 2890 + }, + { + "epoch": 0.83, + "grad_norm": 0.2603420227812946, + "learning_rate": 7.731621110741871e-07, + "loss": 0.0723, + "step": 2891 + }, + { + "epoch": 0.83, + "grad_norm": 0.39173247556403507, + "learning_rate": 7.70692357350774e-07, + "loss": 0.101, + "step": 2892 + }, + { + "epoch": 0.83, + "grad_norm": 0.42356015496705873, + "learning_rate": 7.682262251352379e-07, + "loss": 0.0798, + "step": 2893 + }, + { + "epoch": 0.83, + "grad_norm": 0.3480654007170492, + "learning_rate": 7.657637165393011e-07, + "loss": 0.0873, + "step": 2894 + }, + { + "epoch": 0.83, + "grad_norm": 0.5106815053077439, + "learning_rate": 7.633048336715815e-07, + "loss": 0.097, + "step": 2895 + }, + { + "epoch": 0.83, + "grad_norm": 0.4017028545099516, + "learning_rate": 7.608495786375941e-07, + "loss": 0.0739, + "step": 2896 + }, + { + "epoch": 0.83, + "grad_norm": 0.3780038758141077, + "learning_rate": 7.583979535397428e-07, + "loss": 0.0761, + "step": 2897 + }, + { + "epoch": 0.83, + "grad_norm": 0.29093877108629496, + "learning_rate": 7.55949960477328e-07, + "loss": 0.0532, + "step": 2898 + }, + { + "epoch": 0.83, + "grad_norm": 0.3795134425124431, + "learning_rate": 7.535056015465381e-07, + "loss": 0.0698, + "step": 2899 + }, + { + "epoch": 0.83, + "grad_norm": 0.3404338539562814, + "learning_rate": 7.510648788404501e-07, + "loss": 0.0645, + "step": 2900 + }, + { + "epoch": 0.83, + "grad_norm": 0.275210598494758, + "learning_rate": 7.486277944490256e-07, + "loss": 0.0678, + "step": 2901 + }, + { + "epoch": 0.83, + "grad_norm": 0.3411486611699063, + "learning_rate": 7.461943504591129e-07, + "loss": 0.0709, + "step": 2902 + }, + { + "epoch": 0.83, + "grad_norm": 0.4445929264562924, + "learning_rate": 7.437645489544426e-07, + "loss": 0.0811, + "step": 2903 + }, + { + "epoch": 0.83, + "grad_norm": 0.4442615605419769, + "learning_rate": 7.41338392015627e-07, + "loss": 0.0826, + "step": 2904 + }, + { + "epoch": 0.83, + "grad_norm": 0.26484249727441866, + "learning_rate": 7.389158817201541e-07, + "loss": 0.0425, + "step": 2905 + }, + { + "epoch": 0.83, + "grad_norm": 0.6063348479835404, + "learning_rate": 7.364970201423938e-07, + "loss": 0.0882, + "step": 2906 + }, + { + "epoch": 0.83, + "grad_norm": 0.7413241247014105, + "learning_rate": 7.340818093535884e-07, + "loss": 0.1124, + "step": 2907 + }, + { + "epoch": 0.83, + "grad_norm": 0.4527187327280281, + "learning_rate": 7.316702514218577e-07, + "loss": 0.0566, + "step": 2908 + }, + { + "epoch": 0.83, + "grad_norm": 0.34783032196224395, + "learning_rate": 7.292623484121875e-07, + "loss": 0.0829, + "step": 2909 + }, + { + "epoch": 0.83, + "grad_norm": 0.426314062370534, + "learning_rate": 7.268581023864402e-07, + "loss": 0.0817, + "step": 2910 + }, + { + "epoch": 0.83, + "grad_norm": 0.5627538496006483, + "learning_rate": 7.244575154033451e-07, + "loss": 0.0899, + "step": 2911 + }, + { + "epoch": 0.83, + "grad_norm": 0.39816441651303663, + "learning_rate": 7.220605895184946e-07, + "loss": 0.0862, + "step": 2912 + }, + { + "epoch": 0.83, + "grad_norm": 0.47072614754320086, + "learning_rate": 7.196673267843507e-07, + "loss": 0.09, + "step": 2913 + }, + { + "epoch": 0.83, + "grad_norm": 0.322931872595961, + "learning_rate": 7.172777292502369e-07, + "loss": 0.0585, + "step": 2914 + }, + { + "epoch": 0.83, + "grad_norm": 0.78148842567208, + "learning_rate": 7.148917989623388e-07, + "loss": 0.0871, + "step": 2915 + }, + { + "epoch": 0.83, + "grad_norm": 0.3639198711606861, + "learning_rate": 7.125095379636998e-07, + "loss": 0.111, + "step": 2916 + }, + { + "epoch": 0.83, + "grad_norm": 0.5604431493609869, + "learning_rate": 7.101309482942226e-07, + "loss": 0.1237, + "step": 2917 + }, + { + "epoch": 0.83, + "grad_norm": 0.3332096412037399, + "learning_rate": 7.077560319906696e-07, + "loss": 0.063, + "step": 2918 + }, + { + "epoch": 0.83, + "grad_norm": 0.46800171988972583, + "learning_rate": 7.053847910866513e-07, + "loss": 0.1035, + "step": 2919 + }, + { + "epoch": 0.83, + "grad_norm": 0.97911375149235, + "learning_rate": 7.030172276126351e-07, + "loss": 0.1079, + "step": 2920 + }, + { + "epoch": 0.83, + "grad_norm": 0.3779810733674687, + "learning_rate": 7.00653343595939e-07, + "loss": 0.0664, + "step": 2921 + }, + { + "epoch": 0.83, + "grad_norm": 0.37881221141973503, + "learning_rate": 6.982931410607307e-07, + "loss": 0.046, + "step": 2922 + }, + { + "epoch": 0.84, + "grad_norm": 0.4587799402449723, + "learning_rate": 6.959366220280228e-07, + "loss": 0.0659, + "step": 2923 + }, + { + "epoch": 0.84, + "grad_norm": 0.3527440578984237, + "learning_rate": 6.935837885156765e-07, + "loss": 0.0605, + "step": 2924 + }, + { + "epoch": 0.84, + "grad_norm": 1.0532685327962303, + "learning_rate": 6.912346425383964e-07, + "loss": 0.0526, + "step": 2925 + }, + { + "epoch": 0.84, + "grad_norm": 0.36622834853909303, + "learning_rate": 6.888891861077301e-07, + "loss": 0.0827, + "step": 2926 + }, + { + "epoch": 0.84, + "grad_norm": 0.43796984358312346, + "learning_rate": 6.865474212320638e-07, + "loss": 0.1017, + "step": 2927 + }, + { + "epoch": 0.84, + "grad_norm": 0.38462566932411085, + "learning_rate": 6.842093499166241e-07, + "loss": 0.0894, + "step": 2928 + }, + { + "epoch": 0.84, + "grad_norm": 0.381073163413314, + "learning_rate": 6.818749741634773e-07, + "loss": 0.0975, + "step": 2929 + }, + { + "epoch": 0.84, + "grad_norm": 0.4702508945259308, + "learning_rate": 6.795442959715192e-07, + "loss": 0.0499, + "step": 2930 + }, + { + "epoch": 0.84, + "grad_norm": 0.4515601651038705, + "learning_rate": 6.772173173364843e-07, + "loss": 0.069, + "step": 2931 + }, + { + "epoch": 0.84, + "grad_norm": 0.27852764893931564, + "learning_rate": 6.748940402509386e-07, + "loss": 0.0861, + "step": 2932 + }, + { + "epoch": 0.84, + "grad_norm": 0.25234917442123944, + "learning_rate": 6.725744667042778e-07, + "loss": 0.0495, + "step": 2933 + }, + { + "epoch": 0.84, + "grad_norm": 0.4160714519667449, + "learning_rate": 6.702585986827248e-07, + "loss": 0.0546, + "step": 2934 + }, + { + "epoch": 0.84, + "grad_norm": 0.3131089315605834, + "learning_rate": 6.679464381693324e-07, + "loss": 0.0504, + "step": 2935 + }, + { + "epoch": 0.84, + "grad_norm": 0.3734963943172259, + "learning_rate": 6.656379871439761e-07, + "loss": 0.1037, + "step": 2936 + }, + { + "epoch": 0.84, + "grad_norm": 0.3673991544751655, + "learning_rate": 6.633332475833581e-07, + "loss": 0.0817, + "step": 2937 + }, + { + "epoch": 0.84, + "grad_norm": 0.25188060698912723, + "learning_rate": 6.610322214609982e-07, + "loss": 0.0328, + "step": 2938 + }, + { + "epoch": 0.84, + "grad_norm": 0.7295652337957627, + "learning_rate": 6.587349107472401e-07, + "loss": 0.1042, + "step": 2939 + }, + { + "epoch": 0.84, + "grad_norm": 0.3767937513035894, + "learning_rate": 6.564413174092443e-07, + "loss": 0.0985, + "step": 2940 + }, + { + "epoch": 0.84, + "grad_norm": 0.2979979711754606, + "learning_rate": 6.541514434109902e-07, + "loss": 0.0681, + "step": 2941 + }, + { + "epoch": 0.84, + "grad_norm": 0.37209889854597344, + "learning_rate": 6.518652907132683e-07, + "loss": 0.0644, + "step": 2942 + }, + { + "epoch": 0.84, + "grad_norm": 0.6889404031047961, + "learning_rate": 6.49582861273686e-07, + "loss": 0.0871, + "step": 2943 + }, + { + "epoch": 0.84, + "grad_norm": 0.38580637690100655, + "learning_rate": 6.473041570466631e-07, + "loss": 0.0988, + "step": 2944 + }, + { + "epoch": 0.84, + "grad_norm": 0.421392756298022, + "learning_rate": 6.450291799834257e-07, + "loss": 0.065, + "step": 2945 + }, + { + "epoch": 0.84, + "grad_norm": 0.37457346311721995, + "learning_rate": 6.427579320320116e-07, + "loss": 0.0814, + "step": 2946 + }, + { + "epoch": 0.84, + "grad_norm": 0.4362013612573185, + "learning_rate": 6.404904151372649e-07, + "loss": 0.0775, + "step": 2947 + }, + { + "epoch": 0.84, + "grad_norm": 0.31466494537643575, + "learning_rate": 6.382266312408347e-07, + "loss": 0.0603, + "step": 2948 + }, + { + "epoch": 0.84, + "grad_norm": 0.34666952369419024, + "learning_rate": 6.359665822811722e-07, + "loss": 0.055, + "step": 2949 + }, + { + "epoch": 0.84, + "grad_norm": 0.5589321142596126, + "learning_rate": 6.337102701935322e-07, + "loss": 0.1242, + "step": 2950 + }, + { + "epoch": 0.84, + "grad_norm": 0.35686956590657676, + "learning_rate": 6.31457696909969e-07, + "loss": 0.0774, + "step": 2951 + }, + { + "epoch": 0.84, + "grad_norm": 0.6666184390323098, + "learning_rate": 6.292088643593359e-07, + "loss": 0.08, + "step": 2952 + }, + { + "epoch": 0.84, + "grad_norm": 0.35116524371456787, + "learning_rate": 6.269637744672813e-07, + "loss": 0.061, + "step": 2953 + }, + { + "epoch": 0.84, + "grad_norm": 0.3684925250204042, + "learning_rate": 6.24722429156251e-07, + "loss": 0.0846, + "step": 2954 + }, + { + "epoch": 0.84, + "grad_norm": 0.49795523364453675, + "learning_rate": 6.224848303454828e-07, + "loss": 0.1028, + "step": 2955 + }, + { + "epoch": 0.84, + "grad_norm": 0.46939181035728417, + "learning_rate": 6.202509799510086e-07, + "loss": 0.0856, + "step": 2956 + }, + { + "epoch": 0.84, + "grad_norm": 0.4001252277978988, + "learning_rate": 6.180208798856463e-07, + "loss": 0.0928, + "step": 2957 + }, + { + "epoch": 0.85, + "grad_norm": 0.1974884057510179, + "learning_rate": 6.157945320590076e-07, + "loss": 0.036, + "step": 2958 + }, + { + "epoch": 0.85, + "grad_norm": 0.311837737945852, + "learning_rate": 6.135719383774869e-07, + "loss": 0.0563, + "step": 2959 + }, + { + "epoch": 0.85, + "grad_norm": 0.5622996076001634, + "learning_rate": 6.11353100744268e-07, + "loss": 0.1094, + "step": 2960 + }, + { + "epoch": 0.85, + "grad_norm": 0.40436782668162075, + "learning_rate": 6.091380210593145e-07, + "loss": 0.0886, + "step": 2961 + }, + { + "epoch": 0.85, + "grad_norm": 0.48470414454889127, + "learning_rate": 6.069267012193735e-07, + "loss": 0.1034, + "step": 2962 + }, + { + "epoch": 0.85, + "grad_norm": 0.559384360048733, + "learning_rate": 6.047191431179755e-07, + "loss": 0.1052, + "step": 2963 + }, + { + "epoch": 0.85, + "grad_norm": 0.2985281023166187, + "learning_rate": 6.025153486454238e-07, + "loss": 0.0798, + "step": 2964 + }, + { + "epoch": 0.85, + "grad_norm": 0.39376381859130344, + "learning_rate": 6.003153196888045e-07, + "loss": 0.0917, + "step": 2965 + }, + { + "epoch": 0.85, + "grad_norm": 0.41963611920222765, + "learning_rate": 5.981190581319773e-07, + "loss": 0.0977, + "step": 2966 + }, + { + "epoch": 0.85, + "grad_norm": 0.4950705151532809, + "learning_rate": 5.959265658555769e-07, + "loss": 0.099, + "step": 2967 + }, + { + "epoch": 0.85, + "grad_norm": 0.8195548106339083, + "learning_rate": 5.937378447370068e-07, + "loss": 0.1482, + "step": 2968 + }, + { + "epoch": 0.85, + "grad_norm": 0.6608634549739837, + "learning_rate": 5.915528966504453e-07, + "loss": 0.0977, + "step": 2969 + }, + { + "epoch": 0.85, + "grad_norm": 0.42591936441814277, + "learning_rate": 5.893717234668383e-07, + "loss": 0.1271, + "step": 2970 + }, + { + "epoch": 0.85, + "grad_norm": 0.3180127669552666, + "learning_rate": 5.87194327053901e-07, + "loss": 0.0767, + "step": 2971 + }, + { + "epoch": 0.85, + "grad_norm": 0.26117207730592873, + "learning_rate": 5.850207092761107e-07, + "loss": 0.0673, + "step": 2972 + }, + { + "epoch": 0.85, + "grad_norm": 0.24778096320432705, + "learning_rate": 5.828508719947123e-07, + "loss": 0.0669, + "step": 2973 + }, + { + "epoch": 0.85, + "grad_norm": 0.5001788346295303, + "learning_rate": 5.806848170677132e-07, + "loss": 0.0862, + "step": 2974 + }, + { + "epoch": 0.85, + "grad_norm": 0.2825532774947254, + "learning_rate": 5.785225463498828e-07, + "loss": 0.0458, + "step": 2975 + }, + { + "epoch": 0.85, + "grad_norm": 0.46497002069784876, + "learning_rate": 5.763640616927458e-07, + "loss": 0.0806, + "step": 2976 + }, + { + "epoch": 0.85, + "grad_norm": 0.3106646283993831, + "learning_rate": 5.742093649445901e-07, + "loss": 0.0609, + "step": 2977 + }, + { + "epoch": 0.85, + "grad_norm": 0.3883572399341558, + "learning_rate": 5.720584579504579e-07, + "loss": 0.0712, + "step": 2978 + }, + { + "epoch": 0.85, + "grad_norm": 0.4657594143842723, + "learning_rate": 5.699113425521452e-07, + "loss": 0.0978, + "step": 2979 + }, + { + "epoch": 0.85, + "grad_norm": 0.40682139926705757, + "learning_rate": 5.67768020588203e-07, + "loss": 0.0858, + "step": 2980 + }, + { + "epoch": 0.85, + "grad_norm": 1.6267414290126576, + "learning_rate": 5.656284938939332e-07, + "loss": 0.1127, + "step": 2981 + }, + { + "epoch": 0.85, + "grad_norm": 0.3272751913041415, + "learning_rate": 5.634927643013899e-07, + "loss": 0.0651, + "step": 2982 + }, + { + "epoch": 0.85, + "grad_norm": 0.5510241979365119, + "learning_rate": 5.613608336393711e-07, + "loss": 0.0865, + "step": 2983 + }, + { + "epoch": 0.85, + "grad_norm": 0.386240817194325, + "learning_rate": 5.592327037334261e-07, + "loss": 0.0884, + "step": 2984 + }, + { + "epoch": 0.85, + "grad_norm": 0.4599662420251313, + "learning_rate": 5.571083764058482e-07, + "loss": 0.079, + "step": 2985 + }, + { + "epoch": 0.85, + "grad_norm": 0.4086491572266885, + "learning_rate": 5.549878534756759e-07, + "loss": 0.0952, + "step": 2986 + }, + { + "epoch": 0.85, + "grad_norm": 0.3835854137487857, + "learning_rate": 5.528711367586864e-07, + "loss": 0.1178, + "step": 2987 + }, + { + "epoch": 0.85, + "grad_norm": 0.5224166158794535, + "learning_rate": 5.507582280674012e-07, + "loss": 0.0866, + "step": 2988 + }, + { + "epoch": 0.85, + "grad_norm": 0.626662330089551, + "learning_rate": 5.486491292110796e-07, + "loss": 0.1309, + "step": 2989 + }, + { + "epoch": 0.85, + "grad_norm": 0.49030610702431343, + "learning_rate": 5.465438419957209e-07, + "loss": 0.1277, + "step": 2990 + }, + { + "epoch": 0.85, + "grad_norm": 0.28726618585301955, + "learning_rate": 5.444423682240558e-07, + "loss": 0.0756, + "step": 2991 + }, + { + "epoch": 0.85, + "grad_norm": 0.31634354240371376, + "learning_rate": 5.423447096955531e-07, + "loss": 0.0793, + "step": 2992 + }, + { + "epoch": 0.86, + "grad_norm": 0.5936135712888403, + "learning_rate": 5.402508682064151e-07, + "loss": 0.06, + "step": 2993 + }, + { + "epoch": 0.86, + "grad_norm": 0.458107465639092, + "learning_rate": 5.381608455495724e-07, + "loss": 0.0792, + "step": 2994 + }, + { + "epoch": 0.86, + "grad_norm": 0.4006113400235521, + "learning_rate": 5.360746435146885e-07, + "loss": 0.0728, + "step": 2995 + }, + { + "epoch": 0.86, + "grad_norm": 0.33429001845102757, + "learning_rate": 5.339922638881545e-07, + "loss": 0.0788, + "step": 2996 + }, + { + "epoch": 0.86, + "grad_norm": 0.4748180018707747, + "learning_rate": 5.319137084530896e-07, + "loss": 0.0764, + "step": 2997 + }, + { + "epoch": 0.86, + "grad_norm": 0.6211664302256894, + "learning_rate": 5.298389789893343e-07, + "loss": 0.1283, + "step": 2998 + }, + { + "epoch": 0.86, + "grad_norm": 0.34368775168904847, + "learning_rate": 5.277680772734578e-07, + "loss": 0.0571, + "step": 2999 + }, + { + "epoch": 0.86, + "grad_norm": 0.3828658272501252, + "learning_rate": 5.257010050787487e-07, + "loss": 0.1067, + "step": 3000 + }, + { + "epoch": 0.86, + "grad_norm": 0.3007889578238413, + "learning_rate": 5.236377641752189e-07, + "loss": 0.1028, + "step": 3001 + }, + { + "epoch": 0.86, + "grad_norm": 0.4207146551719682, + "learning_rate": 5.215783563295956e-07, + "loss": 0.1075, + "step": 3002 + }, + { + "epoch": 0.86, + "grad_norm": 0.3473750429740535, + "learning_rate": 5.195227833053273e-07, + "loss": 0.0845, + "step": 3003 + }, + { + "epoch": 0.86, + "grad_norm": 0.43525881622309176, + "learning_rate": 5.174710468625782e-07, + "loss": 0.0893, + "step": 3004 + }, + { + "epoch": 0.86, + "grad_norm": 0.29119338252675736, + "learning_rate": 5.154231487582273e-07, + "loss": 0.073, + "step": 3005 + }, + { + "epoch": 0.86, + "grad_norm": 0.379529649945552, + "learning_rate": 5.133790907458652e-07, + "loss": 0.0527, + "step": 3006 + }, + { + "epoch": 0.86, + "grad_norm": 0.35829786161691707, + "learning_rate": 5.113388745757953e-07, + "loss": 0.0494, + "step": 3007 + }, + { + "epoch": 0.86, + "grad_norm": 0.3496510390374235, + "learning_rate": 5.093025019950321e-07, + "loss": 0.0776, + "step": 3008 + }, + { + "epoch": 0.86, + "grad_norm": 0.45181850433930404, + "learning_rate": 5.072699747472987e-07, + "loss": 0.0769, + "step": 3009 + }, + { + "epoch": 0.86, + "grad_norm": 0.4085586014306148, + "learning_rate": 5.05241294573024e-07, + "loss": 0.1025, + "step": 3010 + }, + { + "epoch": 0.86, + "grad_norm": 0.562440220038349, + "learning_rate": 5.032164632093439e-07, + "loss": 0.0699, + "step": 3011 + }, + { + "epoch": 0.86, + "grad_norm": 0.5193491169755903, + "learning_rate": 5.011954823900994e-07, + "loss": 0.1309, + "step": 3012 + }, + { + "epoch": 0.86, + "grad_norm": 0.31955848751933863, + "learning_rate": 4.991783538458312e-07, + "loss": 0.0653, + "step": 3013 + }, + { + "epoch": 0.86, + "grad_norm": 0.3471447646597856, + "learning_rate": 4.971650793037852e-07, + "loss": 0.0895, + "step": 3014 + }, + { + "epoch": 0.86, + "grad_norm": 0.49495044323085113, + "learning_rate": 4.951556604879049e-07, + "loss": 0.1276, + "step": 3015 + }, + { + "epoch": 0.86, + "grad_norm": 0.5339757502143804, + "learning_rate": 4.931500991188331e-07, + "loss": 0.0685, + "step": 3016 + }, + { + "epoch": 0.86, + "grad_norm": 0.400482854213073, + "learning_rate": 4.911483969139086e-07, + "loss": 0.0941, + "step": 3017 + }, + { + "epoch": 0.86, + "grad_norm": 0.32685845079501163, + "learning_rate": 4.891505555871656e-07, + "loss": 0.0708, + "step": 3018 + }, + { + "epoch": 0.86, + "grad_norm": 0.4032628689364356, + "learning_rate": 4.871565768493341e-07, + "loss": 0.044, + "step": 3019 + }, + { + "epoch": 0.86, + "grad_norm": 0.23237142422515322, + "learning_rate": 4.851664624078356e-07, + "loss": 0.0498, + "step": 3020 + }, + { + "epoch": 0.86, + "grad_norm": 0.443930951871871, + "learning_rate": 4.831802139667807e-07, + "loss": 0.1177, + "step": 3021 + }, + { + "epoch": 0.86, + "grad_norm": 0.5498928475161263, + "learning_rate": 4.811978332269718e-07, + "loss": 0.1225, + "step": 3022 + }, + { + "epoch": 0.86, + "grad_norm": 0.5755441923444777, + "learning_rate": 4.792193218858998e-07, + "loss": 0.0892, + "step": 3023 + }, + { + "epoch": 0.86, + "grad_norm": 0.4011200933670287, + "learning_rate": 4.772446816377408e-07, + "loss": 0.0976, + "step": 3024 + }, + { + "epoch": 0.86, + "grad_norm": 0.4912835564287914, + "learning_rate": 4.7527391417335647e-07, + "loss": 0.106, + "step": 3025 + }, + { + "epoch": 0.86, + "grad_norm": 0.48304625141090346, + "learning_rate": 4.733070211802926e-07, + "loss": 0.0892, + "step": 3026 + }, + { + "epoch": 0.86, + "grad_norm": 0.4162973613416196, + "learning_rate": 4.7134400434277915e-07, + "loss": 0.0785, + "step": 3027 + }, + { + "epoch": 0.87, + "grad_norm": 0.6162478027910685, + "learning_rate": 4.693848653417216e-07, + "loss": 0.081, + "step": 3028 + }, + { + "epoch": 0.87, + "grad_norm": 0.4033790143022547, + "learning_rate": 4.6742960585471066e-07, + "loss": 0.0828, + "step": 3029 + }, + { + "epoch": 0.87, + "grad_norm": 0.4833368881485651, + "learning_rate": 4.654782275560127e-07, + "loss": 0.0988, + "step": 3030 + }, + { + "epoch": 0.87, + "grad_norm": 0.5348699015250554, + "learning_rate": 4.6353073211656886e-07, + "loss": 0.1089, + "step": 3031 + }, + { + "epoch": 0.87, + "grad_norm": 0.41834073298044805, + "learning_rate": 4.615871212039991e-07, + "loss": 0.0831, + "step": 3032 + }, + { + "epoch": 0.87, + "grad_norm": 0.306588739500313, + "learning_rate": 4.5964739648259426e-07, + "loss": 0.0717, + "step": 3033 + }, + { + "epoch": 0.87, + "grad_norm": 0.2856522381562219, + "learning_rate": 4.577115596133197e-07, + "loss": 0.0585, + "step": 3034 + }, + { + "epoch": 0.87, + "grad_norm": 0.5410819550718816, + "learning_rate": 4.5577961225380886e-07, + "loss": 0.1096, + "step": 3035 + }, + { + "epoch": 0.87, + "grad_norm": 0.427611744954672, + "learning_rate": 4.538515560583673e-07, + "loss": 0.0558, + "step": 3036 + }, + { + "epoch": 0.87, + "grad_norm": 0.970277134252806, + "learning_rate": 4.519273926779666e-07, + "loss": 0.1109, + "step": 3037 + }, + { + "epoch": 0.87, + "grad_norm": 0.5654875809557892, + "learning_rate": 4.5000712376024826e-07, + "loss": 0.1136, + "step": 3038 + }, + { + "epoch": 0.87, + "grad_norm": 0.47511571753867105, + "learning_rate": 4.480907509495136e-07, + "loss": 0.1209, + "step": 3039 + }, + { + "epoch": 0.87, + "grad_norm": 0.35770035937260286, + "learning_rate": 4.4617827588673167e-07, + "loss": 0.0822, + "step": 3040 + }, + { + "epoch": 0.87, + "grad_norm": 0.37123942578895763, + "learning_rate": 4.442697002095342e-07, + "loss": 0.0961, + "step": 3041 + }, + { + "epoch": 0.87, + "grad_norm": 0.3367041494830472, + "learning_rate": 4.423650255522127e-07, + "loss": 0.0938, + "step": 3042 + }, + { + "epoch": 0.87, + "grad_norm": 0.48564194331727517, + "learning_rate": 4.404642535457165e-07, + "loss": 0.0626, + "step": 3043 + }, + { + "epoch": 0.87, + "grad_norm": 0.6338948134739405, + "learning_rate": 4.3856738581765703e-07, + "loss": 0.1139, + "step": 3044 + }, + { + "epoch": 0.87, + "grad_norm": 0.8487780302819357, + "learning_rate": 4.3667442399229985e-07, + "loss": 0.1229, + "step": 3045 + }, + { + "epoch": 0.87, + "grad_norm": 0.4002482373146379, + "learning_rate": 4.347853696905657e-07, + "loss": 0.0483, + "step": 3046 + }, + { + "epoch": 0.87, + "grad_norm": 0.5192388658579471, + "learning_rate": 4.329002245300307e-07, + "loss": 0.1042, + "step": 3047 + }, + { + "epoch": 0.87, + "grad_norm": 0.378020294946494, + "learning_rate": 4.3101899012492343e-07, + "loss": 0.1018, + "step": 3048 + }, + { + "epoch": 0.87, + "grad_norm": 0.7343787532260014, + "learning_rate": 4.2914166808612436e-07, + "loss": 0.1131, + "step": 3049 + }, + { + "epoch": 0.87, + "grad_norm": 0.8289430771710176, + "learning_rate": 4.272682600211608e-07, + "loss": 0.1076, + "step": 3050 + }, + { + "epoch": 0.87, + "grad_norm": 0.35776484260402475, + "learning_rate": 4.2539876753421194e-07, + "loss": 0.099, + "step": 3051 + }, + { + "epoch": 0.87, + "grad_norm": 0.2510200087052072, + "learning_rate": 4.2353319222610265e-07, + "loss": 0.0545, + "step": 3052 + }, + { + "epoch": 0.87, + "grad_norm": 0.6291154821345246, + "learning_rate": 4.2167153569430465e-07, + "loss": 0.1009, + "step": 3053 + }, + { + "epoch": 0.87, + "grad_norm": 0.9616173056350652, + "learning_rate": 4.1981379953293155e-07, + "loss": 0.0995, + "step": 3054 + }, + { + "epoch": 0.87, + "grad_norm": 0.4222400099548638, + "learning_rate": 4.179599853327426e-07, + "loss": 0.0789, + "step": 3055 + }, + { + "epoch": 0.87, + "grad_norm": 0.24391964187242662, + "learning_rate": 4.1611009468113806e-07, + "loss": 0.0304, + "step": 3056 + }, + { + "epoch": 0.87, + "grad_norm": 0.4679791473568192, + "learning_rate": 4.1426412916215795e-07, + "loss": 0.062, + "step": 3057 + }, + { + "epoch": 0.87, + "grad_norm": 0.5744433048375208, + "learning_rate": 4.1242209035648075e-07, + "loss": 0.0991, + "step": 3058 + }, + { + "epoch": 0.87, + "grad_norm": 0.3636600923831621, + "learning_rate": 4.1058397984142405e-07, + "loss": 0.0799, + "step": 3059 + }, + { + "epoch": 0.87, + "grad_norm": 0.5308972871030511, + "learning_rate": 4.0874979919094004e-07, + "loss": 0.1245, + "step": 3060 + }, + { + "epoch": 0.87, + "grad_norm": 0.5242302791407181, + "learning_rate": 4.06919549975619e-07, + "loss": 0.114, + "step": 3061 + }, + { + "epoch": 0.87, + "grad_norm": 0.1964078610675712, + "learning_rate": 4.050932337626795e-07, + "loss": 0.0487, + "step": 3062 + }, + { + "epoch": 0.88, + "grad_norm": 0.5361090093876446, + "learning_rate": 4.032708521159762e-07, + "loss": 0.0785, + "step": 3063 + }, + { + "epoch": 0.88, + "grad_norm": 0.5596155053367649, + "learning_rate": 4.014524065959952e-07, + "loss": 0.1127, + "step": 3064 + }, + { + "epoch": 0.88, + "grad_norm": 0.2744212241668214, + "learning_rate": 3.996378987598487e-07, + "loss": 0.0507, + "step": 3065 + }, + { + "epoch": 0.88, + "grad_norm": 0.6448222861745297, + "learning_rate": 3.9782733016128006e-07, + "loss": 0.1036, + "step": 3066 + }, + { + "epoch": 0.88, + "grad_norm": 0.27275233150122863, + "learning_rate": 3.960207023506579e-07, + "loss": 0.0345, + "step": 3067 + }, + { + "epoch": 0.88, + "grad_norm": 0.35955531617117187, + "learning_rate": 3.9421801687497873e-07, + "loss": 0.0943, + "step": 3068 + }, + { + "epoch": 0.88, + "grad_norm": 0.47438097218592085, + "learning_rate": 3.924192752778588e-07, + "loss": 0.0928, + "step": 3069 + }, + { + "epoch": 0.88, + "grad_norm": 0.5002486621511697, + "learning_rate": 3.906244790995423e-07, + "loss": 0.0986, + "step": 3070 + }, + { + "epoch": 0.88, + "grad_norm": 0.30826085522781366, + "learning_rate": 3.8883362987689134e-07, + "loss": 0.0383, + "step": 3071 + }, + { + "epoch": 0.88, + "grad_norm": 0.34721910690891533, + "learning_rate": 3.8704672914339104e-07, + "loss": 0.0656, + "step": 3072 + }, + { + "epoch": 0.88, + "grad_norm": 0.2324282059512588, + "learning_rate": 3.852637784291424e-07, + "loss": 0.0428, + "step": 3073 + }, + { + "epoch": 0.88, + "grad_norm": 0.5372066165704869, + "learning_rate": 3.83484779260867e-07, + "loss": 0.0941, + "step": 3074 + }, + { + "epoch": 0.88, + "grad_norm": 0.38097292742712063, + "learning_rate": 3.8170973316190074e-07, + "loss": 0.0893, + "step": 3075 + }, + { + "epoch": 0.88, + "grad_norm": 0.4076076612996428, + "learning_rate": 3.799386416521966e-07, + "loss": 0.1051, + "step": 3076 + }, + { + "epoch": 0.88, + "grad_norm": 0.664951475549366, + "learning_rate": 3.781715062483188e-07, + "loss": 0.1276, + "step": 3077 + }, + { + "epoch": 0.88, + "grad_norm": 0.4028588925299585, + "learning_rate": 3.7640832846344565e-07, + "loss": 0.1067, + "step": 3078 + }, + { + "epoch": 0.88, + "grad_norm": 0.4714048602084793, + "learning_rate": 3.746491098073668e-07, + "loss": 0.0732, + "step": 3079 + }, + { + "epoch": 0.88, + "grad_norm": 0.5652779938474279, + "learning_rate": 3.728938517864794e-07, + "loss": 0.1264, + "step": 3080 + }, + { + "epoch": 0.88, + "grad_norm": 0.5414256824462692, + "learning_rate": 3.7114255590379234e-07, + "loss": 0.0912, + "step": 3081 + }, + { + "epoch": 0.88, + "grad_norm": 0.48296713636325733, + "learning_rate": 3.693952236589199e-07, + "loss": 0.0659, + "step": 3082 + }, + { + "epoch": 0.88, + "grad_norm": 0.4225849330260804, + "learning_rate": 3.6765185654808357e-07, + "loss": 0.1001, + "step": 3083 + }, + { + "epoch": 0.88, + "grad_norm": 0.3123657679015782, + "learning_rate": 3.659124560641064e-07, + "loss": 0.0785, + "step": 3084 + }, + { + "epoch": 0.88, + "grad_norm": 0.4664992130368141, + "learning_rate": 3.641770236964193e-07, + "loss": 0.073, + "step": 3085 + }, + { + "epoch": 0.88, + "grad_norm": 0.24769732912850836, + "learning_rate": 3.6244556093105285e-07, + "loss": 0.0468, + "step": 3086 + }, + { + "epoch": 0.88, + "grad_norm": 0.4856025683797443, + "learning_rate": 3.60718069250639e-07, + "loss": 0.0854, + "step": 3087 + }, + { + "epoch": 0.88, + "grad_norm": 0.389891640849095, + "learning_rate": 3.5899455013440777e-07, + "loss": 0.0805, + "step": 3088 + }, + { + "epoch": 0.88, + "grad_norm": 0.4865077865198653, + "learning_rate": 3.5727500505819023e-07, + "loss": 0.0806, + "step": 3089 + }, + { + "epoch": 0.88, + "grad_norm": 0.3325968133787995, + "learning_rate": 3.555594354944125e-07, + "loss": 0.0802, + "step": 3090 + }, + { + "epoch": 0.88, + "grad_norm": 0.48090649055260104, + "learning_rate": 3.5384784291209896e-07, + "loss": 0.1066, + "step": 3091 + }, + { + "epoch": 0.88, + "grad_norm": 0.2947187497781367, + "learning_rate": 3.521402287768644e-07, + "loss": 0.0674, + "step": 3092 + }, + { + "epoch": 0.88, + "grad_norm": 0.27166754855906533, + "learning_rate": 3.504365945509203e-07, + "loss": 0.051, + "step": 3093 + }, + { + "epoch": 0.88, + "grad_norm": 0.44337475956250444, + "learning_rate": 3.4873694169306915e-07, + "loss": 0.1323, + "step": 3094 + }, + { + "epoch": 0.88, + "grad_norm": 0.615474051294563, + "learning_rate": 3.4704127165870514e-07, + "loss": 0.101, + "step": 3095 + }, + { + "epoch": 0.88, + "grad_norm": 0.6088624752605024, + "learning_rate": 3.453495858998102e-07, + "loss": 0.1513, + "step": 3096 + }, + { + "epoch": 0.88, + "grad_norm": 0.327733742344023, + "learning_rate": 3.4366188586495543e-07, + "loss": 0.0626, + "step": 3097 + }, + { + "epoch": 0.89, + "grad_norm": 0.6320767703722748, + "learning_rate": 3.419781729993005e-07, + "loss": 0.0704, + "step": 3098 + }, + { + "epoch": 0.89, + "grad_norm": 0.3548027088632169, + "learning_rate": 3.402984487445876e-07, + "loss": 0.0667, + "step": 3099 + }, + { + "epoch": 0.89, + "grad_norm": 0.717913730591346, + "learning_rate": 3.386227145391463e-07, + "loss": 0.1242, + "step": 3100 + }, + { + "epoch": 0.89, + "grad_norm": 0.5855094279203372, + "learning_rate": 3.369509718178887e-07, + "loss": 0.0948, + "step": 3101 + }, + { + "epoch": 0.89, + "grad_norm": 0.35693098689849084, + "learning_rate": 3.352832220123098e-07, + "loss": 0.0782, + "step": 3102 + }, + { + "epoch": 0.89, + "grad_norm": 0.27597356385516486, + "learning_rate": 3.336194665504833e-07, + "loss": 0.0567, + "step": 3103 + }, + { + "epoch": 0.89, + "grad_norm": 0.41699629268519856, + "learning_rate": 3.319597068570646e-07, + "loss": 0.0834, + "step": 3104 + }, + { + "epoch": 0.89, + "grad_norm": 0.4326512457963444, + "learning_rate": 3.303039443532874e-07, + "loss": 0.1048, + "step": 3105 + }, + { + "epoch": 0.89, + "grad_norm": 0.32748288346108995, + "learning_rate": 3.2865218045696256e-07, + "loss": 0.085, + "step": 3106 + }, + { + "epoch": 0.89, + "grad_norm": 0.43981623675700016, + "learning_rate": 3.2700441658247484e-07, + "loss": 0.0773, + "step": 3107 + }, + { + "epoch": 0.89, + "grad_norm": 0.46051008363093954, + "learning_rate": 3.2536065414078724e-07, + "loss": 0.0918, + "step": 3108 + }, + { + "epoch": 0.89, + "grad_norm": 0.5280500620248085, + "learning_rate": 3.237208945394343e-07, + "loss": 0.0928, + "step": 3109 + }, + { + "epoch": 0.89, + "grad_norm": 0.611026107449624, + "learning_rate": 3.220851391825247e-07, + "loss": 0.1458, + "step": 3110 + }, + { + "epoch": 0.89, + "grad_norm": 0.3527668707160877, + "learning_rate": 3.204533894707346e-07, + "loss": 0.1001, + "step": 3111 + }, + { + "epoch": 0.89, + "grad_norm": 0.3904280066329125, + "learning_rate": 3.18825646801314e-07, + "loss": 0.1256, + "step": 3112 + }, + { + "epoch": 0.89, + "grad_norm": 0.2625376396953162, + "learning_rate": 3.172019125680814e-07, + "loss": 0.0593, + "step": 3113 + }, + { + "epoch": 0.89, + "grad_norm": 0.32005754213725546, + "learning_rate": 3.1558218816142015e-07, + "loss": 0.0783, + "step": 3114 + }, + { + "epoch": 0.89, + "grad_norm": 0.3776388385135846, + "learning_rate": 3.1396647496828245e-07, + "loss": 0.0692, + "step": 3115 + }, + { + "epoch": 0.89, + "grad_norm": 0.34578952977950755, + "learning_rate": 3.12354774372185e-07, + "loss": 0.0886, + "step": 3116 + }, + { + "epoch": 0.89, + "grad_norm": 0.4051418101171471, + "learning_rate": 3.107470877532093e-07, + "loss": 0.0967, + "step": 3117 + }, + { + "epoch": 0.89, + "grad_norm": 0.40334526262593723, + "learning_rate": 3.0914341648799805e-07, + "loss": 0.0892, + "step": 3118 + }, + { + "epoch": 0.89, + "grad_norm": 0.5334893902788516, + "learning_rate": 3.0754376194975676e-07, + "loss": 0.1091, + "step": 3119 + }, + { + "epoch": 0.89, + "grad_norm": 0.5427067548895909, + "learning_rate": 3.0594812550825194e-07, + "loss": 0.0923, + "step": 3120 + }, + { + "epoch": 0.89, + "grad_norm": 0.4861593840855422, + "learning_rate": 3.0435650852980947e-07, + "loss": 0.0918, + "step": 3121 + }, + { + "epoch": 0.89, + "grad_norm": 0.5309956283735078, + "learning_rate": 3.0276891237731085e-07, + "loss": 0.1044, + "step": 3122 + }, + { + "epoch": 0.89, + "grad_norm": 0.6687981611951446, + "learning_rate": 3.0118533841019814e-07, + "loss": 0.1141, + "step": 3123 + }, + { + "epoch": 0.89, + "grad_norm": 0.44634404114813325, + "learning_rate": 2.996057879844666e-07, + "loss": 0.085, + "step": 3124 + }, + { + "epoch": 0.89, + "grad_norm": 0.3994425433847164, + "learning_rate": 2.980302624526693e-07, + "loss": 0.0844, + "step": 3125 + }, + { + "epoch": 0.89, + "grad_norm": 0.37461657518182806, + "learning_rate": 2.964587631639082e-07, + "loss": 0.112, + "step": 3126 + }, + { + "epoch": 0.89, + "grad_norm": 0.405516764115973, + "learning_rate": 2.948912914638413e-07, + "loss": 0.091, + "step": 3127 + }, + { + "epoch": 0.89, + "grad_norm": 0.29298656641682885, + "learning_rate": 2.933278486946772e-07, + "loss": 0.059, + "step": 3128 + }, + { + "epoch": 0.89, + "grad_norm": 0.37281228634911845, + "learning_rate": 2.917684361951728e-07, + "loss": 0.1029, + "step": 3129 + }, + { + "epoch": 0.89, + "grad_norm": 0.4085533070703712, + "learning_rate": 2.902130553006366e-07, + "loss": 0.0585, + "step": 3130 + }, + { + "epoch": 0.89, + "grad_norm": 0.5666335396313852, + "learning_rate": 2.886617073429232e-07, + "loss": 0.1125, + "step": 3131 + }, + { + "epoch": 0.89, + "grad_norm": 0.37081535824124257, + "learning_rate": 2.8711439365043394e-07, + "loss": 0.0937, + "step": 3132 + }, + { + "epoch": 0.9, + "grad_norm": 0.35616624004800407, + "learning_rate": 2.8557111554811555e-07, + "loss": 0.098, + "step": 3133 + }, + { + "epoch": 0.9, + "grad_norm": 0.6015374036138982, + "learning_rate": 2.840318743574599e-07, + "loss": 0.1356, + "step": 3134 + }, + { + "epoch": 0.9, + "grad_norm": 0.3933359066546871, + "learning_rate": 2.8249667139650215e-07, + "loss": 0.0774, + "step": 3135 + }, + { + "epoch": 0.9, + "grad_norm": 0.26557566617258205, + "learning_rate": 2.809655079798179e-07, + "loss": 0.0665, + "step": 3136 + }, + { + "epoch": 0.9, + "grad_norm": 0.3922047326006346, + "learning_rate": 2.7943838541852563e-07, + "loss": 0.0743, + "step": 3137 + }, + { + "epoch": 0.9, + "grad_norm": 1.2195651730364043, + "learning_rate": 2.7791530502028263e-07, + "loss": 0.0692, + "step": 3138 + }, + { + "epoch": 0.9, + "grad_norm": 0.33700214535511575, + "learning_rate": 2.763962680892862e-07, + "loss": 0.0839, + "step": 3139 + }, + { + "epoch": 0.9, + "grad_norm": 0.3916410619215585, + "learning_rate": 2.748812759262687e-07, + "loss": 0.0832, + "step": 3140 + }, + { + "epoch": 0.9, + "grad_norm": 0.3581159465712483, + "learning_rate": 2.7337032982850177e-07, + "loss": 0.0552, + "step": 3141 + }, + { + "epoch": 0.9, + "grad_norm": 0.41164379015605307, + "learning_rate": 2.7186343108979106e-07, + "loss": 0.077, + "step": 3142 + }, + { + "epoch": 0.9, + "grad_norm": 0.38996126568477396, + "learning_rate": 2.7036058100047723e-07, + "loss": 0.0579, + "step": 3143 + }, + { + "epoch": 0.9, + "grad_norm": 0.33228438098231394, + "learning_rate": 2.688617808474331e-07, + "loss": 0.0754, + "step": 3144 + }, + { + "epoch": 0.9, + "grad_norm": 0.4607563379599993, + "learning_rate": 2.6736703191406366e-07, + "loss": 0.0854, + "step": 3145 + }, + { + "epoch": 0.9, + "grad_norm": 0.39145788545942767, + "learning_rate": 2.658763354803062e-07, + "loss": 0.0962, + "step": 3146 + }, + { + "epoch": 0.9, + "grad_norm": 0.25668093475194925, + "learning_rate": 2.643896928226275e-07, + "loss": 0.0746, + "step": 3147 + }, + { + "epoch": 0.9, + "grad_norm": 0.46525565113147777, + "learning_rate": 2.62907105214022e-07, + "loss": 0.0799, + "step": 3148 + }, + { + "epoch": 0.9, + "grad_norm": 0.4859299085580294, + "learning_rate": 2.614285739240119e-07, + "loss": 0.0721, + "step": 3149 + }, + { + "epoch": 0.9, + "grad_norm": 0.3738322315883588, + "learning_rate": 2.599541002186479e-07, + "loss": 0.0693, + "step": 3150 + }, + { + "epoch": 0.9, + "grad_norm": 0.4962416550518549, + "learning_rate": 2.5848368536050437e-07, + "loss": 0.1139, + "step": 3151 + }, + { + "epoch": 0.9, + "grad_norm": 0.4352815152144947, + "learning_rate": 2.570173306086804e-07, + "loss": 0.1183, + "step": 3152 + }, + { + "epoch": 0.9, + "grad_norm": 0.22482784421018465, + "learning_rate": 2.5555503721879924e-07, + "loss": 0.0386, + "step": 3153 + }, + { + "epoch": 0.9, + "grad_norm": 0.607644090721368, + "learning_rate": 2.5409680644300716e-07, + "loss": 0.1257, + "step": 3154 + }, + { + "epoch": 0.9, + "grad_norm": 0.752535300456928, + "learning_rate": 2.5264263952996915e-07, + "loss": 0.1057, + "step": 3155 + }, + { + "epoch": 0.9, + "grad_norm": 0.4914630908602349, + "learning_rate": 2.5119253772487137e-07, + "loss": 0.0979, + "step": 3156 + }, + { + "epoch": 0.9, + "grad_norm": 0.7996483933800769, + "learning_rate": 2.497465022694207e-07, + "loss": 0.1053, + "step": 3157 + }, + { + "epoch": 0.9, + "grad_norm": 0.4469903126921449, + "learning_rate": 2.483045344018403e-07, + "loss": 0.1086, + "step": 3158 + }, + { + "epoch": 0.9, + "grad_norm": 0.35171566577316166, + "learning_rate": 2.468666353568705e-07, + "loss": 0.0604, + "step": 3159 + }, + { + "epoch": 0.9, + "grad_norm": 0.24449597579980728, + "learning_rate": 2.4543280636576795e-07, + "loss": 0.0561, + "step": 3160 + }, + { + "epoch": 0.9, + "grad_norm": 0.40197464292987445, + "learning_rate": 2.4400304865630443e-07, + "loss": 0.0779, + "step": 3161 + }, + { + "epoch": 0.9, + "grad_norm": 0.4963289897989669, + "learning_rate": 2.425773634527656e-07, + "loss": 0.1029, + "step": 3162 + }, + { + "epoch": 0.9, + "grad_norm": 0.474085929931268, + "learning_rate": 2.411557519759483e-07, + "loss": 0.0881, + "step": 3163 + }, + { + "epoch": 0.9, + "grad_norm": 0.5514450672530389, + "learning_rate": 2.397382154431621e-07, + "loss": 0.0865, + "step": 3164 + }, + { + "epoch": 0.9, + "grad_norm": 0.2864026441455067, + "learning_rate": 2.3832475506822937e-07, + "loss": 0.0848, + "step": 3165 + }, + { + "epoch": 0.9, + "grad_norm": 0.4852871054394636, + "learning_rate": 2.3691537206147752e-07, + "loss": 0.0837, + "step": 3166 + }, + { + "epoch": 0.9, + "grad_norm": 0.5361997184288021, + "learning_rate": 2.3551006762974615e-07, + "loss": 0.1035, + "step": 3167 + }, + { + "epoch": 0.91, + "grad_norm": 0.3813576245138359, + "learning_rate": 2.3410884297638214e-07, + "loss": 0.0962, + "step": 3168 + }, + { + "epoch": 0.91, + "grad_norm": 0.42105703849591514, + "learning_rate": 2.3271169930123794e-07, + "loss": 0.0903, + "step": 3169 + }, + { + "epoch": 0.91, + "grad_norm": 0.5499075512988395, + "learning_rate": 2.3131863780067043e-07, + "loss": 0.117, + "step": 3170 + }, + { + "epoch": 0.91, + "grad_norm": 0.5624073866287035, + "learning_rate": 2.2992965966754378e-07, + "loss": 0.1106, + "step": 3171 + }, + { + "epoch": 0.91, + "grad_norm": 0.2762019032785361, + "learning_rate": 2.2854476609122324e-07, + "loss": 0.0541, + "step": 3172 + }, + { + "epoch": 0.91, + "grad_norm": 0.3713599896848042, + "learning_rate": 2.2716395825757853e-07, + "loss": 0.1105, + "step": 3173 + }, + { + "epoch": 0.91, + "grad_norm": 0.3275689030994533, + "learning_rate": 2.2578723734897778e-07, + "loss": 0.0701, + "step": 3174 + }, + { + "epoch": 0.91, + "grad_norm": 0.7161842915206217, + "learning_rate": 2.2441460454429298e-07, + "loss": 0.1375, + "step": 3175 + }, + { + "epoch": 0.91, + "grad_norm": 0.7011596949713271, + "learning_rate": 2.230460610188928e-07, + "loss": 0.0902, + "step": 3176 + }, + { + "epoch": 0.91, + "grad_norm": 0.24589768083601782, + "learning_rate": 2.216816079446471e-07, + "loss": 0.0409, + "step": 3177 + }, + { + "epoch": 0.91, + "grad_norm": 0.35005746098384194, + "learning_rate": 2.2032124648992015e-07, + "loss": 0.071, + "step": 3178 + }, + { + "epoch": 0.91, + "grad_norm": 0.28128887155202625, + "learning_rate": 2.189649778195735e-07, + "loss": 0.0515, + "step": 3179 + }, + { + "epoch": 0.91, + "grad_norm": 0.6220102521075701, + "learning_rate": 2.1761280309496645e-07, + "loss": 0.1105, + "step": 3180 + }, + { + "epoch": 0.91, + "grad_norm": 0.5035768917234578, + "learning_rate": 2.1626472347394846e-07, + "loss": 0.1463, + "step": 3181 + }, + { + "epoch": 0.91, + "grad_norm": 0.3079254944344666, + "learning_rate": 2.149207401108666e-07, + "loss": 0.0946, + "step": 3182 + }, + { + "epoch": 0.91, + "grad_norm": 0.35225887804025297, + "learning_rate": 2.1358085415655706e-07, + "loss": 0.0733, + "step": 3183 + }, + { + "epoch": 0.91, + "grad_norm": 0.44659481711958765, + "learning_rate": 2.1224506675835033e-07, + "loss": 0.0954, + "step": 3184 + }, + { + "epoch": 0.91, + "grad_norm": 0.5715611052168519, + "learning_rate": 2.109133790600648e-07, + "loss": 0.0989, + "step": 3185 + }, + { + "epoch": 0.91, + "grad_norm": 0.43700031982172266, + "learning_rate": 2.0958579220200946e-07, + "loss": 0.1155, + "step": 3186 + }, + { + "epoch": 0.91, + "grad_norm": 0.4062292000931909, + "learning_rate": 2.0826230732098273e-07, + "loss": 0.0741, + "step": 3187 + }, + { + "epoch": 0.91, + "grad_norm": 0.42563861553701804, + "learning_rate": 2.0694292555026918e-07, + "loss": 0.0764, + "step": 3188 + }, + { + "epoch": 0.91, + "grad_norm": 0.5406280370247767, + "learning_rate": 2.056276480196401e-07, + "loss": 0.0976, + "step": 3189 + }, + { + "epoch": 0.91, + "grad_norm": 0.3888386079015403, + "learning_rate": 2.043164758553523e-07, + "loss": 0.1083, + "step": 3190 + }, + { + "epoch": 0.91, + "grad_norm": 0.3939488506044353, + "learning_rate": 2.0300941018014775e-07, + "loss": 0.1011, + "step": 3191 + }, + { + "epoch": 0.91, + "grad_norm": 0.5155873758611413, + "learning_rate": 2.0170645211325335e-07, + "loss": 0.095, + "step": 3192 + }, + { + "epoch": 0.91, + "grad_norm": 0.3307401139117292, + "learning_rate": 2.0040760277037497e-07, + "loss": 0.0594, + "step": 3193 + }, + { + "epoch": 0.91, + "grad_norm": 0.5379951042311034, + "learning_rate": 1.9911286326370404e-07, + "loss": 0.1394, + "step": 3194 + }, + { + "epoch": 0.91, + "grad_norm": 0.2784503017561588, + "learning_rate": 1.9782223470191043e-07, + "loss": 0.0627, + "step": 3195 + }, + { + "epoch": 0.91, + "grad_norm": 0.7306195747673293, + "learning_rate": 1.9653571819014504e-07, + "loss": 0.0586, + "step": 3196 + }, + { + "epoch": 0.91, + "grad_norm": 0.3736511783424484, + "learning_rate": 1.952533148300373e-07, + "loss": 0.0736, + "step": 3197 + }, + { + "epoch": 0.91, + "grad_norm": 0.5612025754942825, + "learning_rate": 1.9397502571969372e-07, + "loss": 0.1022, + "step": 3198 + }, + { + "epoch": 0.91, + "grad_norm": 0.5941467998400771, + "learning_rate": 1.9270085195370048e-07, + "loss": 0.1111, + "step": 3199 + }, + { + "epoch": 0.91, + "grad_norm": 0.30722840404377555, + "learning_rate": 1.9143079462311644e-07, + "loss": 0.0764, + "step": 3200 + }, + { + "epoch": 0.91, + "grad_norm": 0.2542069273644538, + "learning_rate": 1.9016485481547775e-07, + "loss": 0.0545, + "step": 3201 + }, + { + "epoch": 0.91, + "grad_norm": 0.2906094176902533, + "learning_rate": 1.88903033614794e-07, + "loss": 0.0542, + "step": 3202 + }, + { + "epoch": 0.92, + "grad_norm": 0.3805554005266994, + "learning_rate": 1.8764533210154866e-07, + "loss": 0.0343, + "step": 3203 + }, + { + "epoch": 0.92, + "grad_norm": 0.47367162811423613, + "learning_rate": 1.8639175135269694e-07, + "loss": 0.0983, + "step": 3204 + }, + { + "epoch": 0.92, + "grad_norm": 0.2976198291144996, + "learning_rate": 1.851422924416657e-07, + "loss": 0.0631, + "step": 3205 + }, + { + "epoch": 0.92, + "grad_norm": 0.6087985301665866, + "learning_rate": 1.838969564383525e-07, + "loss": 0.1004, + "step": 3206 + }, + { + "epoch": 0.92, + "grad_norm": 0.2820442871041826, + "learning_rate": 1.8265574440912482e-07, + "loss": 0.0835, + "step": 3207 + }, + { + "epoch": 0.92, + "grad_norm": 0.3762518008633823, + "learning_rate": 1.8141865741681696e-07, + "loss": 0.0632, + "step": 3208 + }, + { + "epoch": 0.92, + "grad_norm": 0.4503721909351601, + "learning_rate": 1.801856965207338e-07, + "loss": 0.1024, + "step": 3209 + }, + { + "epoch": 0.92, + "grad_norm": 0.4820321119434082, + "learning_rate": 1.789568627766447e-07, + "loss": 0.083, + "step": 3210 + }, + { + "epoch": 0.92, + "grad_norm": 0.5439868678076599, + "learning_rate": 1.7773215723678738e-07, + "loss": 0.0971, + "step": 3211 + }, + { + "epoch": 0.92, + "grad_norm": 0.4855445971609712, + "learning_rate": 1.7651158094986132e-07, + "loss": 0.0758, + "step": 3212 + }, + { + "epoch": 0.92, + "grad_norm": 0.4096660548585762, + "learning_rate": 1.7529513496103322e-07, + "loss": 0.1115, + "step": 3213 + }, + { + "epoch": 0.92, + "grad_norm": 0.3252218780472281, + "learning_rate": 1.7408282031193213e-07, + "loss": 0.1095, + "step": 3214 + }, + { + "epoch": 0.92, + "grad_norm": 0.5576200799213603, + "learning_rate": 1.7287463804064874e-07, + "loss": 0.1319, + "step": 3215 + }, + { + "epoch": 0.92, + "grad_norm": 0.30832358778532887, + "learning_rate": 1.7167058918173552e-07, + "loss": 0.0754, + "step": 3216 + }, + { + "epoch": 0.92, + "grad_norm": 0.3434327836068545, + "learning_rate": 1.7047067476620605e-07, + "loss": 0.0927, + "step": 3217 + }, + { + "epoch": 0.92, + "grad_norm": 0.3785415899616005, + "learning_rate": 1.69274895821534e-07, + "loss": 0.0739, + "step": 3218 + }, + { + "epoch": 0.92, + "grad_norm": 0.40350550111390837, + "learning_rate": 1.6808325337164976e-07, + "loss": 0.0953, + "step": 3219 + }, + { + "epoch": 0.92, + "grad_norm": 0.3168142233658187, + "learning_rate": 1.6689574843694433e-07, + "loss": 0.0615, + "step": 3220 + }, + { + "epoch": 0.92, + "grad_norm": 0.8192219085259064, + "learning_rate": 1.6571238203426431e-07, + "loss": 0.1105, + "step": 3221 + }, + { + "epoch": 0.92, + "grad_norm": 1.2060648279584498, + "learning_rate": 1.6453315517691304e-07, + "loss": 0.1924, + "step": 3222 + }, + { + "epoch": 0.92, + "grad_norm": 0.3242110053967352, + "learning_rate": 1.633580688746489e-07, + "loss": 0.0782, + "step": 3223 + }, + { + "epoch": 0.92, + "grad_norm": 0.3474303083418852, + "learning_rate": 1.6218712413368421e-07, + "loss": 0.0964, + "step": 3224 + }, + { + "epoch": 0.92, + "grad_norm": 0.49573276932793436, + "learning_rate": 1.6102032195668639e-07, + "loss": 0.117, + "step": 3225 + }, + { + "epoch": 0.92, + "grad_norm": 0.5592060687338686, + "learning_rate": 1.5985766334277565e-07, + "loss": 0.0706, + "step": 3226 + }, + { + "epoch": 0.92, + "grad_norm": 0.7275520899795545, + "learning_rate": 1.5869914928752117e-07, + "loss": 0.1233, + "step": 3227 + }, + { + "epoch": 0.92, + "grad_norm": 0.36510226191341333, + "learning_rate": 1.5754478078294667e-07, + "loss": 0.1008, + "step": 3228 + }, + { + "epoch": 0.92, + "grad_norm": 0.4295287535206983, + "learning_rate": 1.563945588175253e-07, + "loss": 0.0951, + "step": 3229 + }, + { + "epoch": 0.92, + "grad_norm": 0.2997562005991779, + "learning_rate": 1.5524848437617757e-07, + "loss": 0.0754, + "step": 3230 + }, + { + "epoch": 0.92, + "grad_norm": 0.2837746237666502, + "learning_rate": 1.5410655844027455e-07, + "loss": 0.0645, + "step": 3231 + }, + { + "epoch": 0.92, + "grad_norm": 0.5188797010105655, + "learning_rate": 1.529687819876341e-07, + "loss": 0.109, + "step": 3232 + }, + { + "epoch": 0.92, + "grad_norm": 0.4537171225884437, + "learning_rate": 1.518351559925224e-07, + "loss": 0.1158, + "step": 3233 + }, + { + "epoch": 0.92, + "grad_norm": 0.4030401708955543, + "learning_rate": 1.5070568142564912e-07, + "loss": 0.0894, + "step": 3234 + }, + { + "epoch": 0.92, + "grad_norm": 0.4316463361203101, + "learning_rate": 1.4958035925417002e-07, + "loss": 0.068, + "step": 3235 + }, + { + "epoch": 0.92, + "grad_norm": 0.4367746398417832, + "learning_rate": 1.4845919044168765e-07, + "loss": 0.1019, + "step": 3236 + }, + { + "epoch": 0.92, + "grad_norm": 0.31041811684435033, + "learning_rate": 1.4734217594824462e-07, + "loss": 0.0764, + "step": 3237 + }, + { + "epoch": 0.93, + "grad_norm": 0.44291887730989216, + "learning_rate": 1.4622931673032802e-07, + "loss": 0.1056, + "step": 3238 + }, + { + "epoch": 0.93, + "grad_norm": 0.5296446008071295, + "learning_rate": 1.4512061374086673e-07, + "loss": 0.0884, + "step": 3239 + }, + { + "epoch": 0.93, + "grad_norm": 0.38103576880475604, + "learning_rate": 1.4401606792923018e-07, + "loss": 0.0823, + "step": 3240 + }, + { + "epoch": 0.93, + "grad_norm": 0.34581749166633186, + "learning_rate": 1.4291568024122848e-07, + "loss": 0.0737, + "step": 3241 + }, + { + "epoch": 0.93, + "grad_norm": 0.4918685641044202, + "learning_rate": 1.418194516191107e-07, + "loss": 0.0802, + "step": 3242 + }, + { + "epoch": 0.93, + "grad_norm": 0.5656988173108841, + "learning_rate": 1.4072738300156542e-07, + "loss": 0.1261, + "step": 3243 + }, + { + "epoch": 0.93, + "grad_norm": 0.5050725988253879, + "learning_rate": 1.3963947532371847e-07, + "loss": 0.0749, + "step": 3244 + }, + { + "epoch": 0.93, + "grad_norm": 0.5657532368454418, + "learning_rate": 1.3855572951713247e-07, + "loss": 0.0827, + "step": 3245 + }, + { + "epoch": 0.93, + "grad_norm": 0.42601363507525336, + "learning_rate": 1.374761465098068e-07, + "loss": 0.0928, + "step": 3246 + }, + { + "epoch": 0.93, + "grad_norm": 0.4157097788132322, + "learning_rate": 1.3640072722617582e-07, + "loss": 0.0989, + "step": 3247 + }, + { + "epoch": 0.93, + "grad_norm": 0.3528585691193063, + "learning_rate": 1.3532947258710905e-07, + "loss": 0.0749, + "step": 3248 + }, + { + "epoch": 0.93, + "grad_norm": 0.3565371330254819, + "learning_rate": 1.342623835099094e-07, + "loss": 0.0968, + "step": 3249 + }, + { + "epoch": 0.93, + "grad_norm": 0.5161753236446334, + "learning_rate": 1.3319946090831372e-07, + "loss": 0.0847, + "step": 3250 + }, + { + "epoch": 0.93, + "grad_norm": 0.46508133632526183, + "learning_rate": 1.3214070569249005e-07, + "loss": 0.0954, + "step": 3251 + }, + { + "epoch": 0.93, + "grad_norm": 0.4372601058548169, + "learning_rate": 1.3108611876903764e-07, + "loss": 0.0775, + "step": 3252 + }, + { + "epoch": 0.93, + "grad_norm": 0.4836003045139583, + "learning_rate": 1.3003570104098807e-07, + "loss": 0.1008, + "step": 3253 + }, + { + "epoch": 0.93, + "grad_norm": 0.4218276298255375, + "learning_rate": 1.2898945340780234e-07, + "loss": 0.0726, + "step": 3254 + }, + { + "epoch": 0.93, + "grad_norm": 0.39651672447591346, + "learning_rate": 1.2794737676536993e-07, + "loss": 0.0666, + "step": 3255 + }, + { + "epoch": 0.93, + "grad_norm": 0.50122331583475, + "learning_rate": 1.2690947200600878e-07, + "loss": 0.0934, + "step": 3256 + }, + { + "epoch": 0.93, + "grad_norm": 0.49216263695395385, + "learning_rate": 1.258757400184657e-07, + "loss": 0.1262, + "step": 3257 + }, + { + "epoch": 0.93, + "grad_norm": 0.29244085203128617, + "learning_rate": 1.2484618168791318e-07, + "loss": 0.059, + "step": 3258 + }, + { + "epoch": 0.93, + "grad_norm": 0.35483072986087194, + "learning_rate": 1.2382079789595046e-07, + "loss": 0.0608, + "step": 3259 + }, + { + "epoch": 0.93, + "grad_norm": 0.47992746531576247, + "learning_rate": 1.2279958952060133e-07, + "loss": 0.0914, + "step": 3260 + }, + { + "epoch": 0.93, + "grad_norm": 0.5153281714533938, + "learning_rate": 1.2178255743631573e-07, + "loss": 0.0934, + "step": 3261 + }, + { + "epoch": 0.93, + "grad_norm": 0.2966278691312279, + "learning_rate": 1.2076970251396593e-07, + "loss": 0.0494, + "step": 3262 + }, + { + "epoch": 0.93, + "grad_norm": 0.4332412938004602, + "learning_rate": 1.1976102562084923e-07, + "loss": 0.0757, + "step": 3263 + }, + { + "epoch": 0.93, + "grad_norm": 0.2894091835267774, + "learning_rate": 1.1875652762068257e-07, + "loss": 0.0809, + "step": 3264 + }, + { + "epoch": 0.93, + "grad_norm": 0.4539431948922057, + "learning_rate": 1.1775620937360677e-07, + "loss": 0.1014, + "step": 3265 + }, + { + "epoch": 0.93, + "grad_norm": 0.4326588252693965, + "learning_rate": 1.1676007173618386e-07, + "loss": 0.0925, + "step": 3266 + }, + { + "epoch": 0.93, + "grad_norm": 0.5796840020659046, + "learning_rate": 1.1576811556139379e-07, + "loss": 0.1243, + "step": 3267 + }, + { + "epoch": 0.93, + "grad_norm": 0.4064585936446287, + "learning_rate": 1.1478034169863761e-07, + "loss": 0.0742, + "step": 3268 + }, + { + "epoch": 0.93, + "grad_norm": 0.7822521092715196, + "learning_rate": 1.1379675099373489e-07, + "loss": 0.108, + "step": 3269 + }, + { + "epoch": 0.93, + "grad_norm": 0.34478155691521306, + "learning_rate": 1.128173442889241e-07, + "loss": 0.0906, + "step": 3270 + }, + { + "epoch": 0.93, + "grad_norm": 0.3409959373970809, + "learning_rate": 1.118421224228583e-07, + "loss": 0.0794, + "step": 3271 + }, + { + "epoch": 0.93, + "grad_norm": 0.7497633233841792, + "learning_rate": 1.1087108623061005e-07, + "loss": 0.1793, + "step": 3272 + }, + { + "epoch": 0.94, + "grad_norm": 0.35876689533953393, + "learning_rate": 1.0990423654366589e-07, + "loss": 0.0905, + "step": 3273 + }, + { + "epoch": 0.94, + "grad_norm": 0.7473414230592106, + "learning_rate": 1.0894157418992913e-07, + "loss": 0.1061, + "step": 3274 + }, + { + "epoch": 0.94, + "grad_norm": 0.5565685234768728, + "learning_rate": 1.0798309999371537e-07, + "loss": 0.0935, + "step": 3275 + }, + { + "epoch": 0.94, + "grad_norm": 0.3876416799604804, + "learning_rate": 1.0702881477575589e-07, + "loss": 0.1132, + "step": 3276 + }, + { + "epoch": 0.94, + "grad_norm": 0.43566974002535763, + "learning_rate": 1.0607871935319424e-07, + "loss": 0.0821, + "step": 3277 + }, + { + "epoch": 0.94, + "grad_norm": 0.4047787715443791, + "learning_rate": 1.0513281453958634e-07, + "loss": 0.0936, + "step": 3278 + }, + { + "epoch": 0.94, + "grad_norm": 0.6282803767499353, + "learning_rate": 1.0419110114489872e-07, + "loss": 0.1171, + "step": 3279 + }, + { + "epoch": 0.94, + "grad_norm": 0.5866470912434046, + "learning_rate": 1.0325357997551133e-07, + "loss": 0.1036, + "step": 3280 + }, + { + "epoch": 0.94, + "grad_norm": 0.3486570549630734, + "learning_rate": 1.0232025183421201e-07, + "loss": 0.05, + "step": 3281 + }, + { + "epoch": 0.94, + "grad_norm": 0.3384301582813744, + "learning_rate": 1.0139111752019815e-07, + "loss": 0.0636, + "step": 3282 + }, + { + "epoch": 0.94, + "grad_norm": 0.3176755796300654, + "learning_rate": 1.004661778290783e-07, + "loss": 0.0698, + "step": 3283 + }, + { + "epoch": 0.94, + "grad_norm": 0.4258889461217338, + "learning_rate": 9.954543355286728e-08, + "loss": 0.0975, + "step": 3284 + }, + { + "epoch": 0.94, + "grad_norm": 0.7024887379112578, + "learning_rate": 9.862888547998828e-08, + "loss": 0.0737, + "step": 3285 + }, + { + "epoch": 0.94, + "grad_norm": 0.4206105340505765, + "learning_rate": 9.771653439527018e-08, + "loss": 0.102, + "step": 3286 + }, + { + "epoch": 0.94, + "grad_norm": 0.4195246951953242, + "learning_rate": 9.680838107994862e-08, + "loss": 0.0756, + "step": 3287 + }, + { + "epoch": 0.94, + "grad_norm": 0.3917028594702229, + "learning_rate": 9.590442631166596e-08, + "loss": 0.083, + "step": 3288 + }, + { + "epoch": 0.94, + "grad_norm": 0.29266001892902305, + "learning_rate": 9.500467086446808e-08, + "loss": 0.0437, + "step": 3289 + }, + { + "epoch": 0.94, + "grad_norm": 0.37412673788755235, + "learning_rate": 9.410911550880474e-08, + "loss": 0.0734, + "step": 3290 + }, + { + "epoch": 0.94, + "grad_norm": 0.46499662507825834, + "learning_rate": 9.321776101153036e-08, + "loss": 0.1064, + "step": 3291 + }, + { + "epoch": 0.94, + "grad_norm": 0.658447741871222, + "learning_rate": 9.2330608135901e-08, + "loss": 0.1305, + "step": 3292 + }, + { + "epoch": 0.94, + "grad_norm": 0.6157612226991941, + "learning_rate": 9.144765764157626e-08, + "loss": 0.1157, + "step": 3293 + }, + { + "epoch": 0.94, + "grad_norm": 0.36526823528116403, + "learning_rate": 9.056891028461634e-08, + "loss": 0.0761, + "step": 3294 + }, + { + "epoch": 0.94, + "grad_norm": 0.3600432633643026, + "learning_rate": 8.969436681748211e-08, + "loss": 0.0747, + "step": 3295 + }, + { + "epoch": 0.94, + "grad_norm": 0.3851054540579154, + "learning_rate": 8.882402798903567e-08, + "loss": 0.0709, + "step": 3296 + }, + { + "epoch": 0.94, + "grad_norm": 0.36745096125280796, + "learning_rate": 8.795789454453862e-08, + "loss": 0.063, + "step": 3297 + }, + { + "epoch": 0.94, + "grad_norm": 0.5655616240414351, + "learning_rate": 8.709596722564995e-08, + "loss": 0.1035, + "step": 3298 + }, + { + "epoch": 0.94, + "grad_norm": 0.2887872178091258, + "learning_rate": 8.623824677042869e-08, + "loss": 0.0686, + "step": 3299 + }, + { + "epoch": 0.94, + "grad_norm": 0.325500457544978, + "learning_rate": 8.53847339133318e-08, + "loss": 0.0864, + "step": 3300 + }, + { + "epoch": 0.94, + "grad_norm": 0.7261985904500424, + "learning_rate": 8.453542938521186e-08, + "loss": 0.0871, + "step": 3301 + }, + { + "epoch": 0.94, + "grad_norm": 0.33562648861250804, + "learning_rate": 8.369033391331827e-08, + "loss": 0.0856, + "step": 3302 + }, + { + "epoch": 0.94, + "grad_norm": 0.45394825171012915, + "learning_rate": 8.284944822129771e-08, + "loss": 0.1497, + "step": 3303 + }, + { + "epoch": 0.94, + "grad_norm": 0.4725440529741108, + "learning_rate": 8.201277302919086e-08, + "loss": 0.0777, + "step": 3304 + }, + { + "epoch": 0.94, + "grad_norm": 0.4171524438210051, + "learning_rate": 8.118030905343244e-08, + "loss": 0.1102, + "step": 3305 + }, + { + "epoch": 0.94, + "grad_norm": 0.4799686894869307, + "learning_rate": 8.035205700685167e-08, + "loss": 0.0815, + "step": 3306 + }, + { + "epoch": 0.94, + "grad_norm": 0.3405687698273351, + "learning_rate": 7.952801759867234e-08, + "loss": 0.069, + "step": 3307 + }, + { + "epoch": 0.95, + "grad_norm": 0.33108709791236074, + "learning_rate": 7.870819153450948e-08, + "loss": 0.0717, + "step": 3308 + }, + { + "epoch": 0.95, + "grad_norm": 0.34672819397009647, + "learning_rate": 7.789257951637097e-08, + "loss": 0.065, + "step": 3309 + }, + { + "epoch": 0.95, + "grad_norm": 0.3926708691791063, + "learning_rate": 7.708118224265538e-08, + "loss": 0.0908, + "step": 3310 + }, + { + "epoch": 0.95, + "grad_norm": 0.3420887632203836, + "learning_rate": 7.627400040815414e-08, + "loss": 0.0867, + "step": 3311 + }, + { + "epoch": 0.95, + "grad_norm": 0.4632687188261814, + "learning_rate": 7.547103470404715e-08, + "loss": 0.0789, + "step": 3312 + }, + { + "epoch": 0.95, + "grad_norm": 0.6222433898869117, + "learning_rate": 7.467228581790442e-08, + "loss": 0.1371, + "step": 3313 + }, + { + "epoch": 0.95, + "grad_norm": 0.246291151164361, + "learning_rate": 7.38777544336855e-08, + "loss": 0.0523, + "step": 3314 + }, + { + "epoch": 0.95, + "grad_norm": 0.3241067907932272, + "learning_rate": 7.308744123174006e-08, + "loss": 0.0698, + "step": 3315 + }, + { + "epoch": 0.95, + "grad_norm": 0.4332113502979993, + "learning_rate": 7.23013468888023e-08, + "loss": 0.059, + "step": 3316 + }, + { + "epoch": 0.95, + "grad_norm": 0.4236873711937894, + "learning_rate": 7.151947207799659e-08, + "loss": 0.0946, + "step": 3317 + }, + { + "epoch": 0.95, + "grad_norm": 0.6952784620458924, + "learning_rate": 7.074181746883402e-08, + "loss": 0.0957, + "step": 3318 + }, + { + "epoch": 0.95, + "grad_norm": 0.28276571411557155, + "learning_rate": 6.996838372721081e-08, + "loss": 0.0656, + "step": 3319 + }, + { + "epoch": 0.95, + "grad_norm": 0.5588409740863747, + "learning_rate": 6.919917151540944e-08, + "loss": 0.1218, + "step": 3320 + }, + { + "epoch": 0.95, + "grad_norm": 0.4612321770491532, + "learning_rate": 6.843418149209746e-08, + "loss": 0.0901, + "step": 3321 + }, + { + "epoch": 0.95, + "grad_norm": 0.4064045617447785, + "learning_rate": 6.7673414312327e-08, + "loss": 0.0931, + "step": 3322 + }, + { + "epoch": 0.95, + "grad_norm": 0.5689550638979906, + "learning_rate": 6.691687062753527e-08, + "loss": 0.0915, + "step": 3323 + }, + { + "epoch": 0.95, + "grad_norm": 0.5663080168611111, + "learning_rate": 6.616455108554077e-08, + "loss": 0.1018, + "step": 3324 + }, + { + "epoch": 0.95, + "grad_norm": 0.8013417354675473, + "learning_rate": 6.54164563305465e-08, + "loss": 0.1508, + "step": 3325 + }, + { + "epoch": 0.95, + "grad_norm": 0.4814055110657226, + "learning_rate": 6.467258700313783e-08, + "loss": 0.0787, + "step": 3326 + }, + { + "epoch": 0.95, + "grad_norm": 0.3147537830167229, + "learning_rate": 6.393294374028191e-08, + "loss": 0.0732, + "step": 3327 + }, + { + "epoch": 0.95, + "grad_norm": 0.2579453432870022, + "learning_rate": 6.319752717532546e-08, + "loss": 0.0722, + "step": 3328 + }, + { + "epoch": 0.95, + "grad_norm": 0.42759856071289837, + "learning_rate": 6.246633793799861e-08, + "loss": 0.0726, + "step": 3329 + }, + { + "epoch": 0.95, + "grad_norm": 0.49898341338214997, + "learning_rate": 6.173937665440943e-08, + "loss": 0.1104, + "step": 3330 + }, + { + "epoch": 0.95, + "grad_norm": 0.6096860816133118, + "learning_rate": 6.101664394704776e-08, + "loss": 0.102, + "step": 3331 + }, + { + "epoch": 0.95, + "grad_norm": 0.5523672671007938, + "learning_rate": 6.029814043478022e-08, + "loss": 0.1123, + "step": 3332 + }, + { + "epoch": 0.95, + "grad_norm": 0.5052568663859954, + "learning_rate": 5.958386673285299e-08, + "loss": 0.1082, + "step": 3333 + }, + { + "epoch": 0.95, + "grad_norm": 0.29911481746438745, + "learning_rate": 5.887382345289183e-08, + "loss": 0.0535, + "step": 3334 + }, + { + "epoch": 0.95, + "grad_norm": 1.0355043715505179, + "learning_rate": 5.816801120289761e-08, + "loss": 0.1142, + "step": 3335 + }, + { + "epoch": 0.95, + "grad_norm": 0.3985300031301009, + "learning_rate": 5.7466430587249654e-08, + "loss": 0.1013, + "step": 3336 + }, + { + "epoch": 0.95, + "grad_norm": 0.3168866278861751, + "learning_rate": 5.676908220670352e-08, + "loss": 0.0695, + "step": 3337 + }, + { + "epoch": 0.95, + "grad_norm": 0.31826716573248615, + "learning_rate": 5.607596665839099e-08, + "loss": 0.0832, + "step": 3338 + }, + { + "epoch": 0.95, + "grad_norm": 0.4231131202459236, + "learning_rate": 5.538708453581787e-08, + "loss": 0.1107, + "step": 3339 + }, + { + "epoch": 0.95, + "grad_norm": 0.5265555752924048, + "learning_rate": 5.47024364288673e-08, + "loss": 0.1225, + "step": 3340 + }, + { + "epoch": 0.95, + "grad_norm": 0.8986658213989515, + "learning_rate": 5.402202292379477e-08, + "loss": 0.1519, + "step": 3341 + }, + { + "epoch": 0.95, + "grad_norm": 0.9789127155337204, + "learning_rate": 5.334584460323089e-08, + "loss": 0.1313, + "step": 3342 + }, + { + "epoch": 0.96, + "grad_norm": 0.49907376320964203, + "learning_rate": 5.267390204617917e-08, + "loss": 0.0862, + "step": 3343 + }, + { + "epoch": 0.96, + "grad_norm": 0.5862843627476348, + "learning_rate": 5.200619582801714e-08, + "loss": 0.1009, + "step": 3344 + }, + { + "epoch": 0.96, + "grad_norm": 0.3247460361294157, + "learning_rate": 5.13427265204941e-08, + "loss": 0.0713, + "step": 3345 + }, + { + "epoch": 0.96, + "grad_norm": 0.5319999254878629, + "learning_rate": 5.068349469173006e-08, + "loss": 0.0985, + "step": 3346 + }, + { + "epoch": 0.96, + "grad_norm": 0.3623403173729547, + "learning_rate": 5.002850090621847e-08, + "loss": 0.0868, + "step": 3347 + }, + { + "epoch": 0.96, + "grad_norm": 0.3339024567836248, + "learning_rate": 4.937774572482235e-08, + "loss": 0.072, + "step": 3348 + }, + { + "epoch": 0.96, + "grad_norm": 0.44323003646036496, + "learning_rate": 4.8731229704777613e-08, + "loss": 0.0858, + "step": 3349 + }, + { + "epoch": 0.96, + "grad_norm": 0.487973680532394, + "learning_rate": 4.808895339968644e-08, + "loss": 0.073, + "step": 3350 + }, + { + "epoch": 0.96, + "grad_norm": 0.37731754414089147, + "learning_rate": 4.7450917359524454e-08, + "loss": 0.0582, + "step": 3351 + }, + { + "epoch": 0.96, + "grad_norm": 0.6750099556148973, + "learning_rate": 4.68171221306335e-08, + "loss": 0.091, + "step": 3352 + }, + { + "epoch": 0.96, + "grad_norm": 0.35522013923132423, + "learning_rate": 4.618756825572612e-08, + "loss": 0.0586, + "step": 3353 + }, + { + "epoch": 0.96, + "grad_norm": 0.31009866016671966, + "learning_rate": 4.556225627388111e-08, + "loss": 0.0742, + "step": 3354 + }, + { + "epoch": 0.96, + "grad_norm": 0.3063728360736794, + "learning_rate": 4.4941186720546257e-08, + "loss": 0.0668, + "step": 3355 + }, + { + "epoch": 0.96, + "grad_norm": 0.4919264632058933, + "learning_rate": 4.4324360127536715e-08, + "loss": 0.1048, + "step": 3356 + }, + { + "epoch": 0.96, + "grad_norm": 0.6002799573453805, + "learning_rate": 4.371177702303386e-08, + "loss": 0.0849, + "step": 3357 + }, + { + "epoch": 0.96, + "grad_norm": 0.43782652600416505, + "learning_rate": 4.310343793158589e-08, + "loss": 0.095, + "step": 3358 + }, + { + "epoch": 0.96, + "grad_norm": 0.7871045398422243, + "learning_rate": 4.2499343374106106e-08, + "loss": 0.057, + "step": 3359 + }, + { + "epoch": 0.96, + "grad_norm": 0.7023890818196845, + "learning_rate": 4.189949386787462e-08, + "loss": 0.1334, + "step": 3360 + }, + { + "epoch": 0.96, + "grad_norm": 0.3359971779330775, + "learning_rate": 4.1303889926534445e-08, + "loss": 0.0651, + "step": 3361 + }, + { + "epoch": 0.96, + "grad_norm": 0.5400735310857169, + "learning_rate": 4.07125320600954e-08, + "loss": 0.1259, + "step": 3362 + }, + { + "epoch": 0.96, + "grad_norm": 0.5405640670288637, + "learning_rate": 4.01254207749302e-08, + "loss": 0.1642, + "step": 3363 + }, + { + "epoch": 0.96, + "grad_norm": 0.5348866870220175, + "learning_rate": 3.9542556573775595e-08, + "loss": 0.1004, + "step": 3364 + }, + { + "epoch": 0.96, + "grad_norm": 0.24910636097863664, + "learning_rate": 3.896393995573178e-08, + "loss": 0.0616, + "step": 3365 + }, + { + "epoch": 0.96, + "grad_norm": 0.366873144328219, + "learning_rate": 3.8389571416260763e-08, + "loss": 0.0898, + "step": 3366 + }, + { + "epoch": 0.96, + "grad_norm": 0.39985874735107874, + "learning_rate": 3.781945144718912e-08, + "loss": 0.1133, + "step": 3367 + }, + { + "epoch": 0.96, + "grad_norm": 0.6632584084143394, + "learning_rate": 3.725358053670247e-08, + "loss": 0.1274, + "step": 3368 + }, + { + "epoch": 0.96, + "grad_norm": 0.3700240104742133, + "learning_rate": 3.669195916935042e-08, + "loss": 0.0742, + "step": 3369 + }, + { + "epoch": 0.96, + "grad_norm": 0.37909204222745296, + "learning_rate": 3.613458782604329e-08, + "loss": 0.1113, + "step": 3370 + }, + { + "epoch": 0.96, + "grad_norm": 0.22199649197537116, + "learning_rate": 3.5581466984051514e-08, + "loss": 0.0443, + "step": 3371 + }, + { + "epoch": 0.96, + "grad_norm": 0.270257742809934, + "learning_rate": 3.5032597117005684e-08, + "loss": 0.0656, + "step": 3372 + }, + { + "epoch": 0.96, + "grad_norm": 0.251242812550788, + "learning_rate": 3.4487978694897615e-08, + "loss": 0.0492, + "step": 3373 + }, + { + "epoch": 0.96, + "grad_norm": 0.3459479416212941, + "learning_rate": 3.394761218407705e-08, + "loss": 0.0942, + "step": 3374 + }, + { + "epoch": 0.96, + "grad_norm": 0.3633909239035125, + "learning_rate": 3.341149804725496e-08, + "loss": 0.071, + "step": 3375 + }, + { + "epoch": 0.96, + "grad_norm": 0.29610327368760175, + "learning_rate": 3.2879636743498036e-08, + "loss": 0.0658, + "step": 3376 + }, + { + "epoch": 0.96, + "grad_norm": 0.5563475551282037, + "learning_rate": 3.235202872823362e-08, + "loss": 0.1119, + "step": 3377 + }, + { + "epoch": 0.97, + "grad_norm": 0.48711414978677237, + "learning_rate": 3.1828674453247e-08, + "loss": 0.0973, + "step": 3378 + }, + { + "epoch": 0.97, + "grad_norm": 0.7148941865718158, + "learning_rate": 3.1309574366680805e-08, + "loss": 0.1406, + "step": 3379 + }, + { + "epoch": 0.97, + "grad_norm": 0.3068520706239714, + "learning_rate": 3.079472891303337e-08, + "loss": 0.104, + "step": 3380 + }, + { + "epoch": 0.97, + "grad_norm": 0.5064547131371717, + "learning_rate": 3.0284138533160924e-08, + "loss": 0.0996, + "step": 3381 + }, + { + "epoch": 0.97, + "grad_norm": 0.3391909003656588, + "learning_rate": 2.977780366427763e-08, + "loss": 0.0849, + "step": 3382 + }, + { + "epoch": 0.97, + "grad_norm": 0.4983722811439082, + "learning_rate": 2.9275724739951107e-08, + "loss": 0.1175, + "step": 3383 + }, + { + "epoch": 0.97, + "grad_norm": 0.36572172944746717, + "learning_rate": 2.8777902190105788e-08, + "loss": 0.0485, + "step": 3384 + }, + { + "epoch": 0.97, + "grad_norm": 0.38564736716257747, + "learning_rate": 2.8284336441021797e-08, + "loss": 0.1195, + "step": 3385 + }, + { + "epoch": 0.97, + "grad_norm": 0.7374921825896716, + "learning_rate": 2.779502791533384e-08, + "loss": 0.1746, + "step": 3386 + }, + { + "epoch": 0.97, + "grad_norm": 0.509760011740364, + "learning_rate": 2.73099770320312e-08, + "loss": 0.0702, + "step": 3387 + }, + { + "epoch": 0.97, + "grad_norm": 0.26007978575729523, + "learning_rate": 2.6829184206457194e-08, + "loss": 0.0854, + "step": 3388 + }, + { + "epoch": 0.97, + "grad_norm": 0.6364074706483942, + "learning_rate": 2.635264985030972e-08, + "loss": 0.0943, + "step": 3389 + }, + { + "epoch": 0.97, + "grad_norm": 0.33450087203858087, + "learning_rate": 2.5880374371639594e-08, + "loss": 0.0835, + "step": 3390 + }, + { + "epoch": 0.97, + "grad_norm": 0.40708895394714223, + "learning_rate": 2.5412358174850547e-08, + "loss": 0.0737, + "step": 3391 + }, + { + "epoch": 0.97, + "grad_norm": 0.43748443437299306, + "learning_rate": 2.4948601660699234e-08, + "loss": 0.1078, + "step": 3392 + }, + { + "epoch": 0.97, + "grad_norm": 0.5448005776903556, + "learning_rate": 2.4489105226295773e-08, + "loss": 0.1102, + "step": 3393 + }, + { + "epoch": 0.97, + "grad_norm": 0.3589647034407713, + "learning_rate": 2.4033869265102095e-08, + "loss": 0.0493, + "step": 3394 + }, + { + "epoch": 0.97, + "grad_norm": 0.4862344371988249, + "learning_rate": 2.358289416693027e-08, + "loss": 0.1112, + "step": 3395 + }, + { + "epoch": 0.97, + "grad_norm": 0.3247753245697565, + "learning_rate": 2.3136180317945845e-08, + "loss": 0.0573, + "step": 3396 + }, + { + "epoch": 0.97, + "grad_norm": 0.6654975992640926, + "learning_rate": 2.2693728100664503e-08, + "loss": 0.1228, + "step": 3397 + }, + { + "epoch": 0.97, + "grad_norm": 0.39955271232783485, + "learning_rate": 2.2255537893953737e-08, + "loss": 0.0932, + "step": 3398 + }, + { + "epoch": 0.97, + "grad_norm": 0.3349268757937497, + "learning_rate": 2.182161007303063e-08, + "loss": 0.0881, + "step": 3399 + }, + { + "epoch": 0.97, + "grad_norm": 0.4246192138527646, + "learning_rate": 2.1391945009461844e-08, + "loss": 0.1012, + "step": 3400 + }, + { + "epoch": 0.97, + "grad_norm": 0.5570969127838759, + "learning_rate": 2.09665430711653e-08, + "loss": 0.0847, + "step": 3401 + }, + { + "epoch": 0.97, + "grad_norm": 0.3703196996109987, + "learning_rate": 2.0545404622407396e-08, + "loss": 0.0733, + "step": 3402 + }, + { + "epoch": 0.97, + "grad_norm": 0.35354514712258345, + "learning_rate": 2.012853002380466e-08, + "loss": 0.0385, + "step": 3403 + }, + { + "epoch": 0.97, + "grad_norm": 0.5707313295190611, + "learning_rate": 1.9715919632322112e-08, + "loss": 0.1097, + "step": 3404 + }, + { + "epoch": 0.97, + "grad_norm": 0.7196503236563584, + "learning_rate": 1.930757380127324e-08, + "loss": 0.1063, + "step": 3405 + }, + { + "epoch": 0.97, + "grad_norm": 0.31695260719600904, + "learning_rate": 1.890349288031945e-08, + "loss": 0.0706, + "step": 3406 + }, + { + "epoch": 0.97, + "grad_norm": 0.5134340109642018, + "learning_rate": 1.8503677215470638e-08, + "loss": 0.0866, + "step": 3407 + }, + { + "epoch": 0.97, + "grad_norm": 0.3469534044839269, + "learning_rate": 1.8108127149085164e-08, + "loss": 0.0907, + "step": 3408 + }, + { + "epoch": 0.97, + "grad_norm": 0.23949055173243536, + "learning_rate": 1.7716843019867646e-08, + "loss": 0.0583, + "step": 3409 + }, + { + "epoch": 0.97, + "grad_norm": 0.47941271748203396, + "learning_rate": 1.7329825162870073e-08, + "loss": 0.0798, + "step": 3410 + }, + { + "epoch": 0.97, + "grad_norm": 0.48442996156809487, + "learning_rate": 1.694707390949124e-08, + "loss": 0.1072, + "step": 3411 + }, + { + "epoch": 0.97, + "grad_norm": 0.45628246575253745, + "learning_rate": 1.656858958747676e-08, + "loss": 0.1119, + "step": 3412 + }, + { + "epoch": 0.98, + "grad_norm": 0.5146618838428757, + "learning_rate": 1.6194372520919044e-08, + "loss": 0.0637, + "step": 3413 + }, + { + "epoch": 0.98, + "grad_norm": 0.4363049530506406, + "learning_rate": 1.5824423030255665e-08, + "loss": 0.0673, + "step": 3414 + }, + { + "epoch": 0.98, + "grad_norm": 0.4012837372101367, + "learning_rate": 1.545874143226933e-08, + "loss": 0.0857, + "step": 3415 + }, + { + "epoch": 0.98, + "grad_norm": 0.45617617905043395, + "learning_rate": 1.509732804009012e-08, + "loss": 0.0702, + "step": 3416 + }, + { + "epoch": 0.98, + "grad_norm": 0.38870183272881115, + "learning_rate": 1.474018316319159e-08, + "loss": 0.0587, + "step": 3417 + }, + { + "epoch": 0.98, + "grad_norm": 0.36078498147593285, + "learning_rate": 1.4387307107393e-08, + "loss": 0.0602, + "step": 3418 + }, + { + "epoch": 0.98, + "grad_norm": 0.6315291875414932, + "learning_rate": 1.4038700174857645e-08, + "loss": 0.1023, + "step": 3419 + }, + { + "epoch": 0.98, + "grad_norm": 0.3796270388545187, + "learning_rate": 1.3694362664094518e-08, + "loss": 0.0815, + "step": 3420 + }, + { + "epoch": 0.98, + "grad_norm": 0.4198817889154841, + "learning_rate": 1.3354294869954987e-08, + "loss": 0.0925, + "step": 3421 + }, + { + "epoch": 0.98, + "grad_norm": 0.34376713163103356, + "learning_rate": 1.3018497083636117e-08, + "loss": 0.0395, + "step": 3422 + }, + { + "epoch": 0.98, + "grad_norm": 0.3443455089540214, + "learning_rate": 1.268696959267679e-08, + "loss": 0.0675, + "step": 3423 + }, + { + "epoch": 0.98, + "grad_norm": 0.5244005996225471, + "learning_rate": 1.2359712680961588e-08, + "loss": 0.0837, + "step": 3424 + }, + { + "epoch": 0.98, + "grad_norm": 0.34067582097236015, + "learning_rate": 1.2036726628715245e-08, + "loss": 0.0595, + "step": 3425 + }, + { + "epoch": 0.98, + "grad_norm": 0.654286644862362, + "learning_rate": 1.1718011712507637e-08, + "loss": 0.1032, + "step": 3426 + }, + { + "epoch": 0.98, + "grad_norm": 0.5809334125193262, + "learning_rate": 1.140356820525157e-08, + "loss": 0.0819, + "step": 3427 + }, + { + "epoch": 0.98, + "grad_norm": 0.41627985970051984, + "learning_rate": 1.1093396376200549e-08, + "loss": 0.1077, + "step": 3428 + }, + { + "epoch": 0.98, + "grad_norm": 0.5133411959970431, + "learning_rate": 1.078749649095101e-08, + "loss": 0.0946, + "step": 3429 + }, + { + "epoch": 0.98, + "grad_norm": 0.32382094063284705, + "learning_rate": 1.0485868811441757e-08, + "loss": 0.0535, + "step": 3430 + }, + { + "epoch": 0.98, + "grad_norm": 0.575475400919534, + "learning_rate": 1.018851359595341e-08, + "loss": 0.0933, + "step": 3431 + }, + { + "epoch": 0.98, + "grad_norm": 0.25770742163761506, + "learning_rate": 9.895431099107845e-09, + "loss": 0.0484, + "step": 3432 + }, + { + "epoch": 0.98, + "grad_norm": 0.6130156592616252, + "learning_rate": 9.606621571867647e-09, + "loss": 0.0832, + "step": 3433 + }, + { + "epoch": 0.98, + "grad_norm": 0.40838636752123786, + "learning_rate": 9.322085261537772e-09, + "loss": 0.079, + "step": 3434 + }, + { + "epoch": 0.98, + "grad_norm": 0.37592866415572707, + "learning_rate": 9.041822411763323e-09, + "loss": 0.0919, + "step": 3435 + }, + { + "epoch": 0.98, + "grad_norm": 0.4348417259620382, + "learning_rate": 8.76583326253011e-09, + "loss": 0.0669, + "step": 3436 + }, + { + "epoch": 0.98, + "grad_norm": 0.49535041246122063, + "learning_rate": 8.494118050164646e-09, + "loss": 0.0851, + "step": 3437 + }, + { + "epoch": 0.98, + "grad_norm": 1.2600484154682323, + "learning_rate": 8.22667700733304e-09, + "loss": 0.1315, + "step": 3438 + }, + { + "epoch": 0.98, + "grad_norm": 0.45761398346885324, + "learning_rate": 7.963510363042103e-09, + "loss": 0.083, + "step": 3439 + }, + { + "epoch": 0.98, + "grad_norm": 0.8751324001451453, + "learning_rate": 7.704618342638804e-09, + "loss": 0.1298, + "step": 3440 + }, + { + "epoch": 0.98, + "grad_norm": 0.5002611858711786, + "learning_rate": 7.450001167809695e-09, + "loss": 0.1023, + "step": 3441 + }, + { + "epoch": 0.98, + "grad_norm": 0.591030695217879, + "learning_rate": 7.199659056579822e-09, + "loss": 0.1224, + "step": 3442 + }, + { + "epoch": 0.98, + "grad_norm": 0.5605203273893826, + "learning_rate": 6.953592223314376e-09, + "loss": 0.0871, + "step": 3443 + }, + { + "epoch": 0.98, + "grad_norm": 0.4037315182990444, + "learning_rate": 6.711800878718144e-09, + "loss": 0.1001, + "step": 3444 + }, + { + "epoch": 0.98, + "grad_norm": 0.5752254954729186, + "learning_rate": 6.4742852298338434e-09, + "loss": 0.1079, + "step": 3445 + }, + { + "epoch": 0.98, + "grad_norm": 0.3958881293253481, + "learning_rate": 6.241045480043229e-09, + "loss": 0.0721, + "step": 3446 + }, + { + "epoch": 0.98, + "grad_norm": 0.39285518812159115, + "learning_rate": 6.012081829067096e-09, + "loss": 0.0866, + "step": 3447 + }, + { + "epoch": 0.99, + "grad_norm": 0.40275387561550535, + "learning_rate": 5.787394472964725e-09, + "loss": 0.1046, + "step": 3448 + }, + { + "epoch": 0.99, + "grad_norm": 0.438490568732715, + "learning_rate": 5.566983604133325e-09, + "loss": 0.0689, + "step": 3449 + }, + { + "epoch": 0.99, + "grad_norm": 0.22473269343661573, + "learning_rate": 5.35084941130748e-09, + "loss": 0.0485, + "step": 3450 + }, + { + "epoch": 0.99, + "grad_norm": 0.6516198767415275, + "learning_rate": 5.138992079561367e-09, + "loss": 0.1205, + "step": 3451 + }, + { + "epoch": 0.99, + "grad_norm": 1.1032766914286642, + "learning_rate": 4.931411790304874e-09, + "loss": 0.1663, + "step": 3452 + }, + { + "epoch": 0.99, + "grad_norm": 0.2583927664869538, + "learning_rate": 4.728108721288038e-09, + "loss": 0.0848, + "step": 3453 + }, + { + "epoch": 0.99, + "grad_norm": 0.3356188968818276, + "learning_rate": 4.5290830465954946e-09, + "loss": 0.0649, + "step": 3454 + }, + { + "epoch": 0.99, + "grad_norm": 0.4091658339723283, + "learning_rate": 4.334334936652029e-09, + "loss": 0.105, + "step": 3455 + }, + { + "epoch": 0.99, + "grad_norm": 0.8466365415029252, + "learning_rate": 4.143864558217026e-09, + "loss": 0.1017, + "step": 3456 + }, + { + "epoch": 0.99, + "grad_norm": 0.31266377817181285, + "learning_rate": 3.957672074388908e-09, + "loss": 0.0763, + "step": 3457 + }, + { + "epoch": 0.99, + "grad_norm": 0.3668851444638233, + "learning_rate": 3.775757644601808e-09, + "loss": 0.1018, + "step": 3458 + }, + { + "epoch": 0.99, + "grad_norm": 0.26861290057771103, + "learning_rate": 3.5981214246266773e-09, + "loss": 0.0509, + "step": 3459 + }, + { + "epoch": 0.99, + "grad_norm": 0.5573965471791562, + "learning_rate": 3.4247635665723977e-09, + "loss": 0.0934, + "step": 3460 + }, + { + "epoch": 0.99, + "grad_norm": 0.5645186381180453, + "learning_rate": 3.255684218882449e-09, + "loss": 0.1307, + "step": 3461 + }, + { + "epoch": 0.99, + "grad_norm": 0.30226112122952675, + "learning_rate": 3.0908835263376845e-09, + "loss": 0.0591, + "step": 3462 + }, + { + "epoch": 0.99, + "grad_norm": 1.2376879396232217, + "learning_rate": 2.930361630055223e-09, + "loss": 0.1214, + "step": 3463 + }, + { + "epoch": 0.99, + "grad_norm": 0.4493319321934206, + "learning_rate": 2.7741186674878905e-09, + "loss": 0.0721, + "step": 3464 + }, + { + "epoch": 0.99, + "grad_norm": 0.26721974000859217, + "learning_rate": 2.6221547724253337e-09, + "loss": 0.0687, + "step": 3465 + }, + { + "epoch": 0.99, + "grad_norm": 0.38897379648915154, + "learning_rate": 2.4744700749923524e-09, + "loss": 0.092, + "step": 3466 + }, + { + "epoch": 0.99, + "grad_norm": 0.5296078663197749, + "learning_rate": 2.3310647016488996e-09, + "loss": 0.124, + "step": 3467 + }, + { + "epoch": 0.99, + "grad_norm": 0.47833822699019957, + "learning_rate": 2.1919387751917485e-09, + "loss": 0.0899, + "step": 3468 + }, + { + "epoch": 0.99, + "grad_norm": 0.4470979908901922, + "learning_rate": 2.057092414753381e-09, + "loss": 0.0723, + "step": 3469 + }, + { + "epoch": 0.99, + "grad_norm": 0.35133914818930667, + "learning_rate": 1.926525735800877e-09, + "loss": 0.0568, + "step": 3470 + }, + { + "epoch": 0.99, + "grad_norm": 0.27584097224744486, + "learning_rate": 1.800238850136471e-09, + "loss": 0.0552, + "step": 3471 + }, + { + "epoch": 0.99, + "grad_norm": 0.3517674209525712, + "learning_rate": 1.6782318658992159e-09, + "loss": 0.0509, + "step": 3472 + }, + { + "epoch": 0.99, + "grad_norm": 0.4149660412637721, + "learning_rate": 1.5605048875610985e-09, + "loss": 0.0923, + "step": 3473 + }, + { + "epoch": 0.99, + "grad_norm": 0.6636339719837038, + "learning_rate": 1.4470580159314795e-09, + "loss": 0.0778, + "step": 3474 + }, + { + "epoch": 0.99, + "grad_norm": 0.5387027081203459, + "learning_rate": 1.3378913481526534e-09, + "loss": 0.0989, + "step": 3475 + }, + { + "epoch": 0.99, + "grad_norm": 0.4298688281989752, + "learning_rate": 1.2330049777037334e-09, + "loss": 0.0847, + "step": 3476 + }, + { + "epoch": 0.99, + "grad_norm": 0.35398138709642324, + "learning_rate": 1.1323989943973213e-09, + "loss": 0.0616, + "step": 3477 + }, + { + "epoch": 0.99, + "grad_norm": 0.6395952838124319, + "learning_rate": 1.0360734843806175e-09, + "loss": 0.0998, + "step": 3478 + }, + { + "epoch": 0.99, + "grad_norm": 0.2491902941704778, + "learning_rate": 9.440285301370865e-10, + "loss": 0.0559, + "step": 3479 + }, + { + "epoch": 0.99, + "grad_norm": 0.4139740029135101, + "learning_rate": 8.562642104831265e-10, + "loss": 0.0547, + "step": 3480 + }, + { + "epoch": 0.99, + "grad_norm": 0.46733585602740035, + "learning_rate": 7.727806005702887e-10, + "loss": 0.0776, + "step": 3481 + }, + { + "epoch": 0.99, + "grad_norm": 0.5162713301266479, + "learning_rate": 6.935777718847237e-10, + "loss": 0.123, + "step": 3482 + }, + { + "epoch": 1.0, + "grad_norm": 0.4656719111645452, + "learning_rate": 6.186557922471803e-10, + "loss": 0.0828, + "step": 3483 + }, + { + "epoch": 1.0, + "grad_norm": 0.4034051607145749, + "learning_rate": 5.480147258118962e-10, + "loss": 0.0617, + "step": 3484 + }, + { + "epoch": 1.0, + "grad_norm": 0.3924679536845503, + "learning_rate": 4.816546330688177e-10, + "loss": 0.0877, + "step": 3485 + }, + { + "epoch": 1.0, + "grad_norm": 0.36789267903815426, + "learning_rate": 4.1957557084082447e-10, + "loss": 0.0727, + "step": 3486 + }, + { + "epoch": 1.0, + "grad_norm": 0.8386319740629019, + "learning_rate": 3.61777592285395e-10, + "loss": 0.1195, + "step": 3487 + }, + { + "epoch": 1.0, + "grad_norm": 1.017152865730572, + "learning_rate": 3.082607468946064e-10, + "loss": 0.0837, + "step": 3488 + }, + { + "epoch": 1.0, + "grad_norm": 0.38688824453684373, + "learning_rate": 2.590250804945793e-10, + "loss": 0.0869, + "step": 3489 + }, + { + "epoch": 1.0, + "grad_norm": 0.4058464346142467, + "learning_rate": 2.1407063524436777e-10, + "loss": 0.0595, + "step": 3490 + }, + { + "epoch": 1.0, + "grad_norm": 0.5868300953204145, + "learning_rate": 1.7339744963873474e-10, + "loss": 0.0709, + "step": 3491 + }, + { + "epoch": 1.0, + "grad_norm": 0.5930557259373533, + "learning_rate": 1.3700555850537645e-10, + "loss": 0.1052, + "step": 3492 + }, + { + "epoch": 1.0, + "grad_norm": 0.4733487558447307, + "learning_rate": 1.0489499300603279e-10, + "loss": 0.1137, + "step": 3493 + }, + { + "epoch": 1.0, + "grad_norm": 0.35835173300491624, + "learning_rate": 7.706578063704228e-11, + "loss": 0.0936, + "step": 3494 + }, + { + "epoch": 1.0, + "grad_norm": 0.3587238751658971, + "learning_rate": 5.351794522823195e-11, + "loss": 0.0553, + "step": 3495 + }, + { + "epoch": 1.0, + "grad_norm": 0.43350933398175695, + "learning_rate": 3.425150694291724e-11, + "loss": 0.0858, + "step": 3496 + }, + { + "epoch": 1.0, + "grad_norm": 0.27176137333516126, + "learning_rate": 1.926648227901229e-11, + "loss": 0.0606, + "step": 3497 + }, + { + "epoch": 1.0, + "grad_norm": 0.9522309126437695, + "learning_rate": 8.562884067919719e-12, + "loss": 0.178, + "step": 3498 + }, + { + "epoch": 1.0, + "grad_norm": 0.27174219401664274, + "learning_rate": 2.140721475085705e-12, + "loss": 0.0647, + "step": 3499 + }, + { + "epoch": 1.0, + "grad_norm": 0.2727162763560935, + "learning_rate": 0.0, + "loss": 0.0498, + "step": 3500 + }, + { + "epoch": 1.0, + "step": 3500, + "total_flos": 133144802129920.0, + "train_loss": 0.10024488587464606, + "train_runtime": 7976.3246, + "train_samples_per_second": 0.878, + "train_steps_per_second": 0.439 + } + ], + "logging_steps": 1.0, + "max_steps": 3500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 50000, + "total_flos": 133144802129920.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}