diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..592698da998163413a3ef51dc6ad9c7967cb4fb0 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "up_proj", + "q_proj", + "k_proj", + "v_proj", + "down_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e538c5ac298c3438a38183b05d406f1246eed4f1 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba877586bdd14eeb401b52760ab6fcbfa98f6b86828c47f86282709dce6d6d1f +size 671150064 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..e452c9fb3dfc074390e686ba821ef6aacd9d015f --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d03b5e8dc93898c8165f95efc97f379e2efafe1851803ae36808a028dac30494 +size 918507402 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8e9d9a5f1cdd039c6f2c6b128ea87573e9624966 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_10000_epochs_1_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.0546417621099164, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3039, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9830943947720656, + "learning_rate": 2.105263157894737e-05, + "loss": 1.2847, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.9632868978760455, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3293, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.8028815117384435, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2719, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.7516661661974123, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2206, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8414762671394682, + "learning_rate": 6.31578947368421e-05, + "loss": 1.2424, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 1.0719234020937745, + "learning_rate": 7.368421052631579e-05, + "loss": 1.2676, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.0134242689859672, + "learning_rate": 8.421052631578948e-05, + "loss": 1.1404, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.8225916239244949, + "learning_rate": 9.473684210526316e-05, + "loss": 1.1376, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.7974580027045621, + "learning_rate": 0.00010526315789473685, + "loss": 1.2182, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.704284118137154, + "learning_rate": 0.00011578947368421053, + "loss": 1.1572, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6777979470385621, + "learning_rate": 0.0001263157894736842, + "loss": 1.1057, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.6789337666737415, + "learning_rate": 0.0001368421052631579, + "loss": 1.1602, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7721367493509489, + "learning_rate": 0.00014736842105263158, + "loss": 1.2724, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.7706380455996645, + "learning_rate": 0.00015789473684210527, + "loss": 1.2377, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7784545094083802, + "learning_rate": 0.00016842105263157895, + "loss": 1.1413, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.749242634067648, + "learning_rate": 0.00017894736842105264, + "loss": 1.1777, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.7185554696627132, + "learning_rate": 0.00018947368421052632, + "loss": 1.1124, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.6592195155815092, + "learning_rate": 0.0002, + "loss": 1.1949, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.631844503497304, + "learning_rate": 0.00019999865623437013, + "loss": 1.2025, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.7375870476144077, + "learning_rate": 0.00019999462497359466, + "loss": 1.1719, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.6787816889277812, + "learning_rate": 0.00019998790632601496, + "loss": 1.2066, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.7576947108984312, + "learning_rate": 0.0001999785004721968, + "loss": 1.2111, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.6636555852178934, + "learning_rate": 0.00019996640766492543, + "loss": 1.1047, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.6608205468484, + "learning_rate": 0.00019995162822919883, + "loss": 1.1325, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.6580649276744608, + "learning_rate": 0.00019993416256221895, + "loss": 1.1296, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.6181390277000566, + "learning_rate": 0.00019991401113338104, + "loss": 1.1378, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6590646065212499, + "learning_rate": 0.00019989117448426108, + "loss": 1.1263, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.6047686546944601, + "learning_rate": 0.00019986565322860115, + "loss": 1.1123, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.6035684316870298, + "learning_rate": 0.00019983744805229296, + "loss": 1.1814, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.6126950655591841, + "learning_rate": 0.00019980655971335945, + "loss": 1.0809, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.6234630001312299, + "learning_rate": 0.00019977298904193437, + "loss": 1.1213, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.6623103513703723, + "learning_rate": 0.00019973673694024, + "loss": 1.1816, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.6621332778326454, + "learning_rate": 0.00019969780438256293, + "loss": 1.1123, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.6071891514167586, + "learning_rate": 0.0001996561924152278, + "loss": 1.167, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.7156277519435071, + "learning_rate": 0.0001996119021565693, + "loss": 1.1658, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.681368373048502, + "learning_rate": 0.0001995649347969019, + "loss": 1.1174, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.7932939060254909, + "learning_rate": 0.00019951529159848805, + "loss": 1.0908, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.6906987465114542, + "learning_rate": 0.00019946297389550433, + "loss": 1.1616, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.6874309548801129, + "learning_rate": 0.00019940798309400526, + "loss": 1.1605, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.6282989911917458, + "learning_rate": 0.0001993503206718859, + "loss": 1.1743, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6744210571411562, + "learning_rate": 0.00019928998817884182, + "loss": 1.1307, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.7070340415480011, + "learning_rate": 0.00019922698723632767, + "loss": 1.1741, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.6457534010554816, + "learning_rate": 0.00019916131953751342, + "loss": 1.1518, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.6965489277838409, + "learning_rate": 0.00019909298684723904, + "loss": 1.2142, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.6654401256010378, + "learning_rate": 0.00019902199100196697, + "loss": 1.1446, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.7663072538177734, + "learning_rate": 0.00019894833390973266, + "loss": 1.1223, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.792057852336374, + "learning_rate": 0.00019887201755009357, + "loss": 1.1632, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.6436782377110332, + "learning_rate": 0.0001987930439740757, + "loss": 1.1619, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.6157428135263696, + "learning_rate": 0.00019871141530411853, + "loss": 1.0433, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.6226083999846979, + "learning_rate": 0.0001986271337340182, + "loss": 1.0906, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.6801892578820706, + "learning_rate": 0.00019854020152886814, + "loss": 1.1518, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.7088306631426905, + "learning_rate": 0.0001984506210249986, + "loss": 1.1941, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.6635052206019092, + "learning_rate": 0.00019835839462991361, + "loss": 1.1591, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.716943879979395, + "learning_rate": 0.00019826352482222638, + "loss": 1.1428, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.6548821918063852, + "learning_rate": 0.00019816601415159263, + "loss": 1.1312, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.6836426283530023, + "learning_rate": 0.0001980658652386421, + "loss": 1.145, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6770189268629901, + "learning_rate": 0.00019796308077490817, + "loss": 1.15, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.6351435125678934, + "learning_rate": 0.00019785766352275542, + "loss": 1.1808, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.6273220292429621, + "learning_rate": 0.00019774961631530545, + "loss": 1.0943, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.6203846312691548, + "learning_rate": 0.00019763894205636072, + "loss": 1.0967, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.6271331300704993, + "learning_rate": 0.00019752564372032657, + "loss": 1.1334, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.7096674154327739, + "learning_rate": 0.00019740972435213115, + "loss": 1.1947, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.6330759609597704, + "learning_rate": 0.00019729118706714375, + "loss": 1.114, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.6100844626495261, + "learning_rate": 0.00019717003505109095, + "loss": 1.144, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.6091114842719405, + "learning_rate": 0.00019704627155997108, + "loss": 1.1652, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.6556844792427164, + "learning_rate": 0.00019691989991996663, + "loss": 1.1038, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6341993533021986, + "learning_rate": 0.0001967909235273549, + "loss": 1.0707, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.6380101782483953, + "learning_rate": 0.00019665934584841682, + "loss": 1.2002, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.6361615932320083, + "learning_rate": 0.00019652517041934356, + "loss": 1.1521, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.6700457625203912, + "learning_rate": 0.00019638840084614182, + "loss": 1.1307, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6359055014529901, + "learning_rate": 0.00019624904080453655, + "loss": 1.157, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.6123561405755759, + "learning_rate": 0.00019610709403987246, + "loss": 1.1568, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.7014447167671706, + "learning_rate": 0.00019596256436701324, + "loss": 1.1119, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.6320209813099228, + "learning_rate": 0.000195815455670239, + "loss": 1.1112, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.6707111589277143, + "learning_rate": 0.00019566577190314197, + "loss": 1.1354, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.7564097536830839, + "learning_rate": 0.0001955135170885202, + "loss": 1.1066, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.6465610953572963, + "learning_rate": 0.00019535869531826937, + "loss": 1.1948, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.6676598076887175, + "learning_rate": 0.00019520131075327298, + "loss": 1.1194, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.6000600146282332, + "learning_rate": 0.00019504136762329047, + "loss": 1.0952, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.6002780664917489, + "learning_rate": 0.00019487887022684336, + "loss": 1.1375, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.6568564993107725, + "learning_rate": 0.00019471382293110003, + "loss": 1.1356, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.7141836382280967, + "learning_rate": 0.00019454623017175812, + "loss": 1.1531, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6419253513780898, + "learning_rate": 0.00019437609645292546, + "loss": 1.0616, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.6720406837651764, + "learning_rate": 0.0001942034263469989, + "loss": 1.1541, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.6842157925259716, + "learning_rate": 0.00019402822449454153, + "loss": 1.0981, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.6331702893953941, + "learning_rate": 0.00019385049560415794, + "loss": 1.1176, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.6386889388907796, + "learning_rate": 0.00019367024445236754, + "loss": 1.0773, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.6643129226720837, + "learning_rate": 0.00019348747588347637, + "loss": 1.1026, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.6344308328394509, + "learning_rate": 0.00019330219480944694, + "loss": 1.1277, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.7128590119998348, + "learning_rate": 0.00019311440620976597, + "loss": 1.1146, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.6603884903952707, + "learning_rate": 0.0001929241151313108, + "loss": 1.1863, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.7718143043284744, + "learning_rate": 0.00019273132668821364, + "loss": 1.2007, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.6785922138631665, + "learning_rate": 0.00019253604606172417, + "loss": 1.0879, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.6223669409606347, + "learning_rate": 0.00019233827850007027, + "loss": 1.1234, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.6916554006299716, + "learning_rate": 0.00019213802931831696, + "loss": 1.143, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.6657548722504808, + "learning_rate": 0.00019193530389822363, + "loss": 1.1172, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.6576599676912572, + "learning_rate": 0.00019173010768809933, + "loss": 1.1701, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.6797676267700059, + "learning_rate": 0.0001915224462026563, + "loss": 1.157, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.6298399264123691, + "learning_rate": 0.00019131232502286188, + "loss": 1.0768, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.6724699973574635, + "learning_rate": 0.0001910997497957885, + "loss": 1.1816, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.6453153869164902, + "learning_rate": 0.00019088472623446183, + "loss": 1.1291, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.6535384061885604, + "learning_rate": 0.00019066726011770726, + "loss": 1.1013, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.6911193392100621, + "learning_rate": 0.0001904473572899947, + "loss": 1.2058, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.6411169183948418, + "learning_rate": 0.00019022502366128135, + "loss": 1.1346, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.6796989475899463, + "learning_rate": 0.00019000026520685302, + "loss": 1.1171, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.6508768473789491, + "learning_rate": 0.0001897730879671634, + "loss": 1.0918, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.6393050631138721, + "learning_rate": 0.00018954349804767184, + "loss": 1.0671, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.6939986073523474, + "learning_rate": 0.00018931150161867916, + "loss": 1.2096, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.6267743426699197, + "learning_rate": 0.00018907710491516199, + "loss": 1.1888, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.6641671795879526, + "learning_rate": 0.0001888403142366049, + "loss": 1.127, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.660607208786974, + "learning_rate": 0.00018860113594683148, + "loss": 1.0923, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.6625425209929494, + "learning_rate": 0.00018835957647383303, + "loss": 1.137, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6577219507067429, + "learning_rate": 0.00018811564230959588, + "loss": 1.1026, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.6657059122075459, + "learning_rate": 0.00018786934000992688, + "loss": 1.1722, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.6502773168148378, + "learning_rate": 0.00018762067619427746, + "loss": 1.0775, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.6723633523053792, + "learning_rate": 0.00018736965754556528, + "loss": 1.1284, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.6507179596192971, + "learning_rate": 0.00018711629080999504, + "loss": 1.0528, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 3.423947246709581, + "learning_rate": 0.00018686058279687698, + "loss": 1.0908, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.6136953582790347, + "learning_rate": 0.00018660254037844388, + "loss": 1.1664, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.6626749371587253, + "learning_rate": 0.00018634217048966637, + "loss": 1.1671, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.640247422131599, + "learning_rate": 0.0001860794801280666, + "loss": 1.0805, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.6126873335303792, + "learning_rate": 0.0001858144763535302, + "loss": 1.1029, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.6160471874719937, + "learning_rate": 0.0001855471662881164, + "loss": 1.1079, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.5883564595153882, + "learning_rate": 0.00018527755711586678, + "loss": 1.0703, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.6598253644642209, + "learning_rate": 0.00018500565608261214, + "loss": 1.109, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.6570694604926712, + "learning_rate": 0.00018473147049577774, + "loss": 1.1638, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.6264482254749499, + "learning_rate": 0.00018445500772418697, + "loss": 1.0336, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.717132874781554, + "learning_rate": 0.00018417627519786315, + "loss": 1.1726, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.6423661485850787, + "learning_rate": 0.00018389528040783012, + "loss": 1.1354, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.6747010662689331, + "learning_rate": 0.00018361203090591071, + "loss": 1.122, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.6166480705864257, + "learning_rate": 0.00018332653430452376, + "loss": 1.1815, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.6566104502085495, + "learning_rate": 0.00018303879827647975, + "loss": 1.1142, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.7193281722634296, + "learning_rate": 0.00018274883055477436, + "loss": 1.1166, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.6676076619134492, + "learning_rate": 0.00018245663893238075, + "loss": 1.1258, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.6312704484126281, + "learning_rate": 0.00018216223126204007, + "loss": 1.0746, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.65951286979931, + "learning_rate": 0.00018186561545605054, + "loss": 1.1657, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.6687293279216957, + "learning_rate": 0.00018156679948605467, + "loss": 1.1179, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.6893653003583647, + "learning_rate": 0.00018126579138282503, + "loss": 1.1553, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.6096273872213167, + "learning_rate": 0.0001809625992360485, + "loss": 1.1289, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.6554943252484647, + "learning_rate": 0.00018065723119410884, + "loss": 1.1123, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.667819283221279, + "learning_rate": 0.00018034969546386757, + "loss": 1.1444, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.635370819590291, + "learning_rate": 0.0001800400003104436, + "loss": 1.071, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.6398152170324534, + "learning_rate": 0.00017972815405699103, + "loss": 1.113, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.628346246018907, + "learning_rate": 0.00017941416508447536, + "loss": 1.0744, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.6743562786957189, + "learning_rate": 0.0001790980418314484, + "loss": 1.0641, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.6667833948751095, + "learning_rate": 0.00017877979279382135, + "loss": 1.1515, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.6415340689159599, + "learning_rate": 0.0001784594265246366, + "loss": 1.1415, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.6333879262471349, + "learning_rate": 0.0001781369516338378, + "loss": 1.2205, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.8344978784124626, + "learning_rate": 0.00017781237678803847, + "loss": 1.1973, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.7222556893403116, + "learning_rate": 0.000177485710710289, + "loss": 1.1104, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.6215683011406403, + "learning_rate": 0.00017715696217984235, + "loss": 1.1403, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.6109772882001075, + "learning_rate": 0.00017682614003191807, + "loss": 1.1608, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.678073310646549, + "learning_rate": 0.00017649325315746478, + "loss": 1.0891, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.6889100814206688, + "learning_rate": 0.0001761583105029213, + "loss": 1.1428, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.6149078322374422, + "learning_rate": 0.00017582132106997616, + "loss": 1.0604, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.6213440946326139, + "learning_rate": 0.00017548229391532572, + "loss": 1.0861, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.6355751515769437, + "learning_rate": 0.00017514123815043074, + "loss": 1.1096, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.6009966959973844, + "learning_rate": 0.00017479816294127152, + "loss": 1.1754, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.6007441577098858, + "learning_rate": 0.0001744530775081015, + "loss": 1.1204, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.6701014376613954, + "learning_rate": 0.0001741059911251997, + "loss": 1.2303, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.6035855457967132, + "learning_rate": 0.000173756913120621, + "loss": 1.14, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.6709387489080831, + "learning_rate": 0.00017340585287594604, + "loss": 1.0787, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.6836692135768181, + "learning_rate": 0.0001730528198260285, + "loss": 1.0743, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.6319277254795637, + "learning_rate": 0.00017269782345874203, + "loss": 1.1492, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6463831638298415, + "learning_rate": 0.00017234087331472497, + "loss": 1.192, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.7527928715648954, + "learning_rate": 0.00017198197898712404, + "loss": 1.1113, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.6272003062313035, + "learning_rate": 0.00017162115012133643, + "loss": 1.1244, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.6544886461846363, + "learning_rate": 0.00017125839641475072, + "loss": 1.1129, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.615466409744761, + "learning_rate": 0.00017089372761648616, + "loss": 1.1855, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.656712861470478, + "learning_rate": 0.00017052715352713075, + "loss": 1.0617, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.646860440733178, + "learning_rate": 0.00017015868399847768, + "loss": 1.1599, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.6211236590895497, + "learning_rate": 0.00016978832893326074, + "loss": 1.154, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.5861204189688349, + "learning_rate": 0.00016941609828488807, + "loss": 1.0823, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.6332202970301454, + "learning_rate": 0.0001690420020571747, + "loss": 1.0757, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.6111843241797855, + "learning_rate": 0.0001686660503040737, + "loss": 1.0858, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.5896095181736208, + "learning_rate": 0.00016828825312940592, + "loss": 1.0593, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.6466936019533218, + "learning_rate": 0.0001679086206865886, + "loss": 1.1672, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.6341290170394617, + "learning_rate": 0.00016752716317836229, + "loss": 1.1156, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.658258953640711, + "learning_rate": 0.0001671438908565167, + "loss": 1.1042, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.6715237401598323, + "learning_rate": 0.00016675881402161536, + "loss": 1.1207, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.6574781590126316, + "learning_rate": 0.0001663719430227186, + "loss": 1.0561, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.5504766238190208, + "learning_rate": 0.00016598328825710533, + "loss": 1.0991, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.5729568943965282, + "learning_rate": 0.000165592860169994, + "loss": 1.1066, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.5807258491260627, + "learning_rate": 0.00016520066925426144, + "loss": 1.0919, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.6583040186838937, + "learning_rate": 0.0001648067260501611, + "loss": 1.1763, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.6214088302578543, + "learning_rate": 0.0001644110411450398, + "loss": 1.0897, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6118543760188059, + "learning_rate": 0.00016401362517305296, + "loss": 1.0737, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.6450056173797082, + "learning_rate": 0.00016361448881487914, + "loss": 1.1265, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.6385339340821453, + "learning_rate": 0.00016321364279743266, + "loss": 1.1025, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.6353393138176362, + "learning_rate": 0.0001628110978935756, + "loss": 1.1319, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.5920780756404599, + "learning_rate": 0.00016240686492182804, + "loss": 1.0875, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.6500357960815165, + "learning_rate": 0.00016200095474607753, + "loss": 1.0852, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.5786362936048618, + "learning_rate": 0.00016159337827528685, + "loss": 1.1304, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.6209132265885975, + "learning_rate": 0.0001611841464632011, + "loss": 1.112, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.6611614501557939, + "learning_rate": 0.0001607732703080532, + "loss": 1.1386, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.6291152809195572, + "learning_rate": 0.00016036076085226814, + "loss": 1.0792, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.6270903628216398, + "learning_rate": 0.0001599466291821666, + "loss": 1.0976, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.6267030785334101, + "learning_rate": 0.0001595308864276666, + "loss": 1.0919, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.6097295463842708, + "learning_rate": 0.0001591135437619847, + "loss": 1.1311, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.6437389287581425, + "learning_rate": 0.0001586946124013354, + "loss": 1.1136, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.6357551055213047, + "learning_rate": 0.0001582741036046301, + "loss": 1.089, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.5914555081464148, + "learning_rate": 0.00015785202867317407, + "loss": 1.1383, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.5787205177550582, + "learning_rate": 0.00015742839895036305, + "loss": 1.1412, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.6841901209326238, + "learning_rate": 0.00015700322582137827, + "loss": 1.1336, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5917724338215758, + "learning_rate": 0.0001565765207128805, + "loss": 0.9994, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.6292805341854978, + "learning_rate": 0.0001561482950927029, + "loss": 1.0956, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.6219929453367142, + "learning_rate": 0.00015571856046954285, + "loss": 1.0919, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.638377910535444, + "learning_rate": 0.00015528732839265272, + "loss": 1.1532, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.675790620281926, + "learning_rate": 0.0001548546104515294, + "loss": 1.2009, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.6402219984777839, + "learning_rate": 0.00015442041827560274, + "loss": 1.1403, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.6063356071315705, + "learning_rate": 0.00015398476353392323, + "loss": 1.137, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.5993748119119959, + "learning_rate": 0.00015354765793484834, + "loss": 1.1048, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.5336280918399235, + "learning_rate": 0.00015310911322572753, + "loss": 1.106, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.5683838539145512, + "learning_rate": 0.000152669141192587, + "loss": 1.115, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.6201028777175007, + "learning_rate": 0.00015222775365981273, + "loss": 1.0647, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.5765478124877643, + "learning_rate": 0.00015178496248983254, + "loss": 1.1217, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.6346404807297925, + "learning_rate": 0.00015134077958279765, + "loss": 1.1293, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.6298913981225904, + "learning_rate": 0.00015089521687626243, + "loss": 1.1343, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.6041165269569185, + "learning_rate": 0.000150448286344864, + "loss": 1.0756, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.6174615585600135, + "learning_rate": 0.00015000000000000001, + "loss": 1.0934, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.6205122835519024, + "learning_rate": 0.00014955036988950618, + "loss": 1.1435, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.6117008270075854, + "learning_rate": 0.00014909940809733222, + "loss": 1.065, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.6647522480570532, + "learning_rate": 0.00014864712674321734, + "loss": 1.1292, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.607403578016957, + "learning_rate": 0.00014819353798236427, + "loss": 1.1379, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.6152761080534418, + "learning_rate": 0.00014773865400511272, + "loss": 1.1079, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.6234030351325274, + "learning_rate": 0.00014728248703661182, + "loss": 0.9997, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.6565232302932521, + "learning_rate": 0.00014682504933649144, + "loss": 1.1298, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.6326307712501449, + "learning_rate": 0.00014636635319853275, + "loss": 1.1287, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.6164374251918011, + "learning_rate": 0.00014590641095033787, + "loss": 1.0487, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.5918756231358342, + "learning_rate": 0.00014544523495299842, + "loss": 1.0911, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.6096400457469056, + "learning_rate": 0.0001449828376007636, + "loss": 1.0646, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.613651480583095, + "learning_rate": 0.0001445192313207067, + "loss": 1.0418, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.659552082278267, + "learning_rate": 0.0001440544285723915, + "loss": 1.1091, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.5793594287738426, + "learning_rate": 0.00014358844184753712, + "loss": 1.0457, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.6144392237777093, + "learning_rate": 0.00014312128366968243, + "loss": 1.1677, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.592217760483564, + "learning_rate": 0.00014265296659384956, + "loss": 1.1067, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.6025771269774809, + "learning_rate": 0.00014218350320620624, + "loss": 1.1178, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.6215619258358068, + "learning_rate": 0.0001417129061237278, + "loss": 1.1086, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.6104099851538662, + "learning_rate": 0.00014124118799385796, + "loss": 1.1642, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.6291772631491427, + "learning_rate": 0.00014076836149416887, + "loss": 1.1268, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5982744552621517, + "learning_rate": 0.0001402944393320206, + "loss": 1.1979, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.5919791603379914, + "learning_rate": 0.00013981943424421932, + "loss": 1.1137, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.641074296489744, + "learning_rate": 0.00013934335899667527, + "loss": 1.1725, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.6416429934908356, + "learning_rate": 0.00013886622638405952, + "loss": 1.1525, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5887938842762135, + "learning_rate": 0.00013838804922946027, + "loss": 1.135, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.5810396164082346, + "learning_rate": 0.00013790884038403795, + "loss": 1.0408, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.6963592619330519, + "learning_rate": 0.00013742861272668012, + "loss": 1.0093, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.6379142365447, + "learning_rate": 0.00013694737916365517, + "loss": 1.1498, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.6362022994521447, + "learning_rate": 0.00013646515262826552, + "loss": 1.1476, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.6277834775561327, + "learning_rate": 0.0001359819460805001, + "loss": 1.0732, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.5846790082739997, + "learning_rate": 0.0001354977725066859, + "loss": 1.131, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.5602623927305204, + "learning_rate": 0.00013501264491913906, + "loss": 1.0225, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.7355716950696458, + "learning_rate": 0.0001345265763558152, + "loss": 1.0296, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.6184498365850783, + "learning_rate": 0.00013403957987995882, + "loss": 1.0972, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.6040828388484982, + "learning_rate": 0.0001335516685797525, + "loss": 1.0091, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.6231315760950605, + "learning_rate": 0.00013306285556796495, + "loss": 1.0924, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.5728821740925255, + "learning_rate": 0.00013257315398159864, + "loss": 1.0333, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.5910177934851119, + "learning_rate": 0.00013208257698153677, + "loss": 1.0574, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.621552668484609, + "learning_rate": 0.00013159113775218964, + "loss": 1.0344, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.6019941844395278, + "learning_rate": 0.00013109884950114007, + "loss": 1.0063, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.6032092128689924, + "learning_rate": 0.00013060572545878875, + "loss": 1.0428, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.6319555566757853, + "learning_rate": 0.00013011177887799845, + "loss": 1.1705, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5931098644745313, + "learning_rate": 0.00012961702303373795, + "loss": 1.1681, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.608186810341348, + "learning_rate": 0.00012912147122272523, + "loss": 1.0613, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.5925589347790949, + "learning_rate": 0.00012862513676307008, + "loss": 1.0844, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.5772865894737961, + "learning_rate": 0.00012812803299391628, + "loss": 1.0661, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.6921431549054023, + "learning_rate": 0.00012763017327508305, + "loss": 1.1091, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.6123370162409066, + "learning_rate": 0.0001271315709867059, + "loss": 1.1155, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.5633139577669003, + "learning_rate": 0.00012663223952887723, + "loss": 1.1665, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.6049090840872673, + "learning_rate": 0.00012613219232128608, + "loss": 1.1739, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.594125413834283, + "learning_rate": 0.00012563144280285741, + "loss": 1.0667, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.5852491344979651, + "learning_rate": 0.00012513000443139112, + "loss": 1.1152, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.6020109661965061, + "learning_rate": 0.00012462789068320017, + "loss": 1.0995, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.6231518041147731, + "learning_rate": 0.00012412511505274844, + "loss": 1.001, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.588252719051879, + "learning_rate": 0.00012362169105228826, + "loss": 1.0376, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.6748959311027869, + "learning_rate": 0.000123117632211497, + "loss": 1.1397, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.6253113800905918, + "learning_rate": 0.00012261295207711346, + "loss": 1.0453, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.67648068654321, + "learning_rate": 0.0001221076642125742, + "loss": 0.987, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.5690792516136426, + "learning_rate": 0.00012160178219764837, + "loss": 1.0885, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.6676421755003893, + "learning_rate": 0.00012109531962807332, + "loss": 1.0912, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.6051116729347528, + "learning_rate": 0.00012058829011518896, + "loss": 1.0397, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.6228985999554482, + "learning_rate": 0.00012008070728557186, + "loss": 1.131, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.5806907674242645, + "learning_rate": 0.00011957258478066931, + "loss": 1.1041, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.6086409424125504, + "learning_rate": 0.00011906393625643244, + "loss": 1.0695, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5664293471536089, + "learning_rate": 0.00011855477538294935, + "loss": 1.0893, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.5875219108972874, + "learning_rate": 0.00011804511584407763, + "loss": 1.1118, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5772945706026095, + "learning_rate": 0.00011753497133707679, + "loss": 1.1374, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.6264590951903878, + "learning_rate": 0.00011702435557223987, + "loss": 1.1053, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.6172016436142939, + "learning_rate": 0.00011651328227252517, + "loss": 1.0733, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.6019466045224809, + "learning_rate": 0.00011600176517318741, + "loss": 1.1048, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.6114137676866246, + "learning_rate": 0.00011548981802140848, + "loss": 1.0182, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.5807423674819735, + "learning_rate": 0.00011497745457592816, + "loss": 1.0819, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.6156122435693614, + "learning_rate": 0.00011446468860667421, + "loss": 1.0373, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.6596210421275496, + "learning_rate": 0.00011395153389439233, + "loss": 1.1059, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6508873442075375, + "learning_rate": 0.00011343800423027582, + "loss": 1.0745, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.5774941786455217, + "learning_rate": 0.0001129241134155949, + "loss": 1.0395, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.6078892397371244, + "learning_rate": 0.00011240987526132594, + "loss": 1.1172, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.6314929637393192, + "learning_rate": 0.00011189530358778005, + "loss": 1.0749, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.6198863199636196, + "learning_rate": 0.00011138041222423177, + "loss": 1.0738, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.5776271114765191, + "learning_rate": 0.00011086521500854745, + "loss": 1.112, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.5839794395278837, + "learning_rate": 0.00011034972578681338, + "loss": 1.0915, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.5752826792204329, + "learning_rate": 0.00010983395841296348, + "loss": 1.0484, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5816716788845563, + "learning_rate": 0.00010931792674840718, + "loss": 0.9258, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.5762376254027969, + "learning_rate": 0.00010880164466165674, + "loss": 1.1193, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5745368903277284, + "learning_rate": 0.00010828512602795462, + "loss": 1.0653, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.6042074974846938, + "learning_rate": 0.00010776838472890065, + "loss": 1.1138, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.593217562570839, + "learning_rate": 0.00010725143465207867, + "loss": 1.0321, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.5808461181038312, + "learning_rate": 0.00010673428969068364, + "loss": 1.1336, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.5887470860927632, + "learning_rate": 0.00010621696374314807, + "loss": 1.1099, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.6498011175723066, + "learning_rate": 0.00010569947071276847, + "loss": 1.1213, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.6128803546990382, + "learning_rate": 0.00010518182450733186, + "loss": 0.9728, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.566857290904936, + "learning_rate": 0.00010466403903874176, + "loss": 1.0272, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.5491573102634107, + "learning_rate": 0.00010414612822264455, + "loss": 1.0219, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.5914292628639273, + "learning_rate": 0.00010362810597805526, + "loss": 1.0494, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5757693535213086, + "learning_rate": 0.0001031099862269837, + "loss": 1.1529, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.5917884927626396, + "learning_rate": 0.00010259178289406011, + "loss": 1.0808, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.6172918542092238, + "learning_rate": 0.00010207350990616107, + "loss": 1.1486, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.601600212336248, + "learning_rate": 0.0001015551811920351, + "loss": 1.0696, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.5933069876414877, + "learning_rate": 0.00010103681068192845, + "loss": 1.1544, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.6283424647746162, + "learning_rate": 0.00010051841230721065, + "loss": 1.0795, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.6081103271482113, + "learning_rate": 0.0001, + "loss": 1.0291, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.6162262451659752, + "learning_rate": 9.948158769278939e-05, + "loss": 1.1334, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.539529725843605, + "learning_rate": 9.896318931807155e-05, + "loss": 1.1309, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.5845019751186606, + "learning_rate": 9.844481880796491e-05, + "loss": 1.0668, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.5326285077787556, + "learning_rate": 9.792649009383899e-05, + "loss": 1.1243, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.6492335865189207, + "learning_rate": 9.740821710593989e-05, + "loss": 1.0785, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.6196210544055624, + "learning_rate": 9.689001377301633e-05, + "loss": 1.0322, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.5630604678872787, + "learning_rate": 9.637189402194476e-05, + "loss": 1.0002, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.5708437509002491, + "learning_rate": 9.585387177735547e-05, + "loss": 1.0903, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.6740980273712052, + "learning_rate": 9.533596096125825e-05, + "loss": 1.1051, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.5766319817458229, + "learning_rate": 9.481817549266817e-05, + "loss": 1.02, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.6806069107667091, + "learning_rate": 9.430052928723153e-05, + "loss": 1.0738, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5837857855687947, + "learning_rate": 9.378303625685195e-05, + "loss": 1.059, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.5840116539912913, + "learning_rate": 9.326571030931637e-05, + "loss": 1.0302, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.6173685917066777, + "learning_rate": 9.274856534792138e-05, + "loss": 1.1018, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.5485781697871722, + "learning_rate": 9.223161527109937e-05, + "loss": 1.0331, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5414180048504806, + "learning_rate": 9.171487397204539e-05, + "loss": 1.0379, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.58617739354263, + "learning_rate": 9.119835533834331e-05, + "loss": 1.1011, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.582820294272527, + "learning_rate": 9.068207325159284e-05, + "loss": 1.1239, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.5495432917639895, + "learning_rate": 9.016604158703654e-05, + "loss": 1.0539, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5941607713298789, + "learning_rate": 8.965027421318665e-05, + "loss": 1.0581, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.6704769304374639, + "learning_rate": 8.913478499145254e-05, + "loss": 1.0363, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.575579872278294, + "learning_rate": 8.861958777576827e-05, + "loss": 1.0063, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.6174123619332044, + "learning_rate": 8.810469641222001e-05, + "loss": 1.0358, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.7765259631194747, + "learning_rate": 8.759012473867407e-05, + "loss": 1.0777, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.59539903150356, + "learning_rate": 8.707588658440511e-05, + "loss": 1.0031, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5838223468459752, + "learning_rate": 8.656199576972423e-05, + "loss": 1.0088, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.5709288860740186, + "learning_rate": 8.604846610560771e-05, + "loss": 1.071, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.616227063456055, + "learning_rate": 8.553531139332582e-05, + "loss": 1.1457, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.5891840856715459, + "learning_rate": 8.502254542407186e-05, + "loss": 1.0601, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.5860896398908301, + "learning_rate": 8.451018197859153e-05, + "loss": 0.9879, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.5706480687216251, + "learning_rate": 8.399823482681262e-05, + "loss": 1.01, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.6075771275273073, + "learning_rate": 8.348671772747487e-05, + "loss": 0.9779, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.6116743292999375, + "learning_rate": 8.297564442776014e-05, + "loss": 1.0019, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.9043649158939876, + "learning_rate": 8.246502866292324e-05, + "loss": 1.0332, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.5940214479829944, + "learning_rate": 8.195488415592238e-05, + "loss": 1.074, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.6363560987658411, + "learning_rate": 8.144522461705067e-05, + "loss": 1.0826, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.572695528702038, + "learning_rate": 8.093606374356759e-05, + "loss": 1.0875, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.58278597153404, + "learning_rate": 8.042741521933071e-05, + "loss": 1.1093, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.609417586744702, + "learning_rate": 7.991929271442817e-05, + "loss": 1.044, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5518628421421953, + "learning_rate": 7.941170988481108e-05, + "loss": 1.0395, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.5602503519347858, + "learning_rate": 7.89046803719267e-05, + "loss": 1.0489, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5635320627455994, + "learning_rate": 7.839821780235168e-05, + "loss": 1.0318, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.6425108193841863, + "learning_rate": 7.789233578742582e-05, + "loss": 1.0997, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5511827643775233, + "learning_rate": 7.738704792288655e-05, + "loss": 1.1024, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.7937945398354243, + "learning_rate": 7.688236778850306e-05, + "loss": 1.0329, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.6158403969166507, + "learning_rate": 7.637830894771175e-05, + "loss": 1.1045, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.5466499351488862, + "learning_rate": 7.587488494725157e-05, + "loss": 1.0115, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.5607938509766125, + "learning_rate": 7.537210931679987e-05, + "loss": 1.0598, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.5573994161160054, + "learning_rate": 7.48699955686089e-05, + "loss": 1.0676, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.5957854875114994, + "learning_rate": 7.43685571971426e-05, + "loss": 1.0617, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.8082040631934779, + "learning_rate": 7.386780767871397e-05, + "loss": 1.0027, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.5427800958181495, + "learning_rate": 7.336776047112276e-05, + "loss": 1.0493, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.680764698748384, + "learning_rate": 7.286842901329412e-05, + "loss": 1.0509, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.6103536964718081, + "learning_rate": 7.236982672491698e-05, + "loss": 1.0078, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.6156365333216228, + "learning_rate": 7.187196700608373e-05, + "loss": 1.1132, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.571025133284332, + "learning_rate": 7.137486323692995e-05, + "loss": 0.9963, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.5979433321264492, + "learning_rate": 7.087852877727481e-05, + "loss": 1.0295, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.5706299903062161, + "learning_rate": 7.038297696626206e-05, + "loss": 1.0061, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.6090765543978603, + "learning_rate": 6.988822112200156e-05, + "loss": 1.0722, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.6495559687818583, + "learning_rate": 6.939427454121128e-05, + "loss": 1.0937, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.5392094274203981, + "learning_rate": 6.890115049885994e-05, + "loss": 1.0045, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.6620081615223117, + "learning_rate": 6.84088622478104e-05, + "loss": 1.1003, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.606655196142612, + "learning_rate": 6.791742301846326e-05, + "loss": 1.0994, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.5347660961501969, + "learning_rate": 6.742684601840141e-05, + "loss": 1.078, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.5610082205027388, + "learning_rate": 6.693714443203507e-05, + "loss": 1.1287, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.5929489801715777, + "learning_rate": 6.644833142024751e-05, + "loss": 1.0979, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5703922937495834, + "learning_rate": 6.59604201200412e-05, + "loss": 1.0169, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.6719211475065047, + "learning_rate": 6.547342364418481e-05, + "loss": 1.0311, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.5837439088175624, + "learning_rate": 6.498735508086093e-05, + "loss": 1.0145, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5840596663526239, + "learning_rate": 6.450222749331414e-05, + "loss": 1.0347, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.5479481912273149, + "learning_rate": 6.40180539194999e-05, + "loss": 1.0433, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5974273349562468, + "learning_rate": 6.35348473717345e-05, + "loss": 1.0157, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.5920150870666145, + "learning_rate": 6.305262083634488e-05, + "loss": 1.0862, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.6040498515191982, + "learning_rate": 6.25713872733199e-05, + "loss": 1.0279, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.6890034751174975, + "learning_rate": 6.209115961596208e-05, + "loss": 1.1037, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.5822406941480194, + "learning_rate": 6.161195077053976e-05, + "loss": 0.9946, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.5545509446925626, + "learning_rate": 6.113377361594049e-05, + "loss": 1.0459, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.6478650349952937, + "learning_rate": 6.065664100332478e-05, + "loss": 1.0448, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.6232439056058718, + "learning_rate": 6.018056575578075e-05, + "loss": 1.0418, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5787774989799778, + "learning_rate": 5.970556066797941e-05, + "loss": 1.1346, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.555604140461821, + "learning_rate": 5.923163850583113e-05, + "loss": 1.0548, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.570207009965997, + "learning_rate": 5.875881200614207e-05, + "loss": 1.04, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.5830691553570289, + "learning_rate": 5.828709387627218e-05, + "loss": 1.0194, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.5898401925651079, + "learning_rate": 5.781649679379378e-05, + "loss": 1.0353, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.5281642759595274, + "learning_rate": 5.73470334061505e-05, + "loss": 1.0172, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.6116846301810461, + "learning_rate": 5.687871633031754e-05, + "loss": 0.968, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.5603052935619109, + "learning_rate": 5.6411558152462894e-05, + "loss": 1.0278, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.6038247528664656, + "learning_rate": 5.5945571427608526e-05, + "loss": 1.0326, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.5547217826242783, + "learning_rate": 5.54807686792933e-05, + "loss": 0.9975, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5307670577336794, + "learning_rate": 5.501716239923642e-05, + "loss": 1.0589, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.6247846642988907, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.9786, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.540419514977222, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.9571, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.5436204368876351, + "learning_rate": 5.363364680146725e-05, + "loss": 0.9559, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.5634354388890752, + "learning_rate": 5.31749506635086e-05, + "loss": 1.0899, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.5373452984019687, + "learning_rate": 5.271751296338823e-05, + "loss": 1.0402, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.5494376365525082, + "learning_rate": 5.226134599488728e-05, + "loss": 1.075, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.5455359002472352, + "learning_rate": 5.180646201763577e-05, + "loss": 1.0315, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.5499899020824084, + "learning_rate": 5.135287325678271e-05, + "loss": 0.9862, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.567553536918042, + "learning_rate": 5.090059190266779e-05, + "loss": 0.9562, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5762050281775415, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.9938, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.587322193307677, + "learning_rate": 5.000000000000002e-05, + "loss": 1.0145, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5642512856058488, + "learning_rate": 4.955171365513603e-05, + "loss": 1.1282, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.5827168848287619, + "learning_rate": 4.9104783123737566e-05, + "loss": 1.0657, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.6157879835028639, + "learning_rate": 4.865922041720239e-05, + "loss": 1.1517, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.5753811424331225, + "learning_rate": 4.821503751016746e-05, + "loss": 1.0513, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5788251938305143, + "learning_rate": 4.777224634018732e-05, + "loss": 1.0326, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.6001375240542786, + "learning_rate": 4.733085880741301e-05, + "loss": 1.1317, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.5314931616369382, + "learning_rate": 4.689088677427249e-05, + "loss": 1.1151, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.6037116023113848, + "learning_rate": 4.645234206515171e-05, + "loss": 1.0136, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5529077977630005, + "learning_rate": 4.6015236466076747e-05, + "loss": 1.0196, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.5322658689804729, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.986, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5832545717855004, + "learning_rate": 4.514538954847064e-05, + "loss": 1.0697, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.5459373413249836, + "learning_rate": 4.471267160734731e-05, + "loss": 1.0586, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.612102133853446, + "learning_rate": 4.428143953045717e-05, + "loss": 0.8879, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.5271492444349947, + "learning_rate": 4.385170490729712e-05, + "loss": 1.052, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.5372955655329454, + "learning_rate": 4.342347928711953e-05, + "loss": 1.0402, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.6969492636482697, + "learning_rate": 4.2996774178621736e-05, + "loss": 1.036, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.6084476972576491, + "learning_rate": 4.257160104963696e-05, + "loss": 1.0911, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.5707168715124262, + "learning_rate": 4.2147971326825966e-05, + "loss": 1.123, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5308859435568433, + "learning_rate": 4.172589639536991e-05, + "loss": 0.9807, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.5531547327344877, + "learning_rate": 4.130538759866457e-05, + "loss": 1.0088, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.5617715008622371, + "learning_rate": 4.088645623801534e-05, + "loss": 1.0015, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.5831603063758195, + "learning_rate": 4.046911357233343e-05, + "loss": 1.0486, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5453015306649112, + "learning_rate": 4.00533708178334e-05, + "loss": 1.1414, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.5467213506991419, + "learning_rate": 3.963923914773187e-05, + "loss": 1.0077, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5838670052403234, + "learning_rate": 3.922672969194686e-05, + "loss": 1.045, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.5316702863240654, + "learning_rate": 3.8815853536798904e-05, + "loss": 1.009, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.5538127101661766, + "learning_rate": 3.840662172471315e-05, + "loss": 1.018, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.8005565133092393, + "learning_rate": 3.79990452539225e-05, + "loss": 1.0019, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5776009430166992, + "learning_rate": 3.759313507817196e-05, + "loss": 1.0631, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.5167486285150212, + "learning_rate": 3.7188902106424416e-05, + "loss": 1.0782, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.578627424855094, + "learning_rate": 3.678635720256737e-05, + "loss": 1.0474, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.5435164656497815, + "learning_rate": 3.638551118512089e-05, + "loss": 0.9957, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.5719424686541105, + "learning_rate": 3.5986374826947066e-05, + "loss": 1.0058, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.5178410334921631, + "learning_rate": 3.558895885496023e-05, + "loss": 0.978, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.8263260950862493, + "learning_rate": 3.519327394983888e-05, + "loss": 1.0467, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.5362699121320619, + "learning_rate": 3.479933074573858e-05, + "loss": 0.9266, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.5296863145242839, + "learning_rate": 3.440713983000601e-05, + "loss": 1.0413, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.6286430146632545, + "learning_rate": 3.401671174289469e-05, + "loss": 1.0208, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5947703566230248, + "learning_rate": 3.362805697728145e-05, + "loss": 1.0797, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.6150942122576878, + "learning_rate": 3.324118597838464e-05, + "loss": 0.9078, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5763878268103857, + "learning_rate": 3.285610914348332e-05, + "loss": 1.0606, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.604256443268453, + "learning_rate": 3.2472836821637744e-05, + "loss": 1.0442, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.6540688612386386, + "learning_rate": 3.209137931341143e-05, + "loss": 1.0547, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.5888623442593799, + "learning_rate": 3.1711746870594086e-05, + "loss": 1.0155, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5693574593086785, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.9869, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.5754271865799155, + "learning_rate": 3.0957997942825336e-05, + "loss": 1.0866, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.6118450847137612, + "learning_rate": 3.058390171511196e-05, + "loss": 1.0135, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.6019488545581383, + "learning_rate": 3.021167106673928e-05, + "loss": 0.9748, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.6136203176659136, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.9842, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.4957658315868677, + "learning_rate": 2.9472846472869298e-05, + "loss": 1.0255, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.6032121305642305, + "learning_rate": 2.9106272383513835e-05, + "loss": 1.0273, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.5613220836478147, + "learning_rate": 2.874160358524931e-05, + "loss": 1.0848, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.5742266347353849, + "learning_rate": 2.8378849878663628e-05, + "loss": 1.0718, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.5996512989753369, + "learning_rate": 2.8018021012875994e-05, + "loss": 1.0252, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.5513763872864543, + "learning_rate": 2.7659126685275027e-05, + "loss": 1.0256, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.616897426658438, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.9867, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.5321083070094431, + "learning_rate": 2.6947180173971508e-05, + "loss": 1.0657, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.5982846463615108, + "learning_rate": 2.659414712405398e-05, + "loss": 0.9823, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.5510098723862623, + "learning_rate": 2.6243086879379e-05, + "loss": 1.0513, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.5312914631911628, + "learning_rate": 2.5894008874800325e-05, + "loss": 1.0319, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.6058974540337424, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.9458, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.5603043779884208, + "learning_rate": 2.5201837058728505e-05, + "loss": 1.029, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5427880849051706, + "learning_rate": 2.485876184956928e-05, + "loss": 0.9807, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.5437875716098907, + "learning_rate": 2.451770608467432e-05, + "loss": 1.0841, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.7171952165375328, + "learning_rate": 2.417867893002387e-05, + "loss": 1.0336, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.554435574357804, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.9722, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.5593028467456796, + "learning_rate": 2.3506746842535242e-05, + "loss": 1.0349, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.5593189261001621, + "learning_rate": 2.3173859968081944e-05, + "loss": 1.0308, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.6010504002830267, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.9928, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.558405142401561, + "learning_rate": 2.251428928971102e-05, + "loss": 1.0552, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.6038965238199007, + "learning_rate": 2.2187623211961562e-05, + "loss": 1.0493, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.5878232251405476, + "learning_rate": 2.1863048366162208e-05, + "loss": 1.0058, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5792843967762865, + "learning_rate": 2.1540573475363402e-05, + "loss": 1.0641, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.6207354557243501, + "learning_rate": 2.1220207206178688e-05, + "loss": 1.0188, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5949739846778636, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.9854, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.610828974706882, + "learning_rate": 2.058583491552465e-05, + "loss": 1.0607, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.553429692755142, + "learning_rate": 2.027184594300898e-05, + "loss": 1.0354, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.5957778158714045, + "learning_rate": 1.995999968955641e-05, + "loss": 0.9977, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.6417881675394035, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.9849, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.5759396894351403, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.9924, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.7994891376666073, + "learning_rate": 1.903740076395151e-05, + "loss": 0.9704, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.5617450444953134, + "learning_rate": 1.8734208617174988e-05, + "loss": 1.0302, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.5355895357665624, + "learning_rate": 1.8433200513945337e-05, + "loss": 1.0276, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5915038784491229, + "learning_rate": 1.8134384543949478e-05, + "loss": 1.0175, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.5420413166609099, + "learning_rate": 1.783776873795994e-05, + "loss": 0.9283, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.6141130519430947, + "learning_rate": 1.754336106761927e-05, + "loss": 0.9567, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.5313555292161568, + "learning_rate": 1.7251169445225657e-05, + "loss": 1.0618, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.5622424268374584, + "learning_rate": 1.696120172352025e-05, + "loss": 1.049, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.5729505863510094, + "learning_rate": 1.6673465695476232e-05, + "loss": 1.042, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.6147367158289911, + "learning_rate": 1.6387969094089316e-05, + "loss": 1.0332, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.5744026154737112, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.9674, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.5282699306125607, + "learning_rate": 1.5823724802136865e-05, + "loss": 1.0449, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.6052632616547621, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.999, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.5289547451299312, + "learning_rate": 1.526852950422226e-05, + "loss": 0.9661, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5300300048812662, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.9651, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.5263056000491023, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.9892, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.5455862581344829, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.9861, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.5670216658855216, + "learning_rate": 1.4185523646469822e-05, + "loss": 1.0091, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5714826959991137, + "learning_rate": 1.3920519871933424e-05, + "loss": 1.0016, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.6430918485169288, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.9834, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.6422037680407515, + "learning_rate": 1.339745962155613e-05, + "loss": 1.1132, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.5333323799870597, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.9638, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.610339329123425, + "learning_rate": 1.2883709190004955e-05, + "loss": 1.0494, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.6651824534785188, + "learning_rate": 1.263034245443473e-05, + "loss": 1.0221, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.6134178935957494, + "learning_rate": 1.2379323805722576e-05, + "loss": 1.0084, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.5721566730523302, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.971, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.5608339377937442, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.9916, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.56075757858118, + "learning_rate": 1.1640423526166988e-05, + "loss": 1.0652, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5747500010881719, + "learning_rate": 1.1398864053168534e-05, + "loss": 1.0922, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.5501690957931828, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.9825, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.5809208426100437, + "learning_rate": 1.0922895084838037e-05, + "loss": 1.0119, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.5790357434096092, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.9896, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5941430671614292, + "learning_rate": 1.045650195232819e-05, + "loss": 1.1058, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.5963745119122649, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.9594, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.5942221402502171, + "learning_rate": 9.999734793146998e-06, + "loss": 1.0372, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.6517007896733954, + "learning_rate": 9.774976338718677e-06, + "loss": 1.0051, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.5914375297469927, + "learning_rate": 9.552642710005299e-06, + "loss": 1.0353, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.567730923141039, + "learning_rate": 9.332739882292752e-06, + "loss": 1.1078, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.5806300782832331, + "learning_rate": 9.115273765538202e-06, + "loss": 1.0311, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.5396734638468552, + "learning_rate": 8.900250204211514e-06, + "loss": 1.0042, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5735940043391957, + "learning_rate": 8.687674977138116e-06, + "loss": 1.0159, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.5737541974889085, + "learning_rate": 8.47755379734373e-06, + "loss": 1.003, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5547697845181571, + "learning_rate": 8.269892311900696e-06, + "loss": 1.0349, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.5733392761602362, + "learning_rate": 8.064696101776358e-06, + "loss": 1.003, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.5724011437733777, + "learning_rate": 7.861970681683051e-06, + "loss": 0.957, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.5411210088167315, + "learning_rate": 7.661721499929753e-06, + "loss": 1.0382, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.6159552656415724, + "learning_rate": 7.463953938275858e-06, + "loss": 1.0812, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.6425117458362681, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.9973, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5763828913151351, + "learning_rate": 7.07588486868922e-06, + "loss": 1.0134, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.5521851129054778, + "learning_rate": 6.8855937902340576e-06, + "loss": 1.0658, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.5618114300494748, + "learning_rate": 6.6978051905530855e-06, + "loss": 1.1078, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.5415127242628184, + "learning_rate": 6.512524116523633e-06, + "loss": 1.0307, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.9441276497123182, + "learning_rate": 6.329755547632499e-06, + "loss": 1.0195, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.5565353021993263, + "learning_rate": 6.149504395842087e-06, + "loss": 0.9657, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.6106580616838163, + "learning_rate": 5.971775505458444e-06, + "loss": 0.9943, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.5634279784443351, + "learning_rate": 5.7965736530010916e-06, + "loss": 1.0729, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.5533138320006595, + "learning_rate": 5.623903547074549e-06, + "loss": 0.9734, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.6347554035913163, + "learning_rate": 5.453769828241872e-06, + "loss": 0.9627, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.559794125412613, + "learning_rate": 5.286177068899989e-06, + "loss": 1.0309, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.556647745583717, + "learning_rate": 5.121129773156663e-06, + "loss": 1.0235, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.6373655339621795, + "learning_rate": 4.95863237670956e-06, + "loss": 1.0544, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.5425862554932976, + "learning_rate": 4.798689246727006e-06, + "loss": 1.004, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.5869350998193746, + "learning_rate": 4.641304681730641e-06, + "loss": 0.9371, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.5480015247701243, + "learning_rate": 4.486482911479839e-06, + "loss": 1.0562, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.5140315108418996, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.9961, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.6148450538420762, + "learning_rate": 4.184544329761009e-06, + "loss": 1.0191, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.5402212289023166, + "learning_rate": 4.037435632986786e-06, + "loss": 1.0148, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.5709596661807356, + "learning_rate": 3.892905960127546e-06, + "loss": 1.0594, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5118843684398965, + "learning_rate": 3.750959195463466e-06, + "loss": 1.0317, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.5527509874809367, + "learning_rate": 3.611599153858214e-06, + "loss": 1.0277, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.5825398681234533, + "learning_rate": 3.4748295806564356e-06, + "loss": 1.0096, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.6143820030825954, + "learning_rate": 3.3406541515832003e-06, + "loss": 1.0815, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5732341075626708, + "learning_rate": 3.209076472645112e-06, + "loss": 1.0462, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.6258640782593358, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.9541, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.5675367593447052, + "learning_rate": 2.9537284400289355e-06, + "loss": 1.0565, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.6043933776964004, + "learning_rate": 2.8299649489090475e-06, + "loss": 1.0216, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.5444010836886899, + "learning_rate": 2.708812932856253e-06, + "loss": 1.0406, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.5884589478752901, + "learning_rate": 2.590275647868867e-06, + "loss": 1.0578, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.5796831340346383, + "learning_rate": 2.4743562796734622e-06, + "loss": 1.0931, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.5503895574333929, + "learning_rate": 2.3610579436393e-06, + "loss": 1.0234, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.6200970086722541, + "learning_rate": 2.250383684694579e-06, + "loss": 1.0565, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.5514254632340541, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.9959, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.6137737755711714, + "learning_rate": 2.036919225091827e-06, + "loss": 1.0731, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.556963861842245, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.9461, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.5629250179170496, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.97, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.539571233600725, + "learning_rate": 1.7364751777736332e-06, + "loss": 1.0104, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.5362621634527788, + "learning_rate": 1.6416053700863964e-06, + "loss": 1.0284, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.540625025680308, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.9179, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.6002488340998073, + "learning_rate": 1.459798471131868e-06, + "loss": 0.9858, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.702654490983124, + "learning_rate": 1.3728662659818204e-06, + "loss": 1.0847, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.5493589008977616, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.99, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.5746085569735714, + "learning_rate": 1.2069560259243328e-06, + "loss": 1.0163, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.5811417507334656, + "learning_rate": 1.1279824499064396e-06, + "loss": 1.0926, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.5791459585782833, + "learning_rate": 1.0516660902673448e-06, + "loss": 1.0457, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5443944524006104, + "learning_rate": 9.780089980330642e-07, + "loss": 1.0076, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.5697409783733853, + "learning_rate": 9.070131527609604e-07, + "loss": 0.9972, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.6029098796366928, + "learning_rate": 8.386804624865851e-07, + "loss": 0.9167, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.5490495118546089, + "learning_rate": 7.730127636723539e-07, + "loss": 0.9517, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.5745601087440052, + "learning_rate": 7.100118211581852e-07, + "loss": 1.0421, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.5389130677020444, + "learning_rate": 6.496793281141056e-07, + "loss": 1.0575, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5862136968699087, + "learning_rate": 5.920169059947411e-07, + "loss": 0.9917, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.5449518186389779, + "learning_rate": 5.370261044956971e-07, + "loss": 1.0193, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5517944068434636, + "learning_rate": 4.847084015119574e-07, + "loss": 0.9529, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.569226164983525, + "learning_rate": 4.3506520309813947e-07, + "loss": 1.0178, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.666594798511301, + "learning_rate": 3.8809784343072366e-07, + "loss": 1.0474, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.5463818709913154, + "learning_rate": 3.4380758477219333e-07, + "loss": 1.0943, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.582588118145432, + "learning_rate": 3.0219561743707326e-07, + "loss": 1.0894, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.5784530517202845, + "learning_rate": 2.6326305976001055e-07, + "loss": 1.0296, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5355976778483454, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.964, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 1.6479292551179407, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.9742, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.7998087277041235, + "learning_rate": 1.6255194770704586e-07, + "loss": 1.0916, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.6483834592129597, + "learning_rate": 1.3434677139885222e-07, + "loss": 1.0472, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.555368245947789, + "learning_rate": 1.0882551573891953e-07, + "loss": 1.0817, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.5063818362833399, + "learning_rate": 8.598886661895788e-08, + "loss": 1.0555, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.6122762285657034, + "learning_rate": 6.583743778106887e-08, + "loss": 1.0354, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.5514939749153159, + "learning_rate": 4.837177080119215e-08, + "loss": 1.0028, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.5564621643004828, + "learning_rate": 3.359233507459481e-08, + "loss": 0.9777, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.5797386135805992, + "learning_rate": 2.1499527803214846e-08, + "loss": 1.0269, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5755447237388454, + "learning_rate": 1.209367398504746e-08, + "loss": 1.0149, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.5641293628448413, + "learning_rate": 5.375026405352035e-09, + "loss": 1.0056, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.613195023494919, + "learning_rate": 1.3437656298687097e-09, + "loss": 1.0408, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.5384219802334457, + "learning_rate": 0.0, + "loss": 1.0434, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 329023610060800.0, + "train_loss": 1.077072388458252, + "train_runtime": 7633.3454, + "train_samples_per_second": 1.31, + "train_steps_per_second": 0.082 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 329023610060800.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9c82b6353cb3ae238b58f52414bc758819908b03 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "o_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b68d69903caea5281d9892c841d6bff1ab14fae9 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eeb61ea8f79dbf15e1aae58c7b0170375847ec00a4f9c8ad346e2f75e5fef1c +size 671150064 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..a3c91feb2111bcc3c609cb2082e1de825b66c65b --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b75b38d4fbde04610931840aba206073356e8d0d298a4aa817d786f79f0789 +size 918507402 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..60867f89faca47db79b6fc4a4856ca5c474c0490 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_1_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 1.0877886563071673, + "learning_rate": 5e-05, + "loss": 1.4204, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.9831963058763455, + "learning_rate": 0.0001, + "loss": 1.2175, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.7478913848154679, + "learning_rate": 0.00015000000000000001, + "loss": 1.2755, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.1851945547125557, + "learning_rate": 0.0002, + "loss": 1.294, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 0.9476134964793809, + "learning_rate": 0.00019996629653035126, + "loss": 1.2281, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.7133074153458693, + "learning_rate": 0.00019986520883988232, + "loss": 1.2197, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.7174771013040582, + "learning_rate": 0.00019969680506871137, + "loss": 1.2152, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.7669579161324069, + "learning_rate": 0.00019946119873266613, + "loss": 1.2647, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 1.1079413613918108, + "learning_rate": 0.00019915854864676664, + "loss": 1.2143, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.7583043070072526, + "learning_rate": 0.00019878905881817252, + "loss": 1.1807, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.8929329853128891, + "learning_rate": 0.00019835297830866826, + "loss": 1.1392, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.7522497944874978, + "learning_rate": 0.00019785060106677818, + "loss": 1.1502, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.6440337581737159, + "learning_rate": 0.00019728226572962473, + "loss": 1.1695, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.7692304262417425, + "learning_rate": 0.0001966483553946637, + "loss": 1.2483, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.6635724770585765, + "learning_rate": 0.00019594929736144976, + "loss": 1.0857, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.6339210471192188, + "learning_rate": 0.00019518556284360696, + "loss": 1.1672, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.6790889547946641, + "learning_rate": 0.0001943576666511982, + "loss": 1.088, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.6692175057561588, + "learning_rate": 0.0001934661668437073, + "loss": 1.2209, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.6437761998032046, + "learning_rate": 0.0001925116643538684, + "loss": 1.122, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.6525125502220214, + "learning_rate": 0.00019149480258259533, + "loss": 1.1809, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.7383920037847639, + "learning_rate": 0.00019041626696528503, + "loss": 1.2011, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.599229910744791, + "learning_rate": 0.0001892767845097864, + "loss": 1.1337, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.6368541820149072, + "learning_rate": 0.00018807712330634642, + "loss": 1.1691, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.6479968056136417, + "learning_rate": 0.0001868180920098644, + "loss": 1.1646, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.6492295290083023, + "learning_rate": 0.00018550053929480202, + "loss": 1.1822, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.6581163598715829, + "learning_rate": 0.00018412535328311814, + "loss": 1.0881, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.6528735853321923, + "learning_rate": 0.0001826934609456129, + "loss": 1.1763, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.6219951075238257, + "learning_rate": 0.00018120582747708502, + "loss": 1.0637, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.7509932918198108, + "learning_rate": 0.0001796634556457236, + "loss": 1.1323, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.6626945316885953, + "learning_rate": 0.0001780673851171728, + "loss": 1.1007, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.7185209593516516, + "learning_rate": 0.00017641869175372493, + "loss": 1.2184, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.639961093110283, + "learning_rate": 0.00017471848688911464, + "loss": 1.1013, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.6093800026585015, + "learning_rate": 0.000172967916579403, + "loss": 1.0853, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.5977982267642971, + "learning_rate": 0.00017116816083045602, + "loss": 1.0534, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.6585541370699558, + "learning_rate": 0.0001693204328025389, + "loss": 1.1287, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.6562897452410896, + "learning_rate": 0.00016742597799256182, + "loss": 1.1724, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.6419065162769554, + "learning_rate": 0.00016548607339452853, + "loss": 1.142, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.6683261695427306, + "learning_rate": 0.00016350202663875386, + "loss": 1.1943, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.6037906950042923, + "learning_rate": 0.0001614751751104301, + "loss": 1.2452, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.6722143393952856, + "learning_rate": 0.00015940688504813662, + "loss": 1.1435, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.5849511558342113, + "learning_rate": 0.00015729855062290022, + "loss": 1.1326, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.6496843215077648, + "learning_rate": 0.00015515159299842707, + "loss": 1.0871, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.622273267698997, + "learning_rate": 0.00015296745937313987, + "loss": 1.1245, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.6248801200159152, + "learning_rate": 0.00015074762200466556, + "loss": 1.0756, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.6350409040736777, + "learning_rate": 0.00014849357721743168, + "loss": 1.1282, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.6573934364037886, + "learning_rate": 0.00014620684439403962, + "loss": 1.2089, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.6322079151415101, + "learning_rate": 0.0001438889649510956, + "loss": 1.1702, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.5931281923571887, + "learning_rate": 0.00014154150130018866, + "loss": 1.1344, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.679086044926115, + "learning_rate": 0.00013916603579471705, + "loss": 1.0855, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.6250561524286053, + "learning_rate": 0.000136764169663272, + "loss": 1.1218, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.5455053641893781, + "learning_rate": 0.00013433752193029886, + "loss": 1.069, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.6315994964575306, + "learning_rate": 0.00013188772832476188, + "loss": 1.1472, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.6058785708917978, + "learning_rate": 0.00012941644017754964, + "loss": 1.0467, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.6163207183209253, + "learning_rate": 0.00012692532330836346, + "loss": 1.0946, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.6262642109854599, + "learning_rate": 0.00012441605690283915, + "loss": 1.1487, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.6959618230470047, + "learning_rate": 0.0001218903323806595, + "loss": 1.1448, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.5894157323220334, + "learning_rate": 0.00011934985225541998, + "loss": 1.0709, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.7981960617785065, + "learning_rate": 0.00011679632898701649, + "loss": 1.1515, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.5765858891790445, + "learning_rate": 0.00011423148382732853, + "loss": 1.0964, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.6046649871496205, + "learning_rate": 0.00011165704565997593, + "loss": 1.1076, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.5638841047377291, + "learning_rate": 0.00010907474983493144, + "loss": 1.1158, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.6248619311895685, + "learning_rate": 0.0001064863369987743, + "loss": 1.158, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.6194657217776681, + "learning_rate": 0.00010389355192137377, + "loss": 1.1525, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.6067740187850118, + "learning_rate": 0.0001012981423197931, + "loss": 1.0888, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.5867746714643287, + "learning_rate": 9.870185768020693e-05, + "loss": 1.2144, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.6162475621164234, + "learning_rate": 9.610644807862625e-05, + "loss": 1.187, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.5816260885153179, + "learning_rate": 9.35136630012257e-05, + "loss": 1.1378, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.6092121742013019, + "learning_rate": 9.092525016506858e-05, + "loss": 1.0935, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.6129047955044695, + "learning_rate": 8.83429543400241e-05, + "loss": 1.0815, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.5748567355209082, + "learning_rate": 8.57685161726715e-05, + "loss": 1.1137, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.5699053032110166, + "learning_rate": 8.320367101298351e-05, + "loss": 1.125, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.5943499018117111, + "learning_rate": 8.065014774458003e-05, + "loss": 1.1111, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.5697756101544271, + "learning_rate": 7.810966761934053e-05, + "loss": 1.0353, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.5987366993178018, + "learning_rate": 7.558394309716088e-05, + "loss": 1.1787, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.599965475322347, + "learning_rate": 7.307467669163655e-05, + "loss": 1.0606, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.6231622935530702, + "learning_rate": 7.058355982245037e-05, + "loss": 1.1237, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.5778865258296028, + "learning_rate": 6.811227167523815e-05, + "loss": 1.07, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.9006399353206177, + "learning_rate": 6.566247806970119e-05, + "loss": 1.0794, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.6080347772853175, + "learning_rate": 6.323583033672799e-05, + "loss": 1.1286, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.6215600441401722, + "learning_rate": 6.083396420528298e-05, + "loss": 1.0333, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.5532899039833619, + "learning_rate": 5.845849869981137e-05, + "loss": 1.1528, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.5702002946777001, + "learning_rate": 5.611103504890444e-05, + "loss": 1.1547, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.6414308810709265, + "learning_rate": 5.379315560596038e-05, + "loss": 1.176, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.6023323139281526, + "learning_rate": 5.1506422782568345e-05, + "loss": 1.1687, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.5892600207157096, + "learning_rate": 4.9252377995334444e-05, + "loss": 1.0763, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.5339341330615615, + "learning_rate": 4.703254062686017e-05, + "loss": 1.0444, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.6279565355094363, + "learning_rate": 4.484840700157295e-05, + "loss": 1.0997, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.6104692154812912, + "learning_rate": 4.270144937709981e-05, + "loss": 1.1856, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.6114748193401518, + "learning_rate": 4.059311495186338e-05, + "loss": 1.1813, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.5517958084848442, + "learning_rate": 3.852482488956992e-05, + "loss": 1.0973, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.5616033058541691, + "learning_rate": 3.649797336124615e-05, + "loss": 1.1052, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.5734895525736495, + "learning_rate": 3.45139266054715e-05, + "loss": 1.1017, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.5849100376601271, + "learning_rate": 3.257402200743821e-05, + "loss": 1.0507, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.5625815255394412, + "learning_rate": 3.0679567197461134e-05, + "loss": 1.1098, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.6367740796942681, + "learning_rate": 2.8831839169543996e-05, + "loss": 1.1416, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.5795379896457181, + "learning_rate": 2.7032083420597e-05, + "loss": 1.1132, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.6143227738602632, + "learning_rate": 2.528151311088537e-05, + "loss": 1.0109, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.6361204646970395, + "learning_rate": 2.3581308246275103e-05, + "loss": 1.0794, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.5930462513939105, + "learning_rate": 2.1932614882827197e-05, + "loss": 1.1248, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.5733572454846041, + "learning_rate": 2.03365443542764e-05, + "loss": 1.1523, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.6495463046060299, + "learning_rate": 1.879417252291502e-05, + "loss": 1.0592, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.5553303058115924, + "learning_rate": 1.730653905438714e-05, + "loss": 1.0753, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.5828567133941143, + "learning_rate": 1.587464671688187e-05, + "loss": 1.1418, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.5731686378110846, + "learning_rate": 1.4499460705197998e-05, + "loss": 1.1059, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.5654646756423977, + "learning_rate": 1.3181907990135622e-05, + "loss": 1.1122, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.6216056944469172, + "learning_rate": 1.1922876693653585e-05, + "loss": 1.0539, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.6030188752416026, + "learning_rate": 1.0723215490213634e-05, + "loss": 1.1155, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.6116320512749103, + "learning_rate": 9.583733034714981e-06, + "loss": 1.0367, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.6241071081816516, + "learning_rate": 8.505197417404687e-06, + "loss": 1.0317, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.6014326320967628, + "learning_rate": 7.488335646131628e-06, + "loss": 1.0507, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.6058356600728491, + "learning_rate": 6.533833156292679e-06, + "loss": 1.0841, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.6327026531424363, + "learning_rate": 5.6423333488018095e-06, + "loss": 1.1028, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.5560188144128159, + "learning_rate": 4.8144371563930476e-06, + "loss": 1.0278, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.683157124015512, + "learning_rate": 4.050702638550275e-06, + "loss": 1.0732, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.5661517981730091, + "learning_rate": 3.3516446053363015e-06, + "loss": 1.0709, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.5465688334036988, + "learning_rate": 2.717734270375272e-06, + "loss": 1.0601, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.6549818213185664, + "learning_rate": 2.1493989332218468e-06, + "loss": 1.1295, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.6120862014550815, + "learning_rate": 1.6470216913317626e-06, + "loss": 1.0448, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.5638666360794156, + "learning_rate": 1.2109411818274852e-06, + "loss": 1.0509, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.6363582988990896, + "learning_rate": 8.41451353233369e-07, + "loss": 1.082, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.5875639830126131, + "learning_rate": 5.388012673338661e-07, + "loss": 1.1082, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.5494186467411913, + "learning_rate": 3.0319493128866396e-07, + "loss": 1.0868, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.64150816142503, + "learning_rate": 1.3479116011769767e-07, + "loss": 1.0792, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.6834676443584711, + "learning_rate": 3.370346964876036e-08, + "loss": 1.1639, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.5706584527747506, + "learning_rate": 0.0, + "loss": 1.0897, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 65356609552384.0, + "train_loss": 1.127972110748291, + "train_runtime": 1526.5357, + "train_samples_per_second": 1.31, + "train_steps_per_second": 0.082 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 65356609552384.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..1c5c0067886b078acc06f8b16e5e4456e15c87f4 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "down_proj", + "k_proj", + "gate_proj", + "q_proj", + "up_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..df081a3bca8324aeb9f125e1dafdeeecd168a316 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c30251737ad375f44282c19e038d1ce7a52bfcfb7c9c4e7799a2e069687d5784 +size 671150064 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..e31953eef7860edb89ad3d64bcc8e2ae32a3dcfe --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e626b7ff4f509eb7086f12ddce72968709b200a65e1ae2f8112e00a05200d71e +size 918507402 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f24d41ccbb2832582567f1369d7a0956c5dfc996 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_2000_epochs_2_lora/trainer_state.json @@ -0,0 +1,1792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 1.0906556698514653, + "learning_rate": 2.5e-05, + "loss": 1.4204, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.9857349066149346, + "learning_rate": 5e-05, + "loss": 1.2175, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.8615599929924261, + "learning_rate": 7.500000000000001e-05, + "loss": 1.3035, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.7690914837028592, + "learning_rate": 0.0001, + "loss": 1.2571, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.0401337718522603, + "learning_rate": 0.000125, + "loss": 1.2383, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.9277157219721284, + "learning_rate": 0.00015000000000000001, + "loss": 1.253, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.7611097693761704, + "learning_rate": 0.000175, + "loss": 1.2298, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.7467727205084197, + "learning_rate": 0.0002, + "loss": 1.2594, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.6443610988856757, + "learning_rate": 0.0001999915737775817, + "loss": 1.2015, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.7890743435032062, + "learning_rate": 0.00019996629653035126, + "loss": 1.1815, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.7765333577639731, + "learning_rate": 0.00019992417251814282, + "loss": 1.1301, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.7965071182224438, + "learning_rate": 0.00019986520883988232, + "loss": 1.1537, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.7348769990385856, + "learning_rate": 0.0001997894154323911, + "loss": 1.1746, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.7643746584209018, + "learning_rate": 0.00019969680506871137, + "loss": 1.2597, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.8509186289214853, + "learning_rate": 0.0001995873933559535, + "loss": 1.0828, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.6590660613580283, + "learning_rate": 0.00019946119873266613, + "loss": 1.1688, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.6908530470091816, + "learning_rate": 0.0001993182424657285, + "loss": 1.0842, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.777278529092561, + "learning_rate": 0.00019915854864676664, + "loss": 1.2296, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.6297881039801307, + "learning_rate": 0.0001989821441880933, + "loss": 1.1193, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.646049560969531, + "learning_rate": 0.00019878905881817252, + "loss": 1.1809, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.70453563966353, + "learning_rate": 0.0001985793250766098, + "loss": 1.2028, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.6216762423800736, + "learning_rate": 0.00019835297830866826, + "loss": 1.1419, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.6748615041776291, + "learning_rate": 0.00019811005665931205, + "loss": 1.1702, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.6462797823043912, + "learning_rate": 0.00019785060106677818, + "loss": 1.1681, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.6639668928998315, + "learning_rate": 0.0001975746552556772, + "loss": 1.1807, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.6201202500087949, + "learning_rate": 0.00019728226572962473, + "loss": 1.0909, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.6179247844982341, + "learning_rate": 0.0001969734817634044, + "loss": 1.1761, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.6490118886653817, + "learning_rate": 0.0001966483553946637, + "loss": 1.0643, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.6590708173604165, + "learning_rate": 0.00019630694141514464, + "loss": 1.1333, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.6815264154798509, + "learning_rate": 0.00019594929736144976, + "loss": 1.1031, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.7390307701064562, + "learning_rate": 0.0001955754835053459, + "loss": 1.2164, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.6598152894051182, + "learning_rate": 0.00019518556284360696, + "loss": 1.1077, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.6217047883076768, + "learning_rate": 0.0001947796010873974, + "loss": 1.0874, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.6260867883739055, + "learning_rate": 0.0001943576666511982, + "loss": 1.0549, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.6505612935212982, + "learning_rate": 0.0001939198306412775, + "loss": 1.1254, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.6638582395118996, + "learning_rate": 0.0001934661668437073, + "loss": 1.1729, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.6760355472513383, + "learning_rate": 0.0001929967517119289, + "loss": 1.1392, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.689419394536968, + "learning_rate": 0.0001925116643538684, + "loss": 1.1996, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.643046392793243, + "learning_rate": 0.0001920109865186052, + "loss": 1.2519, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.6737071109245297, + "learning_rate": 0.00019149480258259533, + "loss": 1.1533, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.6041984234865607, + "learning_rate": 0.00019096319953545185, + "loss": 1.1374, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.6340061670489825, + "learning_rate": 0.00019041626696528503, + "loss": 1.0871, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.6792970281555901, + "learning_rate": 0.00018985409704360456, + "loss": 1.138, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.6675168072446115, + "learning_rate": 0.0001892767845097864, + "loss": 1.0906, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.6459838379601136, + "learning_rate": 0.00018868442665510678, + "loss": 1.1354, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.6654945017142322, + "learning_rate": 0.00018807712330634642, + "loss": 1.2151, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.6504567057295806, + "learning_rate": 0.00018745497680896722, + "loss": 1.1807, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.6707619740477763, + "learning_rate": 0.0001868180920098644, + "loss": 1.1407, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.6838401347017035, + "learning_rate": 0.0001861665762396974, + "loss": 1.0861, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.6408245265616914, + "learning_rate": 0.00018550053929480202, + "loss": 1.1278, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.5920380892978044, + "learning_rate": 0.00018482009341868697, + "loss": 1.0847, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.6843282752275007, + "learning_rate": 0.00018412535328311814, + "loss": 1.1665, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.6412113867201495, + "learning_rate": 0.00018341643596879367, + "loss": 1.0581, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.616003151361979, + "learning_rate": 0.0001826934609456129, + "loss": 1.1037, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.6951255358669098, + "learning_rate": 0.00018195655005254273, + "loss": 1.1601, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.6429783633007734, + "learning_rate": 0.00018120582747708502, + "loss": 1.1597, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.6067825702732784, + "learning_rate": 0.00018044141973434758, + "loss": 1.0717, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.6698768363154343, + "learning_rate": 0.0001796634556457236, + "loss": 1.1689, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.6217186674790761, + "learning_rate": 0.00017887206631718203, + "loss": 1.1173, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.6508142702807356, + "learning_rate": 0.0001780673851171728, + "loss": 1.1203, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.588948225712465, + "learning_rate": 0.00017724954765415137, + "loss": 1.1332, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.6632354263972068, + "learning_rate": 0.00017641869175372493, + "loss": 1.176, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.7029142534588849, + "learning_rate": 0.00017557495743542585, + "loss": 1.1699, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.622116683049225, + "learning_rate": 0.00017471848688911464, + "loss": 1.1055, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.6151441614463127, + "learning_rate": 0.00017384942445101772, + "loss": 1.2301, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.6717386555525499, + "learning_rate": 0.000172967916579403, + "loss": 1.2032, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.6081467264082907, + "learning_rate": 0.00017207411182989832, + "loss": 1.1567, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.6204888422023926, + "learning_rate": 0.00017116816083045602, + "loss": 1.1106, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.6404949562625138, + "learning_rate": 0.00017025021625596853, + "loss": 1.0956, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.6417814085898819, + "learning_rate": 0.0001693204328025389, + "loss": 1.1356, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.635016572212586, + "learning_rate": 0.0001683789671614107, + "loss": 1.1492, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.6308134251683191, + "learning_rate": 0.00016742597799256182, + "loss": 1.128, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 1.0838315900856792, + "learning_rate": 0.00016646162589796615, + "loss": 1.0663, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.6529443760516622, + "learning_rate": 0.00016548607339452853, + "loss": 1.1941, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.6110384281678047, + "learning_rate": 0.00016449948488669639, + "loss": 1.0774, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.6384046652101805, + "learning_rate": 0.00016350202663875386, + "loss": 1.1373, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.5880440159147562, + "learning_rate": 0.00016249386674680184, + "loss": 1.0892, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.6052022720463669, + "learning_rate": 0.0001614751751104301, + "loss": 1.0997, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.8360588585364545, + "learning_rate": 0.00016044612340408466, + "loss": 1.1554, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.6064397572432122, + "learning_rate": 0.00015940688504813662, + "loss": 1.0517, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.5937622017131812, + "learning_rate": 0.00015835763517965673, + "loss": 1.1775, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.5858555438053201, + "learning_rate": 0.00015729855062290022, + "loss": 1.1757, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.6384214991232625, + "learning_rate": 0.0001562298098595078, + "loss": 1.202, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.6373560727609033, + "learning_rate": 0.00015515159299842707, + "loss": 1.1889, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.6221210303020743, + "learning_rate": 0.00015406408174555976, + "loss": 1.0921, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.5810311421752556, + "learning_rate": 0.00015296745937313987, + "loss": 1.0685, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.6574941541121385, + "learning_rate": 0.00015186191068884775, + "loss": 1.1256, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.6135880637902962, + "learning_rate": 0.00015074762200466556, + "loss": 1.2139, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.5887150663213712, + "learning_rate": 0.00014962478110547918, + "loss": 1.2012, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.5720216657559922, + "learning_rate": 0.00014849357721743168, + "loss": 1.1177, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.579631383034893, + "learning_rate": 0.0001473542009760343, + "loss": 1.1278, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.6048021015601645, + "learning_rate": 0.00014620684439403962, + "loss": 1.1421, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.5855942323086678, + "learning_rate": 0.0001450517008290827, + "loss": 1.0698, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.5992340797396072, + "learning_rate": 0.0001438889649510956, + "loss": 1.1308, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.657035857991531, + "learning_rate": 0.00014271883270950073, + "loss": 1.1656, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.5993609083935862, + "learning_rate": 0.00014154150130018866, + "loss": 1.1342, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.594031128592101, + "learning_rate": 0.00014035716913228568, + "loss": 1.0296, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.6343533146102689, + "learning_rate": 0.00013916603579471705, + "loss": 1.1021, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.6370282993472896, + "learning_rate": 0.0001379683020225714, + "loss": 1.1423, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.5999959123873777, + "learning_rate": 0.000136764169663272, + "loss": 1.1809, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.6052322798739889, + "learning_rate": 0.00013555384164256048, + "loss": 1.0853, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.627014952912481, + "learning_rate": 0.00013433752193029886, + "loss": 1.1041, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.6272955192024172, + "learning_rate": 0.00013311541550609565, + "loss": 1.1716, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.5935607523672076, + "learning_rate": 0.00013188772832476188, + "loss": 1.1208, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.650271337449353, + "learning_rate": 0.00013065466728160252, + "loss": 1.1299, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.6508898010705562, + "learning_rate": 0.00012941644017754964, + "loss": 1.0757, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.6307374194277771, + "learning_rate": 0.00012817325568414297, + "loss": 1.1246, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.5874661015571683, + "learning_rate": 0.00012692532330836346, + "loss": 1.0583, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.5880110073044159, + "learning_rate": 0.00012567285335732633, + "loss": 1.059, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.5927953736082483, + "learning_rate": 0.00012441605690283915, + "loss": 1.0698, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.6174223850149873, + "learning_rate": 0.00012315514574583113, + "loss": 1.1004, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.6401625589443253, + "learning_rate": 0.0001218903323806595, + "loss": 1.1198, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.6144835895939935, + "learning_rate": 0.00012062182995929882, + "loss": 1.0495, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.6160080819037824, + "learning_rate": 0.00011934985225541998, + "loss": 1.099, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.588716972446015, + "learning_rate": 0.0001180746136283638, + "loss": 1.0885, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.5878309914107679, + "learning_rate": 0.00011679632898701649, + "loss": 1.0793, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.6828359076557663, + "learning_rate": 0.00011551521375359206, + "loss": 1.1377, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.6583578970824269, + "learning_rate": 0.00011423148382732853, + "loss": 1.0516, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.6166318556513858, + "learning_rate": 0.00011294535554810354, + "loss": 1.0817, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.6893920168054586, + "learning_rate": 0.00011165704565997593, + "loss": 1.1034, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.7653561538457616, + "learning_rate": 0.00011036677127465889, + "loss": 1.1266, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.5801285599368534, + "learning_rate": 0.00010907474983493144, + "loss": 1.0964, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.6170823725640626, + "learning_rate": 0.00010778119907799398, + "loss": 1.0843, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.7582907445192895, + "learning_rate": 0.0001064863369987743, + "loss": 1.1703, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.6260013577906284, + "learning_rate": 0.00010519038181318999, + "loss": 1.0945, + "step": 125 + }, + { + "epoch": 1.008, + "grad_norm": 0.5494314613037083, + "learning_rate": 0.00010389355192137377, + "loss": 0.7531, + "step": 126 + }, + { + "epoch": 1.016, + "grad_norm": 0.5425739308066683, + "learning_rate": 0.00010259606587086783, + "loss": 0.7805, + "step": 127 + }, + { + "epoch": 1.024, + "grad_norm": 0.529115377582859, + "learning_rate": 0.0001012981423197931, + "loss": 0.7759, + "step": 128 + }, + { + "epoch": 1.032, + "grad_norm": 0.5692792123945668, + "learning_rate": 0.0001, + "loss": 0.7704, + "step": 129 + }, + { + "epoch": 1.04, + "grad_norm": 0.6186406508253389, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7314, + "step": 130 + }, + { + "epoch": 1.048, + "grad_norm": 0.6721041733464441, + "learning_rate": 9.740393412913219e-05, + "loss": 0.8125, + "step": 131 + }, + { + "epoch": 1.056, + "grad_norm": 0.6948983333121643, + "learning_rate": 9.610644807862625e-05, + "loss": 0.7738, + "step": 132 + }, + { + "epoch": 1.064, + "grad_norm": 0.6916170510549642, + "learning_rate": 9.480961818681004e-05, + "loss": 0.782, + "step": 133 + }, + { + "epoch": 1.072, + "grad_norm": 0.6835237897842905, + "learning_rate": 9.35136630012257e-05, + "loss": 0.7564, + "step": 134 + }, + { + "epoch": 1.08, + "grad_norm": 0.6864061018600497, + "learning_rate": 9.221880092200601e-05, + "loss": 0.721, + "step": 135 + }, + { + "epoch": 1.088, + "grad_norm": 0.6676838293816894, + "learning_rate": 9.092525016506858e-05, + "loss": 0.7256, + "step": 136 + }, + { + "epoch": 1.096, + "grad_norm": 0.6127394586373243, + "learning_rate": 8.963322872534114e-05, + "loss": 0.6988, + "step": 137 + }, + { + "epoch": 1.104, + "grad_norm": 0.6589588798885051, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8101, + "step": 138 + }, + { + "epoch": 1.112, + "grad_norm": 0.6338289492844081, + "learning_rate": 8.705464445189647e-05, + "loss": 0.7592, + "step": 139 + }, + { + "epoch": 1.12, + "grad_norm": 0.6681908245671879, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7407, + "step": 140 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.6220519972190037, + "learning_rate": 8.448478624640797e-05, + "loss": 0.6765, + "step": 141 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.671093719017714, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7479, + "step": 142 + }, + { + "epoch": 1.144, + "grad_norm": 0.6252540781942465, + "learning_rate": 8.192538637163621e-05, + "loss": 0.69, + "step": 143 + }, + { + "epoch": 1.152, + "grad_norm": 0.7025608874796816, + "learning_rate": 8.065014774458003e-05, + "loss": 0.7372, + "step": 144 + }, + { + "epoch": 1.16, + "grad_norm": 0.8710246282526877, + "learning_rate": 7.93781700407012e-05, + "loss": 0.7009, + "step": 145 + }, + { + "epoch": 1.168, + "grad_norm": 0.6957075564392686, + "learning_rate": 7.810966761934053e-05, + "loss": 0.7039, + "step": 146 + }, + { + "epoch": 1.176, + "grad_norm": 0.7236979642981014, + "learning_rate": 7.684485425416888e-05, + "loss": 0.7541, + "step": 147 + }, + { + "epoch": 1.184, + "grad_norm": 0.6570340780461766, + "learning_rate": 7.558394309716088e-05, + "loss": 0.6962, + "step": 148 + }, + { + "epoch": 1.192, + "grad_norm": 0.608597796982595, + "learning_rate": 7.432714664267373e-05, + "loss": 0.6648, + "step": 149 + }, + { + "epoch": 1.2, + "grad_norm": 0.6274556968757717, + "learning_rate": 7.307467669163655e-05, + "loss": 0.8017, + "step": 150 + }, + { + "epoch": 1.208, + "grad_norm": 0.646045678305703, + "learning_rate": 7.182674431585704e-05, + "loss": 0.687, + "step": 151 + }, + { + "epoch": 1.216, + "grad_norm": 0.615025745148123, + "learning_rate": 7.058355982245037e-05, + "loss": 0.7028, + "step": 152 + }, + { + "epoch": 1.224, + "grad_norm": 0.644127006643345, + "learning_rate": 6.934533271839752e-05, + "loss": 0.6849, + "step": 153 + }, + { + "epoch": 1.232, + "grad_norm": 0.6322032158257218, + "learning_rate": 6.811227167523815e-05, + "loss": 0.6858, + "step": 154 + }, + { + "epoch": 1.24, + "grad_norm": 0.7040220453985434, + "learning_rate": 6.688458449390437e-05, + "loss": 0.7036, + "step": 155 + }, + { + "epoch": 1.248, + "grad_norm": 0.7080898961868384, + "learning_rate": 6.566247806970119e-05, + "loss": 0.6814, + "step": 156 + }, + { + "epoch": 1.256, + "grad_norm": 0.6732544034749526, + "learning_rate": 6.444615835743955e-05, + "loss": 0.6868, + "step": 157 + }, + { + "epoch": 1.264, + "grad_norm": 0.6197892347959479, + "learning_rate": 6.323583033672799e-05, + "loss": 0.6852, + "step": 158 + }, + { + "epoch": 1.272, + "grad_norm": 0.665156386544487, + "learning_rate": 6.203169797742861e-05, + "loss": 0.6963, + "step": 159 + }, + { + "epoch": 1.28, + "grad_norm": 0.6814028088410238, + "learning_rate": 6.083396420528298e-05, + "loss": 0.7747, + "step": 160 + }, + { + "epoch": 1.288, + "grad_norm": 0.6852077562936856, + "learning_rate": 5.964283086771435e-05, + "loss": 0.6945, + "step": 161 + }, + { + "epoch": 1.296, + "grad_norm": 0.7147333175738334, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7479, + "step": 162 + }, + { + "epoch": 1.304, + "grad_norm": 0.6688014606429251, + "learning_rate": 5.728116729049928e-05, + "loss": 0.6931, + "step": 163 + }, + { + "epoch": 1.312, + "grad_norm": 0.6029399268866422, + "learning_rate": 5.611103504890444e-05, + "loss": 0.7594, + "step": 164 + }, + { + "epoch": 1.32, + "grad_norm": 0.6405186403636638, + "learning_rate": 5.4948299170917325e-05, + "loss": 0.6861, + "step": 165 + }, + { + "epoch": 1.328, + "grad_norm": 0.6153553215285271, + "learning_rate": 5.379315560596038e-05, + "loss": 0.7219, + "step": 166 + }, + { + "epoch": 1.336, + "grad_norm": 0.6138403231780742, + "learning_rate": 5.26457990239657e-05, + "loss": 0.7178, + "step": 167 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.594607451880698, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.6499, + "step": 168 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.6274961325954719, + "learning_rate": 5.0375218894520834e-05, + "loss": 0.7387, + "step": 169 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.6428236849437015, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.6673, + "step": 170 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.7424922487103246, + "learning_rate": 4.813808931115228e-05, + "loss": 0.7083, + "step": 171 + }, + { + "epoch": 1.376, + "grad_norm": 0.6413719943869135, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7688, + "step": 172 + }, + { + "epoch": 1.384, + "grad_norm": 0.6447719371076412, + "learning_rate": 4.593591825444028e-05, + "loss": 0.7062, + "step": 173 + }, + { + "epoch": 1.392, + "grad_norm": 0.6850728968609752, + "learning_rate": 4.484840700157295e-05, + "loss": 0.7522, + "step": 174 + }, + { + "epoch": 1.4, + "grad_norm": 0.655257297593665, + "learning_rate": 4.377019014049223e-05, + "loss": 0.7415, + "step": 175 + }, + { + "epoch": 1.408, + "grad_norm": 0.6649918375536736, + "learning_rate": 4.270144937709981e-05, + "loss": 0.7079, + "step": 176 + }, + { + "epoch": 1.416, + "grad_norm": 0.640170590584303, + "learning_rate": 4.164236482034327e-05, + "loss": 0.7246, + "step": 177 + }, + { + "epoch": 1.424, + "grad_norm": 0.6797620642919888, + "learning_rate": 4.059311495186338e-05, + "loss": 0.6517, + "step": 178 + }, + { + "epoch": 1.432, + "grad_norm": 0.680554997205006, + "learning_rate": 3.9553876595915375e-05, + "loss": 0.6751, + "step": 179 + }, + { + "epoch": 1.44, + "grad_norm": 0.6510525823177645, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7289, + "step": 180 + }, + { + "epoch": 1.448, + "grad_norm": 0.6681387989836439, + "learning_rate": 3.750613325319817e-05, + "loss": 0.6989, + "step": 181 + }, + { + "epoch": 1.456, + "grad_norm": 0.682820922478662, + "learning_rate": 3.649797336124615e-05, + "loss": 0.6846, + "step": 182 + }, + { + "epoch": 1.464, + "grad_norm": 0.6669747221606931, + "learning_rate": 3.550051511330361e-05, + "loss": 0.7159, + "step": 183 + }, + { + "epoch": 1.472, + "grad_norm": 0.6346245539679889, + "learning_rate": 3.45139266054715e-05, + "loss": 0.722, + "step": 184 + }, + { + "epoch": 1.48, + "grad_norm": 0.6860528744206814, + "learning_rate": 3.3538374102033866e-05, + "loss": 0.7387, + "step": 185 + }, + { + "epoch": 1.488, + "grad_norm": 0.6466559324864782, + "learning_rate": 3.257402200743821e-05, + "loss": 0.6739, + "step": 186 + }, + { + "epoch": 1.496, + "grad_norm": 0.6023284293992224, + "learning_rate": 3.1621032838589305e-05, + "loss": 0.6686, + "step": 187 + }, + { + "epoch": 1.504, + "grad_norm": 0.6413527159496256, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.748, + "step": 188 + }, + { + "epoch": 1.512, + "grad_norm": 0.6551109522736579, + "learning_rate": 2.974978374403147e-05, + "loss": 0.6868, + "step": 189 + }, + { + "epoch": 1.52, + "grad_norm": 0.6326824311431275, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7515, + "step": 190 + }, + { + "epoch": 1.528, + "grad_norm": 0.6385233894295355, + "learning_rate": 2.7925888170101665e-05, + "loss": 0.7054, + "step": 191 + }, + { + "epoch": 1.536, + "grad_norm": 0.6662115965676328, + "learning_rate": 2.7032083420597e-05, + "loss": 0.7029, + "step": 192 + }, + { + "epoch": 1.544, + "grad_norm": 0.6591688636277143, + "learning_rate": 2.6150575548982292e-05, + "loss": 0.6458, + "step": 193 + }, + { + "epoch": 1.552, + "grad_norm": 0.6936966434138461, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7114, + "step": 194 + }, + { + "epoch": 1.56, + "grad_norm": 0.660940431444361, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.7331, + "step": 195 + }, + { + "epoch": 1.568, + "grad_norm": 0.6364962057855832, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7192, + "step": 196 + }, + { + "epoch": 1.576, + "grad_norm": 0.6560039547156613, + "learning_rate": 2.2750452345848682e-05, + "loss": 0.7001, + "step": 197 + }, + { + "epoch": 1.584, + "grad_norm": 0.6585253364363486, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7239, + "step": 198 + }, + { + "epoch": 1.592, + "grad_norm": 0.624825451719301, + "learning_rate": 2.112793368281799e-05, + "loss": 0.6901, + "step": 199 + }, + { + "epoch": 1.6, + "grad_norm": 0.6649961244159159, + "learning_rate": 2.03365443542764e-05, + "loss": 0.699, + "step": 200 + }, + { + "epoch": 1.608, + "grad_norm": 0.6615988192210411, + "learning_rate": 1.9558580265652448e-05, + "loss": 0.731, + "step": 201 + }, + { + "epoch": 1.616, + "grad_norm": 0.669000346973526, + "learning_rate": 1.879417252291502e-05, + "loss": 0.7515, + "step": 202 + }, + { + "epoch": 1.624, + "grad_norm": 0.6625424275293328, + "learning_rate": 1.804344994745727e-05, + "loss": 0.7279, + "step": 203 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.6172627479461968, + "learning_rate": 1.730653905438714e-05, + "loss": 0.6609, + "step": 204 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.6968471322847645, + "learning_rate": 1.6583564031206357e-05, + "loss": 0.6419, + "step": 205 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.6646091376538787, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7074, + "step": 206 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.7119786340374479, + "learning_rate": 1.5179906581313064e-05, + "loss": 0.7104, + "step": 207 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.6182072495204601, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.6686, + "step": 208 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.6571653192714902, + "learning_rate": 1.3833423760302611e-05, + "loss": 0.6878, + "step": 209 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.6618007770830334, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.6461, + "step": 210 + }, + { + "epoch": 1.688, + "grad_norm": 0.6817305234712389, + "learning_rate": 1.2545023191032801e-05, + "loss": 0.6321, + "step": 211 + }, + { + "epoch": 1.696, + "grad_norm": 0.6704808086973233, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.7268, + "step": 212 + }, + { + "epoch": 1.704, + "grad_norm": 0.6185324770527085, + "learning_rate": 1.131557334489326e-05, + "loss": 0.7113, + "step": 213 + }, + { + "epoch": 1.712, + "grad_norm": 0.6764723338003922, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.6954, + "step": 214 + }, + { + "epoch": 1.72, + "grad_norm": 0.6529991387340686, + "learning_rate": 1.0145902956395447e-05, + "loss": 0.6599, + "step": 215 + }, + { + "epoch": 1.728, + "grad_norm": 0.6470487796656074, + "learning_rate": 9.583733034714981e-06, + "loss": 0.6545, + "step": 216 + }, + { + "epoch": 1.736, + "grad_norm": 0.7156374080738279, + "learning_rate": 9.036800464548157e-06, + "loss": 0.7424, + "step": 217 + }, + { + "epoch": 1.744, + "grad_norm": 0.6628825567627252, + "learning_rate": 8.505197417404687e-06, + "loss": 0.64, + "step": 218 + }, + { + "epoch": 1.752, + "grad_norm": 0.6679173882701385, + "learning_rate": 7.989013481394814e-06, + "loss": 0.7379, + "step": 219 + }, + { + "epoch": 1.76, + "grad_norm": 0.6590707065664086, + "learning_rate": 7.488335646131628e-06, + "loss": 0.5963, + "step": 220 + }, + { + "epoch": 1.768, + "grad_norm": 0.6444345838715957, + "learning_rate": 7.003248288071118e-06, + "loss": 0.7151, + "step": 221 + }, + { + "epoch": 1.776, + "grad_norm": 0.675515451229952, + "learning_rate": 6.533833156292679e-06, + "loss": 0.6899, + "step": 222 + }, + { + "epoch": 1.784, + "grad_norm": 0.7208271256227047, + "learning_rate": 6.08016935872251e-06, + "loss": 0.592, + "step": 223 + }, + { + "epoch": 1.792, + "grad_norm": 0.6543039778126106, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.725, + "step": 224 + }, + { + "epoch": 1.8, + "grad_norm": 0.6545834113447834, + "learning_rate": 5.22039891260262e-06, + "loss": 0.5796, + "step": 225 + }, + { + "epoch": 1.808, + "grad_norm": 0.6413464222219671, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.6239, + "step": 226 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.6409389432940975, + "learning_rate": 4.424516494654118e-06, + "loss": 0.6732, + "step": 227 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.6130780753603059, + "learning_rate": 4.050702638550275e-06, + "loss": 0.7106, + "step": 228 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.6436868508715743, + "learning_rate": 3.693058584855369e-06, + "loss": 0.713, + "step": 229 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.644610773822548, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.6644, + "step": 230 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.6742463101315687, + "learning_rate": 3.026518236595621e-06, + "loss": 0.7132, + "step": 231 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.623708408040731, + "learning_rate": 2.717734270375272e-06, + "loss": 0.708, + "step": 232 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.6353495695076489, + "learning_rate": 2.4253447443228106e-06, + "loss": 0.6728, + "step": 233 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.6470934000333828, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.6684, + "step": 234 + }, + { + "epoch": 1.88, + "grad_norm": 0.6339743329664256, + "learning_rate": 1.8899433406879608e-06, + "loss": 0.7162, + "step": 235 + }, + { + "epoch": 1.888, + "grad_norm": 0.6991547215955025, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.6655, + "step": 236 + }, + { + "epoch": 1.896, + "grad_norm": 0.6573698643261111, + "learning_rate": 1.4206749233902084e-06, + "loss": 0.6264, + "step": 237 + }, + { + "epoch": 1.904, + "grad_norm": 0.674748855909925, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.6998, + "step": 238 + }, + { + "epoch": 1.912, + "grad_norm": 0.6717822723070102, + "learning_rate": 1.0178558119067315e-06, + "loss": 0.6694, + "step": 239 + }, + { + "epoch": 1.92, + "grad_norm": 0.6005919528342801, + "learning_rate": 8.41451353233369e-07, + "loss": 0.6788, + "step": 240 + }, + { + "epoch": 1.928, + "grad_norm": 0.6588728502179835, + "learning_rate": 6.817575342714988e-07, + "loss": 0.6907, + "step": 241 + }, + { + "epoch": 1.936, + "grad_norm": 0.6399991839714124, + "learning_rate": 5.388012673338661e-07, + "loss": 0.7308, + "step": 242 + }, + { + "epoch": 1.944, + "grad_norm": 0.6727098433398869, + "learning_rate": 4.126066440464982e-07, + "loss": 0.7206, + "step": 243 + }, + { + "epoch": 1.952, + "grad_norm": 0.6453747666541074, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.6922, + "step": 244 + }, + { + "epoch": 1.96, + "grad_norm": 0.6826025782964477, + "learning_rate": 2.1058456760891798e-07, + "loss": 0.6359, + "step": 245 + }, + { + "epoch": 1.968, + "grad_norm": 0.6759076178003499, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7169, + "step": 246 + }, + { + "epoch": 1.976, + "grad_norm": 0.6557247806930547, + "learning_rate": 7.582748185719358e-08, + "loss": 0.662, + "step": 247 + }, + { + "epoch": 1.984, + "grad_norm": 0.6905158594933963, + "learning_rate": 3.370346964876036e-08, + "loss": 0.639, + "step": 248 + }, + { + "epoch": 1.992, + "grad_norm": 0.6703768445324131, + "learning_rate": 8.426222418311814e-09, + "loss": 0.7734, + "step": 249 + }, + { + "epoch": 2.0, + "grad_norm": 0.6528927891706773, + "learning_rate": 0.0, + "loss": 0.6719, + "step": 250 + }, + { + "epoch": 2.0, + "step": 250, + "total_flos": 130603858001920.0, + "train_loss": 0.9226167418956757, + "train_runtime": 3038.6804, + "train_samples_per_second": 1.316, + "train_steps_per_second": 0.082 + } + ], + "logging_steps": 1.0, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 130603858001920.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/README.md b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ec7b01dfe7159d02d5d955d904fbc05f76c827ed --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "o_proj", + "v_proj", + "k_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..23df97ed34c2c1ca2c7159df525227430495f255 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e94296acb549b80024e300c9a323b71ee3077e46a71b5a7f17b0f989114a6e5 +size 671150064 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/config.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..1b3922dea144ee16eb1ca5d1074aff938ef4e53d --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd0a8733639c356cd85e403d0c948297f862e3bc719bcb1be355d5aa0fb94d57 +size 918507402 diff --git a/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5f13aa3a9346a36954edbf3382959979c913e948 --- /dev/null +++ b/single_dataset/gpt4o_conversations/VideoGameBunny_v1_1-Llama-3-8B-V-gpt4o_conversations_dataset_5000_epochs_1_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.8934607401720872, + "learning_rate": 2e-05, + "loss": 1.3726, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9702764908035759, + "learning_rate": 4e-05, + "loss": 1.3514, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8928671122403777, + "learning_rate": 6e-05, + "loss": 1.3464, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.733085596985154, + "learning_rate": 8e-05, + "loss": 1.2532, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.8570494872495142, + "learning_rate": 0.0001, + "loss": 1.1028, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.0400527883080488, + "learning_rate": 0.00012, + "loss": 1.2828, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.8428812825744051, + "learning_rate": 0.00014, + "loss": 1.2222, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7939623442508198, + "learning_rate": 0.00016, + "loss": 1.2832, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6664596962036828, + "learning_rate": 0.00018, + "loss": 1.2148, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.7655774280979413, + "learning_rate": 0.0002, + "loss": 1.2593, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.9002584907773491, + "learning_rate": 0.00019999458931878073, + "loss": 1.2274, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.6973696058498797, + "learning_rate": 0.0001999783578606323, + "loss": 1.1777, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.7961500621219396, + "learning_rate": 0.00019995130738201966, + "loss": 1.2384, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.6934442208162789, + "learning_rate": 0.0001999134408101731, + "loss": 1.1529, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.7760808007197953, + "learning_rate": 0.00019986476224277165, + "loss": 1.1961, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.7662958744276087, + "learning_rate": 0.00019980527694749952, + "loss": 1.2431, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.7492875679562371, + "learning_rate": 0.00019973499136147606, + "loss": 1.1853, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.6644857894549516, + "learning_rate": 0.0001996539130905593, + "loss": 1.0896, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.7028291596766432, + "learning_rate": 0.0001995620509085228, + "loss": 1.1517, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.7151346253989167, + "learning_rate": 0.00019945941475610623, + "loss": 1.2471, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.6597569217376018, + "learning_rate": 0.0001993460157399396, + "loss": 1.1051, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.7876128410548371, + "learning_rate": 0.0001992218661313415, + "loss": 1.1344, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.6788178829714944, + "learning_rate": 0.00019908697936499103, + "loss": 1.1579, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.6959667855216832, + "learning_rate": 0.00019894137003747403, + "loss": 1.0935, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.6360283110594259, + "learning_rate": 0.00019878505390570362, + "loss": 1.0861, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.7149481186935083, + "learning_rate": 0.00019861804788521493, + "loss": 1.2315, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.7290306737510642, + "learning_rate": 0.00019844037004833473, + "loss": 1.1464, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.6106941074968636, + "learning_rate": 0.00019825203962222572, + "loss": 1.165, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.6171080416329583, + "learning_rate": 0.0001980530769868059, + "loss": 1.1248, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.6951443407479725, + "learning_rate": 0.00019784350367254322, + "loss": 1.1997, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.7110530520599219, + "learning_rate": 0.0001976233423581255, + "loss": 1.0452, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.6720320155074625, + "learning_rate": 0.0001973926168680066, + "loss": 1.1767, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.6247290543001629, + "learning_rate": 0.00019715135216982798, + "loss": 1.1555, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.6768489593434474, + "learning_rate": 0.0001968995743717171, + "loss": 1.2408, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.5850076189172989, + "learning_rate": 0.00019663731071946206, + "loss": 1.1552, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.6272578992853955, + "learning_rate": 0.00019636458959356316, + "loss": 1.1843, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.6041727331165989, + "learning_rate": 0.0001960814405061619, + "loss": 1.111, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5853996034242491, + "learning_rate": 0.00019578789409784727, + "loss": 1.2139, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.6044406014017014, + "learning_rate": 0.00019548398213434007, + "loss": 1.1251, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.599638159597199, + "learning_rate": 0.00019516973750305532, + "loss": 1.1915, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.6230215019985648, + "learning_rate": 0.00019484519420954354, + "loss": 1.1465, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.6685570571892, + "learning_rate": 0.00019451038737381077, + "loss": 1.1318, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.6591831305025555, + "learning_rate": 0.00019416535322651818, + "loss": 1.1549, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.6302659422826664, + "learning_rate": 0.00019381012910506146, + "loss": 1.1936, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.6551385711236554, + "learning_rate": 0.00019344475344953012, + "loss": 1.1421, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.6846744761544914, + "learning_rate": 0.00019306926579854821, + "loss": 1.1853, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.5995584235958642, + "learning_rate": 0.00019268370678499533, + "loss": 1.1172, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.6693072383155553, + "learning_rate": 0.0001922881181316097, + "loss": 1.0899, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.5823860976342665, + "learning_rate": 0.00019188254264647337, + "loss": 1.0778, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.6548607686911667, + "learning_rate": 0.0001914670242183795, + "loss": 1.1292, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.6455229915546238, + "learning_rate": 0.0001910416078120832, + "loss": 1.2137, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.6797448562922372, + "learning_rate": 0.0001906063394634356, + "loss": 1.1747, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.6169579402663269, + "learning_rate": 0.00019016126627440237, + "loss": 1.1282, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.5752462685930675, + "learning_rate": 0.00018970643640796642, + "loss": 1.2357, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.6524878494461898, + "learning_rate": 0.000189241899082916, + "loss": 1.1178, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.5803880758055884, + "learning_rate": 0.00018876770456851877, + "loss": 1.0582, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.6284049891526231, + "learning_rate": 0.0001882839041790818, + "loss": 1.1503, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.6429933521925236, + "learning_rate": 0.00018779055026839868, + "loss": 1.1227, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.6642315948614433, + "learning_rate": 0.00018728769622408423, + "loss": 1.1491, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.6318270344993646, + "learning_rate": 0.00018677539646179707, + "loss": 1.0924, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.6736013814160521, + "learning_rate": 0.00018625370641935129, + "loss": 1.1376, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.6290487314099203, + "learning_rate": 0.00018572268255071718, + "loss": 1.1099, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.5868150325072425, + "learning_rate": 0.00018518238231991218, + "loss": 1.0572, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.6287361571659641, + "learning_rate": 0.00018463286419478255, + "loss": 1.1199, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.635478784306169, + "learning_rate": 0.00018407418764067627, + "loss": 1.1375, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.6934699927576387, + "learning_rate": 0.00018350641311400812, + "loss": 1.12, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.6300333641647747, + "learning_rate": 0.0001829296020557174, + "loss": 1.1237, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.603806858704276, + "learning_rate": 0.00018234381688461942, + "loss": 1.1252, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.6328663939650221, + "learning_rate": 0.0001817491209906506, + "loss": 1.1643, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.6587939462250295, + "learning_rate": 0.00018114557872800905, + "loss": 1.159, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.6554856961720463, + "learning_rate": 0.00018053325540819045, + "loss": 1.0889, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.7008545104931168, + "learning_rate": 0.0001799122172929206, + "loss": 1.1307, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.6109951811360471, + "learning_rate": 0.00017928253158698473, + "loss": 1.1416, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.5813378104951092, + "learning_rate": 0.0001786442664309554, + "loss": 1.0637, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.5909823532691317, + "learning_rate": 0.0001779974908938184, + "loss": 1.1533, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.5928182395354943, + "learning_rate": 0.0001773422749654988, + "loss": 1.1322, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.6569884577681215, + "learning_rate": 0.00017667868954928694, + "loss": 1.1574, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.6278707831640676, + "learning_rate": 0.00017600680645416583, + "loss": 1.087, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.640992980278925, + "learning_rate": 0.00017532669838704035, + "loss": 1.1207, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.668359710460776, + "learning_rate": 0.00017463843894486937, + "loss": 1.143, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.6285654698383347, + "learning_rate": 0.0001739421026067017, + "loss": 1.1844, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.641766909272507, + "learning_rate": 0.00017323776472561627, + "loss": 1.1131, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.6393224242059528, + "learning_rate": 0.00017252550152056795, + "loss": 1.1277, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.6079741134716099, + "learning_rate": 0.0001718053900681397, + "loss": 1.139, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.5889968838496492, + "learning_rate": 0.00017107750829420176, + "loss": 1.1896, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.635708382670474, + "learning_rate": 0.00017034193496547902, + "loss": 1.1767, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.6347494253926971, + "learning_rate": 0.00016959874968102735, + "loss": 1.0919, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.6081714242856907, + "learning_rate": 0.00016884803286362, + "loss": 1.002, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.6687524144949784, + "learning_rate": 0.00016808986575104465, + "loss": 1.1217, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.6367497368719929, + "learning_rate": 0.00016732433038731242, + "loss": 1.1734, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.6924335068178338, + "learning_rate": 0.0001665515096137797, + "loss": 1.1005, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.6407334042780027, + "learning_rate": 0.00016577148706018328, + "loss": 1.1806, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.6831640002730881, + "learning_rate": 0.00016498434713559088, + "loss": 1.1643, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.6818497163627971, + "learning_rate": 0.00016419017501926656, + "loss": 1.1479, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.6023164784286884, + "learning_rate": 0.0001633890566514535, + "loss": 1.0199, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.6071162038895305, + "learning_rate": 0.00016258107872407375, + "loss": 1.1268, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.6621975947969588, + "learning_rate": 0.0001617663286713474, + "loss": 1.1054, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.6430349596453502, + "learning_rate": 0.00016094489466033043, + "loss": 1.185, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.596716232396584, + "learning_rate": 0.00016011686558137448, + "loss": 1.1167, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.8030431208767496, + "learning_rate": 0.0001592823310385073, + "loss": 1.0043, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.6322736824639741, + "learning_rate": 0.0001584413813397364, + "loss": 1.1156, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.9683388132336448, + "learning_rate": 0.00015759410748727662, + "loss": 1.0974, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.5821355886938816, + "learning_rate": 0.00015674060116770236, + "loss": 1.1224, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.6319431789381217, + "learning_rate": 0.00015588095474202595, + "loss": 1.163, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.6020869145562134, + "learning_rate": 0.00015501526123570277, + "loss": 1.1574, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.581552295038906, + "learning_rate": 0.00015414361432856475, + "loss": 1.0393, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.6434562480401083, + "learning_rate": 0.0001532661083446829, + "loss": 1.1169, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.6505539891186463, + "learning_rate": 0.00015238283824216015, + "loss": 1.1125, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.6143461053823742, + "learning_rate": 0.00015149389960285558, + "loss": 1.1756, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.5678824592957399, + "learning_rate": 0.00015059938862204127, + "loss": 1.1602, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.7196907483605643, + "learning_rate": 0.00014969940209799248, + "loss": 1.1461, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.60872627866381, + "learning_rate": 0.00014879403742151283, + "loss": 1.0779, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.6088948095827803, + "learning_rate": 0.00014788339256539544, + "loss": 1.1465, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.6001452511808347, + "learning_rate": 0.0001469675660738206, + "loss": 1.0806, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.7237162644452426, + "learning_rate": 0.00014604665705169237, + "loss": 1.1681, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.6361290323952742, + "learning_rate": 0.00014512076515391375, + "loss": 1.1262, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.6042846124009682, + "learning_rate": 0.00014418999057460276, + "loss": 1.094, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.6026735908383196, + "learning_rate": 0.0001432544340362501, + "loss": 1.031, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.5813635120693075, + "learning_rate": 0.00014231419677881966, + "loss": 1.0934, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.6607505994262415, + "learning_rate": 0.00014136938054879283, + "loss": 1.1583, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.5797413168500228, + "learning_rate": 0.00014042008758815818, + "loss": 1.119, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.6212414913681146, + "learning_rate": 0.00013946642062334766, + "loss": 1.1696, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.5874185714602815, + "learning_rate": 0.00013850848285411994, + "loss": 1.098, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.6030500726178164, + "learning_rate": 0.000137546377942393, + "loss": 1.0709, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.5515883025449129, + "learning_rate": 0.00013658021000102636, + "loss": 1.0873, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.6617058119593494, + "learning_rate": 0.00013561008358255468, + "loss": 1.0748, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.6044340009807571, + "learning_rate": 0.00013463610366787392, + "loss": 1.1109, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.6280824205414822, + "learning_rate": 0.00013365837565488064, + "loss": 1.1219, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.6023589276763569, + "learning_rate": 0.0001326770053470668, + "loss": 1.0416, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.601503782550989, + "learning_rate": 0.0001316920989420703, + "loss": 1.1202, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.5947810021728166, + "learning_rate": 0.00013070376302018287, + "loss": 1.0489, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.5960541379516832, + "learning_rate": 0.00012971210453281674, + "loss": 1.0828, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.6301040459235665, + "learning_rate": 0.000128717230790931, + "loss": 1.1456, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.6218790112688984, + "learning_rate": 0.00012771924945341906, + "loss": 1.1443, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.6331790582677411, + "learning_rate": 0.00012671826851545851, + "loss": 1.1512, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.6920164094640866, + "learning_rate": 0.0001257143962968246, + "loss": 1.0705, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.6457611049577008, + "learning_rate": 0.00012470774143016853, + "loss": 1.108, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.5851148684445352, + "learning_rate": 0.00012369841284926188, + "loss": 1.1249, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.5616442882431073, + "learning_rate": 0.00012268651977720866, + "loss": 1.173, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.6029147306217549, + "learning_rate": 0.00012167217171462566, + "loss": 1.0922, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.5983185759853441, + "learning_rate": 0.0001206554784277931, + "loss": 1.0569, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.6459305861705162, + "learning_rate": 0.00011963654993677645, + "loss": 1.1201, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.6413251599078064, + "learning_rate": 0.00011861549650352069, + "loss": 1.0949, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.5567459691512033, + "learning_rate": 0.00011759242861991855, + "loss": 1.1077, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.6050063276858223, + "learning_rate": 0.00011656745699585371, + "loss": 1.0871, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.5768000953188527, + "learning_rate": 0.00011554069254722051, + "loss": 1.0336, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.58354353504979, + "learning_rate": 0.00011451224638392129, + "loss": 1.109, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.6239680842671043, + "learning_rate": 0.00011348222979784289, + "loss": 1.1406, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.5639619023664048, + "learning_rate": 0.00011245075425081328, + "loss": 1.1355, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.5994597322729686, + "learning_rate": 0.00011141793136253986, + "loss": 0.9949, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.6187766287679105, + "learning_rate": 0.0001103838728985307, + "loss": 1.123, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.5880933746187075, + "learning_rate": 0.000109348690758, + "loss": 1.0811, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.5214334484876151, + "learning_rate": 0.00010831249696175918, + "loss": 1.04, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.6045346113069798, + "learning_rate": 0.0001072754036400944, + "loss": 1.1112, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.5972215634171124, + "learning_rate": 0.00010623752302063283, + "loss": 1.1067, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.5862603509263139, + "learning_rate": 0.00010519896741619803, + "loss": 1.0297, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.589645718660567, + "learning_rate": 0.00010415984921265609, + "loss": 1.0336, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.5966492770502331, + "learning_rate": 0.00010312028085675391, + "loss": 1.0866, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.638493351015436, + "learning_rate": 0.00010208037484395114, + "loss": 1.0765, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.6208601412709848, + "learning_rate": 0.00010104024370624644, + "loss": 1.1046, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.7456812775232595, + "learning_rate": 0.0001, + "loss": 0.9999, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.6175846317153577, + "learning_rate": 9.895975629375359e-05, + "loss": 1.0644, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.6714016144368657, + "learning_rate": 9.791962515604887e-05, + "loss": 1.0637, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.5837880916678994, + "learning_rate": 9.687971914324607e-05, + "loss": 1.1408, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.5530213168075825, + "learning_rate": 9.584015078734395e-05, + "loss": 1.1138, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.6270056310312441, + "learning_rate": 9.480103258380198e-05, + "loss": 1.2315, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.5561192764658761, + "learning_rate": 9.376247697936719e-05, + "loss": 1.0632, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.6073371369069578, + "learning_rate": 9.272459635990562e-05, + "loss": 1.11, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.5944081760932085, + "learning_rate": 9.168750303824084e-05, + "loss": 1.048, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.5848610084940267, + "learning_rate": 9.065130924199998e-05, + "loss": 1.1179, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.5971797062090918, + "learning_rate": 8.961612710146934e-05, + "loss": 1.0489, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.6027497233577513, + "learning_rate": 8.858206863746018e-05, + "loss": 1.1686, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.5842512985502563, + "learning_rate": 8.754924574918675e-05, + "loss": 1.1134, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.5790768255854168, + "learning_rate": 8.651777020215712e-05, + "loss": 1.0993, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.6244985783771545, + "learning_rate": 8.548775361607872e-05, + "loss": 1.1102, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.6363066079679679, + "learning_rate": 8.445930745277953e-05, + "loss": 1.1054, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.5758171908775908, + "learning_rate": 8.343254300414628e-05, + "loss": 1.1669, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.5340947294270786, + "learning_rate": 8.240757138008149e-05, + "loss": 1.1309, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.5831665436537086, + "learning_rate": 8.138450349647936e-05, + "loss": 1.1248, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.5921556048685298, + "learning_rate": 8.036345006322359e-05, + "loss": 1.112, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.5744251424245944, + "learning_rate": 7.934452157220694e-05, + "loss": 1.022, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.5554185323336767, + "learning_rate": 7.832782828537437e-05, + "loss": 1.0337, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.5743450858728842, + "learning_rate": 7.731348022279134e-05, + "loss": 1.0235, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.5602361322464704, + "learning_rate": 7.630158715073813e-05, + "loss": 1.1312, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.5954682352280739, + "learning_rate": 7.52922585698315e-05, + "loss": 1.1366, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.6109690367869244, + "learning_rate": 7.428560370317542e-05, + "loss": 1.0795, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.564705324519998, + "learning_rate": 7.328173148454151e-05, + "loss": 1.0559, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.5998310026704221, + "learning_rate": 7.228075054658096e-05, + "loss": 1.0471, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.5923598358823705, + "learning_rate": 7.1282769209069e-05, + "loss": 1.0805, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.5837508570039001, + "learning_rate": 7.028789546718326e-05, + "loss": 1.0486, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.5993515803964782, + "learning_rate": 6.929623697981718e-05, + "loss": 1.0758, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.5640431095353253, + "learning_rate": 6.830790105792973e-05, + "loss": 1.1195, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.6597258202025034, + "learning_rate": 6.732299465293322e-05, + "loss": 1.1206, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.5700867646836549, + "learning_rate": 6.63416243451194e-05, + "loss": 1.028, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.5438950021648811, + "learning_rate": 6.536389633212609e-05, + "loss": 1.0901, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.5895697124383904, + "learning_rate": 6.43899164174453e-05, + "loss": 1.003, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.5531669590622527, + "learning_rate": 6.341978999897365e-05, + "loss": 0.9802, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.5634663273360929, + "learning_rate": 6.245362205760704e-05, + "loss": 1.0221, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.5852062381389797, + "learning_rate": 6.149151714588009e-05, + "loss": 1.16, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.5937242795441198, + "learning_rate": 6.053357937665237e-05, + "loss": 1.0475, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.5555566589467784, + "learning_rate": 5.957991241184184e-05, + "loss": 1.0439, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.585204914236333, + "learning_rate": 5.863061945120719e-05, + "loss": 1.0635, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.574624666649916, + "learning_rate": 5.768580322118034e-05, + "loss": 1.1461, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.5895005838586477, + "learning_rate": 5.6745565963749925e-05, + "loss": 1.0533, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.568531621396431, + "learning_rate": 5.5810009425397294e-05, + "loss": 1.0117, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.5713855868879502, + "learning_rate": 5.487923484608629e-05, + "loss": 1.0869, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.533463283372751, + "learning_rate": 5.395334294830765e-05, + "loss": 1.0669, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.596191800876185, + "learning_rate": 5.3032433926179395e-05, + "loss": 1.0889, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.6033094935134793, + "learning_rate": 5.211660743460458e-05, + "loss": 1.0037, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.547758452785834, + "learning_rate": 5.1205962578487155e-05, + "loss": 1.0326, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.5674015427230111, + "learning_rate": 5.030059790200756e-05, + "loss": 1.0642, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.5564959453988821, + "learning_rate": 4.940061137795876e-05, + "loss": 1.1004, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.5510907797932991, + "learning_rate": 4.850610039714444e-05, + "loss": 1.0383, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.5364931725607687, + "learning_rate": 4.761716175783989e-05, + "loss": 1.0421, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.5494121878521853, + "learning_rate": 4.673389165531714e-05, + "loss": 1.0529, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.5363726515242464, + "learning_rate": 4.585638567143529e-05, + "loss": 1.0332, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.5465442810140333, + "learning_rate": 4.498473876429726e-05, + "loss": 1.0314, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.5739488758444642, + "learning_rate": 4.411904525797408e-05, + "loss": 1.0601, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.5879132221028294, + "learning_rate": 4.325939883229766e-05, + "loss": 1.0442, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.6045995523657305, + "learning_rate": 4.240589251272342e-05, + "loss": 1.0801, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.5617927988946498, + "learning_rate": 4.155861866026364e-05, + "loss": 1.0536, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.7006136765663606, + "learning_rate": 4.071766896149273e-05, + "loss": 1.1709, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.5519226190976758, + "learning_rate": 3.988313441862553e-05, + "loss": 1.0035, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.5722678337036814, + "learning_rate": 3.9055105339669595e-05, + "loss": 1.0321, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.546356073899012, + "learning_rate": 3.823367132865265e-05, + "loss": 1.063, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.5572189973946577, + "learning_rate": 3.741892127592625e-05, + "loss": 1.0493, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.5609326202853951, + "learning_rate": 3.6610943348546526e-05, + "loss": 1.0401, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.5573693239077621, + "learning_rate": 3.580982498073344e-05, + "loss": 1.0633, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.6120271145151069, + "learning_rate": 3.501565286440914e-05, + "loss": 1.0247, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.5556112139031892, + "learning_rate": 3.422851293981676e-05, + "loss": 1.0004, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.5442180530300604, + "learning_rate": 3.3448490386220355e-05, + "loss": 1.0231, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.5642744910174119, + "learning_rate": 3.2675669612687565e-05, + "loss": 1.0598, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.5301873588771624, + "learning_rate": 3.191013424895536e-05, + "loss": 1.0609, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.5270404145127983, + "learning_rate": 3.115196713638e-05, + "loss": 1.1029, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.5792631918257901, + "learning_rate": 3.040125031897264e-05, + "loss": 1.0148, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.5870979627317701, + "learning_rate": 2.9658065034520978e-05, + "loss": 1.0717, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.5785722600525429, + "learning_rate": 2.892249170579826e-05, + "loss": 1.0672, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.5709981460824844, + "learning_rate": 2.8194609931860316e-05, + "loss": 1.1185, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.6217871274009109, + "learning_rate": 2.7474498479432087e-05, + "loss": 1.0541, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.5502163111083991, + "learning_rate": 2.6762235274383772e-05, + "loss": 1.037, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.6939068470167971, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.9026, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.5348805571120492, + "learning_rate": 2.536156105513062e-05, + "loss": 1.0644, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.5851088907445847, + "learning_rate": 2.4673301612959654e-05, + "loss": 1.0135, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.5593559807902603, + "learning_rate": 2.399319354583418e-05, + "loss": 1.0758, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.5203188462905168, + "learning_rate": 2.3321310450713062e-05, + "loss": 1.0303, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.5605154105634976, + "learning_rate": 2.265772503450122e-05, + "loss": 1.0211, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.5664827723584672, + "learning_rate": 2.2002509106181624e-05, + "loss": 1.0348, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.5900057315119912, + "learning_rate": 2.1355733569044635e-05, + "loss": 1.0552, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.5381277090688744, + "learning_rate": 2.0717468413015283e-05, + "loss": 1.0548, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.5710250024280842, + "learning_rate": 2.008778270707944e-05, + "loss": 1.0525, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.5247596985036673, + "learning_rate": 1.946674459180955e-05, + "loss": 1.0817, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.5154307520162806, + "learning_rate": 1.8854421271990964e-05, + "loss": 1.0125, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.6337614103760363, + "learning_rate": 1.8250879009349398e-05, + "loss": 1.0949, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.4957665878310061, + "learning_rate": 1.7656183115380577e-05, + "loss": 1.0451, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.5308197423642265, + "learning_rate": 1.707039794428259e-05, + "loss": 1.0212, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.6283452572668032, + "learning_rate": 1.649358688599191e-05, + "loss": 1.0286, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.5277570848309738, + "learning_rate": 1.5925812359323745e-05, + "loss": 1.0517, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.572380991818581, + "learning_rate": 1.5367135805217458e-05, + "loss": 1.052, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.5459522732962143, + "learning_rate": 1.4817617680087825e-05, + "loss": 1.0256, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.5818244377960141, + "learning_rate": 1.4277317449282834e-05, + "loss": 1.0306, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.5757150906330832, + "learning_rate": 1.3746293580648717e-05, + "loss": 1.0161, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.5612123216943989, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.9974, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.5361229678609986, + "learning_rate": 1.2712303775915802e-05, + "loss": 1.054, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.5826837681269577, + "learning_rate": 1.220944973160133e-05, + "loss": 1.0603, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.5402287917678977, + "learning_rate": 1.1716095820918216e-05, + "loss": 1.0676, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.5155809923846763, + "learning_rate": 1.1232295431481222e-05, + "loss": 1.1004, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.6178311815438159, + "learning_rate": 1.0758100917083991e-05, + "loss": 1.0745, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.5804058648418592, + "learning_rate": 1.0293563592033595e-05, + "loss": 1.0794, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.6096364520049162, + "learning_rate": 9.838733725597615e-06, + "loss": 1.008, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.5638671810187824, + "learning_rate": 9.393660536564408e-06, + "loss": 1.059, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.551134037205924, + "learning_rate": 8.958392187916841e-06, + "loss": 1.0661, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.5476576939761192, + "learning_rate": 8.532975781620512e-06, + "loss": 1.0599, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.5618097858382183, + "learning_rate": 8.117457353526625e-06, + "loss": 1.128, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.5545956623834618, + "learning_rate": 7.711881868390291e-06, + "loss": 1.0403, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.5585106863409853, + "learning_rate": 7.3162932150046885e-06, + "loss": 1.0839, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.5777158619246178, + "learning_rate": 6.930734201451816e-06, + "loss": 1.1582, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.5312338291560674, + "learning_rate": 6.555246550469907e-06, + "loss": 1.1125, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.6019089227782868, + "learning_rate": 6.189870894938587e-06, + "loss": 1.0226, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.5385907521451545, + "learning_rate": 5.834646773481811e-06, + "loss": 1.0428, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.5929711954637248, + "learning_rate": 5.489612626189245e-06, + "loss": 1.0584, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.5352372798399714, + "learning_rate": 5.154805790456485e-06, + "loss": 1.0398, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.533249413457343, + "learning_rate": 4.830262496944693e-06, + "loss": 0.9853, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.5325098845257755, + "learning_rate": 4.516017865659949e-06, + "loss": 1.1217, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.544616418598338, + "learning_rate": 4.21210590215273e-06, + "loss": 1.0499, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.5389805914927016, + "learning_rate": 3.918559493838114e-06, + "loss": 1.0955, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.5992479750486914, + "learning_rate": 3.6354104064368566e-06, + "loss": 1.0344, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.5563032838951335, + "learning_rate": 3.3626892805379562e-06, + "loss": 1.1259, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.5344650297317594, + "learning_rate": 3.100425628282899e-06, + "loss": 1.1222, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.6026066762936244, + "learning_rate": 2.848647830172024e-06, + "loss": 1.048, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.5380734238182887, + "learning_rate": 2.607383131993424e-06, + "loss": 1.0735, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.539269851195273, + "learning_rate": 2.3766576418745022e-06, + "loss": 1.0271, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.5101331087182588, + "learning_rate": 2.1564963274568027e-06, + "loss": 1.0877, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.5260284887574873, + "learning_rate": 1.9469230131940907e-06, + "loss": 1.1117, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.6165360335006401, + "learning_rate": 1.7479603777742938e-06, + "loss": 1.1343, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.518362085552097, + "learning_rate": 1.559629951665298e-06, + "loss": 1.0402, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.569371845816852, + "learning_rate": 1.3819521147851123e-06, + "loss": 1.0454, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.5281615447976709, + "learning_rate": 1.2149460942964098e-06, + "loss": 1.1065, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.6341864106370607, + "learning_rate": 1.05862996252597e-06, + "loss": 1.0237, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.5471729782472513, + "learning_rate": 9.130206350089765e-07, + "loss": 1.1344, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.521375171737115, + "learning_rate": 7.781338686584927e-07, + "loss": 1.0338, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.565578997311788, + "learning_rate": 6.539842600603918e-07, + "loss": 1.0667, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.5650391103593894, + "learning_rate": 5.405852438937764e-07, + "loss": 1.0766, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.5820265479098684, + "learning_rate": 4.3794909147720773e-07, + "loss": 1.0305, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.5926416552467474, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.9986, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.5397571968572117, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.986, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.5508600370370091, + "learning_rate": 1.947230525005006e-07, + "loss": 1.1161, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.5460513088356049, + "learning_rate": 1.3523775722834587e-07, + "loss": 1.0266, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.5604567327919951, + "learning_rate": 8.655918982689581e-08, + "loss": 0.9579, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.5383144642509582, + "learning_rate": 4.8692617980350406e-08, + "loss": 1.0491, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.5254302419934068, + "learning_rate": 2.164213936770576e-08, + "loss": 1.0394, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.5885982889343618, + "learning_rate": 5.410681219286673e-09, + "loss": 1.0553, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.6364946352079829, + "learning_rate": 0.0, + "loss": 1.0507, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 163148489719808.0, + "train_loss": 1.1001117495008004, + "train_runtime": 3775.7928, + "train_samples_per_second": 1.324, + "train_steps_per_second": 0.083 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 163148489719808.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9547b55d0b677450ba90e79425a03c5e60fa7cde --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "gate_proj", + "up_proj", + "o_proj", + "down_proj", + "k_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..41f4ce71f4849e3338f2f68dc7ef2e76fcf7fce3 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c1ca96a72a93c29ee37109b9231a208a153b02842d8988648783c09ec6b80cf +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..116dc51c700b5ebf12e85de8035d6d578066c895 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7942b3ff9fdd57de7917cc18505f63252b815f5b36cbf3fe2a9df7d46706f464 +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..781b8168ef9f50ae8debcd64dd418c2d0a4c94ea --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.229373798278603, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5995, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 1.3255378585572952, + "learning_rate": 2.105263157894737e-05, + "loss": 1.613, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 1.1225274609868587, + "learning_rate": 3.157894736842105e-05, + "loss": 1.5345, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9601904040026625, + "learning_rate": 4.210526315789474e-05, + "loss": 1.4265, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.9802642339973807, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.2977, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 1.068848268073717, + "learning_rate": 6.31578947368421e-05, + "loss": 1.2638, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 1.0871580440341346, + "learning_rate": 7.368421052631579e-05, + "loss": 1.1175, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9068610495921913, + "learning_rate": 8.421052631578948e-05, + "loss": 1.0853, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.9811047071816428, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9156, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.8447891281203188, + "learning_rate": 0.00010526315789473685, + "loss": 0.9022, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.7461951662582579, + "learning_rate": 0.00011578947368421053, + "loss": 0.9529, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6177951220259487, + "learning_rate": 0.0001263157894736842, + "loss": 0.8863, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.5348190521442691, + "learning_rate": 0.0001368421052631579, + "loss": 0.8834, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.5413077228307152, + "learning_rate": 0.00014736842105263158, + "loss": 0.8928, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.537046673330067, + "learning_rate": 0.00015789473684210527, + "loss": 0.8687, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.5002112492477047, + "learning_rate": 0.00016842105263157895, + "loss": 0.811, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.5366964322274403, + "learning_rate": 0.00017894736842105264, + "loss": 0.8333, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5014078956773536, + "learning_rate": 0.00018947368421052632, + "loss": 0.8687, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.5252222393318335, + "learning_rate": 0.0002, + "loss": 0.8638, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.5373932752091392, + "learning_rate": 0.00019999865623437013, + "loss": 0.9097, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4762690764333146, + "learning_rate": 0.00019999462497359466, + "loss": 0.8766, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4568017154233179, + "learning_rate": 0.00019998790632601496, + "loss": 0.7836, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.5001790554030369, + "learning_rate": 0.0001999785004721968, + "loss": 0.8819, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4686071769810681, + "learning_rate": 0.00019996640766492543, + "loss": 0.8376, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.3951476828707915, + "learning_rate": 0.00019995162822919883, + "loss": 0.7475, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4204532563793381, + "learning_rate": 0.00019993416256221895, + "loss": 0.8023, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.43605750796653514, + "learning_rate": 0.00019991401113338104, + "loss": 0.7777, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4312291066927483, + "learning_rate": 0.00019989117448426108, + "loss": 0.8084, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4422682479152776, + "learning_rate": 0.00019986565322860115, + "loss": 0.8133, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.4877596727466243, + "learning_rate": 0.00019983744805229296, + "loss": 0.7581, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.42662048000259273, + "learning_rate": 0.00019980655971335945, + "loss": 0.7395, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.47553941708746517, + "learning_rate": 0.00019977298904193437, + "loss": 0.8115, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.4373444242596017, + "learning_rate": 0.00019973673694024, + "loss": 0.7637, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.46828037568677117, + "learning_rate": 0.00019969780438256293, + "loss": 0.8175, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.43057672078604997, + "learning_rate": 0.0001996561924152278, + "loss": 0.7972, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4162655730684327, + "learning_rate": 0.0001996119021565693, + "loss": 0.7981, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.40732518436942894, + "learning_rate": 0.0001995649347969019, + "loss": 0.7856, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.40951222619704847, + "learning_rate": 0.00019951529159848805, + "loss": 0.7895, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4291197421264731, + "learning_rate": 0.00019946297389550433, + "loss": 0.7863, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.4401239448447159, + "learning_rate": 0.00019940798309400526, + "loss": 0.7659, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.39828178872140096, + "learning_rate": 0.0001993503206718859, + "loss": 0.7714, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4107568482589156, + "learning_rate": 0.00019928998817884182, + "loss": 0.7668, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4113529823060761, + "learning_rate": 0.00019922698723632767, + "loss": 0.7933, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.42928009327160627, + "learning_rate": 0.00019916131953751342, + "loss": 0.7457, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.44243052895313983, + "learning_rate": 0.00019909298684723904, + "loss": 0.7841, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4063400800613472, + "learning_rate": 0.00019902199100196697, + "loss": 0.7806, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4205339635927461, + "learning_rate": 0.00019894833390973266, + "loss": 0.7937, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3994926519851495, + "learning_rate": 0.00019887201755009357, + "loss": 0.7852, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.42462911990010227, + "learning_rate": 0.0001987930439740757, + "loss": 0.8006, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.4000315027275353, + "learning_rate": 0.00019871141530411853, + "loss": 0.7605, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.39076661860395445, + "learning_rate": 0.0001986271337340182, + "loss": 0.7824, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.41280970200617884, + "learning_rate": 0.00019854020152886814, + "loss": 0.7927, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.436430398186534, + "learning_rate": 0.0001984506210249986, + "loss": 0.7576, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.46076369353240937, + "learning_rate": 0.00019835839462991361, + "loss": 0.7212, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.4040674327666037, + "learning_rate": 0.00019826352482222638, + "loss": 0.7347, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4101966638057085, + "learning_rate": 0.00019816601415159263, + "loss": 0.7761, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.42413278233874757, + "learning_rate": 0.0001980658652386421, + "loss": 0.7647, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.42302978595377466, + "learning_rate": 0.00019796308077490817, + "loss": 0.7519, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.40006395047526183, + "learning_rate": 0.00019785766352275542, + "loss": 0.7763, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.41306878173353634, + "learning_rate": 0.00019774961631530545, + "loss": 0.786, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4032602892912269, + "learning_rate": 0.00019763894205636072, + "loss": 0.7688, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3892433009854969, + "learning_rate": 0.00019752564372032657, + "loss": 0.7553, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.4004143313998048, + "learning_rate": 0.00019740972435213115, + "loss": 0.7814, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4183210940841329, + "learning_rate": 0.00019729118706714375, + "loss": 0.7779, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.42247226472149213, + "learning_rate": 0.00019717003505109095, + "loss": 0.8057, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.4195024410862326, + "learning_rate": 0.00019704627155997108, + "loss": 0.7776, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.40403657134063525, + "learning_rate": 0.00019691989991996663, + "loss": 0.7967, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.38704775081201337, + "learning_rate": 0.0001967909235273549, + "loss": 0.7602, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.4023423479962683, + "learning_rate": 0.00019665934584841682, + "loss": 0.7905, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.38501121315792136, + "learning_rate": 0.00019652517041934356, + "loss": 0.7468, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4091701530186358, + "learning_rate": 0.00019638840084614182, + "loss": 0.7996, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.36355727696105633, + "learning_rate": 0.00019624904080453655, + "loss": 0.7519, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.47848552592213617, + "learning_rate": 0.00019610709403987246, + "loss": 0.7495, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.39357553915086346, + "learning_rate": 0.00019596256436701324, + "loss": 0.7767, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.3957171032861638, + "learning_rate": 0.000195815455670239, + "loss": 0.7751, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3891948568622778, + "learning_rate": 0.00019566577190314197, + "loss": 0.6961, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.3920291011646516, + "learning_rate": 0.0001955135170885202, + "loss": 0.8146, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.36170578672528475, + "learning_rate": 0.00019535869531826937, + "loss": 0.7712, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.3848155321538925, + "learning_rate": 0.00019520131075327298, + "loss": 0.7489, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.39347713045374183, + "learning_rate": 0.00019504136762329047, + "loss": 0.7946, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.3821383752069123, + "learning_rate": 0.00019487887022684336, + "loss": 0.7531, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3667234454853835, + "learning_rate": 0.00019471382293110003, + "loss": 0.8051, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.3932102128704723, + "learning_rate": 0.00019454623017175812, + "loss": 0.7757, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.38616189039631826, + "learning_rate": 0.00019437609645292546, + "loss": 0.7179, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.4001982588795188, + "learning_rate": 0.0001942034263469989, + "loss": 0.7891, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.3891513932660926, + "learning_rate": 0.00019402822449454153, + "loss": 0.769, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.42116323271475387, + "learning_rate": 0.00019385049560415794, + "loss": 0.8356, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.37430938000545133, + "learning_rate": 0.00019367024445236754, + "loss": 0.7141, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.3986562727168721, + "learning_rate": 0.00019348747588347637, + "loss": 0.7183, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.3968462925532793, + "learning_rate": 0.00019330219480944694, + "loss": 0.7722, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.3936868026019858, + "learning_rate": 0.00019311440620976597, + "loss": 0.7625, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3953996953277331, + "learning_rate": 0.0001929241151313108, + "loss": 0.736, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.40725778287562764, + "learning_rate": 0.00019273132668821364, + "loss": 0.7957, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3956730097996287, + "learning_rate": 0.00019253604606172417, + "loss": 0.7145, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.3856174409454427, + "learning_rate": 0.00019233827850007027, + "loss": 0.7535, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.397956094443797, + "learning_rate": 0.00019213802931831696, + "loss": 0.7729, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.36657234780092485, + "learning_rate": 0.00019193530389822363, + "loss": 0.7173, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.40634460482101714, + "learning_rate": 0.00019173010768809933, + "loss": 0.7646, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.366949505480132, + "learning_rate": 0.0001915224462026563, + "loss": 0.7345, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.3809635372006216, + "learning_rate": 0.00019131232502286188, + "loss": 0.7884, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.43363867156407665, + "learning_rate": 0.0001910997497957885, + "loss": 0.8288, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.3720900596896555, + "learning_rate": 0.00019088472623446183, + "loss": 0.6943, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.7634072094055883, + "learning_rate": 0.00019066726011770726, + "loss": 0.6921, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3816278455124801, + "learning_rate": 0.0001904473572899947, + "loss": 0.7996, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.4097928192635754, + "learning_rate": 0.00019022502366128135, + "loss": 0.7669, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.41740583671897846, + "learning_rate": 0.00019000026520685302, + "loss": 0.7738, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.39027372813301175, + "learning_rate": 0.0001897730879671634, + "loss": 0.7115, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3964970065919041, + "learning_rate": 0.00018954349804767184, + "loss": 0.7536, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.4118659445973478, + "learning_rate": 0.00018931150161867916, + "loss": 0.7963, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.37631494230013696, + "learning_rate": 0.00018907710491516199, + "loss": 0.7276, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.38303900808802116, + "learning_rate": 0.0001888403142366049, + "loss": 0.7686, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4617088675955906, + "learning_rate": 0.00018860113594683148, + "loss": 0.7056, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.4007475923943109, + "learning_rate": 0.00018835957647383303, + "loss": 0.7646, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.42668599104118365, + "learning_rate": 0.00018811564230959588, + "loss": 0.7578, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.48137065952465374, + "learning_rate": 0.00018786934000992688, + "loss": 0.6984, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.37822476605881156, + "learning_rate": 0.00018762067619427746, + "loss": 0.6791, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.3901709397441698, + "learning_rate": 0.00018736965754556528, + "loss": 0.7179, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.37130511642749203, + "learning_rate": 0.00018711629080999504, + "loss": 0.6907, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.402217274699389, + "learning_rate": 0.00018686058279687698, + "loss": 0.7473, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.4188277958180444, + "learning_rate": 0.00018660254037844388, + "loss": 0.7372, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.38853389068801125, + "learning_rate": 0.00018634217048966637, + "loss": 0.6853, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.4169792734028954, + "learning_rate": 0.0001860794801280666, + "loss": 0.7119, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.38595621654016343, + "learning_rate": 0.0001858144763535302, + "loss": 0.6993, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3900268823974102, + "learning_rate": 0.0001855471662881164, + "loss": 0.7333, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.36289349941433785, + "learning_rate": 0.00018527755711586678, + "loss": 0.7228, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3704974542399799, + "learning_rate": 0.00018500565608261214, + "loss": 0.7049, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.38860747392520834, + "learning_rate": 0.00018473147049577774, + "loss": 0.8067, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.35710911077277013, + "learning_rate": 0.00018445500772418697, + "loss": 0.7114, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.3593712961722801, + "learning_rate": 0.00018417627519786315, + "loss": 0.7564, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.39059526056982474, + "learning_rate": 0.00018389528040783012, + "loss": 0.6863, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.3649404379752295, + "learning_rate": 0.00018361203090591071, + "loss": 0.7655, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.41483604598870905, + "learning_rate": 0.00018332653430452376, + "loss": 0.7418, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.40122957922475827, + "learning_rate": 0.00018303879827647975, + "loss": 0.7252, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.38125659969112374, + "learning_rate": 0.00018274883055477436, + "loss": 0.7394, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.39193173095160444, + "learning_rate": 0.00018245663893238075, + "loss": 0.7334, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3852015284724536, + "learning_rate": 0.00018216223126204007, + "loss": 0.7274, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.3853228321924374, + "learning_rate": 0.00018186561545605054, + "loss": 0.7309, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3859003592587561, + "learning_rate": 0.00018156679948605467, + "loss": 0.7459, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.3955167140395012, + "learning_rate": 0.00018126579138282503, + "loss": 0.7377, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.3849858979613698, + "learning_rate": 0.0001809625992360485, + "loss": 0.7148, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.3827161665380854, + "learning_rate": 0.00018065723119410884, + "loss": 0.7162, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.38274975285166835, + "learning_rate": 0.00018034969546386757, + "loss": 0.7363, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.4511367752683735, + "learning_rate": 0.0001800400003104436, + "loss": 0.7714, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3929031484028961, + "learning_rate": 0.00017972815405699103, + "loss": 0.7105, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.3909829530272507, + "learning_rate": 0.00017941416508447536, + "loss": 0.76, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4005882700079233, + "learning_rate": 0.0001790980418314484, + "loss": 0.7263, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.4242225472595463, + "learning_rate": 0.00017877979279382135, + "loss": 0.7437, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.38146844799520946, + "learning_rate": 0.0001784594265246366, + "loss": 0.7018, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.41123976073579627, + "learning_rate": 0.0001781369516338378, + "loss": 0.7277, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.3645878348641131, + "learning_rate": 0.00017781237678803847, + "loss": 0.6988, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.3716361961263643, + "learning_rate": 0.000177485710710289, + "loss": 0.7289, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3863591358843344, + "learning_rate": 0.00017715696217984235, + "loss": 0.6796, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3755277313037591, + "learning_rate": 0.00017682614003191807, + "loss": 0.7121, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3772428656770563, + "learning_rate": 0.00017649325315746478, + "loss": 0.7327, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.3863558598033965, + "learning_rate": 0.0001761583105029213, + "loss": 0.7667, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3700798736664158, + "learning_rate": 0.00017582132106997616, + "loss": 0.7297, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3625692215921915, + "learning_rate": 0.00017548229391532572, + "loss": 0.6769, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.39900910764233755, + "learning_rate": 0.00017514123815043074, + "loss": 0.73, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.35824773081382993, + "learning_rate": 0.00017479816294127152, + "loss": 0.6944, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.36142203894300484, + "learning_rate": 0.0001744530775081015, + "loss": 0.678, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.3975821452535094, + "learning_rate": 0.0001741059911251997, + "loss": 0.7197, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3936341912600741, + "learning_rate": 0.000173756913120621, + "loss": 0.7609, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.364201706851904, + "learning_rate": 0.00017340585287594604, + "loss": 0.7399, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.37875567954037376, + "learning_rate": 0.0001730528198260285, + "loss": 0.7793, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.360667351795953, + "learning_rate": 0.00017269782345874203, + "loss": 0.6545, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3902385173718295, + "learning_rate": 0.00017234087331472497, + "loss": 0.7456, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.39398866659150356, + "learning_rate": 0.00017198197898712404, + "loss": 0.735, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3743528690845882, + "learning_rate": 0.00017162115012133643, + "loss": 0.778, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.39915123353703374, + "learning_rate": 0.00017125839641475072, + "loss": 0.6924, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.38414665513613727, + "learning_rate": 0.00017089372761648616, + "loss": 0.7476, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3812020107036411, + "learning_rate": 0.00017052715352713075, + "loss": 0.7083, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.38019590830017097, + "learning_rate": 0.00017015868399847768, + "loss": 0.7413, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.3916000729224414, + "learning_rate": 0.00016978832893326074, + "loss": 0.7144, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.39835029355072454, + "learning_rate": 0.00016941609828488807, + "loss": 0.7183, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.39104548258367283, + "learning_rate": 0.0001690420020571747, + "loss": 0.6871, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.38502788895335127, + "learning_rate": 0.0001686660503040737, + "loss": 0.7586, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.42484728564668284, + "learning_rate": 0.00016828825312940592, + "loss": 0.7384, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.39800478681057694, + "learning_rate": 0.0001679086206865886, + "loss": 0.6594, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.3972473284811008, + "learning_rate": 0.00016752716317836229, + "loss": 0.7076, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.3836320462207601, + "learning_rate": 0.0001671438908565167, + "loss": 0.7248, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3773777910389116, + "learning_rate": 0.00016675881402161536, + "loss": 0.6984, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.4507746764915864, + "learning_rate": 0.0001663719430227186, + "loss": 0.772, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.46866078541687584, + "learning_rate": 0.00016598328825710533, + "loss": 0.7244, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.4422461097872862, + "learning_rate": 0.000165592860169994, + "loss": 0.697, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.4121388436172653, + "learning_rate": 0.00016520066925426144, + "loss": 0.7225, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.43545976521929347, + "learning_rate": 0.0001648067260501611, + "loss": 0.7725, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.3804683222868557, + "learning_rate": 0.0001644110411450398, + "loss": 0.6659, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.41487982913028726, + "learning_rate": 0.00016401362517305296, + "loss": 0.7619, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.38962666625247283, + "learning_rate": 0.00016361448881487914, + "loss": 0.6597, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.4228344639001, + "learning_rate": 0.00016321364279743266, + "loss": 0.707, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.42015839182108944, + "learning_rate": 0.0001628110978935756, + "loss": 0.7653, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.37683110692755134, + "learning_rate": 0.00016240686492182804, + "loss": 0.7252, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.3772024054374649, + "learning_rate": 0.00016200095474607753, + "loss": 0.6891, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4195457931474166, + "learning_rate": 0.00016159337827528685, + "loss": 0.7618, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.3720664397945769, + "learning_rate": 0.0001611841464632011, + "loss": 0.6919, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3740925748103595, + "learning_rate": 0.0001607732703080532, + "loss": 0.6873, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.37428951513062436, + "learning_rate": 0.00016036076085226814, + "loss": 0.6701, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.38121318450586267, + "learning_rate": 0.0001599466291821666, + "loss": 0.6679, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.38597591689559313, + "learning_rate": 0.0001595308864276666, + "loss": 0.6978, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.39397945622113667, + "learning_rate": 0.0001591135437619847, + "loss": 0.7049, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.38176167402393113, + "learning_rate": 0.0001586946124013354, + "loss": 0.7353, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3670506697529049, + "learning_rate": 0.0001582741036046301, + "loss": 0.7342, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.3892830499788043, + "learning_rate": 0.00015785202867317407, + "loss": 0.7281, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.36666548505760843, + "learning_rate": 0.00015742839895036305, + "loss": 0.7122, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.3824410040805799, + "learning_rate": 0.00015700322582137827, + "loss": 0.7042, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.36673613420740625, + "learning_rate": 0.0001565765207128805, + "loss": 0.6759, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3757664044110948, + "learning_rate": 0.0001561482950927029, + "loss": 0.7035, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.39532812560975455, + "learning_rate": 0.00015571856046954285, + "loss": 0.7232, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.40438099505972785, + "learning_rate": 0.00015528732839265272, + "loss": 0.7594, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.3419577876429092, + "learning_rate": 0.0001548546104515294, + "loss": 0.6602, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.36246576871660047, + "learning_rate": 0.00015442041827560274, + "loss": 0.7293, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3588255354877175, + "learning_rate": 0.00015398476353392323, + "loss": 0.7594, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3901441159572394, + "learning_rate": 0.00015354765793484834, + "loss": 0.7491, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.35719437308085233, + "learning_rate": 0.00015310911322572753, + "loss": 0.6476, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.364700995492638, + "learning_rate": 0.000152669141192587, + "loss": 0.7183, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3784135273383183, + "learning_rate": 0.00015222775365981273, + "loss": 0.7033, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.3749000315840051, + "learning_rate": 0.00015178496248983254, + "loss": 0.7007, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.38800053532610096, + "learning_rate": 0.00015134077958279765, + "loss": 0.7138, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3418098446663238, + "learning_rate": 0.00015089521687626243, + "loss": 0.6944, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.38111268593685843, + "learning_rate": 0.000150448286344864, + "loss": 0.6762, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3710500845514271, + "learning_rate": 0.00015000000000000001, + "loss": 0.6591, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.37441467488393976, + "learning_rate": 0.00014955036988950618, + "loss": 0.7269, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.39654706299414744, + "learning_rate": 0.00014909940809733222, + "loss": 0.7242, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3568180401154739, + "learning_rate": 0.00014864712674321734, + "loss": 0.6911, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.39034288279985846, + "learning_rate": 0.00014819353798236427, + "loss": 0.7557, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.34557279768441856, + "learning_rate": 0.00014773865400511272, + "loss": 0.7082, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.35705484794926023, + "learning_rate": 0.00014728248703661182, + "loss": 0.7241, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.35798628411050953, + "learning_rate": 0.00014682504933649144, + "loss": 0.6318, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.36136379085752796, + "learning_rate": 0.00014636635319853275, + "loss": 0.6821, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.37618932987528225, + "learning_rate": 0.00014590641095033787, + "loss": 0.7047, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.38816240044304784, + "learning_rate": 0.00014544523495299842, + "loss": 0.6785, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.38762381736859697, + "learning_rate": 0.0001449828376007636, + "loss": 0.7498, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.37991677053794676, + "learning_rate": 0.0001445192313207067, + "loss": 0.7418, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.35899697440611383, + "learning_rate": 0.0001440544285723915, + "loss": 0.7145, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.3855068948533226, + "learning_rate": 0.00014358844184753712, + "loss": 0.8137, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3640340840565266, + "learning_rate": 0.00014312128366968243, + "loss": 0.687, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3813513523794655, + "learning_rate": 0.00014265296659384956, + "loss": 0.7193, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3884503295790077, + "learning_rate": 0.00014218350320620624, + "loss": 0.7276, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3591527170010043, + "learning_rate": 0.0001417129061237278, + "loss": 0.6579, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.34340461207851425, + "learning_rate": 0.00014124118799385796, + "loss": 0.6988, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3716142230619113, + "learning_rate": 0.00014076836149416887, + "loss": 0.6837, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3746943540983614, + "learning_rate": 0.0001402944393320206, + "loss": 0.7138, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.6149858253919029, + "learning_rate": 0.00013981943424421932, + "loss": 0.6912, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3694845102751698, + "learning_rate": 0.00013934335899667527, + "loss": 0.6709, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.36790318218261714, + "learning_rate": 0.00013886622638405952, + "loss": 0.6941, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.38696101358451285, + "learning_rate": 0.00013838804922946027, + "loss": 0.7065, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.3626054966593918, + "learning_rate": 0.00013790884038403795, + "loss": 0.6974, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.36810130417217946, + "learning_rate": 0.00013742861272668012, + "loss": 0.7059, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.3624097418077036, + "learning_rate": 0.00013694737916365517, + "loss": 0.7229, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.34599108827247227, + "learning_rate": 0.00013646515262826552, + "loss": 0.7048, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3879058306480498, + "learning_rate": 0.0001359819460805001, + "loss": 0.7924, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.34023836681670694, + "learning_rate": 0.0001354977725066859, + "loss": 0.7157, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.3711063338350221, + "learning_rate": 0.00013501264491913906, + "loss": 0.7521, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.36472198648465, + "learning_rate": 0.0001345265763558152, + "loss": 0.6572, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.3576711587325934, + "learning_rate": 0.00013403957987995882, + "loss": 0.6568, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.37710702613417896, + "learning_rate": 0.0001335516685797525, + "loss": 0.7111, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3793009735708133, + "learning_rate": 0.00013306285556796495, + "loss": 0.7324, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3728235770012968, + "learning_rate": 0.00013257315398159864, + "loss": 0.7258, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.41141037106287126, + "learning_rate": 0.00013208257698153677, + "loss": 0.7106, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.3686623259505453, + "learning_rate": 0.00013159113775218964, + "loss": 0.6813, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.3520653468975216, + "learning_rate": 0.00013109884950114007, + "loss": 0.6652, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.37518104968705895, + "learning_rate": 0.00013060572545878875, + "loss": 0.7043, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.3663522013676943, + "learning_rate": 0.00013011177887799845, + "loss": 0.695, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.350137374488117, + "learning_rate": 0.00012961702303373795, + "loss": 0.7243, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.3600436082512767, + "learning_rate": 0.00012912147122272523, + "loss": 0.7378, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.368533068906102, + "learning_rate": 0.00012862513676307008, + "loss": 0.7028, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.36198729400227725, + "learning_rate": 0.00012812803299391628, + "loss": 0.6707, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.357610238818022, + "learning_rate": 0.00012763017327508305, + "loss": 0.6861, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.35665899603087337, + "learning_rate": 0.0001271315709867059, + "loss": 0.6905, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.3595715382711654, + "learning_rate": 0.00012663223952887723, + "loss": 0.7023, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.3339131480094967, + "learning_rate": 0.00012613219232128608, + "loss": 0.6807, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.35554457901474196, + "learning_rate": 0.00012563144280285741, + "loss": 0.6922, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3461905884208761, + "learning_rate": 0.00012513000443139112, + "loss": 0.7075, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.35885451438850346, + "learning_rate": 0.00012462789068320017, + "loss": 0.7204, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.3787323699405888, + "learning_rate": 0.00012412511505274844, + "loss": 0.6887, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.34410493352577043, + "learning_rate": 0.00012362169105228826, + "loss": 0.6943, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.35672201920563706, + "learning_rate": 0.000123117632211497, + "loss": 0.6677, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.35647172367608165, + "learning_rate": 0.00012261295207711346, + "loss": 0.6617, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.34872758802754317, + "learning_rate": 0.0001221076642125742, + "loss": 0.6522, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.3488974339691185, + "learning_rate": 0.00012160178219764837, + "loss": 0.7242, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.35533726973414664, + "learning_rate": 0.00012109531962807332, + "loss": 0.7126, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3665197508720917, + "learning_rate": 0.00012058829011518896, + "loss": 0.7104, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3901613865642856, + "learning_rate": 0.00012008070728557186, + "loss": 0.7022, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3680636194883015, + "learning_rate": 0.00011957258478066931, + "loss": 0.6206, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.36205230764978635, + "learning_rate": 0.00011906393625643244, + "loss": 0.6846, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3700102223107961, + "learning_rate": 0.00011855477538294935, + "loss": 0.6834, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3707852248575377, + "learning_rate": 0.00011804511584407763, + "loss": 0.73, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3328396952074698, + "learning_rate": 0.00011753497133707679, + "loss": 0.6326, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3647275618298019, + "learning_rate": 0.00011702435557223987, + "loss": 0.6903, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.37522903402817953, + "learning_rate": 0.00011651328227252517, + "loss": 0.7148, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.3459570022247, + "learning_rate": 0.00011600176517318741, + "loss": 0.6901, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3453472634165083, + "learning_rate": 0.00011548981802140848, + "loss": 0.6541, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.46858167069260703, + "learning_rate": 0.00011497745457592816, + "loss": 0.6474, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3425009900081892, + "learning_rate": 0.00011446468860667421, + "loss": 0.6787, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.3408371432957332, + "learning_rate": 0.00011395153389439233, + "loss": 0.6421, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.34309203175466485, + "learning_rate": 0.00011343800423027582, + "loss": 0.6605, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3366472364937924, + "learning_rate": 0.0001129241134155949, + "loss": 0.6766, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3675673823200964, + "learning_rate": 0.00011240987526132594, + "loss": 0.716, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.3624885760394247, + "learning_rate": 0.00011189530358778005, + "loss": 0.6831, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.34485269464743384, + "learning_rate": 0.00011138041222423177, + "loss": 0.6894, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.36363641721437323, + "learning_rate": 0.00011086521500854745, + "loss": 0.7037, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3804302820151281, + "learning_rate": 0.00011034972578681338, + "loss": 0.7362, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.34617907037291995, + "learning_rate": 0.00010983395841296348, + "loss": 0.6751, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.38976971548297357, + "learning_rate": 0.00010931792674840718, + "loss": 0.6566, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.35106778892349916, + "learning_rate": 0.00010880164466165674, + "loss": 0.6725, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3514557744429845, + "learning_rate": 0.00010828512602795462, + "loss": 0.656, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.3546450466677947, + "learning_rate": 0.00010776838472890065, + "loss": 0.6563, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.34723925989473264, + "learning_rate": 0.00010725143465207867, + "loss": 0.6996, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3285706810054756, + "learning_rate": 0.00010673428969068364, + "loss": 0.6848, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.339283492277106, + "learning_rate": 0.00010621696374314807, + "loss": 0.6901, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.33349436629549434, + "learning_rate": 0.00010569947071276847, + "loss": 0.6477, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.366636642059819, + "learning_rate": 0.00010518182450733186, + "loss": 0.6929, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.34361329665130813, + "learning_rate": 0.00010466403903874176, + "loss": 0.6612, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.33892463526483174, + "learning_rate": 0.00010414612822264455, + "loss": 0.6741, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.34973460117012567, + "learning_rate": 0.00010362810597805526, + "loss": 0.601, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3565085399782329, + "learning_rate": 0.0001031099862269837, + "loss": 0.6666, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3418140611458426, + "learning_rate": 0.00010259178289406011, + "loss": 0.6594, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.37195336389240935, + "learning_rate": 0.00010207350990616107, + "loss": 0.6845, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.35110530974078485, + "learning_rate": 0.0001015551811920351, + "loss": 0.6592, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.36489174363833043, + "learning_rate": 0.00010103681068192845, + "loss": 0.6779, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.4171223297688789, + "learning_rate": 0.00010051841230721065, + "loss": 0.6283, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.37413927100291494, + "learning_rate": 0.0001, + "loss": 0.6831, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3528735973381612, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6644, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3745336489378991, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6503, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.37275093455288594, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6984, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3366978224997515, + "learning_rate": 9.792649009383899e-05, + "loss": 0.667, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3564685656104209, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6582, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.35941780232957743, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6493, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.36411996751195624, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6512, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.35889993632067707, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6817, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3413373464186135, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6057, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3371052874072895, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6546, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.34015871442448126, + "learning_rate": 9.430052928723153e-05, + "loss": 0.7548, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.34768381176226404, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6249, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.35055208678570654, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6609, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3696407680595258, + "learning_rate": 9.274856534792138e-05, + "loss": 0.7256, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3443399204481053, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6692, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.36476030873795856, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6694, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3554648289200159, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6894, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.3434710885105751, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7328, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3503197379585682, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6687, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.34374188690838525, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6637, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.34634204748549424, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6231, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3448828690040321, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6761, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.32290933438854713, + "learning_rate": 8.810469641222001e-05, + "loss": 0.619, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.34523451534574523, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6732, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3396007317968865, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6939, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3400300609137072, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6334, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.38617639543364063, + "learning_rate": 8.604846610560771e-05, + "loss": 0.71, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.3449820516474646, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6525, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3736688125800989, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6632, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.33245275428669596, + "learning_rate": 8.451018197859153e-05, + "loss": 0.6842, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.350824463531681, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6891, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3769679328839366, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6448, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.3950956861477466, + "learning_rate": 8.297564442776014e-05, + "loss": 0.7028, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3769715248178555, + "learning_rate": 8.246502866292324e-05, + "loss": 0.7524, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3550864279693727, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6956, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.45579727504565914, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6863, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.33662953960341585, + "learning_rate": 8.093606374356759e-05, + "loss": 0.7046, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.34022215506401343, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6133, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.32991494219957185, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6001, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.37097329614451063, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6345, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.3515701273892897, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6209, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3390859701302974, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6725, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.3261614696438565, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6474, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.35231998430218436, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6356, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.34487852889553167, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6755, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3597076481941233, + "learning_rate": 7.637830894771175e-05, + "loss": 0.7348, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3535979886913036, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6586, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.34056561461249657, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6566, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3313840825869037, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6265, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.356858938714993, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6765, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.36345182693995554, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6972, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3600146037302109, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6791, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.34190431713398445, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6731, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3970175003007231, + "learning_rate": 7.236982672491698e-05, + "loss": 0.5951, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.34878533658916844, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6761, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.35184500997088136, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6261, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3524872693988646, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6783, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.3349970649879647, + "learning_rate": 7.038297696626206e-05, + "loss": 0.64, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.34561197102116836, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6634, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.34109755923127466, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6392, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.33306188510296875, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6143, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3236879938329974, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6788, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.34056717019759014, + "learning_rate": 6.791742301846326e-05, + "loss": 0.6898, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.4159978517020688, + "learning_rate": 6.742684601840141e-05, + "loss": 0.696, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.3364273904078727, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6492, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.34813484667820843, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6728, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3573723704774501, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6136, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.34399138714064675, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6621, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.3847135119237165, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6525, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.32113437688191543, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6312, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.3241685066678636, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6244, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3311499374935771, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6085, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.33061122626387485, + "learning_rate": 6.305262083634488e-05, + "loss": 0.65, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3446348528234487, + "learning_rate": 6.25713872733199e-05, + "loss": 0.643, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3573472270681029, + "learning_rate": 6.209115961596208e-05, + "loss": 0.703, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.35255691408742473, + "learning_rate": 6.161195077053976e-05, + "loss": 0.679, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3603893140466124, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6553, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.38622469068407206, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6372, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.36843820195201377, + "learning_rate": 6.018056575578075e-05, + "loss": 0.6775, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.33775042010116374, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6271, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3503067124724184, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6354, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.36004712918213044, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6682, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.35939821897442975, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6332, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3618911935669541, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6593, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.33854288006772076, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6748, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.35168033639419904, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6682, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.3299805820327915, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6552, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.33869510493371086, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.69, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.34602955410892206, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6222, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3516653306140801, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6362, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3477412232753374, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.7016, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3436682577532841, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6582, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.39319811292136175, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6778, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3209059243373577, + "learning_rate": 5.31749506635086e-05, + "loss": 0.5812, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3306336586741872, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6614, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.33213736652811277, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6687, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.42351662500126236, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6479, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.3459766680283071, + "learning_rate": 5.135287325678271e-05, + "loss": 0.7437, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.3389725535499289, + "learning_rate": 5.090059190266779e-05, + "loss": 0.6837, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3325393445610534, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6647, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3386095660421953, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6233, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3370241727957124, + "learning_rate": 4.955171365513603e-05, + "loss": 0.6342, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.3535099702320059, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.6553, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3740969238615925, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6769, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.33691852823168594, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6591, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3625396658837189, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6234, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.3625049876543691, + "learning_rate": 4.733085880741301e-05, + "loss": 0.7037, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.35212752090111005, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6319, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3625382805725208, + "learning_rate": 4.645234206515171e-05, + "loss": 0.6591, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3456657296675217, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6479, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.36142618618576305, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.672, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.36438136146189665, + "learning_rate": 4.514538954847064e-05, + "loss": 0.7145, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.3476979241199332, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6801, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3490731264105313, + "learning_rate": 4.428143953045717e-05, + "loss": 0.6975, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.35502946241086925, + "learning_rate": 4.385170490729712e-05, + "loss": 0.6424, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.333526847372875, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6618, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.33168275158650823, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6287, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.3612765677677876, + "learning_rate": 4.257160104963696e-05, + "loss": 0.7079, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3524554525972209, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6378, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.369226952481241, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6805, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3452349944201978, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6882, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.34405536883787835, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6434, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.33682862695340515, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6393, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.32084200913186556, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6707, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3230728754454013, + "learning_rate": 3.963923914773187e-05, + "loss": 0.57, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.33415857941196947, + "learning_rate": 3.922672969194686e-05, + "loss": 0.6381, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3325144894494768, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.5948, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.3384324416507604, + "learning_rate": 3.840662172471315e-05, + "loss": 0.6118, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.35864162638588704, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6658, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.35109161421023133, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6789, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3503657334157009, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6615, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.34176881292945643, + "learning_rate": 3.678635720256737e-05, + "loss": 0.559, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.3695571222094076, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6665, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.35129314234943027, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6937, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3232157480221569, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6354, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.4373189259156228, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6597, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.3536064139293133, + "learning_rate": 3.479933074573858e-05, + "loss": 0.7012, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.35182109246364857, + "learning_rate": 3.440713983000601e-05, + "loss": 0.7255, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3357727040183112, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6427, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.31551559137728413, + "learning_rate": 3.362805697728145e-05, + "loss": 0.5961, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3431897137939103, + "learning_rate": 3.324118597838464e-05, + "loss": 0.632, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.35622036759042697, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6459, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.423500818863888, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6537, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.32263410267189285, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6263, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3493717459234465, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6189, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.36040957182808014, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6126, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.34005175160167944, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6298, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.3353710359851425, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6603, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.35480473559934433, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6568, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3249120021096706, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.5979, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.35427464454731056, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.5887, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3410934853706257, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6249, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.3441258786282159, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6573, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3403111049153056, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6313, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3118307375313091, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.5768, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.37470127605134, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.686, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3392666553842881, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.6376, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.33244221956218917, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6558, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.3381098234243685, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6015, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3414803363545892, + "learning_rate": 2.6243086879379e-05, + "loss": 0.6172, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.3497723124625655, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.6539, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.35103151983517905, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6257, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.34521209582923357, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.6416, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.338129214111799, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6554, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3586447069828409, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6875, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3799063058634966, + "learning_rate": 2.417867893002387e-05, + "loss": 0.657, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.3525580555427139, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.7051, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.34351459499403025, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.67, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3505246349587142, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.617, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3166680583654929, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.633, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.34214341151187166, + "learning_rate": 2.251428928971102e-05, + "loss": 0.6526, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.35997181508279635, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6656, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.3401845314118498, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6755, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.342946579399248, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.6585, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3250252010083524, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.6016, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3310191288755577, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6444, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.35459213436813997, + "learning_rate": 2.058583491552465e-05, + "loss": 0.6499, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.35371328706023036, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6763, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.36401419583261213, + "learning_rate": 1.995999968955641e-05, + "loss": 0.6153, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3486439267338421, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6552, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3188377292964103, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.631, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3422508848615651, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6525, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.35044866925244333, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6661, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3281478236957656, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6337, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3465012025306824, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6288, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.35684858536917496, + "learning_rate": 1.783776873795994e-05, + "loss": 0.651, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3329223140475106, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6393, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.34814608355260684, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.6419, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3684288530902952, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6602, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3234120709849068, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6599, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.33327145252980284, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6328, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.34282110962358386, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6125, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.3520446592520151, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6465, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3377156357740252, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6605, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.3293023248874468, + "learning_rate": 1.526852950422226e-05, + "loss": 0.5864, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.33559194831506434, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6268, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3481052104875377, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.6174, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.34302263073518136, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.686, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3546926275970501, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6928, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.40019803032777135, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.7419, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3797909265050745, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.7091, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.33675971345197864, + "learning_rate": 1.339745962155613e-05, + "loss": 0.6188, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.3308856148788113, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.6564, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3297676712198887, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6339, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3297539594308802, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6047, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.32656069639037466, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.65, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3357612955625864, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6497, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.3134749601763479, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6415, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.33255251237148076, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6157, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.35401141781585654, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6918, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.339156042969735, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6424, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3597947315285747, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.683, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.35056167421065826, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.6145, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3204723751052671, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6259, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.32690588734103426, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6326, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.37394346783213966, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6205, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.3851867117156391, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6652, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.3570088341709913, + "learning_rate": 9.552642710005299e-06, + "loss": 0.652, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.3395319081531777, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6831, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3624681070079515, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6538, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.33344608217878446, + "learning_rate": 8.900250204211514e-06, + "loss": 0.6288, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.35815663037601725, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6464, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.36000721164107724, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6759, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.335487015706093, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6464, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3187332361299941, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6336, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.34460908171875926, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6424, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3423571997875007, + "learning_rate": 7.661721499929753e-06, + "loss": 0.657, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.3473702898460535, + "learning_rate": 7.463953938275858e-06, + "loss": 0.6475, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3392579613996751, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6181, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3339561694301029, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6335, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.35297172484859773, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.6123, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3743646934730517, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6763, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.36032553764175485, + "learning_rate": 6.512524116523633e-06, + "loss": 0.5929, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.36040494901162906, + "learning_rate": 6.329755547632499e-06, + "loss": 0.6309, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.33090877985851547, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6208, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3374944266272634, + "learning_rate": 5.971775505458444e-06, + "loss": 0.6329, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.32948701734625285, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6219, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.35525609402854275, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6802, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.31854471346311747, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6242, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3483920540606934, + "learning_rate": 5.286177068899989e-06, + "loss": 0.6565, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.3428486661429979, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6697, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3460855723552184, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6557, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.35322794593241996, + "learning_rate": 4.798689246727006e-06, + "loss": 0.6526, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3332322804597539, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6287, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.33293602440066133, + "learning_rate": 4.486482911479839e-06, + "loss": 0.6188, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3559051583590942, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6721, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.36086522563672596, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6489, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.3741741572241904, + "learning_rate": 4.037435632986786e-06, + "loss": 0.7021, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.3605126758683723, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6041, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.33587874752703967, + "learning_rate": 3.750959195463466e-06, + "loss": 0.631, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.34686099816372346, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6715, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3516087889717228, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6498, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.335259263859088, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.6494, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.33264363464980434, + "learning_rate": 3.209076472645112e-06, + "loss": 0.6259, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.3698507544225705, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6454, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3359232567315633, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5946, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3443174796511564, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6722, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.3344774015741628, + "learning_rate": 2.708812932856253e-06, + "loss": 0.6335, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.33067467572081993, + "learning_rate": 2.590275647868867e-06, + "loss": 0.6316, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3445448332902342, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.6502, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3401402738516373, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6342, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.33063080662094113, + "learning_rate": 2.250383684694579e-06, + "loss": 0.6423, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.3299746965273354, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6291, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.34690133907895937, + "learning_rate": 2.036919225091827e-06, + "loss": 0.6474, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.34108334966002213, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6407, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.34903671948512743, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.6188, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.36401320838784496, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.6513, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.33777563351469914, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6524, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3330583430716245, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.5809, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3498181757801496, + "learning_rate": 1.459798471131868e-06, + "loss": 0.6417, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.34818940049890484, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6884, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3558170390104319, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6722, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.35004208109334956, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.6559, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.599282402226537, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6081, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.34569801459946947, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6661, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3498002867783941, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6973, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3291910091032132, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6124, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.34044113093668404, + "learning_rate": 8.386804624865851e-07, + "loss": 0.6075, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.34323574405735774, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6366, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3502311037032876, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6547, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3730856320600065, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6321, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.36199246020923154, + "learning_rate": 5.920169059947411e-07, + "loss": 0.5857, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.353380416231102, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6817, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.326954512921091, + "learning_rate": 4.847084015119574e-07, + "loss": 0.6183, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.35207403639398227, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.6973, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3509588481398452, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6838, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3351419559207814, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.6273, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.3463415954355351, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.625, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.43454340967775285, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5852, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.33830860580191335, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6344, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3330808465283843, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.6068, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3399037356615535, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6457, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.3733580315038423, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6802, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.34045622189058755, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.6386, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.36430301422773936, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6331, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.37114813941906755, + "learning_rate": 6.583743778106887e-08, + "loss": 0.6566, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.34414233428813773, + "learning_rate": 4.837177080119215e-08, + "loss": 0.6393, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.35187345171016765, + "learning_rate": 3.359233507459481e-08, + "loss": 0.631, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3274001090589902, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.586, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.34181628859730867, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6695, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.32708688446053996, + "learning_rate": 5.375026405352035e-09, + "loss": 0.5799, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3473133775182582, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.6162, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.34794400742509624, + "learning_rate": 0.0, + "loss": 0.6736, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 553635297230848.0, + "train_loss": 0.7008882174491883, + "train_runtime": 9922.821, + "train_samples_per_second": 1.008, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 553635297230848.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..7967aa0ef727f20681a3395643e02bdd696cb8dc --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "up_proj", + "down_proj", + "k_proj", + "gate_proj", + "o_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..d35db844c598e91ddf65fb51ff585e542df8fea9 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26afb8ecb361aa1573c588e61fa343d711716fbcb5c105e69dd7c57f09063e32 +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..ed1f31cc0342017c763b31b33f447ff58af33c54 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aff4131c4b93cdf643c1be464f218e02ad4cf80233999ed855e74ca85ce81f13 +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..36d3856ae417315c93e3b6ead96fce1bf8a77ede --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_20000_epochs_1_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 1.174852192286667, + "learning_rate": 5.263157894736842e-06, + "loss": 1.5412, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 1.2419123477662346, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.5452, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.3059811183984338, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.5984, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 1.095206797753618, + "learning_rate": 2.105263157894737e-05, + "loss": 1.51, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.975387169235589, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.4603, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8843993721042885, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3699, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 1.0351394073465081, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.2938, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 1.0020984757485565, + "learning_rate": 4.210526315789474e-05, + "loss": 1.1656, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 1.1739296273942252, + "learning_rate": 4.736842105263158e-05, + "loss": 1.0653, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 0.8903900013638182, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.9677, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.9449069404528453, + "learning_rate": 5.789473684210527e-05, + "loss": 1.0258, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8932747533831232, + "learning_rate": 6.31578947368421e-05, + "loss": 0.9059, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.8591901163930206, + "learning_rate": 6.842105263157895e-05, + "loss": 0.916, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.760837120573045, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9499, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.6333378357605206, + "learning_rate": 7.894736842105263e-05, + "loss": 0.8212, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.5351536277597265, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9195, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5815984102342769, + "learning_rate": 8.947368421052632e-05, + "loss": 0.9489, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5323214203271259, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8855, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.5271911019078436, + "learning_rate": 0.0001, + "loss": 0.8142, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.5064885518974941, + "learning_rate": 0.00010526315789473685, + "loss": 0.9364, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.5045995751188714, + "learning_rate": 0.0001105263157894737, + "loss": 0.8528, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.4838134325203139, + "learning_rate": 0.00011578947368421053, + "loss": 0.8881, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.4842193651138658, + "learning_rate": 0.00012105263157894738, + "loss": 0.872, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.5193281225894892, + "learning_rate": 0.0001263157894736842, + "loss": 0.8481, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.5067542173200873, + "learning_rate": 0.00013157894736842108, + "loss": 0.781, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.47295936190501514, + "learning_rate": 0.0001368421052631579, + "loss": 0.8296, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5492365639189041, + "learning_rate": 0.00014210526315789474, + "loss": 0.8508, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.48167656965140465, + "learning_rate": 0.00014736842105263158, + "loss": 0.8196, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.4534346171238379, + "learning_rate": 0.00015263157894736845, + "loss": 0.7851, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.4561415536028462, + "learning_rate": 0.00015789473684210527, + "loss": 0.8299, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.466156430217683, + "learning_rate": 0.0001631578947368421, + "loss": 0.8252, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4511530570176387, + "learning_rate": 0.00016842105263157895, + "loss": 0.8315, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.4670234354410688, + "learning_rate": 0.0001736842105263158, + "loss": 0.8015, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.42188075569209216, + "learning_rate": 0.00017894736842105264, + "loss": 0.8158, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.43018547127689205, + "learning_rate": 0.00018421052631578948, + "loss": 0.8703, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4167689351885316, + "learning_rate": 0.00018947368421052632, + "loss": 0.8155, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.4392867612695143, + "learning_rate": 0.00019473684210526317, + "loss": 0.779, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.42312975726573043, + "learning_rate": 0.0002, + "loss": 0.8139, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.41603337108686156, + "learning_rate": 0.00019999966405802826, + "loss": 0.8569, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.4701428337634891, + "learning_rate": 0.00019999865623437013, + "loss": 0.7531, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.4169677817719275, + "learning_rate": 0.00019999697653579705, + "loss": 0.7875, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.423915091323005, + "learning_rate": 0.00019999462497359466, + "loss": 0.8048, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.41304493098960177, + "learning_rate": 0.0001999916015635627, + "loss": 0.7742, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4414232950779636, + "learning_rate": 0.00019998790632601496, + "loss": 0.7888, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.4394849114380294, + "learning_rate": 0.00019998353928577919, + "loss": 0.7749, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.43304201360654093, + "learning_rate": 0.0001999785004721968, + "loss": 0.8211, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.4147928959811065, + "learning_rate": 0.0001999727899191228, + "loss": 0.7894, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5265522462227644, + "learning_rate": 0.00019996640766492543, + "loss": 0.8382, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.4484829312606615, + "learning_rate": 0.00019995935375248606, + "loss": 0.7796, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.42783348522362324, + "learning_rate": 0.00019995162822919883, + "loss": 0.7772, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.42289209845467834, + "learning_rate": 0.00019994323114697022, + "loss": 0.7503, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.45852588594005317, + "learning_rate": 0.00019993416256221895, + "loss": 0.847, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.4404745544017252, + "learning_rate": 0.0001999244225358753, + "loss": 0.7578, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4163731261657778, + "learning_rate": 0.00019991401113338104, + "loss": 0.7804, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.419024020190218, + "learning_rate": 0.00019990292842468868, + "loss": 0.7907, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.41029265779192, + "learning_rate": 0.00019989117448426108, + "loss": 0.7869, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.4148072984657268, + "learning_rate": 0.0001998787493910712, + "loss": 0.7516, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.4419314618936028, + "learning_rate": 0.00019986565322860115, + "loss": 0.7754, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4245687861270358, + "learning_rate": 0.000199851886084842, + "loss": 0.8123, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.413959613692514, + "learning_rate": 0.00019983744805229296, + "loss": 0.8094, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.4107368970313985, + "learning_rate": 0.00019982233922796085, + "loss": 0.7805, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.40851121731308915, + "learning_rate": 0.00019980655971335945, + "loss": 0.7457, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.4230745779297569, + "learning_rate": 0.00019979010961450878, + "loss": 0.8077, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3986868445295381, + "learning_rate": 0.00019977298904193437, + "loss": 0.7979, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.41312211726046255, + "learning_rate": 0.00019975519811066663, + "loss": 0.8196, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.41224771104768804, + "learning_rate": 0.00019973673694024, + "loss": 0.8, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4235560132752872, + "learning_rate": 0.0001997176056546921, + "loss": 0.734, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4220942153235023, + "learning_rate": 0.00019969780438256293, + "loss": 0.7894, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.42418197692776344, + "learning_rate": 0.0001996773332568941, + "loss": 0.7513, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.4353473129092401, + "learning_rate": 0.0001996561924152278, + "loss": 0.7628, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.4070142671565011, + "learning_rate": 0.00019963438199960599, + "loss": 0.7845, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.374428315728352, + "learning_rate": 0.0001996119021565693, + "loss": 0.6753, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.41260021209545916, + "learning_rate": 0.00019958875303715615, + "loss": 0.8008, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.4240938972210413, + "learning_rate": 0.0001995649347969019, + "loss": 0.8001, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.4739076786256837, + "learning_rate": 0.0001995404475958373, + "loss": 0.8006, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4274506824751054, + "learning_rate": 0.00019951529159848805, + "loss": 0.8105, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.4330605147869637, + "learning_rate": 0.0001994894669738732, + "loss": 0.757, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.41833833467992704, + "learning_rate": 0.00019946297389550433, + "loss": 0.7513, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.43108864976438627, + "learning_rate": 0.0001994358125413841, + "loss": 0.7463, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.3970898529137441, + "learning_rate": 0.00019940798309400526, + "loss": 0.7933, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.40354930148793755, + "learning_rate": 0.0001993794857403495, + "loss": 0.7691, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.3996396037171106, + "learning_rate": 0.0001993503206718859, + "loss": 0.7541, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.416669802914507, + "learning_rate": 0.0001993204880845699, + "loss": 0.7207, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4360323522039162, + "learning_rate": 0.00019928998817884182, + "loss": 0.7964, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.4258524108241521, + "learning_rate": 0.00019925882115962568, + "loss": 0.8468, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.46468479366071264, + "learning_rate": 0.00019922698723632767, + "loss": 0.7766, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.3926015424233146, + "learning_rate": 0.00019919448662283478, + "loss": 0.7032, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.44477391483356604, + "learning_rate": 0.00019916131953751342, + "loss": 0.766, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.422480524852823, + "learning_rate": 0.00019912748620320794, + "loss": 0.7825, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.45110875067089323, + "learning_rate": 0.00019909298684723904, + "loss": 0.7853, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.4060988959276219, + "learning_rate": 0.00019905782170140238, + "loss": 0.7873, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.41718451058971323, + "learning_rate": 0.00019902199100196697, + "loss": 0.8195, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.3893544036303535, + "learning_rate": 0.00019898549498967343, + "loss": 0.7577, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.3977928617341849, + "learning_rate": 0.00019894833390973266, + "loss": 0.7278, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.4009331041880101, + "learning_rate": 0.000198910508011824, + "loss": 0.8254, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4483565727456726, + "learning_rate": 0.00019887201755009357, + "loss": 0.7527, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.40841215504099687, + "learning_rate": 0.00019883286278315262, + "loss": 0.7083, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.39252603106546685, + "learning_rate": 0.0001987930439740757, + "loss": 0.732, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.4025898246464496, + "learning_rate": 0.00019875256139039902, + "loss": 0.7381, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.38995013220861396, + "learning_rate": 0.00019871141530411853, + "loss": 0.7239, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.36744758611250783, + "learning_rate": 0.00019866960599168826, + "loss": 0.6995, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.3993807376912834, + "learning_rate": 0.0001986271337340182, + "loss": 0.7204, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.3852929627024007, + "learning_rate": 0.0001985839988164726, + "loss": 0.7149, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.37795734304575523, + "learning_rate": 0.00019854020152886814, + "loss": 0.6888, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.4056272767803145, + "learning_rate": 0.00019849574216547171, + "loss": 0.7184, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.43095451759319586, + "learning_rate": 0.0001984506210249986, + "loss": 0.7887, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.3990843159715983, + "learning_rate": 0.00019840483841061058, + "loss": 0.6993, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.5488510128773152, + "learning_rate": 0.00019835839462991361, + "loss": 0.7816, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.4061949940285652, + "learning_rate": 0.00019831128999495606, + "loss": 0.6864, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.4108354732903804, + "learning_rate": 0.00019826352482222638, + "loss": 0.7521, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.4105779612703396, + "learning_rate": 0.0001982150994326511, + "loss": 0.7676, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4090672628646142, + "learning_rate": 0.00019816601415159263, + "loss": 0.7462, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.3889721654833078, + "learning_rate": 0.0001981162693088471, + "loss": 0.7223, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.40864142586658264, + "learning_rate": 0.0001980658652386421, + "loss": 0.7601, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.3861095237897938, + "learning_rate": 0.0001980148022796345, + "loss": 0.7642, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.40820672730260193, + "learning_rate": 0.00019796308077490817, + "loss": 0.7227, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.48574711337691706, + "learning_rate": 0.00019791070107197153, + "loss": 0.7971, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.4116425695467719, + "learning_rate": 0.00019785766352275542, + "loss": 0.7327, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.4603067786671372, + "learning_rate": 0.0001978039684836106, + "loss": 0.7477, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.40760458191649707, + "learning_rate": 0.00019774961631530545, + "loss": 0.7227, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.38622296406754275, + "learning_rate": 0.0001976946073830234, + "loss": 0.7838, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4311793566422138, + "learning_rate": 0.00019763894205636072, + "loss": 0.7686, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.4001412773975259, + "learning_rate": 0.00019758262070932375, + "loss": 0.7193, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.40234722016688085, + "learning_rate": 0.00019752564372032657, + "loss": 0.7245, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.40717966576780684, + "learning_rate": 0.00019746801147218842, + "loss": 0.7707, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.41601902466245766, + "learning_rate": 0.00019740972435213115, + "loss": 0.7693, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.4618987997508002, + "learning_rate": 0.00019735078275177654, + "loss": 0.7965, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4003184333259143, + "learning_rate": 0.00019729118706714375, + "loss": 0.6668, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.544704002941789, + "learning_rate": 0.00019723093769864663, + "loss": 0.7099, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.40018693660363797, + "learning_rate": 0.00019717003505109095, + "loss": 0.7251, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.373424152652994, + "learning_rate": 0.0001971084795336719, + "loss": 0.6499, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.37934430650594364, + "learning_rate": 0.00019704627155997108, + "loss": 0.667, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.42005136783693797, + "learning_rate": 0.00019698341154795389, + "loss": 0.7462, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.388614133162938, + "learning_rate": 0.00019691989991996663, + "loss": 0.7261, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.39517853074521264, + "learning_rate": 0.00019685573710273376, + "loss": 0.7397, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.41084214200497754, + "learning_rate": 0.0001967909235273549, + "loss": 0.7633, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.37553037057766164, + "learning_rate": 0.00019672545962930215, + "loss": 0.7362, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.36072262854433307, + "learning_rate": 0.00019665934584841682, + "loss": 0.6731, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.39675875599537863, + "learning_rate": 0.00019659258262890683, + "loss": 0.7552, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.38275996674752444, + "learning_rate": 0.00019652517041934356, + "loss": 0.7351, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.40281565207472164, + "learning_rate": 0.00019645710967265882, + "loss": 0.6884, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.3800398937956588, + "learning_rate": 0.00019638840084614182, + "loss": 0.7649, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.37818955959756634, + "learning_rate": 0.00019631904440143612, + "loss": 0.7439, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.38940134231115464, + "learning_rate": 0.00019624904080453655, + "loss": 0.7644, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.37094580145774664, + "learning_rate": 0.00019617839052578603, + "loss": 0.6911, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.3843275373661643, + "learning_rate": 0.00019610709403987246, + "loss": 0.7342, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.37820878684303777, + "learning_rate": 0.0001960351518258255, + "loss": 0.7212, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3762651664537602, + "learning_rate": 0.00019596256436701324, + "loss": 0.7271, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.3906758138124606, + "learning_rate": 0.00019588933215113926, + "loss": 0.7539, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.3950841107355177, + "learning_rate": 0.000195815455670239, + "loss": 0.7658, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.41386345666241126, + "learning_rate": 0.00019574093542067673, + "loss": 0.7738, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3944209165179543, + "learning_rate": 0.00019566577190314197, + "loss": 0.6852, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.40443701216227507, + "learning_rate": 0.0001955899656226464, + "loss": 0.764, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.4053836691434719, + "learning_rate": 0.0001955135170885202, + "loss": 0.7958, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.38968536544727367, + "learning_rate": 0.0001954364268144088, + "loss": 0.6706, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4221878091531115, + "learning_rate": 0.00019535869531826937, + "loss": 0.7524, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.37726503599066874, + "learning_rate": 0.00019528032312236736, + "loss": 0.7492, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.37329107131334766, + "learning_rate": 0.00019520131075327298, + "loss": 0.7132, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.40330351890304966, + "learning_rate": 0.00019512165874185767, + "loss": 0.7367, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.38379928388751416, + "learning_rate": 0.00019504136762329047, + "loss": 0.7165, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.3754901811558422, + "learning_rate": 0.0001949604379370345, + "loss": 0.763, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.4102122892411287, + "learning_rate": 0.00019487887022684336, + "loss": 0.7964, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.40045070158740387, + "learning_rate": 0.00019479666504075736, + "loss": 0.7444, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3904879379908833, + "learning_rate": 0.00019471382293110003, + "loss": 0.7361, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.3901576587422018, + "learning_rate": 0.0001946303444544741, + "loss": 0.7447, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.42698701509255066, + "learning_rate": 0.00019454623017175812, + "loss": 0.7129, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.40973290680547114, + "learning_rate": 0.00019446148064810242, + "loss": 0.7629, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.42713400703510906, + "learning_rate": 0.00019437609645292546, + "loss": 0.7244, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.40000493907484663, + "learning_rate": 0.00019429007815990993, + "loss": 0.7246, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.407066195754339, + "learning_rate": 0.0001942034263469989, + "loss": 0.7265, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.41535699810035354, + "learning_rate": 0.00019411614159639204, + "loss": 0.7627, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.3624646836729746, + "learning_rate": 0.00019402822449454153, + "loss": 0.7513, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.39284941439854026, + "learning_rate": 0.00019393967563214833, + "loss": 0.7494, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.3884196538662921, + "learning_rate": 0.00019385049560415794, + "loss": 0.678, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.37265254992120467, + "learning_rate": 0.00019376068500975667, + "loss": 0.6806, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.4275943898238247, + "learning_rate": 0.00019367024445236754, + "loss": 0.7078, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.4065266567259306, + "learning_rate": 0.000193579174539646, + "loss": 0.7465, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.3990288724103298, + "learning_rate": 0.00019348747588347637, + "loss": 0.6653, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.3940172504770751, + "learning_rate": 0.00019339514909996706, + "loss": 0.7101, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.3788687571876394, + "learning_rate": 0.00019330219480944694, + "loss": 0.7772, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.4312036206527422, + "learning_rate": 0.00019320861363646095, + "loss": 0.7452, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.43294716581488774, + "learning_rate": 0.00019311440620976597, + "loss": 0.7133, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.3759136819153871, + "learning_rate": 0.00019301957316232658, + "loss": 0.6569, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.43297690007372713, + "learning_rate": 0.0001929241151313108, + "loss": 0.7342, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.40682691868312687, + "learning_rate": 0.0001928280327580858, + "loss": 0.7374, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.39341849343313623, + "learning_rate": 0.00019273132668821364, + "loss": 0.7191, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.4009073956965113, + "learning_rate": 0.00019263399757144683, + "loss": 0.7521, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3909729045474597, + "learning_rate": 0.00019253604606172417, + "loss": 0.7695, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.4761266766104909, + "learning_rate": 0.000192437472817166, + "loss": 0.7076, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.36601341885699573, + "learning_rate": 0.00019233827850007027, + "loss": 0.7155, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.3737945505162673, + "learning_rate": 0.00019223846377690754, + "loss": 0.6854, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.40564425939183085, + "learning_rate": 0.00019213802931831696, + "loss": 0.6718, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.41992512451788194, + "learning_rate": 0.00019203697579910154, + "loss": 0.7427, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.3797669077452734, + "learning_rate": 0.00019193530389822363, + "loss": 0.7074, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.3821815634843998, + "learning_rate": 0.00019183301429880043, + "loss": 0.7322, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.39209848345860876, + "learning_rate": 0.00019173010768809933, + "loss": 0.7087, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.3673075795060958, + "learning_rate": 0.00019162658475753327, + "loss": 0.6942, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.40762391950517585, + "learning_rate": 0.0001915224462026563, + "loss": 0.7172, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.47168046809017583, + "learning_rate": 0.00019141769272315858, + "loss": 0.7269, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.3673248032142168, + "learning_rate": 0.00019131232502286188, + "loss": 0.6921, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.40793413927631295, + "learning_rate": 0.00019120634380971496, + "loss": 0.7065, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.39571471829755545, + "learning_rate": 0.0001910997497957885, + "loss": 0.7011, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.47734958206069844, + "learning_rate": 0.0001909925436972706, + "loss": 0.7222, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.3959188333021286, + "learning_rate": 0.00019088472623446183, + "loss": 0.728, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.41699901529985056, + "learning_rate": 0.00019077629813177036, + "loss": 0.6943, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.3735553986044611, + "learning_rate": 0.00019066726011770726, + "loss": 0.7062, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.41784251839692316, + "learning_rate": 0.00019055761292488142, + "loss": 0.7848, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.38059219219418583, + "learning_rate": 0.0001904473572899947, + "loss": 0.7059, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.3624543403043051, + "learning_rate": 0.00019033649395383702, + "loss": 0.735, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.8872087505840575, + "learning_rate": 0.00019022502366128135, + "loss": 0.7237, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.3781844792195068, + "learning_rate": 0.00019011294716127867, + "loss": 0.781, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3618561686453649, + "learning_rate": 0.00019000026520685302, + "loss": 0.6432, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.3771379156643068, + "learning_rate": 0.0001898869785550963, + "loss": 0.7163, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.3784585866949077, + "learning_rate": 0.0001897730879671634, + "loss": 0.6731, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.3662423072781949, + "learning_rate": 0.00018965859420826684, + "loss": 0.6866, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.38826740426565903, + "learning_rate": 0.00018954349804767184, + "loss": 0.7774, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.3932066536646525, + "learning_rate": 0.00018942780025869098, + "loss": 0.7286, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.36997603460335343, + "learning_rate": 0.00018931150161867916, + "loss": 0.6523, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.3918279402399443, + "learning_rate": 0.00018919460290902826, + "loss": 0.7047, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.39326362429181955, + "learning_rate": 0.00018907710491516199, + "loss": 0.75, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.43585572564626895, + "learning_rate": 0.0001889590084265304, + "loss": 0.7153, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.393579939630381, + "learning_rate": 0.0001888403142366049, + "loss": 0.7493, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.37447374558504265, + "learning_rate": 0.0001887210231428727, + "loss": 0.6896, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3915094461951344, + "learning_rate": 0.00018860113594683148, + "loss": 0.6808, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.3773715484147197, + "learning_rate": 0.0001884806534539841, + "loss": 0.7599, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.3994812555017749, + "learning_rate": 0.00018835957647383303, + "loss": 0.7287, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.36481961860846446, + "learning_rate": 0.0001882379058198751, + "loss": 0.6859, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3670019666608475, + "learning_rate": 0.00018811564230959588, + "loss": 0.6722, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.3903432748130376, + "learning_rate": 0.00018799278676446423, + "loss": 0.7106, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.39401899871180873, + "learning_rate": 0.00018786934000992688, + "loss": 0.701, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.3753278029930899, + "learning_rate": 0.00018774530287540278, + "loss": 0.6691, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.40961898544625003, + "learning_rate": 0.00018762067619427746, + "loss": 0.7085, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.4002125511109632, + "learning_rate": 0.00018749546080389757, + "loss": 0.7739, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.37394434731730075, + "learning_rate": 0.00018736965754556528, + "loss": 0.7413, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.3675891639914627, + "learning_rate": 0.00018724326726453244, + "loss": 0.711, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3721632182237883, + "learning_rate": 0.00018711629080999504, + "loss": 0.7038, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.3842121632076663, + "learning_rate": 0.00018698872903508755, + "loss": 0.7093, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.39132717878114504, + "learning_rate": 0.00018686058279687698, + "loss": 0.7617, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.40541838904954997, + "learning_rate": 0.0001867318529563574, + "loss": 0.7113, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.40580848314374224, + "learning_rate": 0.00018660254037844388, + "loss": 0.7234, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.3788393820883769, + "learning_rate": 0.00018647264593196688, + "loss": 0.7089, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.40589206564731684, + "learning_rate": 0.00018634217048966637, + "loss": 0.7425, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.38617912332436866, + "learning_rate": 0.00018621111492818585, + "loss": 0.7176, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3895654787912917, + "learning_rate": 0.0001860794801280666, + "loss": 0.7395, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.3775653943994879, + "learning_rate": 0.00018594726697374175, + "loss": 0.7397, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.3969607038613283, + "learning_rate": 0.0001858144763535302, + "loss": 0.7164, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.37801038885076754, + "learning_rate": 0.0001856811091596308, + "loss": 0.6721, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.409233055247135, + "learning_rate": 0.0001855471662881164, + "loss": 0.7397, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.3846321370323925, + "learning_rate": 0.00018541264863892754, + "loss": 0.6769, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.3836594402578531, + "learning_rate": 0.00018527755711586678, + "loss": 0.73, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.3698135875805448, + "learning_rate": 0.00018514189262659235, + "loss": 0.7178, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3800110014298568, + "learning_rate": 0.00018500565608261214, + "loss": 0.729, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.3824003452606007, + "learning_rate": 0.00018486884839927768, + "loss": 0.6834, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.3819552032844194, + "learning_rate": 0.00018473147049577774, + "loss": 0.7049, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.38025311113168603, + "learning_rate": 0.0001845935232951325, + "loss": 0.6839, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.4074890859906348, + "learning_rate": 0.00018445500772418697, + "loss": 0.7, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.3869044169593943, + "learning_rate": 0.00018431592471360503, + "loss": 0.719, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.3997831414626095, + "learning_rate": 0.00018417627519786315, + "loss": 0.7497, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.3798744672159815, + "learning_rate": 0.000184036060115244, + "loss": 0.7397, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.4315916270774472, + "learning_rate": 0.00018389528040783012, + "loss": 0.7412, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.3481358218167205, + "learning_rate": 0.00018375393702149787, + "loss": 0.6752, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.36423243940453054, + "learning_rate": 0.00018361203090591071, + "loss": 0.6756, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.3932085351565385, + "learning_rate": 0.00018346956301451304, + "loss": 0.7467, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3669609029970178, + "learning_rate": 0.00018332653430452376, + "loss": 0.7016, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.38172853932274353, + "learning_rate": 0.00018318294573692985, + "loss": 0.687, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.391962458393, + "learning_rate": 0.00018303879827647975, + "loss": 0.6936, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.38628359351249747, + "learning_rate": 0.0001828940928916772, + "loss": 0.7466, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.3919425169241573, + "learning_rate": 0.00018274883055477436, + "loss": 0.6973, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.37985313895545575, + "learning_rate": 0.00018260301224176558, + "loss": 0.6682, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.41308411411507867, + "learning_rate": 0.00018245663893238075, + "loss": 0.7446, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.36923000783816107, + "learning_rate": 0.00018230971161007853, + "loss": 0.6944, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.38912255752294606, + "learning_rate": 0.00018216223126204007, + "loss": 0.6824, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.37516157859785415, + "learning_rate": 0.00018201419887916214, + "loss": 0.6835, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.36077449717955606, + "learning_rate": 0.00018186561545605054, + "loss": 0.6689, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.36762261756990633, + "learning_rate": 0.00018171648199101346, + "loss": 0.7054, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3661342722722111, + "learning_rate": 0.00018156679948605467, + "loss": 0.715, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.38269955506386627, + "learning_rate": 0.00018141656894686689, + "loss": 0.7714, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.3688490506054277, + "learning_rate": 0.00018126579138282503, + "loss": 0.7426, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.3873509578960858, + "learning_rate": 0.00018111446780697929, + "loss": 0.7516, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.3761794380044784, + "learning_rate": 0.0001809625992360485, + "loss": 0.7079, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.40056581928299334, + "learning_rate": 0.00018081018669041324, + "loss": 0.695, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.396306174344962, + "learning_rate": 0.00018065723119410884, + "loss": 0.7512, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.36911161649464735, + "learning_rate": 0.00018050373377481878, + "loss": 0.7422, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.37875624220379006, + "learning_rate": 0.00018034969546386757, + "loss": 0.7027, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.38206572357858626, + "learning_rate": 0.0001801951172962139, + "loss": 0.7182, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.44087069157672043, + "learning_rate": 0.0001800400003104436, + "loss": 0.712, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.39564240353912855, + "learning_rate": 0.0001798843455487629, + "loss": 0.7011, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.37477423593554254, + "learning_rate": 0.00017972815405699103, + "loss": 0.7007, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.38532593342694543, + "learning_rate": 0.00017957142688455362, + "loss": 0.6883, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.36152163663724846, + "learning_rate": 0.00017941416508447536, + "loss": 0.6788, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.4099338561938528, + "learning_rate": 0.00017925636971337304, + "loss": 0.7382, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.36593122698903263, + "learning_rate": 0.0001790980418314484, + "loss": 0.6662, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.3718410704563449, + "learning_rate": 0.00017893918250248104, + "loss": 0.6723, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.38015563138293135, + "learning_rate": 0.00017877979279382135, + "loss": 0.6681, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.370824964904487, + "learning_rate": 0.00017861987377638312, + "loss": 0.6981, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.37190425522333226, + "learning_rate": 0.0001784594265246366, + "loss": 0.6915, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.4134502690104631, + "learning_rate": 0.0001782984521166011, + "loss": 0.6933, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.3958332078622072, + "learning_rate": 0.0001781369516338378, + "loss": 0.7158, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.3877104741157981, + "learning_rate": 0.00017797492616144256, + "loss": 0.6979, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.3801598621787254, + "learning_rate": 0.00017781237678803847, + "loss": 0.6658, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.37172870557646764, + "learning_rate": 0.00017764930460576866, + "loss": 0.7234, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.3869693539717847, + "learning_rate": 0.000177485710710289, + "loss": 0.7424, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.39380608987801785, + "learning_rate": 0.00017732159620076053, + "loss": 0.7212, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3836380194679643, + "learning_rate": 0.00017715696217984235, + "loss": 0.7015, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.38294391568201475, + "learning_rate": 0.00017699180975368396, + "loss": 0.6728, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3684407971034021, + "learning_rate": 0.00017682614003191807, + "loss": 0.7276, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.36782914865474786, + "learning_rate": 0.00017665995412765285, + "loss": 0.6935, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3547005180706965, + "learning_rate": 0.00017649325315746478, + "loss": 0.6801, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.37358925011534083, + "learning_rate": 0.00017632603824139085, + "loss": 0.7104, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.3691174540695535, + "learning_rate": 0.0001761583105029213, + "loss": 0.6772, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.3810745954314981, + "learning_rate": 0.0001759900710689918, + "loss": 0.7494, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.37714165675070244, + "learning_rate": 0.00017582132106997616, + "loss": 0.7016, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.3566572512605275, + "learning_rate": 0.00017565206163967846, + "loss": 0.664, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.37756734385677954, + "learning_rate": 0.00017548229391532572, + "loss": 0.7196, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.38629987639451213, + "learning_rate": 0.00017531201903755994, + "loss": 0.669, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.36129876085632506, + "learning_rate": 0.00017514123815043074, + "loss": 0.6862, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.36290271691213977, + "learning_rate": 0.00017496995240138744, + "loss": 0.6881, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.36828837884499055, + "learning_rate": 0.00017479816294127152, + "loss": 0.6885, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.3947123381892617, + "learning_rate": 0.00017462587092430875, + "loss": 0.6739, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.3710342070647339, + "learning_rate": 0.0001744530775081015, + "loss": 0.6682, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.3568277558353811, + "learning_rate": 0.00017427978385362112, + "loss": 0.6023, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.40096883734102273, + "learning_rate": 0.0001741059911251997, + "loss": 0.6938, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.39366913430025, + "learning_rate": 0.0001739317004905227, + "loss": 0.7283, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3899719501341565, + "learning_rate": 0.000173756913120621, + "loss": 0.6953, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.3811060149312132, + "learning_rate": 0.00017358163018986282, + "loss": 0.6923, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.3877184901266767, + "learning_rate": 0.00017340585287594604, + "loss": 0.7181, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.37570487164894645, + "learning_rate": 0.00017322958235989016, + "loss": 0.6903, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3990768334325875, + "learning_rate": 0.0001730528198260285, + "loss": 0.7045, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.36468017035100203, + "learning_rate": 0.00017287556646200018, + "loss": 0.7439, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.3709488792010069, + "learning_rate": 0.00017269782345874203, + "loss": 0.6447, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.3855312196505767, + "learning_rate": 0.00017251959201048083, + "loss": 0.6968, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.37081730330401114, + "learning_rate": 0.00017234087331472497, + "loss": 0.7141, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.39710496855717275, + "learning_rate": 0.00017216166857225674, + "loss": 0.7059, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4175724196477424, + "learning_rate": 0.00017198197898712404, + "loss": 0.6963, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.3842215052177171, + "learning_rate": 0.00017180180576663228, + "loss": 0.7422, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.34944285190348157, + "learning_rate": 0.00017162115012133643, + "loss": 0.7044, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.36351526740228046, + "learning_rate": 0.00017144001326503273, + "loss": 0.699, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.36884647531457104, + "learning_rate": 0.00017125839641475072, + "loss": 0.6755, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.35857535515859895, + "learning_rate": 0.00017107630079074478, + "loss": 0.6806, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.3923666857697795, + "learning_rate": 0.00017089372761648616, + "loss": 0.6936, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.38849877071620453, + "learning_rate": 0.00017071067811865476, + "loss": 0.7046, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3584263264067099, + "learning_rate": 0.00017052715352713075, + "loss": 0.6597, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.36138589740234983, + "learning_rate": 0.00017034315507498635, + "loss": 0.6663, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.38527180068415556, + "learning_rate": 0.00017015868399847768, + "loss": 0.6825, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.40412844856312724, + "learning_rate": 0.00016997374153703625, + "loss": 0.6563, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.37401398327720353, + "learning_rate": 0.00016978832893326074, + "loss": 0.6698, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.40515318586757254, + "learning_rate": 0.00016960244743290868, + "loss": 0.7499, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.35146950749093353, + "learning_rate": 0.00016941609828488807, + "loss": 0.647, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.4063808336460343, + "learning_rate": 0.00016922928274124886, + "loss": 0.7205, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.38644229466527624, + "learning_rate": 0.0001690420020571747, + "loss": 0.7077, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.3420681264350092, + "learning_rate": 0.00016885425749097444, + "loss": 0.6289, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.38308061028907253, + "learning_rate": 0.0001686660503040737, + "loss": 0.6857, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.3674045490160201, + "learning_rate": 0.00016847738176100632, + "loss": 0.7407, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.36874718203802986, + "learning_rate": 0.00016828825312940592, + "loss": 0.7133, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.373818222715427, + "learning_rate": 0.0001680986656799975, + "loss": 0.6957, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3762865907231924, + "learning_rate": 0.0001679086206865886, + "loss": 0.7396, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.36300277570336514, + "learning_rate": 0.00016771811942606108, + "loss": 0.6652, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.3904917705539183, + "learning_rate": 0.00016752716317836229, + "loss": 0.684, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.37199677823710336, + "learning_rate": 0.00016733575322649657, + "loss": 0.6951, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.3985541089788225, + "learning_rate": 0.0001671438908565167, + "loss": 0.747, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.3747357313142051, + "learning_rate": 0.00016695157735751513, + "loss": 0.6941, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.37858108431847404, + "learning_rate": 0.00016675881402161536, + "loss": 0.7125, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.35862584195531766, + "learning_rate": 0.0001665656021439633, + "loss": 0.6779, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.3429005769754215, + "learning_rate": 0.0001663719430227186, + "loss": 0.6994, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.37876750251492375, + "learning_rate": 0.00016617783795904565, + "loss": 0.6804, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.3468960179430786, + "learning_rate": 0.00016598328825710533, + "loss": 0.6676, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.3831747796081473, + "learning_rate": 0.00016578829522404583, + "loss": 0.6808, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.37841376785544, + "learning_rate": 0.000165592860169994, + "loss": 0.6752, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.4279316248084326, + "learning_rate": 0.00016539698440804661, + "loss": 0.7185, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.37601604528298943, + "learning_rate": 0.00016520066925426144, + "loss": 0.6702, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.3505774673496094, + "learning_rate": 0.0001650039160276485, + "loss": 0.6946, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3664877863818731, + "learning_rate": 0.0001648067260501611, + "loss": 0.6836, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.38420617090256015, + "learning_rate": 0.0001646091006466871, + "loss": 0.6758, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.36007075289624113, + "learning_rate": 0.0001644110411450398, + "loss": 0.6639, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.35828089436110433, + "learning_rate": 0.00016421254887594917, + "loss": 0.6958, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3771595610691394, + "learning_rate": 0.00016401362517305296, + "loss": 0.7302, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.35494018975711594, + "learning_rate": 0.00016381427137288754, + "loss": 0.6509, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.3557291271239453, + "learning_rate": 0.00016361448881487914, + "loss": 0.6866, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.36194769111631925, + "learning_rate": 0.0001634142788413346, + "loss": 0.6794, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.36763967338423365, + "learning_rate": 0.00016321364279743266, + "loss": 0.6702, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.34966585380789356, + "learning_rate": 0.00016301258203121462, + "loss": 0.7094, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.3514193489828989, + "learning_rate": 0.0001628110978935756, + "loss": 0.6858, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.39152681449741006, + "learning_rate": 0.00016260919173825508, + "loss": 0.7274, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.35755785523271066, + "learning_rate": 0.00016240686492182804, + "loss": 0.6865, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.3614439044420442, + "learning_rate": 0.00016220411880369601, + "loss": 0.7082, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.358590942979874, + "learning_rate": 0.00016200095474607753, + "loss": 0.7045, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.39159248919640544, + "learning_rate": 0.00016179737411399926, + "loss": 0.7603, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3712266427733067, + "learning_rate": 0.00016159337827528685, + "loss": 0.6877, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.3864394812212985, + "learning_rate": 0.00016138896860055555, + "loss": 0.7502, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.37697745012714656, + "learning_rate": 0.0001611841464632011, + "loss": 0.6384, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.38798423687896516, + "learning_rate": 0.00016097891323939062, + "loss": 0.7353, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3595948778744978, + "learning_rate": 0.0001607732703080532, + "loss": 0.7087, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.37010313959677527, + "learning_rate": 0.00016056721905087056, + "loss": 0.7016, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.35207553195728936, + "learning_rate": 0.00016036076085226814, + "loss": 0.6507, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.3415757077970016, + "learning_rate": 0.00016015389709940538, + "loss": 0.7226, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.36246577422721343, + "learning_rate": 0.0001599466291821666, + "loss": 0.6889, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.3874584927068638, + "learning_rate": 0.0001597389584931517, + "loss": 0.692, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.35050336327947873, + "learning_rate": 0.0001595308864276666, + "loss": 0.6844, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.379138324147785, + "learning_rate": 0.0001593224143837142, + "loss": 0.6694, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.35029682023932, + "learning_rate": 0.0001591135437619847, + "loss": 0.6303, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.36710781427868844, + "learning_rate": 0.00015890427596584617, + "loss": 0.6583, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.3869140830738328, + "learning_rate": 0.0001586946124013354, + "loss": 0.6907, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.375016132656621, + "learning_rate": 0.00015848455447714822, + "loss": 0.752, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.34846735999551737, + "learning_rate": 0.0001582741036046301, + "loss": 0.6605, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.36763862476695747, + "learning_rate": 0.00015806326119776663, + "loss": 0.6912, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.36866573941258934, + "learning_rate": 0.00015785202867317407, + "loss": 0.6729, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.35520848274255595, + "learning_rate": 0.00015764040745008988, + "loss": 0.6989, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3465772132574022, + "learning_rate": 0.00015742839895036305, + "loss": 0.6339, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.3725783884243975, + "learning_rate": 0.00015721600459844468, + "loss": 0.7124, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.37677205954801835, + "learning_rate": 0.00015700322582137827, + "loss": 0.7082, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.3781493059820502, + "learning_rate": 0.00015679006404879033, + "loss": 0.7119, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3758004699076751, + "learning_rate": 0.0001565765207128805, + "loss": 0.6938, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.3614628036787356, + "learning_rate": 0.00015636259724841222, + "loss": 0.6454, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3803283009276044, + "learning_rate": 0.0001561482950927029, + "loss": 0.7137, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.356995580870094, + "learning_rate": 0.00015593361568561428, + "loss": 0.6862, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.37156355892036974, + "learning_rate": 0.00015571856046954285, + "loss": 0.7002, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.39892124229473647, + "learning_rate": 0.0001555031308894101, + "loss": 0.6629, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.35627794947594865, + "learning_rate": 0.00015528732839265272, + "loss": 0.6471, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3729559884121915, + "learning_rate": 0.0001550711544292131, + "loss": 0.6675, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.3898626072425355, + "learning_rate": 0.0001548546104515294, + "loss": 0.7193, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.3873226189997151, + "learning_rate": 0.00015463769791452574, + "loss": 0.7032, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.3626940426569876, + "learning_rate": 0.00015442041827560274, + "loss": 0.6807, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.6811790858918634, + "learning_rate": 0.00015420277299462736, + "loss": 0.6627, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.36326657900745496, + "learning_rate": 0.00015398476353392323, + "loss": 0.7167, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.3437354521778118, + "learning_rate": 0.00015376639135826107, + "loss": 0.6352, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3528095739021646, + "learning_rate": 0.00015354765793484834, + "loss": 0.6894, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.33823450009582723, + "learning_rate": 0.00015332856473331978, + "loss": 0.664, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.33825292026949455, + "learning_rate": 0.00015310911322572753, + "loss": 0.6398, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.35227441495536316, + "learning_rate": 0.00015288930488653094, + "loss": 0.6893, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.3491554261726527, + "learning_rate": 0.000152669141192587, + "loss": 0.6672, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.3557268832514482, + "learning_rate": 0.0001524486236231402, + "loss": 0.6671, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3606105281950049, + "learning_rate": 0.00015222775365981273, + "loss": 0.6855, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.3592704005306545, + "learning_rate": 0.00015200653278659432, + "loss": 0.6681, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.3526739493236099, + "learning_rate": 0.00015178496248983254, + "loss": 0.6997, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.3392310064970902, + "learning_rate": 0.00015156304425822267, + "loss": 0.6656, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.33948021180080706, + "learning_rate": 0.00015134077958279765, + "loss": 0.681, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.3567394688296086, + "learning_rate": 0.00015111816995691809, + "loss": 0.65, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3446383475635426, + "learning_rate": 0.00015089521687626243, + "loss": 0.6621, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.3652029111338738, + "learning_rate": 0.00015067192183881658, + "loss": 0.6742, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.3453361682114829, + "learning_rate": 0.000150448286344864, + "loss": 0.6584, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.3470509010770301, + "learning_rate": 0.00015022431189697568, + "loss": 0.6401, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.39355894619844295, + "learning_rate": 0.00015000000000000001, + "loss": 0.709, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.3505865283735694, + "learning_rate": 0.0001497753521610526, + "loss": 0.6681, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.36355352325231827, + "learning_rate": 0.00014955036988950618, + "loss": 0.6804, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.3706176316524146, + "learning_rate": 0.00014932505469698052, + "loss": 0.6431, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3572688796259224, + "learning_rate": 0.00014909940809733222, + "loss": 0.6244, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.36607041853928607, + "learning_rate": 0.0001488734316066446, + "loss": 0.725, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.36028601904932567, + "learning_rate": 0.00014864712674321734, + "loss": 0.6983, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.3521866298600251, + "learning_rate": 0.0001484204950275565, + "loss": 0.731, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.3525632875009428, + "learning_rate": 0.00014819353798236427, + "loss": 0.6906, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.349145508761633, + "learning_rate": 0.00014796625713252848, + "loss": 0.6661, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3692327518479248, + "learning_rate": 0.00014773865400511272, + "loss": 0.6706, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.3394289327028457, + "learning_rate": 0.00014751073012934587, + "loss": 0.6582, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.38785992118789714, + "learning_rate": 0.00014728248703661182, + "loss": 0.7117, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.35986869847962144, + "learning_rate": 0.0001470539262604393, + "loss": 0.7225, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3548413026743914, + "learning_rate": 0.00014682504933649144, + "loss": 0.6911, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.3879131571635838, + "learning_rate": 0.00014659585780255556, + "loss": 0.726, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3836466458319732, + "learning_rate": 0.00014636635319853275, + "loss": 0.668, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.35093440345017185, + "learning_rate": 0.0001461365370664276, + "loss": 0.6224, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.37107833192612777, + "learning_rate": 0.00014590641095033787, + "loss": 0.6344, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.35404640899323836, + "learning_rate": 0.00014567597639644387, + "loss": 0.6652, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3617031028209322, + "learning_rate": 0.00014544523495299842, + "loss": 0.6717, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.35520372468151784, + "learning_rate": 0.00014521418817031628, + "loss": 0.7378, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3593723764750375, + "learning_rate": 0.0001449828376007636, + "loss": 0.674, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.36271685238720835, + "learning_rate": 0.00014475118479874774, + "loss": 0.7051, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.38835589406468907, + "learning_rate": 0.0001445192313207067, + "loss": 0.6903, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.3450666933540303, + "learning_rate": 0.0001442869787250987, + "loss": 0.6484, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.37833508715281694, + "learning_rate": 0.0001440544285723915, + "loss": 0.6871, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.3664373247024666, + "learning_rate": 0.00014382158242505234, + "loss": 0.6563, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.3756177451926007, + "learning_rate": 0.00014358844184753712, + "loss": 0.7033, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.3590059343277386, + "learning_rate": 0.00014335500840627986, + "loss": 0.6728, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3670433190964126, + "learning_rate": 0.00014312128366968243, + "loss": 0.724, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.3787190599554793, + "learning_rate": 0.0001428872692081038, + "loss": 0.6205, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3454497682512466, + "learning_rate": 0.00014265296659384956, + "loss": 0.6565, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.34704461844006534, + "learning_rate": 0.00014241837740116132, + "loss": 0.6776, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3803751866284716, + "learning_rate": 0.00014218350320620624, + "loss": 0.6574, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.3576266744878013, + "learning_rate": 0.00014194834558706632, + "loss": 0.6592, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3522343869759249, + "learning_rate": 0.0001417129061237278, + "loss": 0.6707, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.35211080690880675, + "learning_rate": 0.0001414771863980707, + "loss": 0.6496, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.3947310834775007, + "learning_rate": 0.00014124118799385796, + "loss": 0.7246, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.40152009995972676, + "learning_rate": 0.00014100491249672498, + "loss": 0.659, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3749105234224021, + "learning_rate": 0.00014076836149416887, + "loss": 0.7107, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.35963198305871635, + "learning_rate": 0.0001405315365755379, + "loss": 0.6456, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.36004407590129617, + "learning_rate": 0.0001402944393320206, + "loss": 0.6849, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.34256206446359866, + "learning_rate": 0.00014005707135663527, + "loss": 0.6883, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.3553528334119219, + "learning_rate": 0.00013981943424421932, + "loss": 0.6635, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.3697495673554272, + "learning_rate": 0.00013958152959141825, + "loss": 0.7297, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.36124751312813597, + "learning_rate": 0.00013934335899667527, + "loss": 0.6949, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.3938585615178389, + "learning_rate": 0.00013910492406022033, + "loss": 0.6753, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.34933636975948074, + "learning_rate": 0.00013886622638405952, + "loss": 0.6783, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.3401386033764282, + "learning_rate": 0.0001386272675719642, + "loss": 0.646, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3276439157499567, + "learning_rate": 0.00013838804922946027, + "loss": 0.5926, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.3648292475835564, + "learning_rate": 0.00013814857296381728, + "loss": 0.6481, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.35712190056947224, + "learning_rate": 0.00013790884038403795, + "loss": 0.6694, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.337986123079369, + "learning_rate": 0.00013766885310084688, + "loss": 0.7206, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3636824004852608, + "learning_rate": 0.00013742861272668012, + "loss": 0.6833, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.34534566926240323, + "learning_rate": 0.00013718812087567414, + "loss": 0.6617, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.3683676689863593, + "learning_rate": 0.00013694737916365517, + "loss": 0.6952, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.3754604071614481, + "learning_rate": 0.000136706389208128, + "loss": 0.6904, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.3502870151094157, + "learning_rate": 0.00013646515262826552, + "loss": 0.7077, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.3532161453414018, + "learning_rate": 0.00013622367104489756, + "loss": 0.6867, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.34216842169703193, + "learning_rate": 0.0001359819460805001, + "loss": 0.6958, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.3452175475894138, + "learning_rate": 0.0001357399793591844, + "loss": 0.6905, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3960344690681105, + "learning_rate": 0.0001354977725066859, + "loss": 0.6733, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.3521931246522434, + "learning_rate": 0.00013525532715035366, + "loss": 0.6811, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.4052583162933025, + "learning_rate": 0.00013501264491913906, + "loss": 0.6317, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.3560063804754956, + "learning_rate": 0.00013476972744358507, + "loss": 0.6463, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3667689282680826, + "learning_rate": 0.0001345265763558152, + "loss": 0.6769, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.38219062338204474, + "learning_rate": 0.00013428319328952253, + "loss": 0.6563, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.351086490437197, + "learning_rate": 0.00013403957987995882, + "loss": 0.649, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.3626946330009153, + "learning_rate": 0.0001337957377639235, + "loss": 0.6553, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3555684128972453, + "learning_rate": 0.0001335516685797525, + "loss": 0.6511, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.35713740915573505, + "learning_rate": 0.0001333073739673076, + "loss": 0.6488, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.36969172915692416, + "learning_rate": 0.00013306285556796495, + "loss": 0.6076, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.35557894666896, + "learning_rate": 0.0001328181150246045, + "loss": 0.6782, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3770997160342275, + "learning_rate": 0.00013257315398159864, + "loss": 0.6463, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.34799230460154973, + "learning_rate": 0.00013232797408480127, + "loss": 0.6537, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.34417676553483495, + "learning_rate": 0.00013208257698153677, + "loss": 0.6101, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.36437771839881833, + "learning_rate": 0.00013183696432058888, + "loss": 0.6535, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.35862352295980277, + "learning_rate": 0.00013159113775218964, + "loss": 0.6756, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.3699165408543771, + "learning_rate": 0.00013134509892800822, + "loss": 0.7167, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.34655546549114363, + "learning_rate": 0.00013109884950114007, + "loss": 0.6545, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.37727583792959807, + "learning_rate": 0.00013085239112609547, + "loss": 0.6446, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3546180541237899, + "learning_rate": 0.00013060572545878875, + "loss": 0.6821, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.3471439526432889, + "learning_rate": 0.00013035885415652685, + "loss": 0.7216, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.3421929753393047, + "learning_rate": 0.00013011177887799845, + "loss": 0.634, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.3568553020749221, + "learning_rate": 0.00012986450128326266, + "loss": 0.6224, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3469058900844622, + "learning_rate": 0.00012961702303373795, + "loss": 0.6822, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.365114132455481, + "learning_rate": 0.00012936934579219094, + "loss": 0.6733, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.36117504088003655, + "learning_rate": 0.00012912147122272523, + "loss": 0.6676, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.35183717817106164, + "learning_rate": 0.00012887340099077024, + "loss": 0.6772, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.33967313753393985, + "learning_rate": 0.00012862513676307008, + "loss": 0.7049, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.3756110009159534, + "learning_rate": 0.0001283766802076722, + "loss": 0.6906, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3567481356987659, + "learning_rate": 0.00012812803299391628, + "loss": 0.6477, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.34587374165715784, + "learning_rate": 0.00012787919679242306, + "loss": 0.7119, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3547315822290638, + "learning_rate": 0.00012763017327508305, + "loss": 0.6496, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.35068723175307986, + "learning_rate": 0.00012738096411504522, + "loss": 0.6594, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.34204285531613576, + "learning_rate": 0.0001271315709867059, + "loss": 0.6592, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.3472665172059884, + "learning_rate": 0.00012688199556569753, + "loss": 0.676, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.33023294700263917, + "learning_rate": 0.00012663223952887723, + "loss": 0.5958, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.3549980428582043, + "learning_rate": 0.0001263823045543158, + "loss": 0.6441, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.37089522030188665, + "learning_rate": 0.00012613219232128608, + "loss": 0.7005, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.3835797613220501, + "learning_rate": 0.00012588190451025207, + "loss": 0.6497, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3560284521440167, + "learning_rate": 0.00012563144280285741, + "loss": 0.6315, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.36053243792495826, + "learning_rate": 0.00012538080888191408, + "loss": 0.6169, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.36357053520621047, + "learning_rate": 0.00012513000443139112, + "loss": 0.6751, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.383585475808702, + "learning_rate": 0.00012487903113640337, + "loss": 0.6828, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3763923557862577, + "learning_rate": 0.00012462789068320017, + "loss": 0.692, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.3583680537989247, + "learning_rate": 0.00012437658475915377, + "loss": 0.6529, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.357971945414798, + "learning_rate": 0.00012412511505274844, + "loss": 0.6242, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.3675974232417326, + "learning_rate": 0.00012387348325356874, + "loss": 0.6667, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.3753348705839958, + "learning_rate": 0.00012362169105228826, + "loss": 0.6567, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.35296107724075343, + "learning_rate": 0.00012336974014065844, + "loss": 0.6744, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.35737068044442893, + "learning_rate": 0.000123117632211497, + "loss": 0.7119, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.3742031428545619, + "learning_rate": 0.00012286536895867654, + "loss": 0.6701, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.35912721431704026, + "learning_rate": 0.00012261295207711346, + "loss": 0.6314, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.3546955760845967, + "learning_rate": 0.00012236038326275626, + "loss": 0.6564, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.34330325859647054, + "learning_rate": 0.0001221076642125742, + "loss": 0.7046, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.3842046843569431, + "learning_rate": 0.00012185479662454595, + "loss": 0.7085, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.3550618456638281, + "learning_rate": 0.00012160178219764837, + "loss": 0.6484, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.3508035800123553, + "learning_rate": 0.00012134862263184467, + "loss": 0.6555, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3424921420125426, + "learning_rate": 0.00012109531962807332, + "loss": 0.6466, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.32771932502111106, + "learning_rate": 0.00012084187488823657, + "loss": 0.6371, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.36329217258802543, + "learning_rate": 0.00012058829011518896, + "loss": 0.6793, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.33510791033413506, + "learning_rate": 0.00012033456701272576, + "loss": 0.6321, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3511162120492735, + "learning_rate": 0.00012008070728557186, + "loss": 0.686, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.3747552930644442, + "learning_rate": 0.00011982671263936995, + "loss": 0.6156, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.36780445309988385, + "learning_rate": 0.00011957258478066931, + "loss": 0.6229, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.40200107534286134, + "learning_rate": 0.00011931832541691418, + "loss": 0.6955, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.37411746459882256, + "learning_rate": 0.00011906393625643244, + "loss": 0.6482, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.34333223943737057, + "learning_rate": 0.00011880941900842397, + "loss": 0.6667, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3605601968276506, + "learning_rate": 0.00011855477538294935, + "loss": 0.6582, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.3705587684406122, + "learning_rate": 0.00011830000709091815, + "loss": 0.716, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.34145327558000876, + "learning_rate": 0.00011804511584407763, + "loss": 0.6518, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.3428159808012652, + "learning_rate": 0.0001177901033550012, + "loss": 0.6287, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3735476142133528, + "learning_rate": 0.00011753497133707679, + "loss": 0.6166, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.36088630739120264, + "learning_rate": 0.00011727972150449544, + "loss": 0.6409, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.336805561573356, + "learning_rate": 0.00011702435557223987, + "loss": 0.6051, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.3564740780513103, + "learning_rate": 0.00011676887525607271, + "loss": 0.693, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.36842908620913045, + "learning_rate": 0.00011651328227252517, + "loss": 0.6431, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.3452755158126447, + "learning_rate": 0.00011625757833888551, + "loss": 0.6389, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.357204753971374, + "learning_rate": 0.00011600176517318741, + "loss": 0.6614, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.3555927630992336, + "learning_rate": 0.0001157458444941984, + "loss": 0.6506, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.35071779715989215, + "learning_rate": 0.00011548981802140848, + "loss": 0.6773, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.3468587325939666, + "learning_rate": 0.00011523368747501839, + "loss": 0.6899, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.36870305727725394, + "learning_rate": 0.00011497745457592816, + "loss": 0.6622, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.3563200011548181, + "learning_rate": 0.00011472112104572547, + "loss": 0.7039, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.35243606935919075, + "learning_rate": 0.00011446468860667421, + "loss": 0.601, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.3889294072436819, + "learning_rate": 0.0001142081589817027, + "loss": 0.673, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.33968815595714036, + "learning_rate": 0.00011395153389439233, + "loss": 0.6692, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.3442811253045003, + "learning_rate": 0.00011369481506896582, + "loss": 0.6491, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.35931484952336434, + "learning_rate": 0.00011343800423027582, + "loss": 0.6255, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.33267277569085585, + "learning_rate": 0.00011318110310379301, + "loss": 0.6222, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3795423975740203, + "learning_rate": 0.0001129241134155949, + "loss": 0.6546, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.36130273787387834, + "learning_rate": 0.00011266703689235394, + "loss": 0.6542, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.33368631687639777, + "learning_rate": 0.00011240987526132594, + "loss": 0.663, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.356016044191631, + "learning_rate": 0.00011215263025033869, + "loss": 0.6029, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.33437157623080216, + "learning_rate": 0.00011189530358778005, + "loss": 0.6009, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.36929028277298886, + "learning_rate": 0.00011163789700258655, + "loss": 0.6923, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.48648035436777626, + "learning_rate": 0.00011138041222423177, + "loss": 0.6236, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.37050028429307086, + "learning_rate": 0.00011112285098271451, + "loss": 0.7212, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3717500208488645, + "learning_rate": 0.00011086521500854745, + "loss": 0.6674, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.37039844873150024, + "learning_rate": 0.00011060750603274535, + "loss": 0.7207, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3211782696393604, + "learning_rate": 0.00011034972578681338, + "loss": 0.6373, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.3259793164203849, + "learning_rate": 0.00011009187600273566, + "loss": 0.618, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.3360116751173385, + "learning_rate": 0.00010983395841296348, + "loss": 0.6427, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.3334069590638098, + "learning_rate": 0.00010957597475040373, + "loss": 0.6427, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.35653982289760483, + "learning_rate": 0.00010931792674840718, + "loss": 0.6511, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.3356635843682699, + "learning_rate": 0.00010905981614075693, + "loss": 0.6108, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.33475695126644883, + "learning_rate": 0.00010880164466165674, + "loss": 0.6468, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.364022566933514, + "learning_rate": 0.00010854341404571928, + "loss": 0.644, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3411126482433266, + "learning_rate": 0.00010828512602795462, + "loss": 0.6793, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.34598234849443915, + "learning_rate": 0.00010802678234375851, + "loss": 0.679, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.33115184965176603, + "learning_rate": 0.00010776838472890065, + "loss": 0.6081, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.34689895068402693, + "learning_rate": 0.0001075099349195131, + "loss": 0.6728, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.33426322053595414, + "learning_rate": 0.00010725143465207867, + "loss": 0.6263, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.3908589045319285, + "learning_rate": 0.00010699288566341914, + "loss": 0.6811, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.37086890844371845, + "learning_rate": 0.00010673428969068364, + "loss": 0.6791, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.352481399418626, + "learning_rate": 0.000106475648471337, + "loss": 0.6035, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.3556905909409883, + "learning_rate": 0.00010621696374314807, + "loss": 0.655, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.323278647915081, + "learning_rate": 0.00010595823724417795, + "loss": 0.6104, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3543930190777003, + "learning_rate": 0.00010569947071276847, + "loss": 0.6571, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.3768691858208267, + "learning_rate": 0.00010544066588753044, + "loss": 0.6995, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.35385766440035066, + "learning_rate": 0.00010518182450733186, + "loss": 0.6742, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.3371564161334034, + "learning_rate": 0.00010492294831128641, + "loss": 0.6972, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.33872615023532215, + "learning_rate": 0.00010466403903874176, + "loss": 0.6888, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.3619087379833558, + "learning_rate": 0.00010440509842926767, + "loss": 0.6515, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3354498087205006, + "learning_rate": 0.00010414612822264455, + "loss": 0.6546, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.36092006804188176, + "learning_rate": 0.00010388713015885161, + "loss": 0.6529, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.37397378593728353, + "learning_rate": 0.00010362810597805526, + "loss": 0.6434, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.34307293730008304, + "learning_rate": 0.00010336905742059742, + "loss": 0.6401, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3418056541944195, + "learning_rate": 0.0001031099862269837, + "loss": 0.633, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.3856471386919663, + "learning_rate": 0.0001028508941378719, + "loss": 0.6639, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3622342192250039, + "learning_rate": 0.00010259178289406011, + "loss": 0.6282, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.33339053534505564, + "learning_rate": 0.00010233265423647523, + "loss": 0.6412, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3381423744397256, + "learning_rate": 0.00010207350990616107, + "loss": 0.6223, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.34388080644268326, + "learning_rate": 0.00010181435164426676, + "loss": 0.6431, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3417840426086896, + "learning_rate": 0.0001015551811920351, + "loss": 0.6437, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.35310851098361445, + "learning_rate": 0.00010129600029079072, + "loss": 0.6559, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.3434450564374458, + "learning_rate": 0.00010103681068192845, + "loss": 0.6134, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.3718283430838848, + "learning_rate": 0.00010077761410690172, + "loss": 0.6588, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.32338874943023094, + "learning_rate": 0.00010051841230721065, + "loss": 0.6285, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.33231452773322584, + "learning_rate": 0.00010025920702439051, + "loss": 0.5941, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3513172583604685, + "learning_rate": 0.0001, + "loss": 0.6744, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.3460857789690904, + "learning_rate": 9.97407929756095e-05, + "loss": 0.6523, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3602769991596556, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6092, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.37105715522506105, + "learning_rate": 9.92223858930983e-05, + "loss": 0.7126, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.36634203067739235, + "learning_rate": 9.896318931807155e-05, + "loss": 0.6457, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.3488670432393145, + "learning_rate": 9.870399970920932e-05, + "loss": 0.6647, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.3680897825669399, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6412, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.5043876017186315, + "learning_rate": 9.818564835573323e-05, + "loss": 0.6728, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3610104478993515, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6492, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.34717150933690816, + "learning_rate": 9.766734576352478e-05, + "loss": 0.6456, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.332565817334399, + "learning_rate": 9.740821710593989e-05, + "loss": 0.634, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.35876030069682097, + "learning_rate": 9.714910586212816e-05, + "loss": 0.6696, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3448074874893681, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6233, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.33930716718963655, + "learning_rate": 9.663094257940258e-05, + "loss": 0.6798, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.35969300240814994, + "learning_rate": 9.637189402194476e-05, + "loss": 0.7062, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.3640424890367901, + "learning_rate": 9.611286984114841e-05, + "loss": 0.6479, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.3465541176904211, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6768, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.336377209847485, + "learning_rate": 9.559490157073236e-05, + "loss": 0.597, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.35720032678174196, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6569, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.36147187421948807, + "learning_rate": 9.507705168871358e-05, + "loss": 0.6572, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.36456812813376643, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6269, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.34238113344853044, + "learning_rate": 9.455933411246958e-05, + "loss": 0.6486, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3538460544846853, + "learning_rate": 9.430052928723153e-05, + "loss": 0.5996, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.3381284231312074, + "learning_rate": 9.404176275582208e-05, + "loss": 0.6357, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.35986582214349305, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6764, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.323563052308323, + "learning_rate": 9.352435152866298e-05, + "loss": 0.5963, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.3956312370915418, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6905, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.3397007639140885, + "learning_rate": 9.300711433658087e-05, + "loss": 0.6332, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.32247734926631433, + "learning_rate": 9.274856534792138e-05, + "loss": 0.6166, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.33475964650166706, + "learning_rate": 9.249006508048694e-05, + "loss": 0.6024, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.35529200114794895, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6809, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.40786172554139966, + "learning_rate": 9.197321765624152e-05, + "loss": 0.7048, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3468015636323706, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6035, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.35249258081975404, + "learning_rate": 9.145658595428074e-05, + "loss": 0.6213, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.4021936975423688, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6314, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.4553860959058501, + "learning_rate": 9.09401838592431e-05, + "loss": 0.6215, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.3790640863147156, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6616, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.3708524896435191, + "learning_rate": 9.04240252495963e-05, + "loss": 0.6457, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3475291898433243, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6591, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.3797371340902428, + "learning_rate": 8.990812399726435e-05, + "loss": 0.7092, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.36574583109343267, + "learning_rate": 8.965027421318665e-05, + "loss": 0.6362, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.34289876837011185, + "learning_rate": 8.939249396725467e-05, + "loss": 0.6396, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.338568685704138, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6506, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.3235712908871543, + "learning_rate": 8.887714901728551e-05, + "loss": 0.6439, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.33715700433179935, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6221, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.33400788687652666, + "learning_rate": 8.836210299741346e-05, + "loss": 0.615, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.4266829104465876, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6346, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.3430245374866473, + "learning_rate": 8.784736974966135e-05, + "loss": 0.6141, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.36185558574035265, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6729, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.37420051018977957, + "learning_rate": 8.733296310764611e-05, + "loss": 0.635, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.33276192449027164, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6213, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.3759493424663521, + "learning_rate": 8.6818896896207e-05, + "loss": 0.6793, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.35066054482886627, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6812, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.8357492304681667, + "learning_rate": 8.63051849310342e-05, + "loss": 0.6549, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.35288247489819874, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6189, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.3328609388255515, + "learning_rate": 8.579184101829734e-05, + "loss": 0.632, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.3410948239423367, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6334, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.3753989457758862, + "learning_rate": 8.527887895427454e-05, + "loss": 0.637, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.359743509245254, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7027, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.32967241090714533, + "learning_rate": 8.476631252498162e-05, + "loss": 0.6337, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3148349462575111, + "learning_rate": 8.451018197859153e-05, + "loss": 0.5859, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.34726818294504275, + "learning_rate": 8.425415550580162e-05, + "loss": 0.6949, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3502795284328535, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6685, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.3573854083478653, + "learning_rate": 8.374242166111448e-05, + "loss": 0.6171, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3251675669449455, + "learning_rate": 8.348671772747487e-05, + "loss": 0.5611, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.34313457651696666, + "learning_rate": 8.323112474392731e-05, + "loss": 0.5764, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.34556748889259625, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6561, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.3275366659802456, + "learning_rate": 8.272027849550457e-05, + "loss": 0.6266, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3357722741486548, + "learning_rate": 8.246502866292324e-05, + "loss": 0.6924, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.33507644767103867, + "learning_rate": 8.220989664499878e-05, + "loss": 0.6281, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3696365488761149, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6706, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.34590966873605616, + "learning_rate": 8.169999290908188e-05, + "loss": 0.6089, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3472035646814677, + "learning_rate": 8.144522461705067e-05, + "loss": 0.705, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.350747720690762, + "learning_rate": 8.119058099157604e-05, + "loss": 0.6231, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.33590050967476426, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6588, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.349347905388431, + "learning_rate": 8.068167458308582e-05, + "loss": 0.6549, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.33807568727676013, + "learning_rate": 8.042741521933071e-05, + "loss": 0.6686, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.3370565633589211, + "learning_rate": 8.017328736063006e-05, + "loss": 0.5958, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4735862669632568, + "learning_rate": 7.991929271442817e-05, + "loss": 0.668, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.33837996550694993, + "learning_rate": 7.966543298727425e-05, + "loss": 0.6308, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3392732986146249, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6761, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.3358765639072988, + "learning_rate": 7.915812511176347e-05, + "loss": 0.6178, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.33173102492725864, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6019, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.35658256031807223, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6705, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3748455544598971, + "learning_rate": 7.839821780235168e-05, + "loss": 0.615, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.32165526070762285, + "learning_rate": 7.814520337545406e-05, + "loss": 0.6348, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.3368828998645065, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6455, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.3397827712490149, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6607, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.36276971696566507, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6662, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.3435139781227591, + "learning_rate": 7.713463104132345e-05, + "loss": 0.6271, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.35148211886992414, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6304, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.34153053729256644, + "learning_rate": 7.663025985934158e-05, + "loss": 0.6428, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.33771166105394973, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6694, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.3381109664449864, + "learning_rate": 7.61265167464313e-05, + "loss": 0.6126, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3322619750857264, + "learning_rate": 7.587488494725157e-05, + "loss": 0.63, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.3411227947303166, + "learning_rate": 7.562341524084623e-05, + "loss": 0.6276, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.3311599693102653, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6116, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.3618162118125905, + "learning_rate": 7.512096886359664e-05, + "loss": 0.675, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3448886900635197, + "learning_rate": 7.48699955686089e-05, + "loss": 0.5876, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.3815086820828657, + "learning_rate": 7.461919111808595e-05, + "loss": 0.7153, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3424972965832173, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6105, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.3438098859462676, + "learning_rate": 7.411809548974792e-05, + "loss": 0.6447, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.33902666734712233, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6163, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.3314464176008185, + "learning_rate": 7.361769544568425e-05, + "loss": 0.6111, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3492886708213938, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6544, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.3431716850924754, + "learning_rate": 7.311800443430251e-05, + "loss": 0.611, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.3662298831016498, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6386, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.34431760904769776, + "learning_rate": 7.26190358849548e-05, + "loss": 0.5973, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3330871036206359, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6317, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.33922286635680166, + "learning_rate": 7.212080320757695e-05, + "loss": 0.6581, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3714215187715549, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6471, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.3315286563306679, + "learning_rate": 7.162331979232783e-05, + "loss": 0.5955, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.35981870840392294, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6558, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.34551284889004336, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6483, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3423659385547448, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6457, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.3664967859517354, + "learning_rate": 7.06306542078091e-05, + "loss": 0.6665, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.3335290747742488, + "learning_rate": 7.038297696626206e-05, + "loss": 0.5645, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.3494881138806042, + "learning_rate": 7.013549871673736e-05, + "loss": 0.6391, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.33656974163543923, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6303, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.34539266418049264, + "learning_rate": 6.964114584347316e-05, + "loss": 0.6368, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.34374647515567053, + "learning_rate": 6.939427454121128e-05, + "loss": 0.623, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.36021041560458406, + "learning_rate": 6.914760887390452e-05, + "loss": 0.6643, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.3476822052034766, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6675, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.3396315149281206, + "learning_rate": 6.865490107199181e-05, + "loss": 0.5738, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3372436009028793, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6379, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.3328355003941503, + "learning_rate": 6.816303567941112e-05, + "loss": 0.6326, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.35791987496306515, + "learning_rate": 6.791742301846326e-05, + "loss": 0.654, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.34334518968094335, + "learning_rate": 6.767202591519875e-05, + "loss": 0.5783, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3368095032667133, + "learning_rate": 6.742684601840141e-05, + "loss": 0.673, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.3537145192474511, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6625, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.3531996916980002, + "learning_rate": 6.693714443203507e-05, + "loss": 0.5923, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.3612731220254524, + "learning_rate": 6.669262603269246e-05, + "loss": 0.6194, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3886557385929825, + "learning_rate": 6.644833142024751e-05, + "loss": 0.6137, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.3544485173162249, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6692, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.5096621614352511, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6197, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.3509486989832222, + "learning_rate": 6.571680671047749e-05, + "loss": 0.6686, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.3572483650291914, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6273, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.34366028857416675, + "learning_rate": 6.523027255641493e-05, + "loss": 0.6459, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.3510230058828406, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6559, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.3495066634770411, + "learning_rate": 6.474467284964634e-05, + "loss": 0.6165, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.33332936474310854, + "learning_rate": 6.450222749331414e-05, + "loss": 0.5952, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.328832402223803, + "learning_rate": 6.426002064081565e-05, + "loss": 0.661, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.33607601799098924, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6495, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3423937706972676, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6274, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.38289286460997496, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6738, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.3338479271655618, + "learning_rate": 6.329361079187199e-05, + "loss": 0.6613, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.328643411646018, + "learning_rate": 6.305262083634488e-05, + "loss": 0.6375, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.35086440716031086, + "learning_rate": 6.281187912432587e-05, + "loss": 0.6083, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.336518880389029, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6488, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.3426882198313159, + "learning_rate": 6.233114689915316e-05, + "loss": 0.621, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.345716170858981, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6358, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.36706291182679185, + "learning_rate": 6.18514270361827e-05, + "loss": 0.6447, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3387527465960079, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6243, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.342881537155457, + "learning_rate": 6.13727324280358e-05, + "loss": 0.6221, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.338695743727742, + "learning_rate": 6.113377361594049e-05, + "loss": 0.5956, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.3601693441226773, + "learning_rate": 6.08950759397797e-05, + "loss": 0.6193, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.33716618376042284, + "learning_rate": 6.065664100332478e-05, + "loss": 0.5791, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.32128132942983445, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.5719, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3749884944695895, + "learning_rate": 6.018056575578075e-05, + "loss": 0.5817, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.33564233650998176, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.6281, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3424352082166732, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6543, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.3509018285412802, + "learning_rate": 5.946846342446214e-05, + "loss": 0.619, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.35934568187673543, + "learning_rate": 5.923163850583113e-05, + "loss": 0.5966, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.3431163978046936, + "learning_rate": 5.899508750327501e-05, + "loss": 0.6364, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.34625798672579355, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6361, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.32443600753641594, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.6356, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.34459251472351493, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6048, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.3345412259745322, + "learning_rate": 5.80516544129337e-05, + "loss": 0.6438, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3260165546056915, + "learning_rate": 5.781649679379378e-05, + "loss": 0.6431, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.3330174029964389, + "learning_rate": 5.758162259883867e-05, + "loss": 0.5977, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.33501579672416454, + "learning_rate": 5.73470334061505e-05, + "loss": 0.5632, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.35228382275397585, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.6751, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.35943790003209114, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6123, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.33798390186355776, + "learning_rate": 5.664499159372017e-05, + "loss": 0.6397, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.33046298698438736, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6365, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.32574852800773374, + "learning_rate": 5.617841757494762e-05, + "loss": 0.6033, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.3216753676493232, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6385, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.3299670892424923, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6213, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.3125119039605451, + "learning_rate": 5.54807686792933e-05, + "loss": 0.5912, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.3797141126460133, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6551, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3370210754898945, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6043, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.3478736493741818, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.6374, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3595153093595184, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6186, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.36047353803967713, + "learning_rate": 5.432402360355615e-05, + "loss": 0.6221, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.33675550469388493, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6202, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.34797239104232763, + "learning_rate": 5.386346293357242e-05, + "loss": 0.6529, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.33924394710023, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6134, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.3828287262102965, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.6054, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.38173112009274013, + "learning_rate": 5.31749506635086e-05, + "loss": 0.6432, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.3331270379892713, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.6187, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3386203032989208, + "learning_rate": 5.271751296338823e-05, + "loss": 0.6411, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.3240975883763448, + "learning_rate": 5.248926987065417e-05, + "loss": 0.5982, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3199483145072002, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6465, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.3459437390242868, + "learning_rate": 5.203374286747158e-05, + "loss": 0.6086, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.34244123635720497, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6422, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.34609204729722776, + "learning_rate": 5.15795049724435e-05, + "loss": 0.6048, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.36228965918385303, + "learning_rate": 5.135287325678271e-05, + "loss": 0.6071, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.331923451965058, + "learning_rate": 5.112656839335543e-05, + "loss": 0.5907, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.37386608339506866, + "learning_rate": 5.090059190266779e-05, + "loss": 0.647, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.3290981600797574, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.5752, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.356801104340884, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6042, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.32682734025038185, + "learning_rate": 5.022464783894744e-05, + "loss": 0.6136, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.34255695950760445, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6837, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.356159265019211, + "learning_rate": 4.977568810302432e-05, + "loss": 0.5955, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.35671005269805617, + "learning_rate": 4.955171365513603e-05, + "loss": 0.5766, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.35753987551620403, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.6295, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.34154223402161743, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.599, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.34068669433515647, + "learning_rate": 4.88818300430819e-05, + "loss": 0.6481, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3163105027104073, + "learning_rate": 4.865922041720239e-05, + "loss": 0.5894, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.3728940924971478, + "learning_rate": 4.843695574177737e-05, + "loss": 0.6313, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.3323470866292376, + "learning_rate": 4.821503751016746e-05, + "loss": 0.6051, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.3422294361915459, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.6409, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3581679930388663, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6063, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.33177641577322436, + "learning_rate": 4.755137637685979e-05, + "loss": 0.5722, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.3341678200125972, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6048, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.34913269384188267, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.6038, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.34210468444628106, + "learning_rate": 4.689088677427249e-05, + "loss": 0.6287, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.31210895546892026, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.5884, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.35219449590912993, + "learning_rate": 4.645234206515171e-05, + "loss": 0.5874, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.3349621825934785, + "learning_rate": 4.623360864173893e-05, + "loss": 0.6063, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3252780110735412, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6325, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.33810475867529366, + "learning_rate": 4.579722700537268e-05, + "loss": 0.6134, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.35824268924366476, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6086, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.35718567477435437, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.6394, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3506010415131454, + "learning_rate": 4.514538954847064e-05, + "loss": 0.624, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.35778490743392094, + "learning_rate": 4.492884557078688e-05, + "loss": 0.6845, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.33433788575913326, + "learning_rate": 4.471267160734731e-05, + "loss": 0.5847, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.3466744954167082, + "learning_rate": 4.449686911058992e-05, + "loss": 0.6432, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3489421862068939, + "learning_rate": 4.428143953045717e-05, + "loss": 0.5872, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.33117493924801256, + "learning_rate": 4.406638431438576e-05, + "loss": 0.6442, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.3348837449633641, + "learning_rate": 4.385170490729712e-05, + "loss": 0.5787, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.3339803491928183, + "learning_rate": 4.36374027515878e-05, + "loss": 0.6456, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.326373852545579, + "learning_rate": 4.342347928711953e-05, + "loss": 0.5956, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.386365828267358, + "learning_rate": 4.320993595120969e-05, + "loss": 0.648, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.3399141661816139, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.5771, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.3599625251464077, + "learning_rate": 4.278399540155536e-05, + "loss": 0.5987, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.3289470636524019, + "learning_rate": 4.257160104963696e-05, + "loss": 0.5911, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.3229463772774697, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.5997, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3829602596620701, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.6462, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.33426423483585777, + "learning_rate": 4.193673880223339e-05, + "loss": 0.6504, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3536087449960244, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6745, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.329713415930323, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6567, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.339402553173935, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6348, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.3537464292423974, + "learning_rate": 4.109572403415386e-05, + "loss": 0.609, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.34196305631334595, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6142, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.3440748200033215, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6214, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.3384041540200196, + "learning_rate": 4.046911357233343e-05, + "loss": 0.6364, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.3532800876899815, + "learning_rate": 4.026104150684835e-05, + "loss": 0.5858, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.33383750428349107, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6363, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.326654564926912, + "learning_rate": 3.984610290059467e-05, + "loss": 0.6071, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.37644944556356774, + "learning_rate": 3.963923914773187e-05, + "loss": 0.691, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.3505956435013439, + "learning_rate": 3.943278094912946e-05, + "loss": 0.5953, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.33658269804132707, + "learning_rate": 3.922672969194686e-05, + "loss": 0.5956, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.3366854732590486, + "learning_rate": 3.902108676060937e-05, + "loss": 0.5764, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3466033210511593, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.6589, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.38021202329505166, + "learning_rate": 3.861103139944449e-05, + "loss": 0.6101, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.3347747636682198, + "learning_rate": 3.840662172471315e-05, + "loss": 0.596, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.34764008560357174, + "learning_rate": 3.820262588600074e-05, + "loss": 0.6236, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.34635126520329873, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5961, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.45852499489645104, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.6135, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.47300754187876, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6055, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.3336196301153384, + "learning_rate": 3.739080826174498e-05, + "loss": 0.5842, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3856310871637318, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.6631, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.346311090865471, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.6542, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3347814958414131, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6127, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.3327824426233396, + "learning_rate": 3.658572115866541e-05, + "loss": 0.6036, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.3284600973420504, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6202, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.36923160165027574, + "learning_rate": 3.618572862711247e-05, + "loss": 0.6601, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3527301872492391, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.6607, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.3425200973681726, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6242, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3568470629966751, + "learning_rate": 3.558895885496023e-05, + "loss": 0.6436, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.34147814071727495, + "learning_rate": 3.539089935331294e-05, + "loss": 0.6151, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.328333809265017, + "learning_rate": 3.519327394983888e-05, + "loss": 0.5981, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.3382395567060543, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.645, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.33331968460229827, + "learning_rate": 3.479933074573858e-05, + "loss": 0.6053, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.34057869855137035, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.5948, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.3290546163013816, + "learning_rate": 3.440713983000601e-05, + "loss": 0.5952, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.3359249985513373, + "learning_rate": 3.421170477595419e-05, + "loss": 0.6254, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3251069976287337, + "learning_rate": 3.401671174289469e-05, + "loss": 0.5989, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.3514172715827471, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6454, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3243574493629817, + "learning_rate": 3.362805697728145e-05, + "loss": 0.5848, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.32423564691983164, + "learning_rate": 3.34343978560367e-05, + "loss": 0.5984, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3349320085842291, + "learning_rate": 3.324118597838464e-05, + "loss": 0.5493, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.35507378645526905, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.5782, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.31346565845057295, + "learning_rate": 3.285610914348332e-05, + "loss": 0.5792, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.339190510820282, + "learning_rate": 3.266424677350346e-05, + "loss": 0.6312, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.341907055391667, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.6016, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.3255898643152782, + "learning_rate": 3.228188057393895e-05, + "loss": 0.5567, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.34848022717459426, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6055, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.34249415602960825, + "learning_rate": 3.190133432000252e-05, + "loss": 0.626, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.31351750195913475, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.5773, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.3261853813612115, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.5937, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3550362507161625, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.6176, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.3347554401991842, + "learning_rate": 3.114574250902558e-05, + "loss": 0.6118, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.35567840871262907, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6469, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.3559769207425711, + "learning_rate": 3.077071725875116e-05, + "loss": 0.6279, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.3622260889023372, + "learning_rate": 3.058390171511196e-05, + "loss": 0.5957, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.34546558486966733, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.6383, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3414376189445506, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6204, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.34334137334535764, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.5722, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3357756940991449, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6246, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.33452702182387545, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.622, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.35678047137193136, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.5947, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.34522705825177974, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.6169, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.328482844180868, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.6065, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.352773092897859, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.5957, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.3430681903679473, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6013, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.3509634075522109, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.6274, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3643192826914098, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.666, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.34819866786226594, + "learning_rate": 2.819819423336775e-05, + "loss": 0.6293, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.33836509096864215, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6602, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.3377560200675623, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.6469, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.34701822295393986, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6237, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.3496507449517045, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.5974, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3518870482920689, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5863, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.3386060344373831, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6339, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.33056597036296287, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.5947, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.35315143510157854, + "learning_rate": 2.677041764010988e-05, + "loss": 0.669, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.36678405493630256, + "learning_rate": 2.659414712405398e-05, + "loss": 0.6602, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.35496756095058585, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.6109, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.32484178473000924, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5899, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.33623176711375374, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.5498, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.33047697672464993, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.5651, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.3379805406898622, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.6536, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.34348726637901394, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.5979, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.3369936237622732, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.6209, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.3620126243026674, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.5937, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.3478287872629664, + "learning_rate": 2.503004759861258e-05, + "loss": 0.6635, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3610022987451473, + "learning_rate": 2.485876184956928e-05, + "loss": 0.6256, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.3510684858259938, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.6034, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.33229601293270916, + "learning_rate": 2.451770608467432e-05, + "loss": 0.5716, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.3407544768084185, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.5545, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.36019711156371026, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6634, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.3496451511939708, + "learning_rate": 2.400992893100822e-05, + "loss": 0.6065, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.3651858232496374, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.6387, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.35783748090888917, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.5921, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.32933507445357985, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.6366, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.34552065441434493, + "learning_rate": 2.334004587234717e-05, + "loss": 0.5765, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3414784912136309, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.5805, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.32676818861295753, + "learning_rate": 2.300819024631603e-05, + "loss": 0.6157, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.311466870667826, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.5735, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.3288806224466473, + "learning_rate": 2.26784037992395e-05, + "loss": 0.5358, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.33842663500282605, + "learning_rate": 2.251428928971102e-05, + "loss": 0.5862, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.33076301508735606, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.6268, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.34191118796049813, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6238, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.30470855878489794, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.5607, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.3394770076542117, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.6047, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.34752911910840706, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.5989, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3171861041373292, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5986, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.32370046708611994, + "learning_rate": 2.138012622361689e-05, + "loss": 0.6322, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3403886458850514, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.5712, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.33532744097004147, + "learning_rate": 2.106081749751897e-05, + "loss": 0.6644, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.33314721264059993, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.6385, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.33888798280900234, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.6358, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.32184629951311494, + "learning_rate": 2.058583491552465e-05, + "loss": 0.5888, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.402043844910486, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.5472, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3268309670747305, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6136, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.3147358042653876, + "learning_rate": 2.011565445123711e-05, + "loss": 0.6145, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.33905985365123564, + "learning_rate": 1.995999968955641e-05, + "loss": 0.5749, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.32554821385971133, + "learning_rate": 1.980488270378612e-05, + "loss": 0.5442, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3705835264290631, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6363, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.34687464446586974, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.6419, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.37149323515615806, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6455, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.37767561793664783, + "learning_rate": 1.918981330958678e-05, + "loss": 0.6109, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3325337496275416, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6137, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.32366227573157247, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.6142, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.3281643585543578, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.6345, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.35630193957247364, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.6526, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3680226392043987, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.6138, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.33454525351931946, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.5325, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3541194241867334, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.6184, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.33032402475086314, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.5966, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.33889136649801577, + "learning_rate": 1.783776873795994e-05, + "loss": 0.5409, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.32944729310677895, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.6122, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.343369601719398, + "learning_rate": 1.754336106761927e-05, + "loss": 0.6667, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.33471215942591, + "learning_rate": 1.739698775823442e-05, + "loss": 0.5985, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.34766327231907157, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.5905, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.34794447852113786, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.6679, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.35352688204869276, + "learning_rate": 1.696120172352025e-05, + "loss": 0.5774, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.3503894409025353, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6049, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3419316201221051, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.5757, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.3474883270272006, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.6044, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.3400637312711317, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6258, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.3590234930612263, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.6039, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3347921945667021, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.5836, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.35418037934495206, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.6275, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.35907806810817905, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.6032, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.3416727500520991, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.5893, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.34937663601271246, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6262, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.3468347223251416, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.596, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.4663420462566947, + "learning_rate": 1.526852950422226e-05, + "loss": 0.5598, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.3478299882572493, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.6037, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3491371976190531, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.578, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.3224168497203568, + "learning_rate": 1.485810737340767e-05, + "loss": 0.5707, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3497842183284705, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.628, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.3637221018117844, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.6261, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.3497090138868019, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.5972, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.3424617829976763, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.6237, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3472679049611877, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.5898, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.38382084399980854, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.6509, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.35818320452547475, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.6183, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.3280855359615322, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.5884, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.34167181642304734, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6063, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.3581565635204731, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.5884, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.40457865558246575, + "learning_rate": 1.339745962155613e-05, + "loss": 0.573, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.3396694616042361, + "learning_rate": 1.326814704364262e-05, + "loss": 0.6134, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.3288631131124086, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.565, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.3522850767680027, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.6208, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3496630890644889, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6478, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.34521354334546384, + "learning_rate": 1.275673273546758e-05, + "loss": 0.5936, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.32916214591040255, + "learning_rate": 1.263034245443473e-05, + "loss": 0.5804, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.32492396630451986, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.5927, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3275153847995364, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6251, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.3528759656094755, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.5801, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.367972580363633, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.6147, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.3587343173142828, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.5958, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.3614376480967368, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.5897, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.3558733082244146, + "learning_rate": 1.176209418012495e-05, + "loss": 0.6067, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.3576499918732122, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.617, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.32348481797534356, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.5924, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.35976954140226275, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.6137, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.3367133292322927, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.585, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3666196371390034, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.6458, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.35296555685607484, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.6428, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.36291384377744296, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6174, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.32944881339937504, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.5806, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.3252347324995862, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5723, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.3759298251455837, + "learning_rate": 1.057219974130903e-05, + "loss": 0.614, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.34350352854627375, + "learning_rate": 1.045650195232819e-05, + "loss": 0.6355, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.3255554235034715, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.5923, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.33935822693399353, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.5757, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.35956652809566675, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.5827, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3464718277540499, + "learning_rate": 9.999734793146998e-06, + "loss": 0.6438, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.33711044965199344, + "learning_rate": 9.887052838721322e-06, + "loss": 0.5891, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.3307753227099956, + "learning_rate": 9.774976338718677e-06, + "loss": 0.606, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.32072821421317327, + "learning_rate": 9.663506046162985e-06, + "loss": 0.5908, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.3679805432721958, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6317, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.34951059101065973, + "learning_rate": 9.44238707511862e-06, + "loss": 0.5946, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.35348265534072415, + "learning_rate": 9.332739882292752e-06, + "loss": 0.6028, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.34262473025738155, + "learning_rate": 9.22370186822965e-06, + "loss": 0.6453, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.339645286968352, + "learning_rate": 9.115273765538202e-06, + "loss": 0.6071, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.33276544388234147, + "learning_rate": 9.0074563027294e-06, + "loss": 0.6231, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3381217622159822, + "learning_rate": 8.900250204211514e-06, + "loss": 0.569, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.32745384801860544, + "learning_rate": 8.79365619028507e-06, + "loss": 0.626, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.35174659838743455, + "learning_rate": 8.687674977138116e-06, + "loss": 0.6391, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.3518656769447974, + "learning_rate": 8.582307276841462e-06, + "loss": 0.6335, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.3705221607331985, + "learning_rate": 8.47755379734373e-06, + "loss": 0.546, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.33276061750399677, + "learning_rate": 8.37341524246672e-06, + "loss": 0.5994, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.35280552080454464, + "learning_rate": 8.269892311900696e-06, + "loss": 0.6496, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.3204166724597419, + "learning_rate": 8.166985701199582e-06, + "loss": 0.5826, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.32988206814279836, + "learning_rate": 8.064696101776358e-06, + "loss": 0.6048, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.3246960203505506, + "learning_rate": 7.963024200898462e-06, + "loss": 0.5601, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.339714136580042, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6614, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.3552916599335909, + "learning_rate": 7.761536223092458e-06, + "loss": 0.6236, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3404478280633831, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6362, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.32239417587058217, + "learning_rate": 7.562527182833978e-06, + "loss": 0.6004, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.32041107412168346, + "learning_rate": 7.463953938275858e-06, + "loss": 0.5549, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.3346809999458499, + "learning_rate": 7.366002428553153e-06, + "loss": 0.5619, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.364466231133146, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.6141, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.3317189002719038, + "learning_rate": 7.171967241914224e-06, + "loss": 0.6238, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3256609411268322, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5651, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.31734296267635187, + "learning_rate": 6.980426837673437e-06, + "loss": 0.6016, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3406610875494774, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5982, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.36246008387194356, + "learning_rate": 6.791386363539065e-06, + "loss": 0.6351, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.33984162849695077, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.6314, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.3275584466815877, + "learning_rate": 6.604850900032955e-06, + "loss": 0.5832, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.3395780237628323, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6099, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.3443027185143187, + "learning_rate": 6.420825460353974e-06, + "loss": 0.6134, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3221773032461762, + "learning_rate": 6.329755547632499e-06, + "loss": 0.5805, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.35206646392275887, + "learning_rate": 6.239314990243339e-06, + "loss": 0.6299, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.32833005369432117, + "learning_rate": 6.149504395842087e-06, + "loss": 0.5983, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.332205742625648, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.6267, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.34898125982747574, + "learning_rate": 5.971775505458444e-06, + "loss": 0.5582, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.31796847705272, + "learning_rate": 5.883858403607967e-06, + "loss": 0.5206, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3641126830546926, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.5835, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.3270521049353336, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.569, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.35222406275247237, + "learning_rate": 5.623903547074549e-06, + "loss": 0.6243, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.3289485047089139, + "learning_rate": 5.538519351897575e-06, + "loss": 0.5896, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.33274687560591176, + "learning_rate": 5.453769828241872e-06, + "loss": 0.6124, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.3493330572049464, + "learning_rate": 5.369655545525909e-06, + "loss": 0.6317, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3362064958501068, + "learning_rate": 5.286177068899989e-06, + "loss": 0.5753, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.3576621743241454, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.5804, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.33012366472913696, + "learning_rate": 5.121129773156663e-06, + "loss": 0.5788, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.347653986542375, + "learning_rate": 5.039562062965508e-06, + "loss": 0.5854, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3323912180374366, + "learning_rate": 4.95863237670956e-06, + "loss": 0.5885, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.38083319979832697, + "learning_rate": 4.87834125814235e-06, + "loss": 0.6529, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.3426288486745291, + "learning_rate": 4.798689246727006e-06, + "loss": 0.652, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.3258227992254228, + "learning_rate": 4.719676877632639e-06, + "loss": 0.5823, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.34552782393746645, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6429, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.34585912369353516, + "learning_rate": 4.563573185591219e-06, + "loss": 0.5873, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.32788379221951186, + "learning_rate": 4.486482911479839e-06, + "loss": 0.626, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.3343646080672475, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.6098, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.355768636496455, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.6236, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.3730126642330909, + "learning_rate": 4.259064579323302e-06, + "loss": 0.5936, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.31953139488473575, + "learning_rate": 4.184544329761009e-06, + "loss": 0.5555, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.3523701477101136, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.6754, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.36742759253540924, + "learning_rate": 4.037435632986786e-06, + "loss": 0.6482, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.33968792210204607, + "learning_rate": 3.964848174174541e-06, + "loss": 0.6029, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.36371297347883375, + "learning_rate": 3.892905960127546e-06, + "loss": 0.6594, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.3660127629756041, + "learning_rate": 3.821609474213983e-06, + "loss": 0.5838, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.35511370698314615, + "learning_rate": 3.750959195463466e-06, + "loss": 0.6452, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.3363613444987827, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.5655, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.36322908286669636, + "learning_rate": 3.611599153858214e-06, + "loss": 0.6346, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.33539501918083525, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.5852, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.37291280962010165, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6272, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.3361674696613345, + "learning_rate": 3.40741737109318e-06, + "loss": 0.5811, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.34984812754410244, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.615, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.46656006049123666, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.6002, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3370570438284523, + "learning_rate": 3.209076472645112e-06, + "loss": 0.5785, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.35279039394029527, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.6398, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.33664616276466686, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.6285, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.3624270205714784, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.5839, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3457343293859614, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.6187, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.3290788278898686, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.5759, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.35531106545655416, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.6329, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.3492304272841695, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6074, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.33751463361052897, + "learning_rate": 2.708812932856253e-06, + "loss": 0.5959, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.34414667982486763, + "learning_rate": 2.649217248223468e-06, + "loss": 0.6766, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.34208711480428583, + "learning_rate": 2.590275647868867e-06, + "loss": 0.5846, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.3274222219532599, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.564, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.32352946950501243, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.5646, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.32646724699080115, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.6262, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.34292711834350403, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6225, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.3396645859144178, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.6387, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.35753988403286646, + "learning_rate": 2.250383684694579e-06, + "loss": 0.678, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.38303375416849994, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.6386, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.32437564943451874, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.6297, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.33455678611417283, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.5864, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.35633372512862366, + "learning_rate": 2.036919225091827e-06, + "loss": 0.5792, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.3270029314567895, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.5935, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.424905218617759, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6181, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.3438103926353635, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.6379, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3484985442300672, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.5823, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.37070144803293215, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.7229, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3435688728484316, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.59, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.3392515140501043, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.6108, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.3380362142098612, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.6424, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.3390725949829156, + "learning_rate": 1.595161589389449e-06, + "loss": 0.6334, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.34696472062215294, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.5874, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.3319619683863788, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.569, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.34664240273570046, + "learning_rate": 1.459798471131868e-06, + "loss": 0.5728, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.32850816834155266, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.5932, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3637393330704097, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.6328, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.32476356201295953, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.6157, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.36321274221856314, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.6552, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.3454751537662377, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.6139, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.32388775087044114, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.562, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.3297851417597346, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.5993, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.32841215556161885, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.5903, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.36272395447568967, + "learning_rate": 1.089491988176017e-06, + "loss": 0.6059, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.8762168378974585, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.6509, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.3184011726641648, + "learning_rate": 1.014505010326583e-06, + "loss": 0.594, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3473583567384825, + "learning_rate": 9.780089980330642e-07, + "loss": 0.6342, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.3584881362293531, + "learning_rate": 9.421782985976068e-07, + "loss": 0.6197, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.32737124624942704, + "learning_rate": 9.070131527609604e-07, + "loss": 0.6299, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.37869537163973865, + "learning_rate": 8.725137967920738e-07, + "loss": 0.6598, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.3321138787804279, + "learning_rate": 8.386804624865851e-07, + "loss": 0.5605, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.3408301098945604, + "learning_rate": 8.055133771652345e-07, + "loss": 0.6054, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3615435875111645, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6177, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.34767742571856813, + "learning_rate": 7.411788403743237e-07, + "loss": 0.5915, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.327953339120735, + "learning_rate": 7.100118211581852e-07, + "loss": 0.5491, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.3300646854341275, + "learning_rate": 6.7951191543012e-07, + "loss": 0.6129, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.339220283137721, + "learning_rate": 6.496793281141056e-07, + "loss": 0.6061, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.32327635395524157, + "learning_rate": 6.205142596505176e-07, + "loss": 0.6273, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3464923503762519, + "learning_rate": 5.920169059947411e-07, + "loss": 0.5959, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.33387843346355767, + "learning_rate": 5.64187458615939e-07, + "loss": 0.6092, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.3366781877057486, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6642, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.36947007399686926, + "learning_rate": 5.105330261267916e-07, + "loss": 0.6075, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3534595267305509, + "learning_rate": 4.847084015119574e-07, + "loss": 0.5763, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.32910943744054405, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.5804, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3113027212367898, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.5952, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.31503140780736955, + "learning_rate": 4.112469628438365e-07, + "loss": 0.565, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.3199722417834563, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.5863, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.36333310408385744, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.5642, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.34357775418269904, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.562, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.3374720421675638, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.5757, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.3240559607434544, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.586, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.3235001197601677, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.5964, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.32646149016401926, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.6076, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.3433792228141088, + "learning_rate": 2.448018893333681e-07, + "loss": 0.6089, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3406389457841003, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.5476, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.3666160110041597, + "learning_rate": 2.098903854912515e-07, + "loss": 0.6208, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3339058118069488, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.605, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.31960892698928794, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.5785, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.35726561286788344, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.5882, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.34382423390343725, + "learning_rate": 1.481139151579991e-07, + "loss": 0.6186, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.33950063476087333, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.6131, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.3374258237510089, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.601, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.32243906417604673, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5592, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.33738749681016067, + "learning_rate": 9.707157531134713e-08, + "loss": 0.6265, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.36964025638056597, + "learning_rate": 8.598886661895788e-08, + "loss": 0.6118, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.33757112059065936, + "learning_rate": 7.557746412468758e-08, + "loss": 0.6032, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.31982456897221395, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5737, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.3369736515602363, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.5822, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.32918194943225737, + "learning_rate": 4.837177080119215e-08, + "loss": 0.578, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.3409591484904072, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6282, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.3775219527797019, + "learning_rate": 3.359233507459481e-08, + "loss": 0.605, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.33899188697787547, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.5586, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.35085086397272225, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.6007, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.38394310885156824, + "learning_rate": 1.646071422083395e-08, + "loss": 0.5863, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3307196575296069, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6356, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.33602057099272453, + "learning_rate": 8.398436437317969e-09, + "loss": 0.5993, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3508656340887223, + "learning_rate": 5.375026405352035e-09, + "loss": 0.6385, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.32674316969959266, + "learning_rate": 3.023464202944748e-09, + "loss": 0.5244, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.3917784573175883, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5978, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.3415814546985931, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.5808, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.33900386574972013, + "learning_rate": 0.0, + "loss": 0.5779, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1113230063894528.0, + "train_loss": 0.6684961423873902, + "train_runtime": 19805.2254, + "train_samples_per_second": 1.01, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1113230063894528.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ddab6017a1a1fb6922a0b0971ea863eb587ac2fc --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "gate_proj", + "v_proj", + "up_proj", + "o_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..5676b622f362e2aca7ec5e323372c92c042dd285 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8262ee6a0c00b8ce095b990a942055d72e79c6072afadd6521a5f42aef69fad +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..e8fb1a17d6759b87a3b29b357dbabd8927e1935c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc232555e409ab19e56350ef2c9b0c80c0b12c30a0291e5aaa3b3d52cdb8089e +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..56daf9503a7c4fdf85b35199d042c2bd2782ce6b --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_2000_epochs_1_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 1.1618730196957123, + "learning_rate": 5e-05, + "loss": 1.5515, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 1.15537504646185, + "learning_rate": 0.0001, + "loss": 1.5311, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.9635490214235893, + "learning_rate": 0.00015000000000000001, + "loss": 1.4312, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.3893228481848556, + "learning_rate": 0.0002, + "loss": 1.167, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.3289968323340917, + "learning_rate": 0.00019996629653035126, + "loss": 1.0447, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.8072806305690539, + "learning_rate": 0.00019986520883988232, + "loss": 1.0105, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.6177964607456953, + "learning_rate": 0.00019969680506871137, + "loss": 0.8789, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.5524814937302823, + "learning_rate": 0.00019946119873266613, + "loss": 0.8635, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.5319431137217482, + "learning_rate": 0.00019915854864676664, + "loss": 0.9212, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.5995041018941503, + "learning_rate": 0.00019878905881817252, + "loss": 0.9419, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.5241617758084439, + "learning_rate": 0.00019835297830866826, + "loss": 0.8659, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.5203540014161351, + "learning_rate": 0.00019785060106677818, + "loss": 0.8887, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.5144968235440169, + "learning_rate": 0.00019728226572962473, + "loss": 0.9298, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.44732252083116936, + "learning_rate": 0.0001966483553946637, + "loss": 0.8116, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.48923527048555243, + "learning_rate": 0.00019594929736144976, + "loss": 0.9035, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.4202366241305497, + "learning_rate": 0.00019518556284360696, + "loss": 0.863, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.4537252189692721, + "learning_rate": 0.0001943576666511982, + "loss": 0.8725, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.4306220050888526, + "learning_rate": 0.0001934661668437073, + "loss": 0.8376, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.3894710263894399, + "learning_rate": 0.0001925116643538684, + "loss": 0.8287, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.43976119402396047, + "learning_rate": 0.00019149480258259533, + "loss": 0.823, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.40179621147791533, + "learning_rate": 0.00019041626696528503, + "loss": 0.7952, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.4345187904250498, + "learning_rate": 0.0001892767845097864, + "loss": 0.8617, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.4131232137462944, + "learning_rate": 0.00018807712330634642, + "loss": 0.8384, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.40603865564196473, + "learning_rate": 0.0001868180920098644, + "loss": 0.8331, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.3951360345025102, + "learning_rate": 0.00018550053929480202, + "loss": 0.7836, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.40477643152693216, + "learning_rate": 0.00018412535328311814, + "loss": 0.7748, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.4251533568833813, + "learning_rate": 0.0001826934609456129, + "loss": 0.8444, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.4071344808654217, + "learning_rate": 0.00018120582747708502, + "loss": 0.8003, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.408159311901172, + "learning_rate": 0.0001796634556457236, + "loss": 0.8071, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.39865492737418323, + "learning_rate": 0.0001780673851171728, + "loss": 0.7849, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.4276310275653838, + "learning_rate": 0.00017641869175372493, + "loss": 0.8444, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.434386133606189, + "learning_rate": 0.00017471848688911464, + "loss": 0.8215, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.3822536210786355, + "learning_rate": 0.000172967916579403, + "loss": 0.8132, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.3913709039902287, + "learning_rate": 0.00017116816083045602, + "loss": 0.8282, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.4171383919051642, + "learning_rate": 0.0001693204328025389, + "loss": 0.8489, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.38777521576935947, + "learning_rate": 0.00016742597799256182, + "loss": 0.7557, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.38572156231558025, + "learning_rate": 0.00016548607339452853, + "loss": 0.7754, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.4104203019193881, + "learning_rate": 0.00016350202663875386, + "loss": 0.7698, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.5677828834671339, + "learning_rate": 0.0001614751751104301, + "loss": 0.7886, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.4077418306551607, + "learning_rate": 0.00015940688504813662, + "loss": 0.8048, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.41489722264421747, + "learning_rate": 0.00015729855062290022, + "loss": 0.7622, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.39615414916810177, + "learning_rate": 0.00015515159299842707, + "loss": 0.7987, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.40912637858839207, + "learning_rate": 0.00015296745937313987, + "loss": 0.7637, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.38793742176167395, + "learning_rate": 0.00015074762200466556, + "loss": 0.8198, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.3979430102052743, + "learning_rate": 0.00014849357721743168, + "loss": 0.8026, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.3788509235665202, + "learning_rate": 0.00014620684439403962, + "loss": 0.8023, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.39278185642418834, + "learning_rate": 0.0001438889649510956, + "loss": 0.7481, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.3696223706213371, + "learning_rate": 0.00014154150130018866, + "loss": 0.7609, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.39313607939291717, + "learning_rate": 0.00013916603579471705, + "loss": 0.8157, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.3855055319878002, + "learning_rate": 0.000136764169663272, + "loss": 0.7638, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.37066834001446136, + "learning_rate": 0.00013433752193029886, + "loss": 0.758, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.4170209578873234, + "learning_rate": 0.00013188772832476188, + "loss": 0.834, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.37700762349814604, + "learning_rate": 0.00012941644017754964, + "loss": 0.7533, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.3694537319289278, + "learning_rate": 0.00012692532330836346, + "loss": 0.7159, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.3764585488672252, + "learning_rate": 0.00012441605690283915, + "loss": 0.7762, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.36568785296633305, + "learning_rate": 0.0001218903323806595, + "loss": 0.7848, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.3642739327906848, + "learning_rate": 0.00011934985225541998, + "loss": 0.8302, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.38223649800753023, + "learning_rate": 0.00011679632898701649, + "loss": 0.7719, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.41018205417563863, + "learning_rate": 0.00011423148382732853, + "loss": 0.7676, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.37477718222402323, + "learning_rate": 0.00011165704565997593, + "loss": 0.7395, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.373178036681771, + "learning_rate": 0.00010907474983493144, + "loss": 0.7662, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.3869027594520346, + "learning_rate": 0.0001064863369987743, + "loss": 0.801, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.3525699091995798, + "learning_rate": 0.00010389355192137377, + "loss": 0.7239, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.37067545461002194, + "learning_rate": 0.0001012981423197931, + "loss": 0.7011, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.37290965245827035, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7643, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.37698723891357505, + "learning_rate": 9.610644807862625e-05, + "loss": 0.7229, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.37310662528724425, + "learning_rate": 9.35136630012257e-05, + "loss": 0.7814, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.36161343594825424, + "learning_rate": 9.092525016506858e-05, + "loss": 0.7469, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.38820107691296035, + "learning_rate": 8.83429543400241e-05, + "loss": 0.8049, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.380257947651291, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7446, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.3532116216773163, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7377, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.3856516769452259, + "learning_rate": 8.065014774458003e-05, + "loss": 0.7489, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.37631098630814735, + "learning_rate": 7.810966761934053e-05, + "loss": 0.7719, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.38056900801863475, + "learning_rate": 7.558394309716088e-05, + "loss": 0.713, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.3475939820542006, + "learning_rate": 7.307467669163655e-05, + "loss": 0.7242, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.35273458966475485, + "learning_rate": 7.058355982245037e-05, + "loss": 0.748, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.3745939242868226, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7519, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.3749958310986676, + "learning_rate": 6.566247806970119e-05, + "loss": 0.6751, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.346667180530484, + "learning_rate": 6.323583033672799e-05, + "loss": 0.6966, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.3747288183174455, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8034, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.3533701171483392, + "learning_rate": 5.845849869981137e-05, + "loss": 0.7251, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.3591629687481598, + "learning_rate": 5.611103504890444e-05, + "loss": 0.7635, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.4084692648623678, + "learning_rate": 5.379315560596038e-05, + "loss": 0.6802, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.38688968930898393, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.764, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.39312529151121633, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.8013, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.3863181564669427, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7498, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.3799061879135913, + "learning_rate": 4.484840700157295e-05, + "loss": 0.7561, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.3838739899745123, + "learning_rate": 4.270144937709981e-05, + "loss": 0.7573, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.3530265455361142, + "learning_rate": 4.059311495186338e-05, + "loss": 0.7013, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.33230437953255176, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7152, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.3833248244274872, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7238, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.39464827998307495, + "learning_rate": 3.45139266054715e-05, + "loss": 0.7773, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.36445590177391896, + "learning_rate": 3.257402200743821e-05, + "loss": 0.7877, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.357480514684254, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.6932, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.37544630400071793, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7565, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.36203062045464296, + "learning_rate": 2.7032083420597e-05, + "loss": 0.6979, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.3729928884427148, + "learning_rate": 2.528151311088537e-05, + "loss": 0.6918, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.3649485582808177, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7278, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.3908594146852727, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7761, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.3642904141026686, + "learning_rate": 2.03365443542764e-05, + "loss": 0.6851, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.44262214768350755, + "learning_rate": 1.879417252291502e-05, + "loss": 0.6799, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.35993727801165104, + "learning_rate": 1.730653905438714e-05, + "loss": 0.7673, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.40347095599001326, + "learning_rate": 1.587464671688187e-05, + "loss": 0.716, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.3845207910750551, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7552, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.394463216691188, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.7964, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.348375773555577, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.7627, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.38679798921064223, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7596, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.34654567263972674, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7239, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.35543614359220455, + "learning_rate": 8.505197417404687e-06, + "loss": 0.751, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.38143658550121157, + "learning_rate": 7.488335646131628e-06, + "loss": 0.7369, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.37067691582012285, + "learning_rate": 6.533833156292679e-06, + "loss": 0.7399, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.3488961142915352, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7192, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.33944638409930467, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.6978, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.3568746231972527, + "learning_rate": 4.050702638550275e-06, + "loss": 0.7851, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.41401609220830243, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.7084, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.35282787153112793, + "learning_rate": 2.717734270375272e-06, + "loss": 0.7244, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.3527796028448424, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.6861, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.5300463886375664, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.7752, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.3994773326174539, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.6969, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.3651699090120398, + "learning_rate": 8.41451353233369e-07, + "loss": 0.7376, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.34676126862325934, + "learning_rate": 5.388012673338661e-07, + "loss": 0.7347, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.37764716508598595, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7745, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.3564807247913989, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7641, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.3742103364582621, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7504, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.3985429773500903, + "learning_rate": 0.0, + "loss": 0.7313, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 114429774790656.0, + "train_loss": 0.7987350339889526, + "train_runtime": 1999.0989, + "train_samples_per_second": 1.0, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 114429774790656.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..c7327bc9e0975f5675f0b1cad99f0ca78114c285 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "q_proj", + "v_proj", + "up_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3c6191004ca47f0d06e57ebee60afa0c4a9679b9 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b9ef59283510c9e7b49e3932426b46eabb879faff3c8887c30de65c24077d5fd +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..d70a1447480e247ef0e5b0c3486d0b95f8fa6f2e --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a09b73edd598b333e4c155070f11cbe643f9b38281bac20c220a35ec1c5993 +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..770135592cc16f102bef0cff872dcf4ad8777ffd --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_40000_epochs_1_lora/trainer_state.json @@ -0,0 +1,17542 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 2500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0004, + "grad_norm": 1.233553174704933, + "learning_rate": 2.666666666666667e-06, + "loss": 1.5715, + "step": 1 + }, + { + "epoch": 0.0008, + "grad_norm": 1.3337982820383152, + "learning_rate": 5.333333333333334e-06, + "loss": 1.5787, + "step": 2 + }, + { + "epoch": 0.0012, + "grad_norm": 1.311564134607298, + "learning_rate": 8.000000000000001e-06, + "loss": 1.5487, + "step": 3 + }, + { + "epoch": 0.0016, + "grad_norm": 1.061096529351063, + "learning_rate": 1.0666666666666667e-05, + "loss": 1.5362, + "step": 4 + }, + { + "epoch": 0.002, + "grad_norm": 1.2336840237271833, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.5033, + "step": 5 + }, + { + "epoch": 0.0024, + "grad_norm": 0.9783991334698707, + "learning_rate": 1.6000000000000003e-05, + "loss": 1.4035, + "step": 6 + }, + { + "epoch": 0.0028, + "grad_norm": 0.9119634003290528, + "learning_rate": 1.866666666666667e-05, + "loss": 1.4177, + "step": 7 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9977006256744981, + "learning_rate": 2.1333333333333335e-05, + "loss": 1.321, + "step": 8 + }, + { + "epoch": 0.0036, + "grad_norm": 1.1542983904843214, + "learning_rate": 2.4e-05, + "loss": 1.239, + "step": 9 + }, + { + "epoch": 0.004, + "grad_norm": 0.9682399854813285, + "learning_rate": 2.6666666666666667e-05, + "loss": 1.2082, + "step": 10 + }, + { + "epoch": 0.0044, + "grad_norm": 0.9957686985180999, + "learning_rate": 2.9333333333333336e-05, + "loss": 1.1093, + "step": 11 + }, + { + "epoch": 0.0048, + "grad_norm": 1.7801228600481995, + "learning_rate": 3.2000000000000005e-05, + "loss": 1.0706, + "step": 12 + }, + { + "epoch": 0.0052, + "grad_norm": 0.8252284965178849, + "learning_rate": 3.466666666666667e-05, + "loss": 1.0651, + "step": 13 + }, + { + "epoch": 0.0056, + "grad_norm": 0.8172977694645526, + "learning_rate": 3.733333333333334e-05, + "loss": 1.0237, + "step": 14 + }, + { + "epoch": 0.006, + "grad_norm": 0.8846448752697776, + "learning_rate": 4e-05, + "loss": 0.9963, + "step": 15 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7617761740155385, + "learning_rate": 4.266666666666667e-05, + "loss": 0.9759, + "step": 16 + }, + { + "epoch": 0.0068, + "grad_norm": 0.683601200285156, + "learning_rate": 4.5333333333333335e-05, + "loss": 0.8564, + "step": 17 + }, + { + "epoch": 0.0072, + "grad_norm": 0.7155019049565624, + "learning_rate": 4.8e-05, + "loss": 0.9246, + "step": 18 + }, + { + "epoch": 0.0076, + "grad_norm": 0.6216097253918912, + "learning_rate": 5.0666666666666674e-05, + "loss": 0.9475, + "step": 19 + }, + { + "epoch": 0.008, + "grad_norm": 0.6398971674842517, + "learning_rate": 5.333333333333333e-05, + "loss": 0.8955, + "step": 20 + }, + { + "epoch": 0.0084, + "grad_norm": 0.6220173239834369, + "learning_rate": 5.6000000000000006e-05, + "loss": 0.9069, + "step": 21 + }, + { + "epoch": 0.0088, + "grad_norm": 0.5676109594805903, + "learning_rate": 5.866666666666667e-05, + "loss": 0.8886, + "step": 22 + }, + { + "epoch": 0.0092, + "grad_norm": 0.5562866799273941, + "learning_rate": 6.133333333333334e-05, + "loss": 0.9011, + "step": 23 + }, + { + "epoch": 0.0096, + "grad_norm": 0.6399877528249651, + "learning_rate": 6.400000000000001e-05, + "loss": 0.8165, + "step": 24 + }, + { + "epoch": 0.01, + "grad_norm": 0.523177200801189, + "learning_rate": 6.666666666666667e-05, + "loss": 0.8198, + "step": 25 + }, + { + "epoch": 0.0104, + "grad_norm": 0.49637418009567086, + "learning_rate": 6.933333333333334e-05, + "loss": 0.8107, + "step": 26 + }, + { + "epoch": 0.0108, + "grad_norm": 0.5471005048916006, + "learning_rate": 7.2e-05, + "loss": 0.86, + "step": 27 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5322454492355576, + "learning_rate": 7.466666666666667e-05, + "loss": 0.841, + "step": 28 + }, + { + "epoch": 0.0116, + "grad_norm": 0.48738685686042843, + "learning_rate": 7.733333333333333e-05, + "loss": 0.8626, + "step": 29 + }, + { + "epoch": 0.012, + "grad_norm": 0.5305190850010086, + "learning_rate": 8e-05, + "loss": 0.9097, + "step": 30 + }, + { + "epoch": 0.0124, + "grad_norm": 0.47861290923048533, + "learning_rate": 8.266666666666667e-05, + "loss": 0.8681, + "step": 31 + }, + { + "epoch": 0.0128, + "grad_norm": 0.4740060284166036, + "learning_rate": 8.533333333333334e-05, + "loss": 0.8481, + "step": 32 + }, + { + "epoch": 0.0132, + "grad_norm": 0.4723004681291369, + "learning_rate": 8.800000000000001e-05, + "loss": 0.8406, + "step": 33 + }, + { + "epoch": 0.0136, + "grad_norm": 0.5102009004568578, + "learning_rate": 9.066666666666667e-05, + "loss": 0.82, + "step": 34 + }, + { + "epoch": 0.014, + "grad_norm": 0.4534745152781649, + "learning_rate": 9.333333333333334e-05, + "loss": 0.7999, + "step": 35 + }, + { + "epoch": 0.0144, + "grad_norm": 0.44852670349819795, + "learning_rate": 9.6e-05, + "loss": 0.8555, + "step": 36 + }, + { + "epoch": 0.0148, + "grad_norm": 0.4832952532334212, + "learning_rate": 9.866666666666668e-05, + "loss": 0.8648, + "step": 37 + }, + { + "epoch": 0.0152, + "grad_norm": 0.47705929417929455, + "learning_rate": 0.00010133333333333335, + "loss": 0.757, + "step": 38 + }, + { + "epoch": 0.0156, + "grad_norm": 0.4607455254160932, + "learning_rate": 0.00010400000000000001, + "loss": 0.8459, + "step": 39 + }, + { + "epoch": 0.016, + "grad_norm": 0.4626227212847935, + "learning_rate": 0.00010666666666666667, + "loss": 0.8004, + "step": 40 + }, + { + "epoch": 0.0164, + "grad_norm": 0.4886210667054391, + "learning_rate": 0.00010933333333333333, + "loss": 0.796, + "step": 41 + }, + { + "epoch": 0.0168, + "grad_norm": 0.45953131550049225, + "learning_rate": 0.00011200000000000001, + "loss": 0.7777, + "step": 42 + }, + { + "epoch": 0.0172, + "grad_norm": 0.4988337626973592, + "learning_rate": 0.00011466666666666667, + "loss": 0.7889, + "step": 43 + }, + { + "epoch": 0.0176, + "grad_norm": 0.46646005480591024, + "learning_rate": 0.00011733333333333334, + "loss": 0.8255, + "step": 44 + }, + { + "epoch": 0.018, + "grad_norm": 0.45796241326093257, + "learning_rate": 0.00012, + "loss": 0.8138, + "step": 45 + }, + { + "epoch": 0.0184, + "grad_norm": 0.5007827146329478, + "learning_rate": 0.00012266666666666668, + "loss": 0.8442, + "step": 46 + }, + { + "epoch": 0.0188, + "grad_norm": 0.4353271252466811, + "learning_rate": 0.00012533333333333334, + "loss": 0.801, + "step": 47 + }, + { + "epoch": 0.0192, + "grad_norm": 0.45968526845933466, + "learning_rate": 0.00012800000000000002, + "loss": 0.7364, + "step": 48 + }, + { + "epoch": 0.0196, + "grad_norm": 0.49745369949682966, + "learning_rate": 0.00013066666666666668, + "loss": 0.8031, + "step": 49 + }, + { + "epoch": 0.02, + "grad_norm": 0.44668329837307647, + "learning_rate": 0.00013333333333333334, + "loss": 0.7674, + "step": 50 + }, + { + "epoch": 0.0204, + "grad_norm": 0.43655839767863414, + "learning_rate": 0.00013600000000000003, + "loss": 0.8486, + "step": 51 + }, + { + "epoch": 0.0208, + "grad_norm": 0.472683113679215, + "learning_rate": 0.00013866666666666669, + "loss": 0.787, + "step": 52 + }, + { + "epoch": 0.0212, + "grad_norm": 0.450526582200461, + "learning_rate": 0.00014133333333333334, + "loss": 0.8041, + "step": 53 + }, + { + "epoch": 0.0216, + "grad_norm": 0.5768027325528573, + "learning_rate": 0.000144, + "loss": 0.7016, + "step": 54 + }, + { + "epoch": 0.022, + "grad_norm": 0.481586914103551, + "learning_rate": 0.00014666666666666666, + "loss": 0.7489, + "step": 55 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4505400211123096, + "learning_rate": 0.00014933333333333335, + "loss": 0.7228, + "step": 56 + }, + { + "epoch": 0.0228, + "grad_norm": 0.4542790336161104, + "learning_rate": 0.000152, + "loss": 0.7701, + "step": 57 + }, + { + "epoch": 0.0232, + "grad_norm": 0.44949587528522483, + "learning_rate": 0.00015466666666666667, + "loss": 0.7668, + "step": 58 + }, + { + "epoch": 0.0236, + "grad_norm": 0.4351243243946112, + "learning_rate": 0.00015733333333333333, + "loss": 0.826, + "step": 59 + }, + { + "epoch": 0.024, + "grad_norm": 0.4636031767973824, + "learning_rate": 0.00016, + "loss": 0.8101, + "step": 60 + }, + { + "epoch": 0.0244, + "grad_norm": 0.4523286961925652, + "learning_rate": 0.00016266666666666667, + "loss": 0.7196, + "step": 61 + }, + { + "epoch": 0.0248, + "grad_norm": 0.4411015337759786, + "learning_rate": 0.00016533333333333333, + "loss": 0.8573, + "step": 62 + }, + { + "epoch": 0.0252, + "grad_norm": 0.44374281606353655, + "learning_rate": 0.000168, + "loss": 0.8284, + "step": 63 + }, + { + "epoch": 0.0256, + "grad_norm": 0.43205787306575816, + "learning_rate": 0.00017066666666666668, + "loss": 0.7832, + "step": 64 + }, + { + "epoch": 0.026, + "grad_norm": 0.4500334614959462, + "learning_rate": 0.00017333333333333334, + "loss": 0.8236, + "step": 65 + }, + { + "epoch": 0.0264, + "grad_norm": 0.49176342841903986, + "learning_rate": 0.00017600000000000002, + "loss": 0.8431, + "step": 66 + }, + { + "epoch": 0.0268, + "grad_norm": 0.4171536469667789, + "learning_rate": 0.00017866666666666668, + "loss": 0.7936, + "step": 67 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4441975548154934, + "learning_rate": 0.00018133333333333334, + "loss": 0.7874, + "step": 68 + }, + { + "epoch": 0.0276, + "grad_norm": 0.42173891568743316, + "learning_rate": 0.00018400000000000003, + "loss": 0.7767, + "step": 69 + }, + { + "epoch": 0.028, + "grad_norm": 0.43152248333642174, + "learning_rate": 0.0001866666666666667, + "loss": 0.8205, + "step": 70 + }, + { + "epoch": 0.0284, + "grad_norm": 0.43754948754338047, + "learning_rate": 0.00018933333333333335, + "loss": 0.7883, + "step": 71 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4510713409723826, + "learning_rate": 0.000192, + "loss": 0.7889, + "step": 72 + }, + { + "epoch": 0.0292, + "grad_norm": 0.43305456062703274, + "learning_rate": 0.0001946666666666667, + "loss": 0.7761, + "step": 73 + }, + { + "epoch": 0.0296, + "grad_norm": 0.44421606021947135, + "learning_rate": 0.00019733333333333335, + "loss": 0.8069, + "step": 74 + }, + { + "epoch": 0.03, + "grad_norm": 0.4374806718919075, + "learning_rate": 0.0002, + "loss": 0.7374, + "step": 75 + }, + { + "epoch": 0.0304, + "grad_norm": 0.43669138824870574, + "learning_rate": 0.00019999991608372393, + "loss": 0.7494, + "step": 76 + }, + { + "epoch": 0.0308, + "grad_norm": 0.47021568540914727, + "learning_rate": 0.00019999966433503652, + "loss": 0.7765, + "step": 77 + }, + { + "epoch": 0.0312, + "grad_norm": 0.42215672127392045, + "learning_rate": 0.0001999992447543603, + "loss": 0.7376, + "step": 78 + }, + { + "epoch": 0.0316, + "grad_norm": 0.4550168133398646, + "learning_rate": 0.00019999865734239946, + "loss": 0.8398, + "step": 79 + }, + { + "epoch": 0.032, + "grad_norm": 0.4257398400765521, + "learning_rate": 0.00019999790210013988, + "loss": 0.7967, + "step": 80 + }, + { + "epoch": 0.0324, + "grad_norm": 0.46493621948454705, + "learning_rate": 0.0001999969790288491, + "loss": 0.7052, + "step": 81 + }, + { + "epoch": 0.0328, + "grad_norm": 0.41987088411626927, + "learning_rate": 0.00019999588813007633, + "loss": 0.7497, + "step": 82 + }, + { + "epoch": 0.0332, + "grad_norm": 0.4604213663969049, + "learning_rate": 0.00019999462940565243, + "loss": 0.7872, + "step": 83 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4256343198719839, + "learning_rate": 0.00019999320285769, + "loss": 0.8006, + "step": 84 + }, + { + "epoch": 0.034, + "grad_norm": 0.44084009183711087, + "learning_rate": 0.0001999916084885832, + "loss": 0.8113, + "step": 85 + }, + { + "epoch": 0.0344, + "grad_norm": 0.4485125928528638, + "learning_rate": 0.00019998984630100792, + "loss": 0.7894, + "step": 86 + }, + { + "epoch": 0.0348, + "grad_norm": 0.43001354176382595, + "learning_rate": 0.0001999879162979217, + "loss": 0.7267, + "step": 87 + }, + { + "epoch": 0.0352, + "grad_norm": 0.42469727606498964, + "learning_rate": 0.0001999858184825637, + "loss": 0.7284, + "step": 88 + }, + { + "epoch": 0.0356, + "grad_norm": 0.42522972576982493, + "learning_rate": 0.00019998355285845475, + "loss": 0.7954, + "step": 89 + }, + { + "epoch": 0.036, + "grad_norm": 0.42623386533074803, + "learning_rate": 0.0001999811194293973, + "loss": 0.7646, + "step": 90 + }, + { + "epoch": 0.0364, + "grad_norm": 0.4419344880281263, + "learning_rate": 0.00019997851819947537, + "loss": 0.8297, + "step": 91 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4152362718412211, + "learning_rate": 0.00019997574917305478, + "loss": 0.7784, + "step": 92 + }, + { + "epoch": 0.0372, + "grad_norm": 0.4232230671800955, + "learning_rate": 0.00019997281235478278, + "loss": 0.7855, + "step": 93 + }, + { + "epoch": 0.0376, + "grad_norm": 0.41852191170507774, + "learning_rate": 0.00019996970774958836, + "loss": 0.7304, + "step": 94 + }, + { + "epoch": 0.038, + "grad_norm": 0.44716788452141526, + "learning_rate": 0.00019996643536268204, + "loss": 0.7658, + "step": 95 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4255123756712993, + "learning_rate": 0.0001999629951995559, + "loss": 0.7895, + "step": 96 + }, + { + "epoch": 0.0388, + "grad_norm": 0.4442132358144533, + "learning_rate": 0.00019995938726598373, + "loss": 0.7595, + "step": 97 + }, + { + "epoch": 0.0392, + "grad_norm": 0.41829577238427884, + "learning_rate": 0.00019995561156802079, + "loss": 0.6954, + "step": 98 + }, + { + "epoch": 0.0396, + "grad_norm": 0.4158609423759713, + "learning_rate": 0.0001999516681120039, + "loss": 0.7325, + "step": 99 + }, + { + "epoch": 0.04, + "grad_norm": 0.4316103491720492, + "learning_rate": 0.00019994755690455152, + "loss": 0.7705, + "step": 100 + }, + { + "epoch": 0.0404, + "grad_norm": 0.40851905497651775, + "learning_rate": 0.0001999432779525635, + "loss": 0.7624, + "step": 101 + }, + { + "epoch": 0.0408, + "grad_norm": 0.4319653076801545, + "learning_rate": 0.0001999388312632214, + "loss": 0.7191, + "step": 102 + }, + { + "epoch": 0.0412, + "grad_norm": 0.6966142087903838, + "learning_rate": 0.00019993421684398824, + "loss": 0.7768, + "step": 103 + }, + { + "epoch": 0.0416, + "grad_norm": 0.41710643293646915, + "learning_rate": 0.00019992943470260844, + "loss": 0.7996, + "step": 104 + }, + { + "epoch": 0.042, + "grad_norm": 0.4110556419161805, + "learning_rate": 0.00019992448484710797, + "loss": 0.7359, + "step": 105 + }, + { + "epoch": 0.0424, + "grad_norm": 0.5156907227676272, + "learning_rate": 0.00019991936728579437, + "loss": 0.7905, + "step": 106 + }, + { + "epoch": 0.0428, + "grad_norm": 0.42197055785073717, + "learning_rate": 0.00019991408202725655, + "loss": 0.7386, + "step": 107 + }, + { + "epoch": 0.0432, + "grad_norm": 0.43155124246288074, + "learning_rate": 0.0001999086290803649, + "loss": 0.7079, + "step": 108 + }, + { + "epoch": 0.0436, + "grad_norm": 0.41500836404692715, + "learning_rate": 0.00019990300845427125, + "loss": 0.7573, + "step": 109 + }, + { + "epoch": 0.044, + "grad_norm": 0.4277723496639902, + "learning_rate": 0.0001998972201584088, + "loss": 0.7001, + "step": 110 + }, + { + "epoch": 0.0444, + "grad_norm": 0.4203089960191111, + "learning_rate": 0.00019989126420249221, + "loss": 0.747, + "step": 111 + }, + { + "epoch": 0.0448, + "grad_norm": 0.40083853943140735, + "learning_rate": 0.00019988514059651752, + "loss": 0.7518, + "step": 112 + }, + { + "epoch": 0.0452, + "grad_norm": 0.4271473061997102, + "learning_rate": 0.00019987884935076213, + "loss": 0.7423, + "step": 113 + }, + { + "epoch": 0.0456, + "grad_norm": 0.4447134275311075, + "learning_rate": 0.00019987239047578482, + "loss": 0.7603, + "step": 114 + }, + { + "epoch": 0.046, + "grad_norm": 0.4141406035356386, + "learning_rate": 0.00019986576398242566, + "loss": 0.7072, + "step": 115 + }, + { + "epoch": 0.0464, + "grad_norm": 0.40561074396013835, + "learning_rate": 0.00019985896988180605, + "loss": 0.7718, + "step": 116 + }, + { + "epoch": 0.0468, + "grad_norm": 0.40220017292785776, + "learning_rate": 0.00019985200818532875, + "loss": 0.7502, + "step": 117 + }, + { + "epoch": 0.0472, + "grad_norm": 0.4233888412820622, + "learning_rate": 0.0001998448789046777, + "loss": 0.7652, + "step": 118 + }, + { + "epoch": 0.0476, + "grad_norm": 0.41629114657404, + "learning_rate": 0.00019983758205181822, + "loss": 0.763, + "step": 119 + }, + { + "epoch": 0.048, + "grad_norm": 0.4334543642986752, + "learning_rate": 0.00019983011763899673, + "loss": 0.7573, + "step": 120 + }, + { + "epoch": 0.0484, + "grad_norm": 0.44993886253918425, + "learning_rate": 0.00019982248567874098, + "loss": 0.7478, + "step": 121 + }, + { + "epoch": 0.0488, + "grad_norm": 0.4055243046386066, + "learning_rate": 0.00019981468618385988, + "loss": 0.7378, + "step": 122 + }, + { + "epoch": 0.0492, + "grad_norm": 0.4413650244469547, + "learning_rate": 0.00019980671916744352, + "loss": 0.6988, + "step": 123 + }, + { + "epoch": 0.0496, + "grad_norm": 0.44282863942194783, + "learning_rate": 0.00019979858464286317, + "loss": 0.7739, + "step": 124 + }, + { + "epoch": 0.05, + "grad_norm": 0.4003454200231535, + "learning_rate": 0.00019979028262377118, + "loss": 0.7124, + "step": 125 + }, + { + "epoch": 0.0504, + "grad_norm": 0.42277199793975706, + "learning_rate": 0.00019978181312410104, + "loss": 0.8094, + "step": 126 + }, + { + "epoch": 0.0508, + "grad_norm": 0.42727006528696454, + "learning_rate": 0.00019977317615806737, + "loss": 0.7637, + "step": 127 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4194902391741022, + "learning_rate": 0.00019976437174016573, + "loss": 0.7225, + "step": 128 + }, + { + "epoch": 0.0516, + "grad_norm": 0.4083313507877906, + "learning_rate": 0.00019975539988517288, + "loss": 0.6898, + "step": 129 + }, + { + "epoch": 0.052, + "grad_norm": 0.4038095107764052, + "learning_rate": 0.00019974626060814647, + "loss": 0.7255, + "step": 130 + }, + { + "epoch": 0.0524, + "grad_norm": 0.40295378634918333, + "learning_rate": 0.0001997369539244252, + "loss": 0.7044, + "step": 131 + }, + { + "epoch": 0.0528, + "grad_norm": 0.42946977035313305, + "learning_rate": 0.0001997274798496287, + "loss": 0.8355, + "step": 132 + }, + { + "epoch": 0.0532, + "grad_norm": 0.45280745176400994, + "learning_rate": 0.00019971783839965756, + "loss": 0.8345, + "step": 133 + }, + { + "epoch": 0.0536, + "grad_norm": 0.40812858974254734, + "learning_rate": 0.00019970802959069328, + "loss": 0.7534, + "step": 134 + }, + { + "epoch": 0.054, + "grad_norm": 0.4005018025383052, + "learning_rate": 0.00019969805343919821, + "loss": 0.7328, + "step": 135 + }, + { + "epoch": 0.0544, + "grad_norm": 0.41213665680910816, + "learning_rate": 0.0001996879099619156, + "loss": 0.708, + "step": 136 + }, + { + "epoch": 0.0548, + "grad_norm": 0.3831675511800143, + "learning_rate": 0.00019967759917586953, + "loss": 0.7062, + "step": 137 + }, + { + "epoch": 0.0552, + "grad_norm": 0.4095960002451829, + "learning_rate": 0.00019966712109836476, + "loss": 0.7307, + "step": 138 + }, + { + "epoch": 0.0556, + "grad_norm": 0.4132300546181534, + "learning_rate": 0.000199656475746987, + "loss": 0.7887, + "step": 139 + }, + { + "epoch": 0.056, + "grad_norm": 0.397649037372497, + "learning_rate": 0.00019964566313960264, + "loss": 0.7265, + "step": 140 + }, + { + "epoch": 0.0564, + "grad_norm": 0.42120826285768476, + "learning_rate": 0.0001996346832943587, + "loss": 0.7836, + "step": 141 + }, + { + "epoch": 0.0568, + "grad_norm": 0.4342001860001955, + "learning_rate": 0.00019962353622968295, + "loss": 0.709, + "step": 142 + }, + { + "epoch": 0.0572, + "grad_norm": 0.4123432548955716, + "learning_rate": 0.00019961222196428378, + "loss": 0.767, + "step": 143 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4145218974145123, + "learning_rate": 0.0001996007405171502, + "loss": 0.7571, + "step": 144 + }, + { + "epoch": 0.058, + "grad_norm": 0.4195605987194582, + "learning_rate": 0.00019958909190755187, + "loss": 0.7503, + "step": 145 + }, + { + "epoch": 0.0584, + "grad_norm": 0.408252075709149, + "learning_rate": 0.00019957727615503888, + "loss": 0.7169, + "step": 146 + }, + { + "epoch": 0.0588, + "grad_norm": 0.37858547837823253, + "learning_rate": 0.00019956529327944198, + "loss": 0.7304, + "step": 147 + }, + { + "epoch": 0.0592, + "grad_norm": 0.41152287217885664, + "learning_rate": 0.00019955314330087225, + "loss": 0.7074, + "step": 148 + }, + { + "epoch": 0.0596, + "grad_norm": 0.39588672202028524, + "learning_rate": 0.00019954082623972142, + "loss": 0.7444, + "step": 149 + }, + { + "epoch": 0.06, + "grad_norm": 0.423265875933927, + "learning_rate": 0.0001995283421166614, + "loss": 0.7311, + "step": 150 + }, + { + "epoch": 0.0604, + "grad_norm": 0.4117989229058825, + "learning_rate": 0.00019951569095264473, + "loss": 0.7431, + "step": 151 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4097653391835411, + "learning_rate": 0.0001995028727689041, + "loss": 0.7139, + "step": 152 + }, + { + "epoch": 0.0612, + "grad_norm": 0.4080502432875463, + "learning_rate": 0.00019948988758695263, + "loss": 0.7546, + "step": 153 + }, + { + "epoch": 0.0616, + "grad_norm": 0.4289198634795377, + "learning_rate": 0.00019947673542858367, + "loss": 0.7418, + "step": 154 + }, + { + "epoch": 0.062, + "grad_norm": 0.4053067174061218, + "learning_rate": 0.00019946341631587087, + "loss": 0.7417, + "step": 155 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4149451768425185, + "learning_rate": 0.00019944993027116797, + "loss": 0.7538, + "step": 156 + }, + { + "epoch": 0.0628, + "grad_norm": 0.3902632133129172, + "learning_rate": 0.00019943627731710897, + "loss": 0.7075, + "step": 157 + }, + { + "epoch": 0.0632, + "grad_norm": 0.40528543888587293, + "learning_rate": 0.00019942245747660796, + "loss": 0.7732, + "step": 158 + }, + { + "epoch": 0.0636, + "grad_norm": 0.39075970939435, + "learning_rate": 0.00019940847077285916, + "loss": 0.7051, + "step": 159 + }, + { + "epoch": 0.064, + "grad_norm": 0.39548018332760576, + "learning_rate": 0.0001993943172293368, + "loss": 0.7164, + "step": 160 + }, + { + "epoch": 0.0644, + "grad_norm": 0.39318742210695906, + "learning_rate": 0.0001993799968697951, + "loss": 0.6935, + "step": 161 + }, + { + "epoch": 0.0648, + "grad_norm": 0.39855227114585756, + "learning_rate": 0.00019936550971826834, + "loss": 0.7697, + "step": 162 + }, + { + "epoch": 0.0652, + "grad_norm": 0.38947866213700505, + "learning_rate": 0.00019935085579907063, + "loss": 0.7061, + "step": 163 + }, + { + "epoch": 0.0656, + "grad_norm": 0.42041126259328787, + "learning_rate": 0.00019933603513679605, + "loss": 0.7051, + "step": 164 + }, + { + "epoch": 0.066, + "grad_norm": 0.38933284000032525, + "learning_rate": 0.00019932104775631846, + "loss": 0.7598, + "step": 165 + }, + { + "epoch": 0.0664, + "grad_norm": 0.40483887852504585, + "learning_rate": 0.0001993058936827916, + "loss": 0.7265, + "step": 166 + }, + { + "epoch": 0.0668, + "grad_norm": 0.39830833640345015, + "learning_rate": 0.00019929057294164893, + "loss": 0.7151, + "step": 167 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4115135101352071, + "learning_rate": 0.0001992750855586036, + "loss": 0.7502, + "step": 168 + }, + { + "epoch": 0.0676, + "grad_norm": 0.38172900244707286, + "learning_rate": 0.00019925943155964856, + "loss": 0.7247, + "step": 169 + }, + { + "epoch": 0.068, + "grad_norm": 0.379583113248406, + "learning_rate": 0.00019924361097105623, + "loss": 0.7005, + "step": 170 + }, + { + "epoch": 0.0684, + "grad_norm": 0.4500629854402789, + "learning_rate": 0.00019922762381937878, + "loss": 0.791, + "step": 171 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4119696775639509, + "learning_rate": 0.0001992114701314478, + "loss": 0.6956, + "step": 172 + }, + { + "epoch": 0.0692, + "grad_norm": 0.42835121955460526, + "learning_rate": 0.00019919514993437445, + "loss": 0.6814, + "step": 173 + }, + { + "epoch": 0.0696, + "grad_norm": 0.4078948690763593, + "learning_rate": 0.00019917866325554938, + "loss": 0.7347, + "step": 174 + }, + { + "epoch": 0.07, + "grad_norm": 0.4110342906921341, + "learning_rate": 0.00019916201012264254, + "loss": 0.7335, + "step": 175 + }, + { + "epoch": 0.0704, + "grad_norm": 0.41830987692511135, + "learning_rate": 0.0001991451905636033, + "loss": 0.7822, + "step": 176 + }, + { + "epoch": 0.0708, + "grad_norm": 0.39501674747254456, + "learning_rate": 0.00019912820460666044, + "loss": 0.7925, + "step": 177 + }, + { + "epoch": 0.0712, + "grad_norm": 0.42899841467332, + "learning_rate": 0.00019911105228032186, + "loss": 0.7735, + "step": 178 + }, + { + "epoch": 0.0716, + "grad_norm": 0.41271689167378905, + "learning_rate": 0.00019909373361337476, + "loss": 0.725, + "step": 179 + }, + { + "epoch": 0.072, + "grad_norm": 0.3834594071228448, + "learning_rate": 0.0001990762486348855, + "loss": 0.7488, + "step": 180 + }, + { + "epoch": 0.0724, + "grad_norm": 0.40870320889373035, + "learning_rate": 0.00019905859737419956, + "loss": 0.742, + "step": 181 + }, + { + "epoch": 0.0728, + "grad_norm": 0.39509791257797366, + "learning_rate": 0.00019904077986094152, + "loss": 0.7336, + "step": 182 + }, + { + "epoch": 0.0732, + "grad_norm": 0.43168076295692787, + "learning_rate": 0.00019902279612501493, + "loss": 0.7687, + "step": 183 + }, + { + "epoch": 0.0736, + "grad_norm": 0.40758738331074995, + "learning_rate": 0.0001990046461966024, + "loss": 0.7744, + "step": 184 + }, + { + "epoch": 0.074, + "grad_norm": 0.4046682465439113, + "learning_rate": 0.00019898633010616542, + "loss": 0.7161, + "step": 185 + }, + { + "epoch": 0.0744, + "grad_norm": 0.3984141736259908, + "learning_rate": 0.0001989678478844443, + "loss": 0.7172, + "step": 186 + }, + { + "epoch": 0.0748, + "grad_norm": 0.4101332832422778, + "learning_rate": 0.00019894919956245824, + "loss": 0.7378, + "step": 187 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4041894850957024, + "learning_rate": 0.00019893038517150525, + "loss": 0.7698, + "step": 188 + }, + { + "epoch": 0.0756, + "grad_norm": 0.3756720407067433, + "learning_rate": 0.00019891140474316194, + "loss": 0.7126, + "step": 189 + }, + { + "epoch": 0.076, + "grad_norm": 0.393690580210608, + "learning_rate": 0.00019889225830928365, + "loss": 0.7162, + "step": 190 + }, + { + "epoch": 0.0764, + "grad_norm": 0.409859652021738, + "learning_rate": 0.00019887294590200435, + "loss": 0.7246, + "step": 191 + }, + { + "epoch": 0.0768, + "grad_norm": 0.39567494207743276, + "learning_rate": 0.00019885346755373656, + "loss": 0.7754, + "step": 192 + }, + { + "epoch": 0.0772, + "grad_norm": 0.40662939057511427, + "learning_rate": 0.00019883382329717128, + "loss": 0.7528, + "step": 193 + }, + { + "epoch": 0.0776, + "grad_norm": 0.4008250299398194, + "learning_rate": 0.00019881401316527793, + "loss": 0.7124, + "step": 194 + }, + { + "epoch": 0.078, + "grad_norm": 0.4032947178650355, + "learning_rate": 0.0001987940371913044, + "loss": 0.7153, + "step": 195 + }, + { + "epoch": 0.0784, + "grad_norm": 0.3903992167133848, + "learning_rate": 0.00019877389540877687, + "loss": 0.7283, + "step": 196 + }, + { + "epoch": 0.0788, + "grad_norm": 0.41517569778134544, + "learning_rate": 0.0001987535878514998, + "loss": 0.7359, + "step": 197 + }, + { + "epoch": 0.0792, + "grad_norm": 0.41301040465889627, + "learning_rate": 0.0001987331145535559, + "loss": 0.707, + "step": 198 + }, + { + "epoch": 0.0796, + "grad_norm": 0.4135534682003512, + "learning_rate": 0.000198712475549306, + "loss": 0.686, + "step": 199 + }, + { + "epoch": 0.08, + "grad_norm": 0.42377098811209635, + "learning_rate": 0.00019869167087338907, + "loss": 0.7537, + "step": 200 + }, + { + "epoch": 0.0804, + "grad_norm": 0.40895896445777713, + "learning_rate": 0.00019867070056072214, + "loss": 0.7311, + "step": 201 + }, + { + "epoch": 0.0808, + "grad_norm": 0.40915037278205396, + "learning_rate": 0.00019864956464650025, + "loss": 0.7449, + "step": 202 + }, + { + "epoch": 0.0812, + "grad_norm": 0.40445847337509294, + "learning_rate": 0.00019862826316619628, + "loss": 0.7158, + "step": 203 + }, + { + "epoch": 0.0816, + "grad_norm": 0.46618903941937756, + "learning_rate": 0.0001986067961555611, + "loss": 0.718, + "step": 204 + }, + { + "epoch": 0.082, + "grad_norm": 0.4046909719778866, + "learning_rate": 0.00019858516365062334, + "loss": 0.7036, + "step": 205 + }, + { + "epoch": 0.0824, + "grad_norm": 0.41548282817470816, + "learning_rate": 0.00019856336568768935, + "loss": 0.7487, + "step": 206 + }, + { + "epoch": 0.0828, + "grad_norm": 0.4170338736156831, + "learning_rate": 0.00019854140230334322, + "loss": 0.7642, + "step": 207 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3703809957686201, + "learning_rate": 0.0001985192735344467, + "loss": 0.7279, + "step": 208 + }, + { + "epoch": 0.0836, + "grad_norm": 0.4313534925981791, + "learning_rate": 0.00019849697941813898, + "loss": 0.7603, + "step": 209 + }, + { + "epoch": 0.084, + "grad_norm": 0.3967111981874945, + "learning_rate": 0.00019847451999183694, + "loss": 0.6939, + "step": 210 + }, + { + "epoch": 0.0844, + "grad_norm": 0.39530819623251945, + "learning_rate": 0.00019845189529323475, + "loss": 0.754, + "step": 211 + }, + { + "epoch": 0.0848, + "grad_norm": 0.40409630943783886, + "learning_rate": 0.00019842910536030403, + "loss": 0.7026, + "step": 212 + }, + { + "epoch": 0.0852, + "grad_norm": 0.40861421773390033, + "learning_rate": 0.00019840615023129372, + "loss": 0.7411, + "step": 213 + }, + { + "epoch": 0.0856, + "grad_norm": 0.40743970735535945, + "learning_rate": 0.00019838302994472997, + "loss": 0.7416, + "step": 214 + }, + { + "epoch": 0.086, + "grad_norm": 0.4149156974194457, + "learning_rate": 0.0001983597445394162, + "loss": 0.7416, + "step": 215 + }, + { + "epoch": 0.0864, + "grad_norm": 0.40827125586244023, + "learning_rate": 0.00019833629405443284, + "loss": 0.6797, + "step": 216 + }, + { + "epoch": 0.0868, + "grad_norm": 0.3991174587727073, + "learning_rate": 0.0001983126785291375, + "loss": 0.7051, + "step": 217 + }, + { + "epoch": 0.0872, + "grad_norm": 0.3995736742692105, + "learning_rate": 0.00019828889800316466, + "loss": 0.6868, + "step": 218 + }, + { + "epoch": 0.0876, + "grad_norm": 0.4096471682734239, + "learning_rate": 0.00019826495251642578, + "loss": 0.7455, + "step": 219 + }, + { + "epoch": 0.088, + "grad_norm": 0.4143782757620809, + "learning_rate": 0.00019824084210910925, + "loss": 0.7631, + "step": 220 + }, + { + "epoch": 0.0884, + "grad_norm": 0.3849725243870913, + "learning_rate": 0.00019821656682168012, + "loss": 0.7172, + "step": 221 + }, + { + "epoch": 0.0888, + "grad_norm": 0.4145335757065706, + "learning_rate": 0.00019819212669488026, + "loss": 0.7241, + "step": 222 + }, + { + "epoch": 0.0892, + "grad_norm": 0.42576198570162194, + "learning_rate": 0.00019816752176972813, + "loss": 0.7725, + "step": 223 + }, + { + "epoch": 0.0896, + "grad_norm": 0.39758444622346295, + "learning_rate": 0.0001981427520875188, + "loss": 0.7016, + "step": 224 + }, + { + "epoch": 0.09, + "grad_norm": 0.39097805447968603, + "learning_rate": 0.0001981178176898239, + "loss": 0.7032, + "step": 225 + }, + { + "epoch": 0.0904, + "grad_norm": 0.38693500061254904, + "learning_rate": 0.00019809271861849145, + "loss": 0.6888, + "step": 226 + }, + { + "epoch": 0.0908, + "grad_norm": 0.405762505678354, + "learning_rate": 0.00019806745491564586, + "loss": 0.6475, + "step": 227 + }, + { + "epoch": 0.0912, + "grad_norm": 0.4041890904067067, + "learning_rate": 0.0001980420266236878, + "loss": 0.6854, + "step": 228 + }, + { + "epoch": 0.0916, + "grad_norm": 0.4070188903338878, + "learning_rate": 0.0001980164337852943, + "loss": 0.7296, + "step": 229 + }, + { + "epoch": 0.092, + "grad_norm": 0.3943738247800487, + "learning_rate": 0.00019799067644341844, + "loss": 0.6877, + "step": 230 + }, + { + "epoch": 0.0924, + "grad_norm": 0.3879611737937716, + "learning_rate": 0.00019796475464128942, + "loss": 0.7096, + "step": 231 + }, + { + "epoch": 0.0928, + "grad_norm": 0.41252676247575876, + "learning_rate": 0.00019793866842241243, + "loss": 0.7955, + "step": 232 + }, + { + "epoch": 0.0932, + "grad_norm": 0.3731003151411376, + "learning_rate": 0.00019791241783056874, + "loss": 0.6998, + "step": 233 + }, + { + "epoch": 0.0936, + "grad_norm": 0.41307635587280533, + "learning_rate": 0.00019788600290981525, + "loss": 0.7288, + "step": 234 + }, + { + "epoch": 0.094, + "grad_norm": 0.39841410187280835, + "learning_rate": 0.0001978594237044849, + "loss": 0.731, + "step": 235 + }, + { + "epoch": 0.0944, + "grad_norm": 0.39306819371338675, + "learning_rate": 0.0001978326802591862, + "loss": 0.7462, + "step": 236 + }, + { + "epoch": 0.0948, + "grad_norm": 0.4173972279803122, + "learning_rate": 0.00019780577261880336, + "loss": 0.7252, + "step": 237 + }, + { + "epoch": 0.0952, + "grad_norm": 0.3734715331431929, + "learning_rate": 0.0001977787008284962, + "loss": 0.7047, + "step": 238 + }, + { + "epoch": 0.0956, + "grad_norm": 0.40155801467397784, + "learning_rate": 0.00019775146493369994, + "loss": 0.7478, + "step": 239 + }, + { + "epoch": 0.096, + "grad_norm": 0.4262333677432803, + "learning_rate": 0.0001977240649801253, + "loss": 0.7816, + "step": 240 + }, + { + "epoch": 0.0964, + "grad_norm": 0.391277356302728, + "learning_rate": 0.00019769650101375837, + "loss": 0.6988, + "step": 241 + }, + { + "epoch": 0.0968, + "grad_norm": 0.40394930428709697, + "learning_rate": 0.00019766877308086036, + "loss": 0.7343, + "step": 242 + }, + { + "epoch": 0.0972, + "grad_norm": 0.4730053935130541, + "learning_rate": 0.00019764088122796783, + "loss": 0.7479, + "step": 243 + }, + { + "epoch": 0.0976, + "grad_norm": 0.38553115270003163, + "learning_rate": 0.0001976128255018924, + "loss": 0.7224, + "step": 244 + }, + { + "epoch": 0.098, + "grad_norm": 0.3942639089554025, + "learning_rate": 0.00019758460594972068, + "loss": 0.6645, + "step": 245 + }, + { + "epoch": 0.0984, + "grad_norm": 0.4086576183632301, + "learning_rate": 0.00019755622261881427, + "loss": 0.7888, + "step": 246 + }, + { + "epoch": 0.0988, + "grad_norm": 0.402016469304673, + "learning_rate": 0.00019752767555680968, + "loss": 0.7368, + "step": 247 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3881004687395709, + "learning_rate": 0.00019749896481161808, + "loss": 0.7491, + "step": 248 + }, + { + "epoch": 0.0996, + "grad_norm": 0.4019891978415487, + "learning_rate": 0.00019747009043142555, + "loss": 0.7185, + "step": 249 + }, + { + "epoch": 0.1, + "grad_norm": 0.378323402032098, + "learning_rate": 0.00019744105246469263, + "loss": 0.7158, + "step": 250 + }, + { + "epoch": 0.1004, + "grad_norm": 0.3906032109598485, + "learning_rate": 0.00019741185096015448, + "loss": 0.731, + "step": 251 + }, + { + "epoch": 0.1008, + "grad_norm": 0.38837749622029527, + "learning_rate": 0.00019738248596682078, + "loss": 0.6796, + "step": 252 + }, + { + "epoch": 0.1012, + "grad_norm": 0.37230188694518274, + "learning_rate": 0.0001973529575339755, + "loss": 0.6871, + "step": 253 + }, + { + "epoch": 0.1016, + "grad_norm": 0.41085346510849274, + "learning_rate": 0.00019732326571117703, + "loss": 0.7151, + "step": 254 + }, + { + "epoch": 0.102, + "grad_norm": 0.39845782119378576, + "learning_rate": 0.00019729341054825782, + "loss": 0.7581, + "step": 255 + }, + { + "epoch": 0.1024, + "grad_norm": 0.40171006946231647, + "learning_rate": 0.00019726339209532462, + "loss": 0.7216, + "step": 256 + }, + { + "epoch": 0.1028, + "grad_norm": 0.41249135755171695, + "learning_rate": 0.00019723321040275815, + "loss": 0.7617, + "step": 257 + }, + { + "epoch": 0.1032, + "grad_norm": 0.38194977115100603, + "learning_rate": 0.0001972028655212131, + "loss": 0.7661, + "step": 258 + }, + { + "epoch": 0.1036, + "grad_norm": 0.3927113415894503, + "learning_rate": 0.00019717235750161806, + "loss": 0.6845, + "step": 259 + }, + { + "epoch": 0.104, + "grad_norm": 0.4184396270944774, + "learning_rate": 0.00019714168639517544, + "loss": 0.6976, + "step": 260 + }, + { + "epoch": 0.1044, + "grad_norm": 0.3929516613812163, + "learning_rate": 0.00019711085225336132, + "loss": 0.6956, + "step": 261 + }, + { + "epoch": 0.1048, + "grad_norm": 0.3910654862881255, + "learning_rate": 0.00019707985512792543, + "loss": 0.6977, + "step": 262 + }, + { + "epoch": 0.1052, + "grad_norm": 0.3876378264776691, + "learning_rate": 0.00019704869507089105, + "loss": 0.7256, + "step": 263 + }, + { + "epoch": 0.1056, + "grad_norm": 0.3697637416688594, + "learning_rate": 0.0001970173721345549, + "loss": 0.6674, + "step": 264 + }, + { + "epoch": 0.106, + "grad_norm": 0.4147680271429278, + "learning_rate": 0.00019698588637148703, + "loss": 0.7912, + "step": 265 + }, + { + "epoch": 0.1064, + "grad_norm": 0.40245644770019046, + "learning_rate": 0.00019695423783453088, + "loss": 0.7434, + "step": 266 + }, + { + "epoch": 0.1068, + "grad_norm": 0.39585106634137457, + "learning_rate": 0.00019692242657680286, + "loss": 0.696, + "step": 267 + }, + { + "epoch": 0.1072, + "grad_norm": 0.38206639064738385, + "learning_rate": 0.00019689045265169273, + "loss": 0.6787, + "step": 268 + }, + { + "epoch": 0.1076, + "grad_norm": 0.38659658884436554, + "learning_rate": 0.0001968583161128631, + "loss": 0.7411, + "step": 269 + }, + { + "epoch": 0.108, + "grad_norm": 0.37684016450723856, + "learning_rate": 0.0001968260170142496, + "loss": 0.7033, + "step": 270 + }, + { + "epoch": 0.1084, + "grad_norm": 0.3929047202882145, + "learning_rate": 0.00019679355541006054, + "loss": 0.6944, + "step": 271 + }, + { + "epoch": 0.1088, + "grad_norm": 0.38317417460333625, + "learning_rate": 0.00019676093135477713, + "loss": 0.735, + "step": 272 + }, + { + "epoch": 0.1092, + "grad_norm": 0.3837467243669443, + "learning_rate": 0.0001967281449031531, + "loss": 0.6802, + "step": 273 + }, + { + "epoch": 0.1096, + "grad_norm": 0.3929277164168596, + "learning_rate": 0.00019669519611021486, + "loss": 0.7336, + "step": 274 + }, + { + "epoch": 0.11, + "grad_norm": 0.39661746176816876, + "learning_rate": 0.00019666208503126112, + "loss": 0.7604, + "step": 275 + }, + { + "epoch": 0.1104, + "grad_norm": 0.39575484759133867, + "learning_rate": 0.00019662881172186313, + "loss": 0.7149, + "step": 276 + }, + { + "epoch": 0.1108, + "grad_norm": 0.380670072192301, + "learning_rate": 0.00019659537623786428, + "loss": 0.7044, + "step": 277 + }, + { + "epoch": 0.1112, + "grad_norm": 0.3664185447465465, + "learning_rate": 0.00019656177863538026, + "loss": 0.6484, + "step": 278 + }, + { + "epoch": 0.1116, + "grad_norm": 0.39537126352935253, + "learning_rate": 0.00019652801897079869, + "loss": 0.705, + "step": 279 + }, + { + "epoch": 0.112, + "grad_norm": 0.4000097402559313, + "learning_rate": 0.00019649409730077935, + "loss": 0.6854, + "step": 280 + }, + { + "epoch": 0.1124, + "grad_norm": 0.3926356257874463, + "learning_rate": 0.00019646001368225382, + "loss": 0.7619, + "step": 281 + }, + { + "epoch": 0.1128, + "grad_norm": 0.3904258346252303, + "learning_rate": 0.0001964257681724255, + "loss": 0.6844, + "step": 282 + }, + { + "epoch": 0.1132, + "grad_norm": 0.3868867524178238, + "learning_rate": 0.00019639136082876953, + "loss": 0.6828, + "step": 283 + }, + { + "epoch": 0.1136, + "grad_norm": 0.37141624945766505, + "learning_rate": 0.00019635679170903258, + "loss": 0.7094, + "step": 284 + }, + { + "epoch": 0.114, + "grad_norm": 0.36599494714533054, + "learning_rate": 0.00019632206087123296, + "loss": 0.6958, + "step": 285 + }, + { + "epoch": 0.1144, + "grad_norm": 0.4065700930147299, + "learning_rate": 0.00019628716837366027, + "loss": 0.7096, + "step": 286 + }, + { + "epoch": 0.1148, + "grad_norm": 0.40651433946028726, + "learning_rate": 0.00019625211427487548, + "loss": 0.6958, + "step": 287 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4232769181127487, + "learning_rate": 0.00019621689863371083, + "loss": 0.7201, + "step": 288 + }, + { + "epoch": 0.1156, + "grad_norm": 0.4127296931497552, + "learning_rate": 0.00019618152150926955, + "loss": 0.6296, + "step": 289 + }, + { + "epoch": 0.116, + "grad_norm": 0.3882774288253915, + "learning_rate": 0.000196145982960926, + "loss": 0.7274, + "step": 290 + }, + { + "epoch": 0.1164, + "grad_norm": 0.4409139953301101, + "learning_rate": 0.00019611028304832546, + "loss": 0.812, + "step": 291 + }, + { + "epoch": 0.1168, + "grad_norm": 0.3878309341285608, + "learning_rate": 0.000196074421831384, + "loss": 0.6975, + "step": 292 + }, + { + "epoch": 0.1172, + "grad_norm": 0.386519880007628, + "learning_rate": 0.00019603839937028838, + "loss": 0.6707, + "step": 293 + }, + { + "epoch": 0.1176, + "grad_norm": 0.4067072595460183, + "learning_rate": 0.00019600221572549606, + "loss": 0.7579, + "step": 294 + }, + { + "epoch": 0.118, + "grad_norm": 0.38200579454885775, + "learning_rate": 0.00019596587095773495, + "loss": 0.7511, + "step": 295 + }, + { + "epoch": 0.1184, + "grad_norm": 0.39000018516209073, + "learning_rate": 0.00019592936512800342, + "loss": 0.7013, + "step": 296 + }, + { + "epoch": 0.1188, + "grad_norm": 0.3855048239013933, + "learning_rate": 0.00019589269829757008, + "loss": 0.6928, + "step": 297 + }, + { + "epoch": 0.1192, + "grad_norm": 0.4144883803849691, + "learning_rate": 0.00019585587052797389, + "loss": 0.6998, + "step": 298 + }, + { + "epoch": 0.1196, + "grad_norm": 0.4187340811173014, + "learning_rate": 0.00019581888188102375, + "loss": 0.7296, + "step": 299 + }, + { + "epoch": 0.12, + "grad_norm": 0.43580089453193305, + "learning_rate": 0.00019578173241879872, + "loss": 0.773, + "step": 300 + }, + { + "epoch": 0.1204, + "grad_norm": 0.38469047116127925, + "learning_rate": 0.00019574442220364767, + "loss": 0.7402, + "step": 301 + }, + { + "epoch": 0.1208, + "grad_norm": 0.4056538975856685, + "learning_rate": 0.00019570695129818926, + "loss": 0.6336, + "step": 302 + }, + { + "epoch": 0.1212, + "grad_norm": 0.3803875541548997, + "learning_rate": 0.0001956693197653119, + "loss": 0.7296, + "step": 303 + }, + { + "epoch": 0.1216, + "grad_norm": 0.39397850043949445, + "learning_rate": 0.00019563152766817354, + "loss": 0.6881, + "step": 304 + }, + { + "epoch": 0.122, + "grad_norm": 0.3934750562850638, + "learning_rate": 0.00019559357507020162, + "loss": 0.7417, + "step": 305 + }, + { + "epoch": 0.1224, + "grad_norm": 0.35516981921786084, + "learning_rate": 0.00019555546203509297, + "loss": 0.6466, + "step": 306 + }, + { + "epoch": 0.1228, + "grad_norm": 0.3871342400238589, + "learning_rate": 0.00019551718862681364, + "loss": 0.699, + "step": 307 + }, + { + "epoch": 0.1232, + "grad_norm": 0.37122507606125155, + "learning_rate": 0.00019547875490959885, + "loss": 0.7068, + "step": 308 + }, + { + "epoch": 0.1236, + "grad_norm": 0.5062668465863249, + "learning_rate": 0.00019544016094795295, + "loss": 0.6881, + "step": 309 + }, + { + "epoch": 0.124, + "grad_norm": 0.4088913754479982, + "learning_rate": 0.00019540140680664913, + "loss": 0.7788, + "step": 310 + }, + { + "epoch": 0.1244, + "grad_norm": 0.4080616458426694, + "learning_rate": 0.00019536249255072948, + "loss": 0.7358, + "step": 311 + }, + { + "epoch": 0.1248, + "grad_norm": 0.40713022875650223, + "learning_rate": 0.00019532341824550479, + "loss": 0.6608, + "step": 312 + }, + { + "epoch": 0.1252, + "grad_norm": 0.39832416169944584, + "learning_rate": 0.0001952841839565544, + "loss": 0.7263, + "step": 313 + }, + { + "epoch": 0.1256, + "grad_norm": 0.3832264350159671, + "learning_rate": 0.0001952447897497263, + "loss": 0.669, + "step": 314 + }, + { + "epoch": 0.126, + "grad_norm": 0.3875302932764967, + "learning_rate": 0.00019520523569113677, + "loss": 0.7009, + "step": 315 + }, + { + "epoch": 0.1264, + "grad_norm": 0.40318507984409185, + "learning_rate": 0.00019516552184717037, + "loss": 0.7279, + "step": 316 + }, + { + "epoch": 0.1268, + "grad_norm": 0.37726750077569626, + "learning_rate": 0.00019512564828447988, + "loss": 0.6794, + "step": 317 + }, + { + "epoch": 0.1272, + "grad_norm": 0.40181893918441336, + "learning_rate": 0.0001950856150699861, + "loss": 0.7269, + "step": 318 + }, + { + "epoch": 0.1276, + "grad_norm": 0.4019533837773805, + "learning_rate": 0.0001950454222708778, + "loss": 0.651, + "step": 319 + }, + { + "epoch": 0.128, + "grad_norm": 0.3797291101627864, + "learning_rate": 0.0001950050699546116, + "loss": 0.7245, + "step": 320 + }, + { + "epoch": 0.1284, + "grad_norm": 0.4041662776983708, + "learning_rate": 0.0001949645581889118, + "loss": 0.7394, + "step": 321 + }, + { + "epoch": 0.1288, + "grad_norm": 0.3907511069014927, + "learning_rate": 0.00019492388704177036, + "loss": 0.7226, + "step": 322 + }, + { + "epoch": 0.1292, + "grad_norm": 0.367624741020837, + "learning_rate": 0.00019488305658144667, + "loss": 0.7047, + "step": 323 + }, + { + "epoch": 0.1296, + "grad_norm": 0.3939626503174779, + "learning_rate": 0.00019484206687646753, + "loss": 0.7064, + "step": 324 + }, + { + "epoch": 0.13, + "grad_norm": 0.4157025875157999, + "learning_rate": 0.00019480091799562704, + "loss": 0.7476, + "step": 325 + }, + { + "epoch": 0.1304, + "grad_norm": 0.3805353828876863, + "learning_rate": 0.00019475961000798645, + "loss": 0.6952, + "step": 326 + }, + { + "epoch": 0.1308, + "grad_norm": 0.4169465960559036, + "learning_rate": 0.0001947181429828739, + "loss": 0.7243, + "step": 327 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3604721681368048, + "learning_rate": 0.00019467651698988462, + "loss": 0.6753, + "step": 328 + }, + { + "epoch": 0.1316, + "grad_norm": 0.36477468252574086, + "learning_rate": 0.0001946347320988806, + "loss": 0.6663, + "step": 329 + }, + { + "epoch": 0.132, + "grad_norm": 0.4097890346629651, + "learning_rate": 0.00019459278837999046, + "loss": 0.7221, + "step": 330 + }, + { + "epoch": 0.1324, + "grad_norm": 0.39758712851129263, + "learning_rate": 0.00019455068590360942, + "loss": 0.7078, + "step": 331 + }, + { + "epoch": 0.1328, + "grad_norm": 0.3846785536341143, + "learning_rate": 0.00019450842474039913, + "loss": 0.7174, + "step": 332 + }, + { + "epoch": 0.1332, + "grad_norm": 0.38510034785734193, + "learning_rate": 0.00019446600496128758, + "loss": 0.6909, + "step": 333 + }, + { + "epoch": 0.1336, + "grad_norm": 0.40677707004711616, + "learning_rate": 0.00019442342663746902, + "loss": 0.7074, + "step": 334 + }, + { + "epoch": 0.134, + "grad_norm": 0.42544237574759275, + "learning_rate": 0.00019438068984040365, + "loss": 0.6776, + "step": 335 + }, + { + "epoch": 0.1344, + "grad_norm": 0.4053693843260777, + "learning_rate": 0.00019433779464181778, + "loss": 0.6747, + "step": 336 + }, + { + "epoch": 0.1348, + "grad_norm": 0.3913371200478165, + "learning_rate": 0.00019429474111370352, + "loss": 0.6924, + "step": 337 + }, + { + "epoch": 0.1352, + "grad_norm": 0.4149006096866092, + "learning_rate": 0.0001942515293283187, + "loss": 0.6904, + "step": 338 + }, + { + "epoch": 0.1356, + "grad_norm": 0.38030074742371944, + "learning_rate": 0.00019420815935818672, + "loss": 0.6468, + "step": 339 + }, + { + "epoch": 0.136, + "grad_norm": 0.4109580631805435, + "learning_rate": 0.00019416463127609656, + "loss": 0.6809, + "step": 340 + }, + { + "epoch": 0.1364, + "grad_norm": 0.3874486823898277, + "learning_rate": 0.00019412094515510248, + "loss": 0.7066, + "step": 341 + }, + { + "epoch": 0.1368, + "grad_norm": 0.400169839799612, + "learning_rate": 0.00019407710106852404, + "loss": 0.6735, + "step": 342 + }, + { + "epoch": 0.1372, + "grad_norm": 0.4413037177842987, + "learning_rate": 0.00019403309908994586, + "loss": 0.6974, + "step": 343 + }, + { + "epoch": 0.1376, + "grad_norm": 0.3742979700025569, + "learning_rate": 0.00019398893929321761, + "loss": 0.6784, + "step": 344 + }, + { + "epoch": 0.138, + "grad_norm": 0.42359247868849575, + "learning_rate": 0.00019394462175245381, + "loss": 0.7778, + "step": 345 + }, + { + "epoch": 0.1384, + "grad_norm": 0.4234055597178886, + "learning_rate": 0.00019390014654203369, + "loss": 0.7195, + "step": 346 + }, + { + "epoch": 0.1388, + "grad_norm": 0.4005993402368827, + "learning_rate": 0.0001938555137366011, + "loss": 0.7165, + "step": 347 + }, + { + "epoch": 0.1392, + "grad_norm": 0.3816229344928684, + "learning_rate": 0.00019381072341106452, + "loss": 0.7154, + "step": 348 + }, + { + "epoch": 0.1396, + "grad_norm": 0.40684887980158285, + "learning_rate": 0.0001937657756405966, + "loss": 0.7035, + "step": 349 + }, + { + "epoch": 0.14, + "grad_norm": 0.4008381489661814, + "learning_rate": 0.00019372067050063438, + "loss": 0.7016, + "step": 350 + }, + { + "epoch": 0.1404, + "grad_norm": 0.4033847431397458, + "learning_rate": 0.00019367540806687893, + "loss": 0.7286, + "step": 351 + }, + { + "epoch": 0.1408, + "grad_norm": 0.40743931660695076, + "learning_rate": 0.0001936299884152954, + "loss": 0.7008, + "step": 352 + }, + { + "epoch": 0.1412, + "grad_norm": 0.3762833708731107, + "learning_rate": 0.0001935844116221127, + "loss": 0.6785, + "step": 353 + }, + { + "epoch": 0.1416, + "grad_norm": 0.39511133029768386, + "learning_rate": 0.00019353867776382354, + "loss": 0.7313, + "step": 354 + }, + { + "epoch": 0.142, + "grad_norm": 0.41516019571816626, + "learning_rate": 0.00019349278691718427, + "loss": 0.7304, + "step": 355 + }, + { + "epoch": 0.1424, + "grad_norm": 0.4090372519670217, + "learning_rate": 0.0001934467391592146, + "loss": 0.7437, + "step": 356 + }, + { + "epoch": 0.1428, + "grad_norm": 0.367684000824685, + "learning_rate": 0.00019340053456719768, + "loss": 0.7387, + "step": 357 + }, + { + "epoch": 0.1432, + "grad_norm": 0.3634548661346829, + "learning_rate": 0.00019335417321867987, + "loss": 0.6985, + "step": 358 + }, + { + "epoch": 0.1436, + "grad_norm": 0.38248846649804163, + "learning_rate": 0.0001933076551914706, + "loss": 0.6946, + "step": 359 + }, + { + "epoch": 0.144, + "grad_norm": 0.3903837776823106, + "learning_rate": 0.00019326098056364222, + "loss": 0.738, + "step": 360 + }, + { + "epoch": 0.1444, + "grad_norm": 0.37316510939032765, + "learning_rate": 0.00019321414941353003, + "loss": 0.7023, + "step": 361 + }, + { + "epoch": 0.1448, + "grad_norm": 0.3843035531540929, + "learning_rate": 0.00019316716181973188, + "loss": 0.6897, + "step": 362 + }, + { + "epoch": 0.1452, + "grad_norm": 0.3960956773681474, + "learning_rate": 0.00019312001786110828, + "loss": 0.7433, + "step": 363 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4101148329592276, + "learning_rate": 0.00019307271761678213, + "loss": 0.68, + "step": 364 + }, + { + "epoch": 0.146, + "grad_norm": 0.4035799883510049, + "learning_rate": 0.00019302526116613864, + "loss": 0.7756, + "step": 365 + }, + { + "epoch": 0.1464, + "grad_norm": 0.34335587702974607, + "learning_rate": 0.00019297764858882514, + "loss": 0.6584, + "step": 366 + }, + { + "epoch": 0.1468, + "grad_norm": 0.41575729551259694, + "learning_rate": 0.00019292987996475113, + "loss": 0.7255, + "step": 367 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3987444219453461, + "learning_rate": 0.0001928819553740878, + "loss": 0.7267, + "step": 368 + }, + { + "epoch": 0.1476, + "grad_norm": 0.36645586098492455, + "learning_rate": 0.00019283387489726827, + "loss": 0.6392, + "step": 369 + }, + { + "epoch": 0.148, + "grad_norm": 0.3901779305178469, + "learning_rate": 0.00019278563861498723, + "loss": 0.7727, + "step": 370 + }, + { + "epoch": 0.1484, + "grad_norm": 0.3887595088054966, + "learning_rate": 0.00019273724660820088, + "loss": 0.6535, + "step": 371 + }, + { + "epoch": 0.1488, + "grad_norm": 0.3846140376662957, + "learning_rate": 0.00019268869895812672, + "loss": 0.7625, + "step": 372 + }, + { + "epoch": 0.1492, + "grad_norm": 0.4217259319075013, + "learning_rate": 0.00019263999574624355, + "loss": 0.7694, + "step": 373 + }, + { + "epoch": 0.1496, + "grad_norm": 0.38627934512584944, + "learning_rate": 0.0001925911370542912, + "loss": 0.7557, + "step": 374 + }, + { + "epoch": 0.15, + "grad_norm": 0.39726895858256805, + "learning_rate": 0.00019254212296427044, + "loss": 0.696, + "step": 375 + }, + { + "epoch": 0.1504, + "grad_norm": 0.39509375724648294, + "learning_rate": 0.00019249295355844285, + "loss": 0.6939, + "step": 376 + }, + { + "epoch": 0.1508, + "grad_norm": 0.39093740307759517, + "learning_rate": 0.00019244362891933077, + "loss": 0.7079, + "step": 377 + }, + { + "epoch": 0.1512, + "grad_norm": 0.3752349281466314, + "learning_rate": 0.00019239414912971696, + "loss": 0.6919, + "step": 378 + }, + { + "epoch": 0.1516, + "grad_norm": 0.3680847322213261, + "learning_rate": 0.0001923445142726446, + "loss": 0.7116, + "step": 379 + }, + { + "epoch": 0.152, + "grad_norm": 0.37204500135886637, + "learning_rate": 0.0001922947244314172, + "loss": 0.7019, + "step": 380 + }, + { + "epoch": 0.1524, + "grad_norm": 0.40124819438119996, + "learning_rate": 0.0001922447796895982, + "loss": 0.7217, + "step": 381 + }, + { + "epoch": 0.1528, + "grad_norm": 0.3865757435112636, + "learning_rate": 0.00019219468013101124, + "loss": 0.7401, + "step": 382 + }, + { + "epoch": 0.1532, + "grad_norm": 0.37950392105185105, + "learning_rate": 0.00019214442583973966, + "loss": 0.72, + "step": 383 + }, + { + "epoch": 0.1536, + "grad_norm": 0.36805410693362434, + "learning_rate": 0.00019209401690012653, + "loss": 0.7653, + "step": 384 + }, + { + "epoch": 0.154, + "grad_norm": 0.3574885525687318, + "learning_rate": 0.00019204345339677442, + "loss": 0.7135, + "step": 385 + }, + { + "epoch": 0.1544, + "grad_norm": 0.3628661116738235, + "learning_rate": 0.00019199273541454538, + "loss": 0.6446, + "step": 386 + }, + { + "epoch": 0.1548, + "grad_norm": 0.37492897305274625, + "learning_rate": 0.00019194186303856067, + "loss": 0.698, + "step": 387 + }, + { + "epoch": 0.1552, + "grad_norm": 0.38152196770466545, + "learning_rate": 0.00019189083635420075, + "loss": 0.693, + "step": 388 + }, + { + "epoch": 0.1556, + "grad_norm": 0.38450336508215077, + "learning_rate": 0.00019183965544710495, + "loss": 0.6905, + "step": 389 + }, + { + "epoch": 0.156, + "grad_norm": 0.3712084726550805, + "learning_rate": 0.00019178832040317155, + "loss": 0.6671, + "step": 390 + }, + { + "epoch": 0.1564, + "grad_norm": 0.3616802203123445, + "learning_rate": 0.0001917368313085574, + "loss": 0.6671, + "step": 391 + }, + { + "epoch": 0.1568, + "grad_norm": 0.3721551241618506, + "learning_rate": 0.00019168518824967795, + "loss": 0.7082, + "step": 392 + }, + { + "epoch": 0.1572, + "grad_norm": 0.3738096531629254, + "learning_rate": 0.00019163339131320718, + "loss": 0.6859, + "step": 393 + }, + { + "epoch": 0.1576, + "grad_norm": 0.3747029831223078, + "learning_rate": 0.00019158144058607708, + "loss": 0.6783, + "step": 394 + }, + { + "epoch": 0.158, + "grad_norm": 0.3643373612674067, + "learning_rate": 0.00019152933615547798, + "loss": 0.6779, + "step": 395 + }, + { + "epoch": 0.1584, + "grad_norm": 0.4025633496093721, + "learning_rate": 0.000191477078108858, + "loss": 0.6875, + "step": 396 + }, + { + "epoch": 0.1588, + "grad_norm": 0.3890515795999232, + "learning_rate": 0.00019142466653392318, + "loss": 0.7375, + "step": 397 + }, + { + "epoch": 0.1592, + "grad_norm": 0.41196475648797765, + "learning_rate": 0.0001913721015186372, + "loss": 0.8219, + "step": 398 + }, + { + "epoch": 0.1596, + "grad_norm": 0.36303264191560275, + "learning_rate": 0.0001913193831512213, + "loss": 0.6539, + "step": 399 + }, + { + "epoch": 0.16, + "grad_norm": 0.3733774345130003, + "learning_rate": 0.00019126651152015403, + "loss": 0.7258, + "step": 400 + }, + { + "epoch": 0.1604, + "grad_norm": 0.3600148652565467, + "learning_rate": 0.0001912134867141712, + "loss": 0.6763, + "step": 401 + }, + { + "epoch": 0.1608, + "grad_norm": 0.4130332089907557, + "learning_rate": 0.0001911603088222657, + "loss": 0.6829, + "step": 402 + }, + { + "epoch": 0.1612, + "grad_norm": 0.37915621184314446, + "learning_rate": 0.0001911069779336873, + "loss": 0.7099, + "step": 403 + }, + { + "epoch": 0.1616, + "grad_norm": 0.3982697677981037, + "learning_rate": 0.00019105349413794272, + "loss": 0.7706, + "step": 404 + }, + { + "epoch": 0.162, + "grad_norm": 0.38458590119885516, + "learning_rate": 0.00019099985752479506, + "loss": 0.7495, + "step": 405 + }, + { + "epoch": 0.1624, + "grad_norm": 0.40139329153688263, + "learning_rate": 0.00019094606818426403, + "loss": 0.7122, + "step": 406 + }, + { + "epoch": 0.1628, + "grad_norm": 0.38138079047942747, + "learning_rate": 0.00019089212620662568, + "loss": 0.7237, + "step": 407 + }, + { + "epoch": 0.1632, + "grad_norm": 0.367840917838084, + "learning_rate": 0.00019083803168241223, + "loss": 0.7042, + "step": 408 + }, + { + "epoch": 0.1636, + "grad_norm": 0.3719462302258643, + "learning_rate": 0.00019078378470241183, + "loss": 0.7003, + "step": 409 + }, + { + "epoch": 0.164, + "grad_norm": 0.3871663286058704, + "learning_rate": 0.00019072938535766865, + "loss": 0.699, + "step": 410 + }, + { + "epoch": 0.1644, + "grad_norm": 0.37122376588124145, + "learning_rate": 0.00019067483373948243, + "loss": 0.6396, + "step": 411 + }, + { + "epoch": 0.1648, + "grad_norm": 0.36458011191898626, + "learning_rate": 0.00019062012993940859, + "loss": 0.7197, + "step": 412 + }, + { + "epoch": 0.1652, + "grad_norm": 0.39366153882601723, + "learning_rate": 0.00019056527404925789, + "loss": 0.7104, + "step": 413 + }, + { + "epoch": 0.1656, + "grad_norm": 0.3772750003433443, + "learning_rate": 0.00019051026616109638, + "loss": 0.7374, + "step": 414 + }, + { + "epoch": 0.166, + "grad_norm": 0.3717203779567961, + "learning_rate": 0.0001904551063672452, + "loss": 0.7414, + "step": 415 + }, + { + "epoch": 0.1664, + "grad_norm": 0.366581005775702, + "learning_rate": 0.00019039979476028043, + "loss": 0.688, + "step": 416 + }, + { + "epoch": 0.1668, + "grad_norm": 0.39793014724181996, + "learning_rate": 0.000190344331433033, + "loss": 0.6706, + "step": 417 + }, + { + "epoch": 0.1672, + "grad_norm": 0.3909005122636842, + "learning_rate": 0.00019028871647858834, + "loss": 0.7149, + "step": 418 + }, + { + "epoch": 0.1676, + "grad_norm": 0.3827833190394904, + "learning_rate": 0.00019023294999028653, + "loss": 0.7067, + "step": 419 + }, + { + "epoch": 0.168, + "grad_norm": 0.3707544804332314, + "learning_rate": 0.00019017703206172185, + "loss": 0.7368, + "step": 420 + }, + { + "epoch": 0.1684, + "grad_norm": 0.387842364505732, + "learning_rate": 0.0001901209627867428, + "loss": 0.6957, + "step": 421 + }, + { + "epoch": 0.1688, + "grad_norm": 0.3586581412742944, + "learning_rate": 0.0001900647422594519, + "loss": 0.7095, + "step": 422 + }, + { + "epoch": 0.1692, + "grad_norm": 0.3776421164866786, + "learning_rate": 0.0001900083705742054, + "loss": 0.7314, + "step": 423 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3518309566121195, + "learning_rate": 0.00018995184782561345, + "loss": 0.662, + "step": 424 + }, + { + "epoch": 0.17, + "grad_norm": 0.35175767335896485, + "learning_rate": 0.00018989517410853955, + "loss": 0.6484, + "step": 425 + }, + { + "epoch": 0.1704, + "grad_norm": 0.37014079477382306, + "learning_rate": 0.0001898383495181007, + "loss": 0.6885, + "step": 426 + }, + { + "epoch": 0.1708, + "grad_norm": 0.3999729258228388, + "learning_rate": 0.00018978137414966698, + "loss": 0.6745, + "step": 427 + }, + { + "epoch": 0.1712, + "grad_norm": 0.391206313530272, + "learning_rate": 0.0001897242480988617, + "loss": 0.6987, + "step": 428 + }, + { + "epoch": 0.1716, + "grad_norm": 0.3760066653882305, + "learning_rate": 0.00018966697146156092, + "loss": 0.6594, + "step": 429 + }, + { + "epoch": 0.172, + "grad_norm": 0.41634437705839444, + "learning_rate": 0.00018960954433389345, + "loss": 0.7287, + "step": 430 + }, + { + "epoch": 0.1724, + "grad_norm": 0.38316647013622684, + "learning_rate": 0.0001895519668122408, + "loss": 0.716, + "step": 431 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3859147191673041, + "learning_rate": 0.0001894942389932367, + "loss": 0.688, + "step": 432 + }, + { + "epoch": 0.1732, + "grad_norm": 0.38536780897260897, + "learning_rate": 0.00018943636097376726, + "loss": 0.6973, + "step": 433 + }, + { + "epoch": 0.1736, + "grad_norm": 0.38022975855348395, + "learning_rate": 0.00018937833285097066, + "loss": 0.6921, + "step": 434 + }, + { + "epoch": 0.174, + "grad_norm": 0.3727868538535105, + "learning_rate": 0.00018932015472223693, + "loss": 0.6858, + "step": 435 + }, + { + "epoch": 0.1744, + "grad_norm": 0.39691838283866077, + "learning_rate": 0.00018926182668520792, + "loss": 0.7093, + "step": 436 + }, + { + "epoch": 0.1748, + "grad_norm": 0.386131787817755, + "learning_rate": 0.0001892033488377771, + "loss": 0.7128, + "step": 437 + }, + { + "epoch": 0.1752, + "grad_norm": 0.3866928717931557, + "learning_rate": 0.0001891447212780893, + "loss": 0.6833, + "step": 438 + }, + { + "epoch": 0.1756, + "grad_norm": 0.37440549427127356, + "learning_rate": 0.0001890859441045407, + "loss": 0.7292, + "step": 439 + }, + { + "epoch": 0.176, + "grad_norm": 0.3966367147518759, + "learning_rate": 0.0001890270174157784, + "loss": 0.7243, + "step": 440 + }, + { + "epoch": 0.1764, + "grad_norm": 0.3725940293890487, + "learning_rate": 0.00018896794131070073, + "loss": 0.6948, + "step": 441 + }, + { + "epoch": 0.1768, + "grad_norm": 0.38919233794834746, + "learning_rate": 0.0001889087158884565, + "loss": 0.7018, + "step": 442 + }, + { + "epoch": 0.1772, + "grad_norm": 0.37745830074894476, + "learning_rate": 0.00018884934124844532, + "loss": 0.7484, + "step": 443 + }, + { + "epoch": 0.1776, + "grad_norm": 0.3854960456283892, + "learning_rate": 0.00018878981749031716, + "loss": 0.676, + "step": 444 + }, + { + "epoch": 0.178, + "grad_norm": 0.38826431295550146, + "learning_rate": 0.00018873014471397224, + "loss": 0.6926, + "step": 445 + }, + { + "epoch": 0.1784, + "grad_norm": 0.3802235682610009, + "learning_rate": 0.00018867032301956088, + "loss": 0.6411, + "step": 446 + }, + { + "epoch": 0.1788, + "grad_norm": 0.3686143691302594, + "learning_rate": 0.00018861035250748343, + "loss": 0.6984, + "step": 447 + }, + { + "epoch": 0.1792, + "grad_norm": 0.37565487988507895, + "learning_rate": 0.00018855023327838983, + "loss": 0.7024, + "step": 448 + }, + { + "epoch": 0.1796, + "grad_norm": 0.3729423273925913, + "learning_rate": 0.00018848996543317982, + "loss": 0.7043, + "step": 449 + }, + { + "epoch": 0.18, + "grad_norm": 0.38784766614091365, + "learning_rate": 0.00018842954907300236, + "loss": 0.7266, + "step": 450 + }, + { + "epoch": 0.1804, + "grad_norm": 0.40027503237144674, + "learning_rate": 0.00018836898429925585, + "loss": 0.7562, + "step": 451 + }, + { + "epoch": 0.1808, + "grad_norm": 0.3828729125010602, + "learning_rate": 0.0001883082712135877, + "loss": 0.6473, + "step": 452 + }, + { + "epoch": 0.1812, + "grad_norm": 0.40764363051902, + "learning_rate": 0.00018824740991789415, + "loss": 0.7135, + "step": 453 + }, + { + "epoch": 0.1816, + "grad_norm": 0.3672450974926339, + "learning_rate": 0.00018818640051432035, + "loss": 0.7266, + "step": 454 + }, + { + "epoch": 0.182, + "grad_norm": 0.3638359282622377, + "learning_rate": 0.0001881252431052599, + "loss": 0.7062, + "step": 455 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3718509816996577, + "learning_rate": 0.00018806393779335483, + "loss": 0.6764, + "step": 456 + }, + { + "epoch": 0.1828, + "grad_norm": 0.3703550388188459, + "learning_rate": 0.00018800248468149543, + "loss": 0.6773, + "step": 457 + }, + { + "epoch": 0.1832, + "grad_norm": 0.3880381643824319, + "learning_rate": 0.00018794088387282, + "loss": 0.6608, + "step": 458 + }, + { + "epoch": 0.1836, + "grad_norm": 0.5023937375660238, + "learning_rate": 0.00018787913547071484, + "loss": 0.6839, + "step": 459 + }, + { + "epoch": 0.184, + "grad_norm": 0.38790606135318434, + "learning_rate": 0.00018781723957881372, + "loss": 0.7163, + "step": 460 + }, + { + "epoch": 0.1844, + "grad_norm": 0.38612530797035416, + "learning_rate": 0.0001877551963009982, + "loss": 0.6792, + "step": 461 + }, + { + "epoch": 0.1848, + "grad_norm": 0.39648088910610624, + "learning_rate": 0.0001876930057413971, + "loss": 0.6931, + "step": 462 + }, + { + "epoch": 0.1852, + "grad_norm": 0.3620504076730805, + "learning_rate": 0.00018763066800438636, + "loss": 0.7105, + "step": 463 + }, + { + "epoch": 0.1856, + "grad_norm": 0.381143525075452, + "learning_rate": 0.00018756818319458907, + "loss": 0.7143, + "step": 464 + }, + { + "epoch": 0.186, + "grad_norm": 0.39980114683333307, + "learning_rate": 0.000187505551416875, + "loss": 0.6536, + "step": 465 + }, + { + "epoch": 0.1864, + "grad_norm": 0.36845926955292746, + "learning_rate": 0.0001874427727763607, + "loss": 0.7263, + "step": 466 + }, + { + "epoch": 0.1868, + "grad_norm": 0.38586647673684304, + "learning_rate": 0.0001873798473784092, + "loss": 0.6942, + "step": 467 + }, + { + "epoch": 0.1872, + "grad_norm": 0.39590597916453313, + "learning_rate": 0.00018731677532862976, + "loss": 0.6978, + "step": 468 + }, + { + "epoch": 0.1876, + "grad_norm": 0.3612337720822994, + "learning_rate": 0.00018725355673287778, + "loss": 0.6745, + "step": 469 + }, + { + "epoch": 0.188, + "grad_norm": 0.3977639592238108, + "learning_rate": 0.00018719019169725472, + "loss": 0.6738, + "step": 470 + }, + { + "epoch": 0.1884, + "grad_norm": 0.3718333358835038, + "learning_rate": 0.00018712668032810768, + "loss": 0.6761, + "step": 471 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3836908486345542, + "learning_rate": 0.00018706302273202943, + "loss": 0.6954, + "step": 472 + }, + { + "epoch": 0.1892, + "grad_norm": 0.36142278549101325, + "learning_rate": 0.00018699921901585813, + "loss": 0.6762, + "step": 473 + }, + { + "epoch": 0.1896, + "grad_norm": 0.37599764474175124, + "learning_rate": 0.0001869352692866772, + "loss": 0.7131, + "step": 474 + }, + { + "epoch": 0.19, + "grad_norm": 0.37048262581086955, + "learning_rate": 0.00018687117365181512, + "loss": 0.6972, + "step": 475 + }, + { + "epoch": 0.1904, + "grad_norm": 0.3600691572898928, + "learning_rate": 0.00018680693221884517, + "loss": 0.7422, + "step": 476 + }, + { + "epoch": 0.1908, + "grad_norm": 0.3818288691556821, + "learning_rate": 0.00018674254509558544, + "loss": 0.7011, + "step": 477 + }, + { + "epoch": 0.1912, + "grad_norm": 0.3674052799125421, + "learning_rate": 0.00018667801239009846, + "loss": 0.705, + "step": 478 + }, + { + "epoch": 0.1916, + "grad_norm": 0.41930911669276566, + "learning_rate": 0.00018661333421069113, + "loss": 0.718, + "step": 479 + }, + { + "epoch": 0.192, + "grad_norm": 0.3913531615832862, + "learning_rate": 0.00018654851066591448, + "loss": 0.7438, + "step": 480 + }, + { + "epoch": 0.1924, + "grad_norm": 0.3557742897729816, + "learning_rate": 0.00018648354186456348, + "loss": 0.669, + "step": 481 + }, + { + "epoch": 0.1928, + "grad_norm": 0.3731384423646767, + "learning_rate": 0.000186418427915677, + "loss": 0.6502, + "step": 482 + }, + { + "epoch": 0.1932, + "grad_norm": 0.3867524755105473, + "learning_rate": 0.00018635316892853741, + "loss": 0.6892, + "step": 483 + }, + { + "epoch": 0.1936, + "grad_norm": 0.3650916952976356, + "learning_rate": 0.00018628776501267052, + "loss": 0.7087, + "step": 484 + }, + { + "epoch": 0.194, + "grad_norm": 0.3624557636642345, + "learning_rate": 0.0001862222162778454, + "loss": 0.6799, + "step": 485 + }, + { + "epoch": 0.1944, + "grad_norm": 0.3570561492880132, + "learning_rate": 0.0001861565228340742, + "loss": 0.7019, + "step": 486 + }, + { + "epoch": 0.1948, + "grad_norm": 0.37587842187370224, + "learning_rate": 0.00018609068479161182, + "loss": 0.692, + "step": 487 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3778178257632083, + "learning_rate": 0.00018602470226095603, + "loss": 0.7456, + "step": 488 + }, + { + "epoch": 0.1956, + "grad_norm": 0.37179725458621526, + "learning_rate": 0.00018595857535284692, + "loss": 0.6848, + "step": 489 + }, + { + "epoch": 0.196, + "grad_norm": 0.40301659971709364, + "learning_rate": 0.00018589230417826697, + "loss": 0.7707, + "step": 490 + }, + { + "epoch": 0.1964, + "grad_norm": 0.3812236492427729, + "learning_rate": 0.00018582588884844084, + "loss": 0.6561, + "step": 491 + }, + { + "epoch": 0.1968, + "grad_norm": 0.36485508331427363, + "learning_rate": 0.00018575932947483502, + "loss": 0.6637, + "step": 492 + }, + { + "epoch": 0.1972, + "grad_norm": 0.3655147118213809, + "learning_rate": 0.00018569262616915784, + "loss": 0.6627, + "step": 493 + }, + { + "epoch": 0.1976, + "grad_norm": 0.35720316681945796, + "learning_rate": 0.00018562577904335912, + "loss": 0.6774, + "step": 494 + }, + { + "epoch": 0.198, + "grad_norm": 0.3662569867994764, + "learning_rate": 0.00018555878820963013, + "loss": 0.6054, + "step": 495 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3840820833030506, + "learning_rate": 0.00018549165378040327, + "loss": 0.6556, + "step": 496 + }, + { + "epoch": 0.1988, + "grad_norm": 0.3896881309740263, + "learning_rate": 0.00018542437586835202, + "loss": 0.7036, + "step": 497 + }, + { + "epoch": 0.1992, + "grad_norm": 0.3948463129162245, + "learning_rate": 0.00018535695458639056, + "loss": 0.708, + "step": 498 + }, + { + "epoch": 0.1996, + "grad_norm": 0.41169778255777534, + "learning_rate": 0.00018528939004767376, + "loss": 0.715, + "step": 499 + }, + { + "epoch": 0.2, + "grad_norm": 0.3879039540215038, + "learning_rate": 0.00018522168236559695, + "loss": 0.7172, + "step": 500 + }, + { + "epoch": 0.2004, + "grad_norm": 0.355510555850886, + "learning_rate": 0.0001851538316537956, + "loss": 0.6596, + "step": 501 + }, + { + "epoch": 0.2008, + "grad_norm": 0.3610419600173354, + "learning_rate": 0.0001850858380261453, + "loss": 0.6548, + "step": 502 + }, + { + "epoch": 0.2012, + "grad_norm": 0.4072826093126328, + "learning_rate": 0.00018501770159676156, + "loss": 0.7069, + "step": 503 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3734772225484452, + "learning_rate": 0.0001849494224799994, + "loss": 0.7277, + "step": 504 + }, + { + "epoch": 0.202, + "grad_norm": 0.3954106111018613, + "learning_rate": 0.00018488100079045344, + "loss": 0.7542, + "step": 505 + }, + { + "epoch": 0.2024, + "grad_norm": 0.36858612321909856, + "learning_rate": 0.0001848124366429576, + "loss": 0.6988, + "step": 506 + }, + { + "epoch": 0.2028, + "grad_norm": 0.4026041820405598, + "learning_rate": 0.00018474373015258473, + "loss": 0.7183, + "step": 507 + }, + { + "epoch": 0.2032, + "grad_norm": 0.3747512856185188, + "learning_rate": 0.0001846748814346468, + "loss": 0.6292, + "step": 508 + }, + { + "epoch": 0.2036, + "grad_norm": 0.40646861586250593, + "learning_rate": 0.00018460589060469425, + "loss": 0.7361, + "step": 509 + }, + { + "epoch": 0.204, + "grad_norm": 0.3712373168619134, + "learning_rate": 0.00018453675777851627, + "loss": 0.6965, + "step": 510 + }, + { + "epoch": 0.2044, + "grad_norm": 0.3822153730339224, + "learning_rate": 0.00018446748307214019, + "loss": 0.7495, + "step": 511 + }, + { + "epoch": 0.2048, + "grad_norm": 0.37312144201226494, + "learning_rate": 0.0001843980666018315, + "loss": 0.705, + "step": 512 + }, + { + "epoch": 0.2052, + "grad_norm": 0.385950831839263, + "learning_rate": 0.00018432850848409363, + "loss": 0.7575, + "step": 513 + }, + { + "epoch": 0.2056, + "grad_norm": 0.3741140719848538, + "learning_rate": 0.00018425880883566782, + "loss": 0.7222, + "step": 514 + }, + { + "epoch": 0.206, + "grad_norm": 0.3972815278946878, + "learning_rate": 0.0001841889677735327, + "loss": 0.7346, + "step": 515 + }, + { + "epoch": 0.2064, + "grad_norm": 0.38212963630442465, + "learning_rate": 0.00018411898541490434, + "loss": 0.6665, + "step": 516 + }, + { + "epoch": 0.2068, + "grad_norm": 0.3825583560569224, + "learning_rate": 0.0001840488618772359, + "loss": 0.6931, + "step": 517 + }, + { + "epoch": 0.2072, + "grad_norm": 0.39728934202351796, + "learning_rate": 0.00018397859727821748, + "loss": 0.7092, + "step": 518 + }, + { + "epoch": 0.2076, + "grad_norm": 0.40555236197334826, + "learning_rate": 0.00018390819173577598, + "loss": 0.6769, + "step": 519 + }, + { + "epoch": 0.208, + "grad_norm": 0.5609157488299549, + "learning_rate": 0.00018383764536807485, + "loss": 0.6946, + "step": 520 + }, + { + "epoch": 0.2084, + "grad_norm": 0.38043954785026474, + "learning_rate": 0.00018376695829351377, + "loss": 0.6994, + "step": 521 + }, + { + "epoch": 0.2088, + "grad_norm": 0.3716156223121875, + "learning_rate": 0.00018369613063072874, + "loss": 0.6695, + "step": 522 + }, + { + "epoch": 0.2092, + "grad_norm": 0.38068514825235367, + "learning_rate": 0.00018362516249859163, + "loss": 0.7272, + "step": 523 + }, + { + "epoch": 0.2096, + "grad_norm": 0.3754330491889234, + "learning_rate": 0.00018355405401621001, + "loss": 0.6926, + "step": 524 + }, + { + "epoch": 0.21, + "grad_norm": 0.3999675352290636, + "learning_rate": 0.00018348280530292713, + "loss": 0.716, + "step": 525 + }, + { + "epoch": 0.2104, + "grad_norm": 0.37528982607451183, + "learning_rate": 0.00018341141647832147, + "loss": 0.6607, + "step": 526 + }, + { + "epoch": 0.2108, + "grad_norm": 0.39089716567979277, + "learning_rate": 0.00018333988766220676, + "loss": 0.6711, + "step": 527 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3658466970924529, + "learning_rate": 0.0001832682189746316, + "loss": 0.7213, + "step": 528 + }, + { + "epoch": 0.2116, + "grad_norm": 0.36261304242450976, + "learning_rate": 0.00018319641053587938, + "loss": 0.7012, + "step": 529 + }, + { + "epoch": 0.212, + "grad_norm": 0.3702824148173612, + "learning_rate": 0.0001831244624664681, + "loss": 0.7397, + "step": 530 + }, + { + "epoch": 0.2124, + "grad_norm": 0.3762171911426314, + "learning_rate": 0.00018305237488714995, + "loss": 0.6978, + "step": 531 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3416142482473879, + "learning_rate": 0.00018298014791891137, + "loss": 0.6812, + "step": 532 + }, + { + "epoch": 0.2132, + "grad_norm": 0.3611274469188154, + "learning_rate": 0.00018290778168297277, + "loss": 0.7329, + "step": 533 + }, + { + "epoch": 0.2136, + "grad_norm": 0.3757944321097854, + "learning_rate": 0.00018283527630078825, + "loss": 0.6581, + "step": 534 + }, + { + "epoch": 0.214, + "grad_norm": 0.35800642170913016, + "learning_rate": 0.0001827626318940454, + "loss": 0.6836, + "step": 535 + }, + { + "epoch": 0.2144, + "grad_norm": 0.38446712295634733, + "learning_rate": 0.00018268984858466522, + "loss": 0.6543, + "step": 536 + }, + { + "epoch": 0.2148, + "grad_norm": 0.4016318412652301, + "learning_rate": 0.00018261692649480175, + "loss": 0.7244, + "step": 537 + }, + { + "epoch": 0.2152, + "grad_norm": 0.3809971119904624, + "learning_rate": 0.00018254386574684204, + "loss": 0.7082, + "step": 538 + }, + { + "epoch": 0.2156, + "grad_norm": 0.42070210464610197, + "learning_rate": 0.0001824706664634058, + "loss": 0.698, + "step": 539 + }, + { + "epoch": 0.216, + "grad_norm": 0.3654952949633688, + "learning_rate": 0.00018239732876734527, + "loss": 0.639, + "step": 540 + }, + { + "epoch": 0.2164, + "grad_norm": 0.3815611739066012, + "learning_rate": 0.0001823238527817449, + "loss": 0.7342, + "step": 541 + }, + { + "epoch": 0.2168, + "grad_norm": 0.3883767391244725, + "learning_rate": 0.00018225023862992142, + "loss": 0.7583, + "step": 542 + }, + { + "epoch": 0.2172, + "grad_norm": 0.3824779257270353, + "learning_rate": 0.00018217648643542323, + "loss": 0.6867, + "step": 543 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3607319379743882, + "learning_rate": 0.0001821025963220306, + "loss": 0.6877, + "step": 544 + }, + { + "epoch": 0.218, + "grad_norm": 0.3860102243656817, + "learning_rate": 0.00018202856841375518, + "loss": 0.7322, + "step": 545 + }, + { + "epoch": 0.2184, + "grad_norm": 0.35714157126866863, + "learning_rate": 0.00018195440283483988, + "loss": 0.6767, + "step": 546 + }, + { + "epoch": 0.2188, + "grad_norm": 0.3824467030729281, + "learning_rate": 0.0001818800997097587, + "loss": 0.7056, + "step": 547 + }, + { + "epoch": 0.2192, + "grad_norm": 0.40057749915303886, + "learning_rate": 0.00018180565916321647, + "loss": 0.6617, + "step": 548 + }, + { + "epoch": 0.2196, + "grad_norm": 0.38911454764153103, + "learning_rate": 0.0001817310813201486, + "loss": 0.7121, + "step": 549 + }, + { + "epoch": 0.22, + "grad_norm": 0.3830635611411421, + "learning_rate": 0.0001816563663057211, + "loss": 0.6557, + "step": 550 + }, + { + "epoch": 0.2204, + "grad_norm": 0.36468480046935936, + "learning_rate": 0.00018158151424533002, + "loss": 0.6871, + "step": 551 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3668031521515548, + "learning_rate": 0.00018150652526460146, + "loss": 0.6871, + "step": 552 + }, + { + "epoch": 0.2212, + "grad_norm": 0.3768692147099381, + "learning_rate": 0.00018143139948939137, + "loss": 0.669, + "step": 553 + }, + { + "epoch": 0.2216, + "grad_norm": 0.3832613262529067, + "learning_rate": 0.00018135613704578526, + "loss": 0.6778, + "step": 554 + }, + { + "epoch": 0.222, + "grad_norm": 0.3803908262267479, + "learning_rate": 0.000181280738060098, + "loss": 0.7378, + "step": 555 + }, + { + "epoch": 0.2224, + "grad_norm": 0.3662989151567502, + "learning_rate": 0.00018120520265887363, + "loss": 0.7141, + "step": 556 + }, + { + "epoch": 0.2228, + "grad_norm": 0.35040922293491916, + "learning_rate": 0.00018112953096888516, + "loss": 0.6102, + "step": 557 + }, + { + "epoch": 0.2232, + "grad_norm": 0.3674158329615544, + "learning_rate": 0.00018105372311713432, + "loss": 0.7266, + "step": 558 + }, + { + "epoch": 0.2236, + "grad_norm": 0.3616944009768484, + "learning_rate": 0.0001809777792308513, + "loss": 0.6999, + "step": 559 + }, + { + "epoch": 0.224, + "grad_norm": 0.36693103145374456, + "learning_rate": 0.00018090169943749476, + "loss": 0.6683, + "step": 560 + }, + { + "epoch": 0.2244, + "grad_norm": 0.36337409937433873, + "learning_rate": 0.0001808254838647513, + "loss": 0.648, + "step": 561 + }, + { + "epoch": 0.2248, + "grad_norm": 0.38399518685713585, + "learning_rate": 0.00018074913264053545, + "loss": 0.7389, + "step": 562 + }, + { + "epoch": 0.2252, + "grad_norm": 0.37042175198125565, + "learning_rate": 0.00018067264589298945, + "loss": 0.6548, + "step": 563 + }, + { + "epoch": 0.2256, + "grad_norm": 0.37049712923697325, + "learning_rate": 0.00018059602375048293, + "loss": 0.7153, + "step": 564 + }, + { + "epoch": 0.226, + "grad_norm": 0.3801771827533767, + "learning_rate": 0.00018051926634161282, + "loss": 0.6598, + "step": 565 + }, + { + "epoch": 0.2264, + "grad_norm": 0.38212696328862, + "learning_rate": 0.00018044237379520305, + "loss": 0.6387, + "step": 566 + }, + { + "epoch": 0.2268, + "grad_norm": 0.3613484282834108, + "learning_rate": 0.0001803653462403043, + "loss": 0.6351, + "step": 567 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3685052769200213, + "learning_rate": 0.0001802881838061939, + "loss": 0.5813, + "step": 568 + }, + { + "epoch": 0.2276, + "grad_norm": 0.38269798265593397, + "learning_rate": 0.00018021088662237552, + "loss": 0.7184, + "step": 569 + }, + { + "epoch": 0.228, + "grad_norm": 0.3706152341283996, + "learning_rate": 0.00018013345481857903, + "loss": 0.6913, + "step": 570 + }, + { + "epoch": 0.2284, + "grad_norm": 0.35506592800145376, + "learning_rate": 0.00018005588852476015, + "loss": 0.6613, + "step": 571 + }, + { + "epoch": 0.2288, + "grad_norm": 0.3757597285473621, + "learning_rate": 0.00017997818787110042, + "loss": 0.6994, + "step": 572 + }, + { + "epoch": 0.2292, + "grad_norm": 0.3818200309145151, + "learning_rate": 0.0001799003529880068, + "loss": 0.696, + "step": 573 + }, + { + "epoch": 0.2296, + "grad_norm": 0.36612758373419035, + "learning_rate": 0.0001798223840061116, + "loss": 0.6853, + "step": 574 + }, + { + "epoch": 0.23, + "grad_norm": 0.3613173490452758, + "learning_rate": 0.00017974428105627208, + "loss": 0.7036, + "step": 575 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3891757919963155, + "learning_rate": 0.00017966604426957047, + "loss": 0.7426, + "step": 576 + }, + { + "epoch": 0.2308, + "grad_norm": 0.3576378134002283, + "learning_rate": 0.00017958767377731358, + "loss": 0.6839, + "step": 577 + }, + { + "epoch": 0.2312, + "grad_norm": 0.3526132751603115, + "learning_rate": 0.00017950916971103259, + "loss": 0.6689, + "step": 578 + }, + { + "epoch": 0.2316, + "grad_norm": 0.35281699044640497, + "learning_rate": 0.00017943053220248283, + "loss": 0.6669, + "step": 579 + }, + { + "epoch": 0.232, + "grad_norm": 0.38185895530607183, + "learning_rate": 0.0001793517613836437, + "loss": 0.6896, + "step": 580 + }, + { + "epoch": 0.2324, + "grad_norm": 0.3808102528089049, + "learning_rate": 0.00017927285738671825, + "loss": 0.7001, + "step": 581 + }, + { + "epoch": 0.2328, + "grad_norm": 0.40342552440094603, + "learning_rate": 0.00017919382034413305, + "loss": 0.7306, + "step": 582 + }, + { + "epoch": 0.2332, + "grad_norm": 0.3927131868324498, + "learning_rate": 0.00017911465038853805, + "loss": 0.6665, + "step": 583 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3778722392589643, + "learning_rate": 0.00017903534765280614, + "loss": 0.6318, + "step": 584 + }, + { + "epoch": 0.234, + "grad_norm": 0.4151287852900532, + "learning_rate": 0.00017895591227003315, + "loss": 0.7491, + "step": 585 + }, + { + "epoch": 0.2344, + "grad_norm": 0.4127180502215466, + "learning_rate": 0.00017887634437353754, + "loss": 0.7258, + "step": 586 + }, + { + "epoch": 0.2348, + "grad_norm": 0.3760329520227691, + "learning_rate": 0.00017879664409686008, + "loss": 0.6348, + "step": 587 + }, + { + "epoch": 0.2352, + "grad_norm": 0.38209501400968743, + "learning_rate": 0.00017871681157376383, + "loss": 0.7147, + "step": 588 + }, + { + "epoch": 0.2356, + "grad_norm": 0.3847708905853376, + "learning_rate": 0.00017863684693823374, + "loss": 0.6319, + "step": 589 + }, + { + "epoch": 0.236, + "grad_norm": 0.3840268112970994, + "learning_rate": 0.00017855675032447648, + "loss": 0.7028, + "step": 590 + }, + { + "epoch": 0.2364, + "grad_norm": 0.36321968809268107, + "learning_rate": 0.00017847652186692026, + "loss": 0.6773, + "step": 591 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3889401405169338, + "learning_rate": 0.00017839616170021452, + "loss": 0.6702, + "step": 592 + }, + { + "epoch": 0.2372, + "grad_norm": 0.36552397912944695, + "learning_rate": 0.00017831566995922985, + "loss": 0.6149, + "step": 593 + }, + { + "epoch": 0.2376, + "grad_norm": 0.41483290567522496, + "learning_rate": 0.0001782350467790575, + "loss": 0.722, + "step": 594 + }, + { + "epoch": 0.238, + "grad_norm": 0.37673867043341613, + "learning_rate": 0.00017815429229500946, + "loss": 0.6631, + "step": 595 + }, + { + "epoch": 0.2384, + "grad_norm": 0.400236400217791, + "learning_rate": 0.00017807340664261802, + "loss": 0.7343, + "step": 596 + }, + { + "epoch": 0.2388, + "grad_norm": 0.3872262889336039, + "learning_rate": 0.00017799238995763568, + "loss": 0.6667, + "step": 597 + }, + { + "epoch": 0.2392, + "grad_norm": 0.36611894737878553, + "learning_rate": 0.00017791124237603477, + "loss": 0.6985, + "step": 598 + }, + { + "epoch": 0.2396, + "grad_norm": 0.3794928656592404, + "learning_rate": 0.00017782996403400736, + "loss": 0.6649, + "step": 599 + }, + { + "epoch": 0.24, + "grad_norm": 0.3737190197254289, + "learning_rate": 0.00017774855506796496, + "loss": 0.6962, + "step": 600 + }, + { + "epoch": 0.2404, + "grad_norm": 0.39463729811356335, + "learning_rate": 0.0001776670156145383, + "loss": 0.7304, + "step": 601 + }, + { + "epoch": 0.2408, + "grad_norm": 0.3831388487338996, + "learning_rate": 0.00017758534581057718, + "loss": 0.6629, + "step": 602 + }, + { + "epoch": 0.2412, + "grad_norm": 0.43051349105935655, + "learning_rate": 0.00017750354579315004, + "loss": 0.6777, + "step": 603 + }, + { + "epoch": 0.2416, + "grad_norm": 0.3815835573125915, + "learning_rate": 0.00017742161569954398, + "loss": 0.7249, + "step": 604 + }, + { + "epoch": 0.242, + "grad_norm": 0.40401973764275717, + "learning_rate": 0.0001773395556672644, + "loss": 0.6964, + "step": 605 + }, + { + "epoch": 0.2424, + "grad_norm": 0.3759793497162433, + "learning_rate": 0.0001772573658340347, + "loss": 0.6788, + "step": 606 + }, + { + "epoch": 0.2428, + "grad_norm": 0.362762388517486, + "learning_rate": 0.0001771750463377962, + "loss": 0.6848, + "step": 607 + }, + { + "epoch": 0.2432, + "grad_norm": 0.37107964853497377, + "learning_rate": 0.00017709259731670774, + "loss": 0.6923, + "step": 608 + }, + { + "epoch": 0.2436, + "grad_norm": 0.37303727524487695, + "learning_rate": 0.00017701001890914572, + "loss": 0.6886, + "step": 609 + }, + { + "epoch": 0.244, + "grad_norm": 0.3748572413204125, + "learning_rate": 0.00017692731125370354, + "loss": 0.6442, + "step": 610 + }, + { + "epoch": 0.2444, + "grad_norm": 0.3871237167689098, + "learning_rate": 0.00017684447448919154, + "loss": 0.7179, + "step": 611 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3852499457759282, + "learning_rate": 0.00017676150875463686, + "loss": 0.6639, + "step": 612 + }, + { + "epoch": 0.2452, + "grad_norm": 0.39892523319546336, + "learning_rate": 0.0001766784141892829, + "loss": 0.6848, + "step": 613 + }, + { + "epoch": 0.2456, + "grad_norm": 0.38582941181479236, + "learning_rate": 0.0001765951909325895, + "loss": 0.7209, + "step": 614 + }, + { + "epoch": 0.246, + "grad_norm": 0.3693773371571141, + "learning_rate": 0.00017651183912423228, + "loss": 0.6578, + "step": 615 + }, + { + "epoch": 0.2464, + "grad_norm": 0.41863131309102863, + "learning_rate": 0.0001764283589041028, + "loss": 0.6928, + "step": 616 + }, + { + "epoch": 0.2468, + "grad_norm": 0.3729907993552498, + "learning_rate": 0.00017634475041230797, + "loss": 0.6805, + "step": 617 + }, + { + "epoch": 0.2472, + "grad_norm": 0.3652078874991456, + "learning_rate": 0.00017626101378917004, + "loss": 0.6434, + "step": 618 + }, + { + "epoch": 0.2476, + "grad_norm": 0.3841317724190862, + "learning_rate": 0.0001761771491752264, + "loss": 0.6609, + "step": 619 + }, + { + "epoch": 0.248, + "grad_norm": 0.38851731747912505, + "learning_rate": 0.0001760931567112291, + "loss": 0.7074, + "step": 620 + }, + { + "epoch": 0.2484, + "grad_norm": 0.370551859571575, + "learning_rate": 0.0001760090365381449, + "loss": 0.6655, + "step": 621 + }, + { + "epoch": 0.2488, + "grad_norm": 0.35499415306837123, + "learning_rate": 0.0001759247887971548, + "loss": 0.6747, + "step": 622 + }, + { + "epoch": 0.2492, + "grad_norm": 0.3845099556724429, + "learning_rate": 0.00017584041362965396, + "loss": 0.7022, + "step": 623 + }, + { + "epoch": 0.2496, + "grad_norm": 0.37613503730669373, + "learning_rate": 0.0001757559111772513, + "loss": 0.6742, + "step": 624 + }, + { + "epoch": 0.25, + "grad_norm": 0.3684496070745272, + "learning_rate": 0.00017567128158176953, + "loss": 0.6842, + "step": 625 + }, + { + "epoch": 0.2504, + "grad_norm": 0.3888794211882254, + "learning_rate": 0.0001755865249852446, + "loss": 0.6807, + "step": 626 + }, + { + "epoch": 0.2508, + "grad_norm": 0.40567984878124275, + "learning_rate": 0.00017550164152992573, + "loss": 0.7102, + "step": 627 + }, + { + "epoch": 0.2512, + "grad_norm": 0.4117379538927731, + "learning_rate": 0.00017541663135827492, + "loss": 0.6682, + "step": 628 + }, + { + "epoch": 0.2516, + "grad_norm": 0.3966339407445109, + "learning_rate": 0.000175331494612967, + "loss": 0.7165, + "step": 629 + }, + { + "epoch": 0.252, + "grad_norm": 0.3849902084616848, + "learning_rate": 0.00017524623143688902, + "loss": 0.674, + "step": 630 + }, + { + "epoch": 0.2524, + "grad_norm": 0.38338662340864577, + "learning_rate": 0.00017516084197314046, + "loss": 0.705, + "step": 631 + }, + { + "epoch": 0.2528, + "grad_norm": 0.36860128784352547, + "learning_rate": 0.00017507532636503256, + "loss": 0.7055, + "step": 632 + }, + { + "epoch": 0.2532, + "grad_norm": 0.3640341887601648, + "learning_rate": 0.00017498968475608838, + "loss": 0.6785, + "step": 633 + }, + { + "epoch": 0.2536, + "grad_norm": 0.3702796399289806, + "learning_rate": 0.00017490391729004244, + "loss": 0.6571, + "step": 634 + }, + { + "epoch": 0.254, + "grad_norm": 0.3982786533867165, + "learning_rate": 0.00017481802411084042, + "loss": 0.6869, + "step": 635 + }, + { + "epoch": 0.2544, + "grad_norm": 0.3913728183615705, + "learning_rate": 0.00017473200536263905, + "loss": 0.6273, + "step": 636 + }, + { + "epoch": 0.2548, + "grad_norm": 0.3644492049830817, + "learning_rate": 0.0001746458611898058, + "loss": 0.6912, + "step": 637 + }, + { + "epoch": 0.2552, + "grad_norm": 0.3850145886141273, + "learning_rate": 0.00017455959173691863, + "loss": 0.698, + "step": 638 + }, + { + "epoch": 0.2556, + "grad_norm": 0.4037588418859427, + "learning_rate": 0.00017447319714876579, + "loss": 0.6915, + "step": 639 + }, + { + "epoch": 0.256, + "grad_norm": 0.38514181666674996, + "learning_rate": 0.00017438667757034546, + "loss": 0.7011, + "step": 640 + }, + { + "epoch": 0.2564, + "grad_norm": 0.3804264348600396, + "learning_rate": 0.00017430003314686569, + "loss": 0.6562, + "step": 641 + }, + { + "epoch": 0.2568, + "grad_norm": 0.36701716408505514, + "learning_rate": 0.00017421326402374405, + "loss": 0.6247, + "step": 642 + }, + { + "epoch": 0.2572, + "grad_norm": 0.3791261640547457, + "learning_rate": 0.00017412637034660734, + "loss": 0.6627, + "step": 643 + }, + { + "epoch": 0.2576, + "grad_norm": 0.3590754847058719, + "learning_rate": 0.0001740393522612915, + "loss": 0.6937, + "step": 644 + }, + { + "epoch": 0.258, + "grad_norm": 0.35700417054100436, + "learning_rate": 0.0001739522099138411, + "loss": 0.7102, + "step": 645 + }, + { + "epoch": 0.2584, + "grad_norm": 0.3617050874886994, + "learning_rate": 0.00017386494345050942, + "loss": 0.7027, + "step": 646 + }, + { + "epoch": 0.2588, + "grad_norm": 0.3646321617837386, + "learning_rate": 0.000173777553017758, + "loss": 0.6648, + "step": 647 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3625516760201923, + "learning_rate": 0.00017369003876225642, + "loss": 0.7316, + "step": 648 + }, + { + "epoch": 0.2596, + "grad_norm": 0.37263329756150554, + "learning_rate": 0.00017360240083088213, + "loss": 0.6924, + "step": 649 + }, + { + "epoch": 0.26, + "grad_norm": 0.376611064425148, + "learning_rate": 0.00017351463937072004, + "loss": 0.7019, + "step": 650 + }, + { + "epoch": 0.2604, + "grad_norm": 0.36198702197838944, + "learning_rate": 0.00017342675452906248, + "loss": 0.661, + "step": 651 + }, + { + "epoch": 0.2608, + "grad_norm": 0.3765301050003338, + "learning_rate": 0.00017333874645340884, + "loss": 0.6803, + "step": 652 + }, + { + "epoch": 0.2612, + "grad_norm": 0.369200244627375, + "learning_rate": 0.0001732506152914653, + "loss": 0.6466, + "step": 653 + }, + { + "epoch": 0.2616, + "grad_norm": 0.3707798765929919, + "learning_rate": 0.00017316236119114463, + "loss": 0.6924, + "step": 654 + }, + { + "epoch": 0.262, + "grad_norm": 0.35393038650976316, + "learning_rate": 0.00017307398430056593, + "loss": 0.6453, + "step": 655 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3643741949790214, + "learning_rate": 0.00017298548476805446, + "loss": 0.6577, + "step": 656 + }, + { + "epoch": 0.2628, + "grad_norm": 0.3953980293463085, + "learning_rate": 0.00017289686274214118, + "loss": 0.7378, + "step": 657 + }, + { + "epoch": 0.2632, + "grad_norm": 0.37223081584921586, + "learning_rate": 0.00017280811837156268, + "loss": 0.6593, + "step": 658 + }, + { + "epoch": 0.2636, + "grad_norm": 0.38670120610896946, + "learning_rate": 0.00017271925180526094, + "loss": 0.6253, + "step": 659 + }, + { + "epoch": 0.264, + "grad_norm": 0.37613165176707375, + "learning_rate": 0.00017263026319238301, + "loss": 0.6898, + "step": 660 + }, + { + "epoch": 0.2644, + "grad_norm": 0.369258827355013, + "learning_rate": 0.0001725411526822807, + "loss": 0.6723, + "step": 661 + }, + { + "epoch": 0.2648, + "grad_norm": 0.35725508206210926, + "learning_rate": 0.0001724519204245105, + "loss": 0.6454, + "step": 662 + }, + { + "epoch": 0.2652, + "grad_norm": 0.36228632485926626, + "learning_rate": 0.0001723625665688331, + "loss": 0.6724, + "step": 663 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3716104052474549, + "learning_rate": 0.00017227309126521348, + "loss": 0.7132, + "step": 664 + }, + { + "epoch": 0.266, + "grad_norm": 0.3785359212197475, + "learning_rate": 0.00017218349466382023, + "loss": 0.6516, + "step": 665 + }, + { + "epoch": 0.2664, + "grad_norm": 0.40494771256617085, + "learning_rate": 0.00017209377691502565, + "loss": 0.7157, + "step": 666 + }, + { + "epoch": 0.2668, + "grad_norm": 0.387566905753395, + "learning_rate": 0.0001720039381694053, + "loss": 0.6663, + "step": 667 + }, + { + "epoch": 0.2672, + "grad_norm": 0.4072399001753149, + "learning_rate": 0.00017191397857773788, + "loss": 0.6819, + "step": 668 + }, + { + "epoch": 0.2676, + "grad_norm": 0.35836036729376525, + "learning_rate": 0.00017182389829100485, + "loss": 0.7076, + "step": 669 + }, + { + "epoch": 0.268, + "grad_norm": 0.3724433995870969, + "learning_rate": 0.00017173369746039025, + "loss": 0.6463, + "step": 670 + }, + { + "epoch": 0.2684, + "grad_norm": 0.37686765784544657, + "learning_rate": 0.00017164337623728045, + "loss": 0.7049, + "step": 671 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3873353017767109, + "learning_rate": 0.00017155293477326384, + "loss": 0.7006, + "step": 672 + }, + { + "epoch": 0.2692, + "grad_norm": 0.37233428622257264, + "learning_rate": 0.00017146237322013068, + "loss": 0.6929, + "step": 673 + }, + { + "epoch": 0.2696, + "grad_norm": 0.37389546581786853, + "learning_rate": 0.00017137169172987268, + "loss": 0.6666, + "step": 674 + }, + { + "epoch": 0.27, + "grad_norm": 0.3468004782863812, + "learning_rate": 0.00017128089045468294, + "loss": 0.6896, + "step": 675 + }, + { + "epoch": 0.2704, + "grad_norm": 0.3776082125284464, + "learning_rate": 0.00017118996954695553, + "loss": 0.6808, + "step": 676 + }, + { + "epoch": 0.2708, + "grad_norm": 0.3614927088426095, + "learning_rate": 0.00017109892915928535, + "loss": 0.6954, + "step": 677 + }, + { + "epoch": 0.2712, + "grad_norm": 0.4890730822409624, + "learning_rate": 0.00017100776944446781, + "loss": 0.7149, + "step": 678 + }, + { + "epoch": 0.2716, + "grad_norm": 0.35165166072488435, + "learning_rate": 0.00017091649055549855, + "loss": 0.676, + "step": 679 + }, + { + "epoch": 0.272, + "grad_norm": 0.3568818029823244, + "learning_rate": 0.0001708250926455733, + "loss": 0.7469, + "step": 680 + }, + { + "epoch": 0.2724, + "grad_norm": 0.350578974990701, + "learning_rate": 0.00017073357586808752, + "loss": 0.6526, + "step": 681 + }, + { + "epoch": 0.2728, + "grad_norm": 0.38843902431385663, + "learning_rate": 0.0001706419403766361, + "loss": 0.7067, + "step": 682 + }, + { + "epoch": 0.2732, + "grad_norm": 0.3842213506736467, + "learning_rate": 0.00017055018632501325, + "loss": 0.6636, + "step": 683 + }, + { + "epoch": 0.2736, + "grad_norm": 0.40072118060337003, + "learning_rate": 0.00017045831386721213, + "loss": 0.6932, + "step": 684 + }, + { + "epoch": 0.274, + "grad_norm": 0.3612018078843182, + "learning_rate": 0.00017036632315742462, + "loss": 0.6805, + "step": 685 + }, + { + "epoch": 0.2744, + "grad_norm": 0.34390762452297413, + "learning_rate": 0.00017027421435004112, + "loss": 0.6856, + "step": 686 + }, + { + "epoch": 0.2748, + "grad_norm": 0.41820407520924147, + "learning_rate": 0.00017018198759965016, + "loss": 0.665, + "step": 687 + }, + { + "epoch": 0.2752, + "grad_norm": 0.4158476097941947, + "learning_rate": 0.00017008964306103823, + "loss": 0.5753, + "step": 688 + }, + { + "epoch": 0.2756, + "grad_norm": 0.3693476703813386, + "learning_rate": 0.00016999718088918955, + "loss": 0.6863, + "step": 689 + }, + { + "epoch": 0.276, + "grad_norm": 0.3605022683275531, + "learning_rate": 0.00016990460123928575, + "loss": 0.6355, + "step": 690 + }, + { + "epoch": 0.2764, + "grad_norm": 0.35584585773469385, + "learning_rate": 0.0001698119042667056, + "loss": 0.6767, + "step": 691 + }, + { + "epoch": 0.2768, + "grad_norm": 0.40033377797346864, + "learning_rate": 0.00016971909012702483, + "loss": 0.6137, + "step": 692 + }, + { + "epoch": 0.2772, + "grad_norm": 0.3997013259402933, + "learning_rate": 0.00016962615897601573, + "loss": 0.6606, + "step": 693 + }, + { + "epoch": 0.2776, + "grad_norm": 0.3656529046170065, + "learning_rate": 0.00016953311096964705, + "loss": 0.6594, + "step": 694 + }, + { + "epoch": 0.278, + "grad_norm": 0.4052724353425028, + "learning_rate": 0.00016943994626408363, + "loss": 0.6518, + "step": 695 + }, + { + "epoch": 0.2784, + "grad_norm": 0.36842188704362194, + "learning_rate": 0.00016934666501568617, + "loss": 0.6173, + "step": 696 + }, + { + "epoch": 0.2788, + "grad_norm": 0.3647712480521673, + "learning_rate": 0.00016925326738101098, + "loss": 0.6558, + "step": 697 + }, + { + "epoch": 0.2792, + "grad_norm": 0.37886961387347196, + "learning_rate": 0.00016915975351680968, + "loss": 0.6386, + "step": 698 + }, + { + "epoch": 0.2796, + "grad_norm": 0.3685950283804296, + "learning_rate": 0.000169066123580029, + "loss": 0.6264, + "step": 699 + }, + { + "epoch": 0.28, + "grad_norm": 0.3633280590433798, + "learning_rate": 0.00016897237772781044, + "loss": 0.6638, + "step": 700 + }, + { + "epoch": 0.2804, + "grad_norm": 0.3732030051110284, + "learning_rate": 0.00016887851611749005, + "loss": 0.655, + "step": 701 + }, + { + "epoch": 0.2808, + "grad_norm": 0.3651617189481963, + "learning_rate": 0.00016878453890659814, + "loss": 0.7264, + "step": 702 + }, + { + "epoch": 0.2812, + "grad_norm": 0.36780411902884236, + "learning_rate": 0.0001686904462528591, + "loss": 0.6849, + "step": 703 + }, + { + "epoch": 0.2816, + "grad_norm": 0.35258092853914086, + "learning_rate": 0.000168596238314191, + "loss": 0.6552, + "step": 704 + }, + { + "epoch": 0.282, + "grad_norm": 0.3723815062634396, + "learning_rate": 0.00016850191524870546, + "loss": 0.6614, + "step": 705 + }, + { + "epoch": 0.2824, + "grad_norm": 0.3639460261569966, + "learning_rate": 0.00016840747721470731, + "loss": 0.6867, + "step": 706 + }, + { + "epoch": 0.2828, + "grad_norm": 0.37330295108405304, + "learning_rate": 0.00016831292437069427, + "loss": 0.6791, + "step": 707 + }, + { + "epoch": 0.2832, + "grad_norm": 0.3738837669495127, + "learning_rate": 0.00016821825687535674, + "loss": 0.6462, + "step": 708 + }, + { + "epoch": 0.2836, + "grad_norm": 0.3713143980678414, + "learning_rate": 0.00016812347488757772, + "loss": 0.6776, + "step": 709 + }, + { + "epoch": 0.284, + "grad_norm": 0.3956676625326154, + "learning_rate": 0.00016802857856643215, + "loss": 0.7719, + "step": 710 + }, + { + "epoch": 0.2844, + "grad_norm": 0.3633848095765644, + "learning_rate": 0.00016793356807118695, + "loss": 0.7062, + "step": 711 + }, + { + "epoch": 0.2848, + "grad_norm": 0.37781250994760474, + "learning_rate": 0.00016783844356130071, + "loss": 0.6277, + "step": 712 + }, + { + "epoch": 0.2852, + "grad_norm": 0.37066789686772716, + "learning_rate": 0.0001677432051964233, + "loss": 0.6883, + "step": 713 + }, + { + "epoch": 0.2856, + "grad_norm": 0.34506269849481763, + "learning_rate": 0.0001676478531363957, + "loss": 0.6532, + "step": 714 + }, + { + "epoch": 0.286, + "grad_norm": 0.3789970406349399, + "learning_rate": 0.00016755238754124965, + "loss": 0.6672, + "step": 715 + }, + { + "epoch": 0.2864, + "grad_norm": 0.36544272597592525, + "learning_rate": 0.00016745680857120757, + "loss": 0.6744, + "step": 716 + }, + { + "epoch": 0.2868, + "grad_norm": 0.3728910819407148, + "learning_rate": 0.00016736111638668204, + "loss": 0.6433, + "step": 717 + }, + { + "epoch": 0.2872, + "grad_norm": 0.3549090324306322, + "learning_rate": 0.00016726531114827573, + "loss": 0.5937, + "step": 718 + }, + { + "epoch": 0.2876, + "grad_norm": 0.35314191620570956, + "learning_rate": 0.00016716939301678098, + "loss": 0.6245, + "step": 719 + }, + { + "epoch": 0.288, + "grad_norm": 0.35969773999782906, + "learning_rate": 0.00016707336215317968, + "loss": 0.6847, + "step": 720 + }, + { + "epoch": 0.2884, + "grad_norm": 0.4214046623787497, + "learning_rate": 0.00016697721871864284, + "loss": 0.7331, + "step": 721 + }, + { + "epoch": 0.2888, + "grad_norm": 0.35052029199312573, + "learning_rate": 0.00016688096287453046, + "loss": 0.6652, + "step": 722 + }, + { + "epoch": 0.2892, + "grad_norm": 0.36484181678208727, + "learning_rate": 0.00016678459478239118, + "loss": 0.6677, + "step": 723 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3790540606763471, + "learning_rate": 0.00016668811460396202, + "loss": 0.6642, + "step": 724 + }, + { + "epoch": 0.29, + "grad_norm": 0.36849657826638405, + "learning_rate": 0.00016659152250116812, + "loss": 0.6618, + "step": 725 + }, + { + "epoch": 0.2904, + "grad_norm": 0.3892714920294252, + "learning_rate": 0.00016649481863612248, + "loss": 0.6795, + "step": 726 + }, + { + "epoch": 0.2908, + "grad_norm": 0.35662611125712923, + "learning_rate": 0.0001663980031711257, + "loss": 0.5695, + "step": 727 + }, + { + "epoch": 0.2912, + "grad_norm": 0.40575267772496953, + "learning_rate": 0.00016630107626866558, + "loss": 0.7215, + "step": 728 + }, + { + "epoch": 0.2916, + "grad_norm": 0.372483669398152, + "learning_rate": 0.00016620403809141705, + "loss": 0.6797, + "step": 729 + }, + { + "epoch": 0.292, + "grad_norm": 0.36510913163052217, + "learning_rate": 0.00016610688880224178, + "loss": 0.6857, + "step": 730 + }, + { + "epoch": 0.2924, + "grad_norm": 0.38278306058860057, + "learning_rate": 0.00016600962856418782, + "loss": 0.632, + "step": 731 + }, + { + "epoch": 0.2928, + "grad_norm": 0.3610059628511899, + "learning_rate": 0.00016591225754048963, + "loss": 0.6671, + "step": 732 + }, + { + "epoch": 0.2932, + "grad_norm": 0.35661460366895475, + "learning_rate": 0.00016581477589456734, + "loss": 0.6547, + "step": 733 + }, + { + "epoch": 0.2936, + "grad_norm": 0.3476447348320809, + "learning_rate": 0.00016571718379002705, + "loss": 0.6387, + "step": 734 + }, + { + "epoch": 0.294, + "grad_norm": 0.3692442697306115, + "learning_rate": 0.00016561948139065996, + "loss": 0.7041, + "step": 735 + }, + { + "epoch": 0.2944, + "grad_norm": 0.35450747006735767, + "learning_rate": 0.00016552166886044253, + "loss": 0.7146, + "step": 736 + }, + { + "epoch": 0.2948, + "grad_norm": 0.3669567893885407, + "learning_rate": 0.00016542374636353604, + "loss": 0.6699, + "step": 737 + }, + { + "epoch": 0.2952, + "grad_norm": 0.3594868978946008, + "learning_rate": 0.0001653257140642863, + "loss": 0.6495, + "step": 738 + }, + { + "epoch": 0.2956, + "grad_norm": 0.3634931926296147, + "learning_rate": 0.00016522757212722344, + "loss": 0.6649, + "step": 739 + }, + { + "epoch": 0.296, + "grad_norm": 0.350291190620466, + "learning_rate": 0.00016512932071706152, + "loss": 0.6881, + "step": 740 + }, + { + "epoch": 0.2964, + "grad_norm": 0.3610260685199456, + "learning_rate": 0.0001650309599986985, + "loss": 0.6776, + "step": 741 + }, + { + "epoch": 0.2968, + "grad_norm": 0.3569659676953712, + "learning_rate": 0.00016493249013721558, + "loss": 0.6505, + "step": 742 + }, + { + "epoch": 0.2972, + "grad_norm": 0.36554646148471603, + "learning_rate": 0.00016483391129787727, + "loss": 0.6614, + "step": 743 + }, + { + "epoch": 0.2976, + "grad_norm": 0.350784920726585, + "learning_rate": 0.000164735223646131, + "loss": 0.6603, + "step": 744 + }, + { + "epoch": 0.298, + "grad_norm": 0.347788847751402, + "learning_rate": 0.0001646364273476067, + "loss": 0.6799, + "step": 745 + }, + { + "epoch": 0.2984, + "grad_norm": 0.3626403008330356, + "learning_rate": 0.00016453752256811674, + "loss": 0.6834, + "step": 746 + }, + { + "epoch": 0.2988, + "grad_norm": 0.3657655470550385, + "learning_rate": 0.00016443850947365558, + "loss": 0.6584, + "step": 747 + }, + { + "epoch": 0.2992, + "grad_norm": 0.34036129141990146, + "learning_rate": 0.0001643393882303994, + "loss": 0.6319, + "step": 748 + }, + { + "epoch": 0.2996, + "grad_norm": 0.35126675290236997, + "learning_rate": 0.00016424015900470587, + "loss": 0.6651, + "step": 749 + }, + { + "epoch": 0.3, + "grad_norm": 0.35391930145941913, + "learning_rate": 0.000164140821963114, + "loss": 0.6863, + "step": 750 + }, + { + "epoch": 0.3004, + "grad_norm": 0.370861156187132, + "learning_rate": 0.00016404137727234365, + "loss": 0.6459, + "step": 751 + }, + { + "epoch": 0.3008, + "grad_norm": 0.35987999591978354, + "learning_rate": 0.00016394182509929536, + "loss": 0.672, + "step": 752 + }, + { + "epoch": 0.3012, + "grad_norm": 0.3587536922915207, + "learning_rate": 0.00016384216561105014, + "loss": 0.6978, + "step": 753 + }, + { + "epoch": 0.3016, + "grad_norm": 0.3511980224029837, + "learning_rate": 0.000163742398974869, + "loss": 0.6422, + "step": 754 + }, + { + "epoch": 0.302, + "grad_norm": 0.34901958545175216, + "learning_rate": 0.00016364252535819282, + "loss": 0.6684, + "step": 755 + }, + { + "epoch": 0.3024, + "grad_norm": 0.33961478583723576, + "learning_rate": 0.00016354254492864211, + "loss": 0.6703, + "step": 756 + }, + { + "epoch": 0.3028, + "grad_norm": 0.34445952568829874, + "learning_rate": 0.00016344245785401653, + "loss": 0.6511, + "step": 757 + }, + { + "epoch": 0.3032, + "grad_norm": 0.35753721007263206, + "learning_rate": 0.00016334226430229475, + "loss": 0.7074, + "step": 758 + }, + { + "epoch": 0.3036, + "grad_norm": 0.37499914558856595, + "learning_rate": 0.00016324196444163423, + "loss": 0.7285, + "step": 759 + }, + { + "epoch": 0.304, + "grad_norm": 0.3630558022939141, + "learning_rate": 0.00016314155844037074, + "loss": 0.6575, + "step": 760 + }, + { + "epoch": 0.3044, + "grad_norm": 0.35329110139187386, + "learning_rate": 0.0001630410464670182, + "loss": 0.6434, + "step": 761 + }, + { + "epoch": 0.3048, + "grad_norm": 0.3639095040912134, + "learning_rate": 0.00016294042869026851, + "loss": 0.648, + "step": 762 + }, + { + "epoch": 0.3052, + "grad_norm": 0.3843760678407015, + "learning_rate": 0.000162839705278991, + "loss": 0.6508, + "step": 763 + }, + { + "epoch": 0.3056, + "grad_norm": 0.3872124608722155, + "learning_rate": 0.0001627388764022323, + "loss": 0.6649, + "step": 764 + }, + { + "epoch": 0.306, + "grad_norm": 0.3768132943244325, + "learning_rate": 0.0001626379422292162, + "loss": 0.6286, + "step": 765 + }, + { + "epoch": 0.3064, + "grad_norm": 0.36204946392202214, + "learning_rate": 0.000162536902929343, + "loss": 0.6278, + "step": 766 + }, + { + "epoch": 0.3068, + "grad_norm": 0.36082703461295695, + "learning_rate": 0.00016243575867218958, + "loss": 0.6616, + "step": 767 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3484587533493218, + "learning_rate": 0.00016233450962750893, + "loss": 0.6315, + "step": 768 + }, + { + "epoch": 0.3076, + "grad_norm": 0.34636803888691087, + "learning_rate": 0.00016223315596522987, + "loss": 0.6898, + "step": 769 + }, + { + "epoch": 0.308, + "grad_norm": 0.38751438455311005, + "learning_rate": 0.0001621316978554569, + "loss": 0.7102, + "step": 770 + }, + { + "epoch": 0.3084, + "grad_norm": 0.36662104390459777, + "learning_rate": 0.00016203013546846966, + "loss": 0.7066, + "step": 771 + }, + { + "epoch": 0.3088, + "grad_norm": 0.35577550680251185, + "learning_rate": 0.00016192846897472297, + "loss": 0.6634, + "step": 772 + }, + { + "epoch": 0.3092, + "grad_norm": 0.36344365908813225, + "learning_rate": 0.0001618266985448463, + "loss": 0.6888, + "step": 773 + }, + { + "epoch": 0.3096, + "grad_norm": 0.35338050166466156, + "learning_rate": 0.00016172482434964353, + "loss": 0.6299, + "step": 774 + }, + { + "epoch": 0.31, + "grad_norm": 0.36305744063960965, + "learning_rate": 0.00016162284656009274, + "loss": 0.6935, + "step": 775 + }, + { + "epoch": 0.3104, + "grad_norm": 0.37054085691252775, + "learning_rate": 0.00016152076534734584, + "loss": 0.6571, + "step": 776 + }, + { + "epoch": 0.3108, + "grad_norm": 0.3950689393757759, + "learning_rate": 0.00016141858088272837, + "loss": 0.7024, + "step": 777 + }, + { + "epoch": 0.3112, + "grad_norm": 0.35527100014035695, + "learning_rate": 0.00016131629333773908, + "loss": 0.6599, + "step": 778 + }, + { + "epoch": 0.3116, + "grad_norm": 0.37204052038957597, + "learning_rate": 0.0001612139028840498, + "loss": 0.7039, + "step": 779 + }, + { + "epoch": 0.312, + "grad_norm": 0.35028572528380963, + "learning_rate": 0.00016111140969350503, + "loss": 0.6679, + "step": 780 + }, + { + "epoch": 0.3124, + "grad_norm": 0.3952564467547976, + "learning_rate": 0.0001610088139381217, + "loss": 0.7119, + "step": 781 + }, + { + "epoch": 0.3128, + "grad_norm": 0.3589953166918137, + "learning_rate": 0.00016090611579008888, + "loss": 0.6773, + "step": 782 + }, + { + "epoch": 0.3132, + "grad_norm": 0.35319758705458487, + "learning_rate": 0.00016080331542176753, + "loss": 0.6585, + "step": 783 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3677613546602713, + "learning_rate": 0.00016070041300569012, + "loss": 0.6966, + "step": 784 + }, + { + "epoch": 0.314, + "grad_norm": 0.38700046926645354, + "learning_rate": 0.00016059740871456036, + "loss": 0.6679, + "step": 785 + }, + { + "epoch": 0.3144, + "grad_norm": 0.37625609996376963, + "learning_rate": 0.000160494302721253, + "loss": 0.6678, + "step": 786 + }, + { + "epoch": 0.3148, + "grad_norm": 0.37295043227958613, + "learning_rate": 0.0001603910951988135, + "loss": 0.685, + "step": 787 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3560683487097046, + "learning_rate": 0.00016028778632045762, + "loss": 0.6134, + "step": 788 + }, + { + "epoch": 0.3156, + "grad_norm": 0.3601034178766075, + "learning_rate": 0.00016018437625957133, + "loss": 0.6901, + "step": 789 + }, + { + "epoch": 0.316, + "grad_norm": 0.3705501734874379, + "learning_rate": 0.00016008086518971037, + "loss": 0.7209, + "step": 790 + }, + { + "epoch": 0.3164, + "grad_norm": 0.3914966449738789, + "learning_rate": 0.0001599772532846, + "loss": 0.6661, + "step": 791 + }, + { + "epoch": 0.3168, + "grad_norm": 0.38500568784698896, + "learning_rate": 0.0001598735407181347, + "loss": 0.6657, + "step": 792 + }, + { + "epoch": 0.3172, + "grad_norm": 0.3667191488097956, + "learning_rate": 0.00015976972766437795, + "loss": 0.6769, + "step": 793 + }, + { + "epoch": 0.3176, + "grad_norm": 0.3740913595595519, + "learning_rate": 0.00015966581429756183, + "loss": 0.6601, + "step": 794 + }, + { + "epoch": 0.318, + "grad_norm": 0.354469593288666, + "learning_rate": 0.00015956180079208682, + "loss": 0.6903, + "step": 795 + }, + { + "epoch": 0.3184, + "grad_norm": 0.35637752265785166, + "learning_rate": 0.00015945768732252144, + "loss": 0.6529, + "step": 796 + }, + { + "epoch": 0.3188, + "grad_norm": 0.34671445467104656, + "learning_rate": 0.00015935347406360192, + "loss": 0.6213, + "step": 797 + }, + { + "epoch": 0.3192, + "grad_norm": 0.41260532402747196, + "learning_rate": 0.00015924916119023212, + "loss": 0.7666, + "step": 798 + }, + { + "epoch": 0.3196, + "grad_norm": 0.37980397342432365, + "learning_rate": 0.00015914474887748295, + "loss": 0.688, + "step": 799 + }, + { + "epoch": 0.32, + "grad_norm": 0.3471779827666835, + "learning_rate": 0.00015904023730059228, + "loss": 0.6414, + "step": 800 + }, + { + "epoch": 0.3204, + "grad_norm": 0.39442178437895997, + "learning_rate": 0.0001589356266349645, + "loss": 0.702, + "step": 801 + }, + { + "epoch": 0.3208, + "grad_norm": 0.36392050739465226, + "learning_rate": 0.00015883091705617045, + "loss": 0.685, + "step": 802 + }, + { + "epoch": 0.3212, + "grad_norm": 0.3730467393953074, + "learning_rate": 0.00015872610873994685, + "loss": 0.6942, + "step": 803 + }, + { + "epoch": 0.3216, + "grad_norm": 0.351615133925163, + "learning_rate": 0.00015862120186219613, + "loss": 0.6402, + "step": 804 + }, + { + "epoch": 0.322, + "grad_norm": 0.3595595588948544, + "learning_rate": 0.00015851619659898623, + "loss": 0.6736, + "step": 805 + }, + { + "epoch": 0.3224, + "grad_norm": 0.41516559446603896, + "learning_rate": 0.00015841109312655016, + "loss": 0.6576, + "step": 806 + }, + { + "epoch": 0.3228, + "grad_norm": 0.35512516476410827, + "learning_rate": 0.00015830589162128572, + "loss": 0.6371, + "step": 807 + }, + { + "epoch": 0.3232, + "grad_norm": 0.36836803104245197, + "learning_rate": 0.00015820059225975531, + "loss": 0.6896, + "step": 808 + }, + { + "epoch": 0.3236, + "grad_norm": 0.3968393493789091, + "learning_rate": 0.0001580951952186856, + "loss": 0.7132, + "step": 809 + }, + { + "epoch": 0.324, + "grad_norm": 0.36669128320751304, + "learning_rate": 0.000157989700674967, + "loss": 0.6834, + "step": 810 + }, + { + "epoch": 0.3244, + "grad_norm": 0.37771858320485735, + "learning_rate": 0.00015788410880565379, + "loss": 0.7076, + "step": 811 + }, + { + "epoch": 0.3248, + "grad_norm": 0.3653533497184648, + "learning_rate": 0.00015777841978796347, + "loss": 0.686, + "step": 812 + }, + { + "epoch": 0.3252, + "grad_norm": 0.3719804998835032, + "learning_rate": 0.0001576726337992766, + "loss": 0.7133, + "step": 813 + }, + { + "epoch": 0.3256, + "grad_norm": 0.34430137750629114, + "learning_rate": 0.00015756675101713657, + "loss": 0.6281, + "step": 814 + }, + { + "epoch": 0.326, + "grad_norm": 0.3874952604990868, + "learning_rate": 0.00015746077161924905, + "loss": 0.6883, + "step": 815 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3602363795621833, + "learning_rate": 0.00015735469578348208, + "loss": 0.658, + "step": 816 + }, + { + "epoch": 0.3268, + "grad_norm": 0.3678885252566548, + "learning_rate": 0.00015724852368786537, + "loss": 0.6451, + "step": 817 + }, + { + "epoch": 0.3272, + "grad_norm": 0.37231861031745034, + "learning_rate": 0.0001571422555105903, + "loss": 0.6797, + "step": 818 + }, + { + "epoch": 0.3276, + "grad_norm": 0.36582627986791816, + "learning_rate": 0.0001570358914300094, + "loss": 0.6686, + "step": 819 + }, + { + "epoch": 0.328, + "grad_norm": 0.3751003188638191, + "learning_rate": 0.00015692943162463628, + "loss": 0.6865, + "step": 820 + }, + { + "epoch": 0.3284, + "grad_norm": 0.35279218250418054, + "learning_rate": 0.00015682287627314515, + "loss": 0.6436, + "step": 821 + }, + { + "epoch": 0.3288, + "grad_norm": 0.39811222024035187, + "learning_rate": 0.00015671622555437053, + "loss": 0.6913, + "step": 822 + }, + { + "epoch": 0.3292, + "grad_norm": 0.35173620983932047, + "learning_rate": 0.00015660947964730708, + "loss": 0.6351, + "step": 823 + }, + { + "epoch": 0.3296, + "grad_norm": 0.35186063275235907, + "learning_rate": 0.0001565026387311092, + "loss": 0.6351, + "step": 824 + }, + { + "epoch": 0.33, + "grad_norm": 0.3582217383025288, + "learning_rate": 0.00015639570298509064, + "loss": 0.6668, + "step": 825 + }, + { + "epoch": 0.3304, + "grad_norm": 0.3528654454610766, + "learning_rate": 0.0001562886725887245, + "loss": 0.6337, + "step": 826 + }, + { + "epoch": 0.3308, + "grad_norm": 0.3779509939019467, + "learning_rate": 0.00015618154772164256, + "loss": 0.7223, + "step": 827 + }, + { + "epoch": 0.3312, + "grad_norm": 0.36399592374656126, + "learning_rate": 0.00015607432856363525, + "loss": 0.6322, + "step": 828 + }, + { + "epoch": 0.3316, + "grad_norm": 0.3555705327070775, + "learning_rate": 0.00015596701529465117, + "loss": 0.609, + "step": 829 + }, + { + "epoch": 0.332, + "grad_norm": 0.37182042716754116, + "learning_rate": 0.00015585960809479696, + "loss": 0.6626, + "step": 830 + }, + { + "epoch": 0.3324, + "grad_norm": 0.3668089208734812, + "learning_rate": 0.00015575210714433686, + "loss": 0.6255, + "step": 831 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3637504693784944, + "learning_rate": 0.00015564451262369247, + "loss": 0.6611, + "step": 832 + }, + { + "epoch": 0.3332, + "grad_norm": 0.36434482436667015, + "learning_rate": 0.00015553682471344238, + "loss": 0.7383, + "step": 833 + }, + { + "epoch": 0.3336, + "grad_norm": 0.39528366147657606, + "learning_rate": 0.00015542904359432198, + "loss": 0.6729, + "step": 834 + }, + { + "epoch": 0.334, + "grad_norm": 0.3606252529006789, + "learning_rate": 0.00015532116944722308, + "loss": 0.6206, + "step": 835 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3431618001058578, + "learning_rate": 0.00015521320245319363, + "loss": 0.6346, + "step": 836 + }, + { + "epoch": 0.3348, + "grad_norm": 0.35978080940778, + "learning_rate": 0.00015510514279343734, + "loss": 0.6922, + "step": 837 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3511258242838397, + "learning_rate": 0.00015499699064931355, + "loss": 0.6275, + "step": 838 + }, + { + "epoch": 0.3356, + "grad_norm": 0.35577057580095667, + "learning_rate": 0.00015488874620233674, + "loss": 0.6601, + "step": 839 + }, + { + "epoch": 0.336, + "grad_norm": 0.4011721496975597, + "learning_rate": 0.0001547804096341763, + "loss": 0.6742, + "step": 840 + }, + { + "epoch": 0.3364, + "grad_norm": 0.3781361898780127, + "learning_rate": 0.00015467198112665632, + "loss": 0.6873, + "step": 841 + }, + { + "epoch": 0.3368, + "grad_norm": 0.3585323822428953, + "learning_rate": 0.0001545634608617551, + "loss": 0.6562, + "step": 842 + }, + { + "epoch": 0.3372, + "grad_norm": 0.36587737160692985, + "learning_rate": 0.00015445484902160491, + "loss": 0.6777, + "step": 843 + }, + { + "epoch": 0.3376, + "grad_norm": 0.3789378464952347, + "learning_rate": 0.00015434614578849188, + "loss": 0.6335, + "step": 844 + }, + { + "epoch": 0.338, + "grad_norm": 0.3605160743547415, + "learning_rate": 0.00015423735134485536, + "loss": 0.6728, + "step": 845 + }, + { + "epoch": 0.3384, + "grad_norm": 0.3636761377770127, + "learning_rate": 0.00015412846587328782, + "loss": 0.6506, + "step": 846 + }, + { + "epoch": 0.3388, + "grad_norm": 0.3488872370925301, + "learning_rate": 0.0001540194895565346, + "loss": 0.65, + "step": 847 + }, + { + "epoch": 0.3392, + "grad_norm": 0.375624221182326, + "learning_rate": 0.00015391042257749336, + "loss": 0.671, + "step": 848 + }, + { + "epoch": 0.3396, + "grad_norm": 0.35330262247588673, + "learning_rate": 0.00015380126511921403, + "loss": 0.6714, + "step": 849 + }, + { + "epoch": 0.34, + "grad_norm": 0.35426489657914634, + "learning_rate": 0.0001536920173648984, + "loss": 0.669, + "step": 850 + }, + { + "epoch": 0.3404, + "grad_norm": 0.3723971472909655, + "learning_rate": 0.00015358267949789966, + "loss": 0.6926, + "step": 851 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3925138623019826, + "learning_rate": 0.00015347325170172245, + "loss": 0.6884, + "step": 852 + }, + { + "epoch": 0.3412, + "grad_norm": 0.3737024778126771, + "learning_rate": 0.0001533637341600221, + "loss": 0.6918, + "step": 853 + }, + { + "epoch": 0.3416, + "grad_norm": 0.36923274420514496, + "learning_rate": 0.0001532541270566049, + "loss": 0.6423, + "step": 854 + }, + { + "epoch": 0.342, + "grad_norm": 0.3650733794413604, + "learning_rate": 0.00015314443057542703, + "loss": 0.6589, + "step": 855 + }, + { + "epoch": 0.3424, + "grad_norm": 0.39054808959550796, + "learning_rate": 0.00015303464490059506, + "loss": 0.6576, + "step": 856 + }, + { + "epoch": 0.3428, + "grad_norm": 0.3586992537018854, + "learning_rate": 0.00015292477021636497, + "loss": 0.6529, + "step": 857 + }, + { + "epoch": 0.3432, + "grad_norm": 0.37092971161160443, + "learning_rate": 0.0001528148067071423, + "loss": 0.6625, + "step": 858 + }, + { + "epoch": 0.3436, + "grad_norm": 0.33410515280317205, + "learning_rate": 0.00015270475455748166, + "loss": 0.632, + "step": 859 + }, + { + "epoch": 0.344, + "grad_norm": 0.3640275136273407, + "learning_rate": 0.00015259461395208628, + "loss": 0.662, + "step": 860 + }, + { + "epoch": 0.3444, + "grad_norm": 0.3613335383897598, + "learning_rate": 0.00015248438507580806, + "loss": 0.6969, + "step": 861 + }, + { + "epoch": 0.3448, + "grad_norm": 0.35592397239382606, + "learning_rate": 0.00015237406811364682, + "loss": 0.6969, + "step": 862 + }, + { + "epoch": 0.3452, + "grad_norm": 0.39705521362789375, + "learning_rate": 0.0001522636632507504, + "loss": 0.6707, + "step": 863 + }, + { + "epoch": 0.3456, + "grad_norm": 0.36112856011278505, + "learning_rate": 0.00015215317067241414, + "loss": 0.6631, + "step": 864 + }, + { + "epoch": 0.346, + "grad_norm": 0.38273950968861264, + "learning_rate": 0.00015204259056408046, + "loss": 0.667, + "step": 865 + }, + { + "epoch": 0.3464, + "grad_norm": 0.3783694059874637, + "learning_rate": 0.00015193192311133884, + "loss": 0.6663, + "step": 866 + }, + { + "epoch": 0.3468, + "grad_norm": 0.366434677879186, + "learning_rate": 0.00015182116849992526, + "loss": 0.6166, + "step": 867 + }, + { + "epoch": 0.3472, + "grad_norm": 0.3458935991132749, + "learning_rate": 0.00015171032691572206, + "loss": 0.6834, + "step": 868 + }, + { + "epoch": 0.3476, + "grad_norm": 0.36496087348892964, + "learning_rate": 0.00015159939854475743, + "loss": 0.6704, + "step": 869 + }, + { + "epoch": 0.348, + "grad_norm": 0.3760305334945727, + "learning_rate": 0.00015148838357320537, + "loss": 0.6935, + "step": 870 + }, + { + "epoch": 0.3484, + "grad_norm": 0.35850665616931604, + "learning_rate": 0.00015137728218738502, + "loss": 0.6872, + "step": 871 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3928618937253879, + "learning_rate": 0.0001512660945737608, + "loss": 0.6643, + "step": 872 + }, + { + "epoch": 0.3492, + "grad_norm": 0.35046941936035353, + "learning_rate": 0.00015115482091894165, + "loss": 0.6654, + "step": 873 + }, + { + "epoch": 0.3496, + "grad_norm": 0.3685208949359589, + "learning_rate": 0.00015104346140968095, + "loss": 0.6759, + "step": 874 + }, + { + "epoch": 0.35, + "grad_norm": 0.34450173958566066, + "learning_rate": 0.00015093201623287631, + "loss": 0.6523, + "step": 875 + }, + { + "epoch": 0.3504, + "grad_norm": 0.36555178856164366, + "learning_rate": 0.00015082048557556893, + "loss": 0.625, + "step": 876 + }, + { + "epoch": 0.3508, + "grad_norm": 0.37279078820571543, + "learning_rate": 0.00015070886962494358, + "loss": 0.6755, + "step": 877 + }, + { + "epoch": 0.3512, + "grad_norm": 0.3467247782349474, + "learning_rate": 0.0001505971685683282, + "loss": 0.636, + "step": 878 + }, + { + "epoch": 0.3516, + "grad_norm": 0.35286283300836774, + "learning_rate": 0.00015048538259319346, + "loss": 0.6466, + "step": 879 + }, + { + "epoch": 0.352, + "grad_norm": 0.3396382068456816, + "learning_rate": 0.00015037351188715265, + "loss": 0.6633, + "step": 880 + }, + { + "epoch": 0.3524, + "grad_norm": 0.35316683921346637, + "learning_rate": 0.00015026155663796123, + "loss": 0.6227, + "step": 881 + }, + { + "epoch": 0.3528, + "grad_norm": 0.3846584919822831, + "learning_rate": 0.00015014951703351653, + "loss": 0.6311, + "step": 882 + }, + { + "epoch": 0.3532, + "grad_norm": 0.3569064084982436, + "learning_rate": 0.00015003739326185751, + "loss": 0.6781, + "step": 883 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3649128002305385, + "learning_rate": 0.00014992518551116434, + "loss": 0.6731, + "step": 884 + }, + { + "epoch": 0.354, + "grad_norm": 0.3723253026564224, + "learning_rate": 0.00014981289396975817, + "loss": 0.6747, + "step": 885 + }, + { + "epoch": 0.3544, + "grad_norm": 0.363202738117168, + "learning_rate": 0.0001497005188261007, + "loss": 0.6734, + "step": 886 + }, + { + "epoch": 0.3548, + "grad_norm": 0.3548792469765704, + "learning_rate": 0.0001495880602687941, + "loss": 0.6462, + "step": 887 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3474115630121927, + "learning_rate": 0.00014947551848658034, + "loss": 0.6577, + "step": 888 + }, + { + "epoch": 0.3556, + "grad_norm": 0.374218002987988, + "learning_rate": 0.00014936289366834123, + "loss": 0.6991, + "step": 889 + }, + { + "epoch": 0.356, + "grad_norm": 0.35089097506675154, + "learning_rate": 0.00014925018600309785, + "loss": 0.6623, + "step": 890 + }, + { + "epoch": 0.3564, + "grad_norm": 0.39265641433945575, + "learning_rate": 0.00014913739568001033, + "loss": 0.6654, + "step": 891 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3540771581716744, + "learning_rate": 0.0001490245228883776, + "loss": 0.6691, + "step": 892 + }, + { + "epoch": 0.3572, + "grad_norm": 0.35062203052174024, + "learning_rate": 0.0001489115678176369, + "loss": 0.617, + "step": 893 + }, + { + "epoch": 0.3576, + "grad_norm": 0.3867056359915103, + "learning_rate": 0.00014879853065736365, + "loss": 0.6904, + "step": 894 + }, + { + "epoch": 0.358, + "grad_norm": 0.3645808187567261, + "learning_rate": 0.00014868541159727096, + "loss": 0.6453, + "step": 895 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3624902031755957, + "learning_rate": 0.00014857221082720948, + "loss": 0.6612, + "step": 896 + }, + { + "epoch": 0.3588, + "grad_norm": 0.3542078534637287, + "learning_rate": 0.0001484589285371669, + "loss": 0.6572, + "step": 897 + }, + { + "epoch": 0.3592, + "grad_norm": 0.3629270925759756, + "learning_rate": 0.0001483455649172678, + "loss": 0.6454, + "step": 898 + }, + { + "epoch": 0.3596, + "grad_norm": 0.36443854767359396, + "learning_rate": 0.0001482321201577733, + "loss": 0.6501, + "step": 899 + }, + { + "epoch": 0.36, + "grad_norm": 0.35841159262345307, + "learning_rate": 0.00014811859444908052, + "loss": 0.674, + "step": 900 + }, + { + "epoch": 0.3604, + "grad_norm": 0.3478107592885514, + "learning_rate": 0.0001480049879817226, + "loss": 0.66, + "step": 901 + }, + { + "epoch": 0.3608, + "grad_norm": 0.34446415507051853, + "learning_rate": 0.0001478913009463682, + "loss": 0.6475, + "step": 902 + }, + { + "epoch": 0.3612, + "grad_norm": 0.3357149083896519, + "learning_rate": 0.00014777753353382119, + "loss": 0.6466, + "step": 903 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3701287329337555, + "learning_rate": 0.00014766368593502026, + "loss": 0.6728, + "step": 904 + }, + { + "epoch": 0.362, + "grad_norm": 0.3652716631266611, + "learning_rate": 0.00014754975834103877, + "loss": 0.6557, + "step": 905 + }, + { + "epoch": 0.3624, + "grad_norm": 0.37214552315597893, + "learning_rate": 0.00014743575094308431, + "loss": 0.6652, + "step": 906 + }, + { + "epoch": 0.3628, + "grad_norm": 0.3555294949666416, + "learning_rate": 0.0001473216639324984, + "loss": 0.6431, + "step": 907 + }, + { + "epoch": 0.3632, + "grad_norm": 0.3521451446522717, + "learning_rate": 0.0001472074975007562, + "loss": 0.6534, + "step": 908 + }, + { + "epoch": 0.3636, + "grad_norm": 0.3476086077345556, + "learning_rate": 0.0001470932518394661, + "loss": 0.637, + "step": 909 + }, + { + "epoch": 0.364, + "grad_norm": 0.37783200576792464, + "learning_rate": 0.00014697892714036958, + "loss": 0.6883, + "step": 910 + }, + { + "epoch": 0.3644, + "grad_norm": 0.3494756499260396, + "learning_rate": 0.00014686452359534066, + "loss": 0.6536, + "step": 911 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3491803081093194, + "learning_rate": 0.0001467500413963857, + "loss": 0.6926, + "step": 912 + }, + { + "epoch": 0.3652, + "grad_norm": 0.3747430627750039, + "learning_rate": 0.00014663548073564316, + "loss": 0.6423, + "step": 913 + }, + { + "epoch": 0.3656, + "grad_norm": 0.42084751004103793, + "learning_rate": 0.00014652084180538302, + "loss": 0.6585, + "step": 914 + }, + { + "epoch": 0.366, + "grad_norm": 0.3557562025796701, + "learning_rate": 0.00014640612479800686, + "loss": 0.6542, + "step": 915 + }, + { + "epoch": 0.3664, + "grad_norm": 0.38388321779715584, + "learning_rate": 0.00014629132990604706, + "loss": 0.6331, + "step": 916 + }, + { + "epoch": 0.3668, + "grad_norm": 0.3848509563350158, + "learning_rate": 0.00014617645732216685, + "loss": 0.6476, + "step": 917 + }, + { + "epoch": 0.3672, + "grad_norm": 0.36490760655892146, + "learning_rate": 0.00014606150723915984, + "loss": 0.6403, + "step": 918 + }, + { + "epoch": 0.3676, + "grad_norm": 0.3947551795151395, + "learning_rate": 0.00014594647984994964, + "loss": 0.6904, + "step": 919 + }, + { + "epoch": 0.368, + "grad_norm": 0.35124246715619073, + "learning_rate": 0.00014583137534758967, + "loss": 0.6447, + "step": 920 + }, + { + "epoch": 0.3684, + "grad_norm": 0.36635027190183794, + "learning_rate": 0.00014571619392526278, + "loss": 0.6458, + "step": 921 + }, + { + "epoch": 0.3688, + "grad_norm": 0.3581306372172641, + "learning_rate": 0.0001456009357762809, + "loss": 0.6528, + "step": 922 + }, + { + "epoch": 0.3692, + "grad_norm": 0.3615759086065034, + "learning_rate": 0.00014548560109408466, + "loss": 0.7031, + "step": 923 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3807900684626195, + "learning_rate": 0.00014537019007224324, + "loss": 0.72, + "step": 924 + }, + { + "epoch": 0.37, + "grad_norm": 0.38231568945229655, + "learning_rate": 0.00014525470290445392, + "loss": 0.6839, + "step": 925 + }, + { + "epoch": 0.3704, + "grad_norm": 0.34273098488248055, + "learning_rate": 0.00014513913978454168, + "loss": 0.6582, + "step": 926 + }, + { + "epoch": 0.3708, + "grad_norm": 0.39393100124395825, + "learning_rate": 0.00014502350090645917, + "loss": 0.6721, + "step": 927 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3400567054502143, + "learning_rate": 0.000144907786464286, + "loss": 0.6406, + "step": 928 + }, + { + "epoch": 0.3716, + "grad_norm": 0.3670211913921609, + "learning_rate": 0.0001447919966522287, + "loss": 0.6818, + "step": 929 + }, + { + "epoch": 0.372, + "grad_norm": 0.35772860406848145, + "learning_rate": 0.00014467613166462023, + "loss": 0.6296, + "step": 930 + }, + { + "epoch": 0.3724, + "grad_norm": 0.3526738213667584, + "learning_rate": 0.00014456019169591978, + "loss": 0.6407, + "step": 931 + }, + { + "epoch": 0.3728, + "grad_norm": 0.34854679382294573, + "learning_rate": 0.0001444441769407124, + "loss": 0.648, + "step": 932 + }, + { + "epoch": 0.3732, + "grad_norm": 0.3621509291016752, + "learning_rate": 0.00014432808759370854, + "loss": 0.7082, + "step": 933 + }, + { + "epoch": 0.3736, + "grad_norm": 0.35524954321215996, + "learning_rate": 0.00014421192384974396, + "loss": 0.6422, + "step": 934 + }, + { + "epoch": 0.374, + "grad_norm": 0.3558940796138577, + "learning_rate": 0.00014409568590377918, + "loss": 0.6626, + "step": 935 + }, + { + "epoch": 0.3744, + "grad_norm": 0.33592802217161094, + "learning_rate": 0.0001439793739508994, + "loss": 0.6347, + "step": 936 + }, + { + "epoch": 0.3748, + "grad_norm": 0.3820145016362385, + "learning_rate": 0.00014386298818631386, + "loss": 0.6434, + "step": 937 + }, + { + "epoch": 0.3752, + "grad_norm": 0.3927205004390802, + "learning_rate": 0.0001437465288053558, + "loss": 0.718, + "step": 938 + }, + { + "epoch": 0.3756, + "grad_norm": 0.3518561450001955, + "learning_rate": 0.00014362999600348196, + "loss": 0.6321, + "step": 939 + }, + { + "epoch": 0.376, + "grad_norm": 0.3627903941056215, + "learning_rate": 0.00014351338997627234, + "loss": 0.6589, + "step": 940 + }, + { + "epoch": 0.3764, + "grad_norm": 0.3419710630379931, + "learning_rate": 0.00014339671091942978, + "loss": 0.6235, + "step": 941 + }, + { + "epoch": 0.3768, + "grad_norm": 0.36879674846754606, + "learning_rate": 0.0001432799590287797, + "loss": 0.6674, + "step": 942 + }, + { + "epoch": 0.3772, + "grad_norm": 0.35366350852981354, + "learning_rate": 0.00014316313450026986, + "loss": 0.6382, + "step": 943 + }, + { + "epoch": 0.3776, + "grad_norm": 0.37921850015556424, + "learning_rate": 0.00014304623752996973, + "loss": 0.6263, + "step": 944 + }, + { + "epoch": 0.378, + "grad_norm": 0.3742507615630018, + "learning_rate": 0.00014292926831407061, + "loss": 0.6481, + "step": 945 + }, + { + "epoch": 0.3784, + "grad_norm": 0.37185581882115953, + "learning_rate": 0.0001428122270488848, + "loss": 0.6749, + "step": 946 + }, + { + "epoch": 0.3788, + "grad_norm": 0.36292397414328936, + "learning_rate": 0.00014269511393084572, + "loss": 0.6045, + "step": 947 + }, + { + "epoch": 0.3792, + "grad_norm": 0.36157941938220134, + "learning_rate": 0.00014257792915650728, + "loss": 0.615, + "step": 948 + }, + { + "epoch": 0.3796, + "grad_norm": 0.38269180434028977, + "learning_rate": 0.00014246067292254366, + "loss": 0.681, + "step": 949 + }, + { + "epoch": 0.38, + "grad_norm": 0.3827510153886556, + "learning_rate": 0.00014234334542574906, + "loss": 0.6877, + "step": 950 + }, + { + "epoch": 0.3804, + "grad_norm": 0.37997491138841605, + "learning_rate": 0.00014222594686303706, + "loss": 0.6867, + "step": 951 + }, + { + "epoch": 0.3808, + "grad_norm": 0.36575592804829177, + "learning_rate": 0.00014210847743144087, + "loss": 0.6839, + "step": 952 + }, + { + "epoch": 0.3812, + "grad_norm": 0.3882126265654071, + "learning_rate": 0.00014199093732811225, + "loss": 0.616, + "step": 953 + }, + { + "epoch": 0.3816, + "grad_norm": 0.3543677696755192, + "learning_rate": 0.00014187332675032188, + "loss": 0.5746, + "step": 954 + }, + { + "epoch": 0.382, + "grad_norm": 0.3460644642182079, + "learning_rate": 0.00014175564589545854, + "loss": 0.676, + "step": 955 + }, + { + "epoch": 0.3824, + "grad_norm": 0.35503051555346704, + "learning_rate": 0.00014163789496102902, + "loss": 0.6153, + "step": 956 + }, + { + "epoch": 0.3828, + "grad_norm": 0.34896213065649373, + "learning_rate": 0.0001415200741446577, + "loss": 0.6904, + "step": 957 + }, + { + "epoch": 0.3832, + "grad_norm": 0.346168689259183, + "learning_rate": 0.00014140218364408632, + "loss": 0.6332, + "step": 958 + }, + { + "epoch": 0.3836, + "grad_norm": 0.3524594723123227, + "learning_rate": 0.00014128422365717347, + "loss": 0.6225, + "step": 959 + }, + { + "epoch": 0.384, + "grad_norm": 0.4070227595189606, + "learning_rate": 0.0001411661943818944, + "loss": 0.6878, + "step": 960 + }, + { + "epoch": 0.3844, + "grad_norm": 0.35668228011669395, + "learning_rate": 0.0001410480960163407, + "loss": 0.704, + "step": 961 + }, + { + "epoch": 0.3848, + "grad_norm": 0.35134169124857884, + "learning_rate": 0.00014092992875871979, + "loss": 0.6662, + "step": 962 + }, + { + "epoch": 0.3852, + "grad_norm": 0.35423691459425, + "learning_rate": 0.00014081169280735488, + "loss": 0.6708, + "step": 963 + }, + { + "epoch": 0.3856, + "grad_norm": 0.35015610010335474, + "learning_rate": 0.00014069338836068433, + "loss": 0.609, + "step": 964 + }, + { + "epoch": 0.386, + "grad_norm": 0.3499702691181545, + "learning_rate": 0.00014057501561726157, + "loss": 0.6391, + "step": 965 + }, + { + "epoch": 0.3864, + "grad_norm": 0.32312770115386513, + "learning_rate": 0.00014045657477575448, + "loss": 0.5978, + "step": 966 + }, + { + "epoch": 0.3868, + "grad_norm": 0.3748104243838437, + "learning_rate": 0.0001403380660349455, + "loss": 0.6848, + "step": 967 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3548359551898651, + "learning_rate": 0.00014021948959373076, + "loss": 0.6492, + "step": 968 + }, + { + "epoch": 0.3876, + "grad_norm": 0.35902687894741403, + "learning_rate": 0.0001401008456511202, + "loss": 0.681, + "step": 969 + }, + { + "epoch": 0.388, + "grad_norm": 0.34628114641184954, + "learning_rate": 0.0001399821344062369, + "loss": 0.6307, + "step": 970 + }, + { + "epoch": 0.3884, + "grad_norm": 0.3444827567722983, + "learning_rate": 0.00013986335605831705, + "loss": 0.6209, + "step": 971 + }, + { + "epoch": 0.3888, + "grad_norm": 0.36743538427244765, + "learning_rate": 0.00013974451080670934, + "loss": 0.6141, + "step": 972 + }, + { + "epoch": 0.3892, + "grad_norm": 0.3935183203217268, + "learning_rate": 0.0001396255988508748, + "loss": 0.6755, + "step": 973 + }, + { + "epoch": 0.3896, + "grad_norm": 0.37448677909376504, + "learning_rate": 0.00013950662039038643, + "loss": 0.6585, + "step": 974 + }, + { + "epoch": 0.39, + "grad_norm": 0.35524468770800793, + "learning_rate": 0.00013938757562492873, + "loss": 0.6477, + "step": 975 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3527436400724316, + "learning_rate": 0.00013926846475429766, + "loss": 0.6934, + "step": 976 + }, + { + "epoch": 0.3908, + "grad_norm": 0.3918038008179953, + "learning_rate": 0.00013914928797839995, + "loss": 0.723, + "step": 977 + }, + { + "epoch": 0.3912, + "grad_norm": 0.34877061301820206, + "learning_rate": 0.0001390300454972531, + "loss": 0.6628, + "step": 978 + }, + { + "epoch": 0.3916, + "grad_norm": 0.3938563882435424, + "learning_rate": 0.0001389107375109848, + "loss": 0.6691, + "step": 979 + }, + { + "epoch": 0.392, + "grad_norm": 0.3526362195360042, + "learning_rate": 0.00013879136421983266, + "loss": 0.6761, + "step": 980 + }, + { + "epoch": 0.3924, + "grad_norm": 0.36384177194514367, + "learning_rate": 0.00013867192582414393, + "loss": 0.6573, + "step": 981 + }, + { + "epoch": 0.3928, + "grad_norm": 0.3483204591210129, + "learning_rate": 0.0001385524225243751, + "loss": 0.6345, + "step": 982 + }, + { + "epoch": 0.3932, + "grad_norm": 0.38026899558829796, + "learning_rate": 0.00013843285452109166, + "loss": 0.6617, + "step": 983 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3779346317714003, + "learning_rate": 0.00013831322201496757, + "loss": 0.6454, + "step": 984 + }, + { + "epoch": 0.394, + "grad_norm": 0.4100717507480477, + "learning_rate": 0.0001381935252067852, + "loss": 0.6689, + "step": 985 + }, + { + "epoch": 0.3944, + "grad_norm": 0.3571062943128852, + "learning_rate": 0.00013807376429743467, + "loss": 0.6299, + "step": 986 + }, + { + "epoch": 0.3948, + "grad_norm": 0.35142189144611696, + "learning_rate": 0.00013795393948791383, + "loss": 0.6647, + "step": 987 + }, + { + "epoch": 0.3952, + "grad_norm": 0.3633902967535515, + "learning_rate": 0.0001378340509793277, + "loss": 0.6273, + "step": 988 + }, + { + "epoch": 0.3956, + "grad_norm": 0.3590667520188268, + "learning_rate": 0.00013771409897288822, + "loss": 0.6305, + "step": 989 + }, + { + "epoch": 0.396, + "grad_norm": 0.3605606122453971, + "learning_rate": 0.0001375940836699139, + "loss": 0.6957, + "step": 990 + }, + { + "epoch": 0.3964, + "grad_norm": 0.35831604074772716, + "learning_rate": 0.00013747400527182953, + "loss": 0.6694, + "step": 991 + }, + { + "epoch": 0.3968, + "grad_norm": 0.36450846842086443, + "learning_rate": 0.0001373538639801657, + "loss": 0.6336, + "step": 992 + }, + { + "epoch": 0.3972, + "grad_norm": 0.3504069378684052, + "learning_rate": 0.0001372336599965586, + "loss": 0.6714, + "step": 993 + }, + { + "epoch": 0.3976, + "grad_norm": 0.3628339670725981, + "learning_rate": 0.00013711339352274966, + "loss": 0.6611, + "step": 994 + }, + { + "epoch": 0.398, + "grad_norm": 0.35354002460595096, + "learning_rate": 0.0001369930647605852, + "loss": 0.6947, + "step": 995 + }, + { + "epoch": 0.3984, + "grad_norm": 0.3480541378031107, + "learning_rate": 0.00013687267391201605, + "loss": 0.6158, + "step": 996 + }, + { + "epoch": 0.3988, + "grad_norm": 0.3640618287141173, + "learning_rate": 0.00013675222117909717, + "loss": 0.6381, + "step": 997 + }, + { + "epoch": 0.3992, + "grad_norm": 0.35634838244330663, + "learning_rate": 0.00013663170676398752, + "loss": 0.6396, + "step": 998 + }, + { + "epoch": 0.3996, + "grad_norm": 0.3506478735382033, + "learning_rate": 0.00013651113086894952, + "loss": 0.6867, + "step": 999 + }, + { + "epoch": 0.4, + "grad_norm": 0.35888751195771684, + "learning_rate": 0.00013639049369634876, + "loss": 0.6703, + "step": 1000 + }, + { + "epoch": 0.4004, + "grad_norm": 0.345483947255706, + "learning_rate": 0.00013626979544865367, + "loss": 0.6409, + "step": 1001 + }, + { + "epoch": 0.4008, + "grad_norm": 0.3840870344565062, + "learning_rate": 0.00013614903632843523, + "loss": 0.7254, + "step": 1002 + }, + { + "epoch": 0.4012, + "grad_norm": 0.34028673798735415, + "learning_rate": 0.00013602821653836654, + "loss": 0.6108, + "step": 1003 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3496563277340895, + "learning_rate": 0.0001359073362812225, + "loss": 0.649, + "step": 1004 + }, + { + "epoch": 0.402, + "grad_norm": 0.3590764777051446, + "learning_rate": 0.00013578639575987958, + "loss": 0.6573, + "step": 1005 + }, + { + "epoch": 0.4024, + "grad_norm": 0.36232084757956284, + "learning_rate": 0.00013566539517731536, + "loss": 0.5995, + "step": 1006 + }, + { + "epoch": 0.4028, + "grad_norm": 0.40296914583508997, + "learning_rate": 0.00013554433473660817, + "loss": 0.6468, + "step": 1007 + }, + { + "epoch": 0.4032, + "grad_norm": 0.34618356579834514, + "learning_rate": 0.0001354232146409368, + "loss": 0.6311, + "step": 1008 + }, + { + "epoch": 0.4036, + "grad_norm": 0.3699522185641742, + "learning_rate": 0.0001353020350935803, + "loss": 0.6515, + "step": 1009 + }, + { + "epoch": 0.404, + "grad_norm": 0.3402055249662885, + "learning_rate": 0.00013518079629791724, + "loss": 0.6117, + "step": 1010 + }, + { + "epoch": 0.4044, + "grad_norm": 0.3520890315363541, + "learning_rate": 0.00013505949845742598, + "loss": 0.6479, + "step": 1011 + }, + { + "epoch": 0.4048, + "grad_norm": 0.3563001708547387, + "learning_rate": 0.00013493814177568364, + "loss": 0.6317, + "step": 1012 + }, + { + "epoch": 0.4052, + "grad_norm": 0.36934129376300595, + "learning_rate": 0.00013481672645636626, + "loss": 0.6336, + "step": 1013 + }, + { + "epoch": 0.4056, + "grad_norm": 0.34467273873477794, + "learning_rate": 0.00013469525270324835, + "loss": 0.6515, + "step": 1014 + }, + { + "epoch": 0.406, + "grad_norm": 0.37520625303944943, + "learning_rate": 0.0001345737207202023, + "loss": 0.6268, + "step": 1015 + }, + { + "epoch": 0.4064, + "grad_norm": 0.37153699344781604, + "learning_rate": 0.0001344521307111984, + "loss": 0.6908, + "step": 1016 + }, + { + "epoch": 0.4068, + "grad_norm": 0.3525054790816169, + "learning_rate": 0.00013433048288030423, + "loss": 0.6686, + "step": 1017 + }, + { + "epoch": 0.4072, + "grad_norm": 0.35007987426870385, + "learning_rate": 0.00013420877743168449, + "loss": 0.6506, + "step": 1018 + }, + { + "epoch": 0.4076, + "grad_norm": 0.33329849886499396, + "learning_rate": 0.0001340870145696005, + "loss": 0.588, + "step": 1019 + }, + { + "epoch": 0.408, + "grad_norm": 0.3534071124831478, + "learning_rate": 0.00013396519449841005, + "loss": 0.6546, + "step": 1020 + }, + { + "epoch": 0.4084, + "grad_norm": 0.3583732560061697, + "learning_rate": 0.0001338433174225668, + "loss": 0.6202, + "step": 1021 + }, + { + "epoch": 0.4088, + "grad_norm": 0.3508190351005674, + "learning_rate": 0.0001337213835466202, + "loss": 0.6763, + "step": 1022 + }, + { + "epoch": 0.4092, + "grad_norm": 0.37536449187169085, + "learning_rate": 0.00013359939307521493, + "loss": 0.6846, + "step": 1023 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3687071147005669, + "learning_rate": 0.00013347734621309076, + "loss": 0.6739, + "step": 1024 + }, + { + "epoch": 0.41, + "grad_norm": 0.3634034898624675, + "learning_rate": 0.00013335524316508208, + "loss": 0.6288, + "step": 1025 + }, + { + "epoch": 0.4104, + "grad_norm": 0.3732640277119394, + "learning_rate": 0.00013323308413611747, + "loss": 0.6606, + "step": 1026 + }, + { + "epoch": 0.4108, + "grad_norm": 0.351890195894751, + "learning_rate": 0.00013311086933121962, + "loss": 0.6635, + "step": 1027 + }, + { + "epoch": 0.4112, + "grad_norm": 0.39339991608869956, + "learning_rate": 0.00013298859895550472, + "loss": 0.6426, + "step": 1028 + }, + { + "epoch": 0.4116, + "grad_norm": 0.3302126076186118, + "learning_rate": 0.00013286627321418227, + "loss": 0.64, + "step": 1029 + }, + { + "epoch": 0.412, + "grad_norm": 0.33616046205245464, + "learning_rate": 0.00013274389231255466, + "loss": 0.6597, + "step": 1030 + }, + { + "epoch": 0.4124, + "grad_norm": 0.6888957360247192, + "learning_rate": 0.00013262145645601692, + "loss": 0.6904, + "step": 1031 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3392597187306241, + "learning_rate": 0.00013249896585005628, + "loss": 0.6291, + "step": 1032 + }, + { + "epoch": 0.4132, + "grad_norm": 0.3592003259259342, + "learning_rate": 0.00013237642070025184, + "loss": 0.6974, + "step": 1033 + }, + { + "epoch": 0.4136, + "grad_norm": 0.4153925706260768, + "learning_rate": 0.0001322538212122742, + "loss": 0.6899, + "step": 1034 + }, + { + "epoch": 0.414, + "grad_norm": 0.37095096601650834, + "learning_rate": 0.00013213116759188523, + "loss": 0.6659, + "step": 1035 + }, + { + "epoch": 0.4144, + "grad_norm": 0.3592106483257176, + "learning_rate": 0.0001320084600449377, + "loss": 0.6997, + "step": 1036 + }, + { + "epoch": 0.4148, + "grad_norm": 0.36046345046306594, + "learning_rate": 0.00013188569877737474, + "loss": 0.6265, + "step": 1037 + }, + { + "epoch": 0.4152, + "grad_norm": 0.3613241628633839, + "learning_rate": 0.00013176288399522975, + "loss": 0.6563, + "step": 1038 + }, + { + "epoch": 0.4156, + "grad_norm": 0.3557392478888832, + "learning_rate": 0.0001316400159046259, + "loss": 0.6328, + "step": 1039 + }, + { + "epoch": 0.416, + "grad_norm": 0.36421798184827514, + "learning_rate": 0.00013151709471177588, + "loss": 0.6568, + "step": 1040 + }, + { + "epoch": 0.4164, + "grad_norm": 0.3449672427354774, + "learning_rate": 0.0001313941206229814, + "loss": 0.6471, + "step": 1041 + }, + { + "epoch": 0.4168, + "grad_norm": 0.5251559097727022, + "learning_rate": 0.0001312710938446331, + "loss": 0.6507, + "step": 1042 + }, + { + "epoch": 0.4172, + "grad_norm": 0.3860052644841409, + "learning_rate": 0.00013114801458320987, + "loss": 0.6842, + "step": 1043 + }, + { + "epoch": 0.4176, + "grad_norm": 0.8066263740122651, + "learning_rate": 0.0001310248830452788, + "loss": 0.6253, + "step": 1044 + }, + { + "epoch": 0.418, + "grad_norm": 0.35514986303183, + "learning_rate": 0.00013090169943749476, + "loss": 0.6209, + "step": 1045 + }, + { + "epoch": 0.4184, + "grad_norm": 0.36456989649396787, + "learning_rate": 0.00013077846396659985, + "loss": 0.6437, + "step": 1046 + }, + { + "epoch": 0.4188, + "grad_norm": 0.36435993468594363, + "learning_rate": 0.0001306551768394234, + "loss": 0.6373, + "step": 1047 + }, + { + "epoch": 0.4192, + "grad_norm": 0.35898161031201, + "learning_rate": 0.00013053183826288123, + "loss": 0.6257, + "step": 1048 + }, + { + "epoch": 0.4196, + "grad_norm": 0.34399594238893383, + "learning_rate": 0.00013040844844397574, + "loss": 0.6376, + "step": 1049 + }, + { + "epoch": 0.42, + "grad_norm": 0.3639089445483288, + "learning_rate": 0.00013028500758979506, + "loss": 0.6595, + "step": 1050 + }, + { + "epoch": 0.4204, + "grad_norm": 0.3430020351995961, + "learning_rate": 0.0001301615159075133, + "loss": 0.6597, + "step": 1051 + }, + { + "epoch": 0.4208, + "grad_norm": 0.38058307582533973, + "learning_rate": 0.0001300379736043896, + "loss": 0.6555, + "step": 1052 + }, + { + "epoch": 0.4212, + "grad_norm": 0.3546300742504651, + "learning_rate": 0.00012991438088776817, + "loss": 0.6224, + "step": 1053 + }, + { + "epoch": 0.4216, + "grad_norm": 0.3525371684642542, + "learning_rate": 0.00012979073796507787, + "loss": 0.6439, + "step": 1054 + }, + { + "epoch": 0.422, + "grad_norm": 0.3574218454915876, + "learning_rate": 0.00012966704504383168, + "loss": 0.6391, + "step": 1055 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3383106126426222, + "learning_rate": 0.00012954330233162667, + "loss": 0.63, + "step": 1056 + }, + { + "epoch": 0.4228, + "grad_norm": 0.35807892428638477, + "learning_rate": 0.00012941951003614337, + "loss": 0.6249, + "step": 1057 + }, + { + "epoch": 0.4232, + "grad_norm": 0.3444483156769968, + "learning_rate": 0.00012929566836514554, + "loss": 0.6247, + "step": 1058 + }, + { + "epoch": 0.4236, + "grad_norm": 0.35707124361746045, + "learning_rate": 0.0001291717775264798, + "loss": 0.6771, + "step": 1059 + }, + { + "epoch": 0.424, + "grad_norm": 0.3590898231704314, + "learning_rate": 0.00012904783772807533, + "loss": 0.6532, + "step": 1060 + }, + { + "epoch": 0.4244, + "grad_norm": 0.3637411663002567, + "learning_rate": 0.00012892384917794346, + "loss": 0.6753, + "step": 1061 + }, + { + "epoch": 0.4248, + "grad_norm": 0.3461421346562393, + "learning_rate": 0.00012879981208417735, + "loss": 0.6455, + "step": 1062 + }, + { + "epoch": 0.4252, + "grad_norm": 0.35897628205140314, + "learning_rate": 0.00012867572665495157, + "loss": 0.6467, + "step": 1063 + }, + { + "epoch": 0.4256, + "grad_norm": 0.34729957726092625, + "learning_rate": 0.0001285515930985219, + "loss": 0.6485, + "step": 1064 + }, + { + "epoch": 0.426, + "grad_norm": 0.3471680462876662, + "learning_rate": 0.00012842741162322487, + "loss": 0.6186, + "step": 1065 + }, + { + "epoch": 0.4264, + "grad_norm": 0.4170020999870481, + "learning_rate": 0.00012830318243747736, + "loss": 0.633, + "step": 1066 + }, + { + "epoch": 0.4268, + "grad_norm": 0.3527236798223839, + "learning_rate": 0.00012817890574977646, + "loss": 0.6883, + "step": 1067 + }, + { + "epoch": 0.4272, + "grad_norm": 0.37101012646371456, + "learning_rate": 0.00012805458176869884, + "loss": 0.6815, + "step": 1068 + }, + { + "epoch": 0.4276, + "grad_norm": 0.36168116105218867, + "learning_rate": 0.00012793021070290066, + "loss": 0.7068, + "step": 1069 + }, + { + "epoch": 0.428, + "grad_norm": 0.3580616421424723, + "learning_rate": 0.00012780579276111702, + "loss": 0.6534, + "step": 1070 + }, + { + "epoch": 0.4284, + "grad_norm": 0.34562766642889714, + "learning_rate": 0.00012768132815216173, + "loss": 0.6857, + "step": 1071 + }, + { + "epoch": 0.4288, + "grad_norm": 0.36003250403947595, + "learning_rate": 0.00012755681708492695, + "loss": 0.632, + "step": 1072 + }, + { + "epoch": 0.4292, + "grad_norm": 0.3604809695713053, + "learning_rate": 0.00012743225976838274, + "loss": 0.6744, + "step": 1073 + }, + { + "epoch": 0.4296, + "grad_norm": 0.3538437373444088, + "learning_rate": 0.0001273076564115769, + "loss": 0.6561, + "step": 1074 + }, + { + "epoch": 0.43, + "grad_norm": 0.35581085943841284, + "learning_rate": 0.0001271830072236343, + "loss": 0.672, + "step": 1075 + }, + { + "epoch": 0.4304, + "grad_norm": 0.32045712249396435, + "learning_rate": 0.00012705831241375694, + "loss": 0.6231, + "step": 1076 + }, + { + "epoch": 0.4308, + "grad_norm": 0.34139655052392703, + "learning_rate": 0.0001269335721912233, + "loss": 0.677, + "step": 1077 + }, + { + "epoch": 0.4312, + "grad_norm": 0.3643977216109273, + "learning_rate": 0.00012680878676538804, + "loss": 0.6689, + "step": 1078 + }, + { + "epoch": 0.4316, + "grad_norm": 0.3660374205847108, + "learning_rate": 0.00012668395634568176, + "loss": 0.6489, + "step": 1079 + }, + { + "epoch": 0.432, + "grad_norm": 0.3368196263255544, + "learning_rate": 0.0001265590811416105, + "loss": 0.6129, + "step": 1080 + }, + { + "epoch": 0.4324, + "grad_norm": 0.3657687843961985, + "learning_rate": 0.00012643416136275557, + "loss": 0.6401, + "step": 1081 + }, + { + "epoch": 0.4328, + "grad_norm": 0.3448187133801033, + "learning_rate": 0.00012630919721877298, + "loss": 0.6328, + "step": 1082 + }, + { + "epoch": 0.4332, + "grad_norm": 0.35717875019015266, + "learning_rate": 0.0001261841889193932, + "loss": 0.6559, + "step": 1083 + }, + { + "epoch": 0.4336, + "grad_norm": 0.3685023753029261, + "learning_rate": 0.00012605913667442095, + "loss": 0.6799, + "step": 1084 + }, + { + "epoch": 0.434, + "grad_norm": 0.36087411813898496, + "learning_rate": 0.0001259340406937345, + "loss": 0.6428, + "step": 1085 + }, + { + "epoch": 0.4344, + "grad_norm": 0.37381189014081895, + "learning_rate": 0.00012580890118728572, + "loss": 0.6423, + "step": 1086 + }, + { + "epoch": 0.4348, + "grad_norm": 0.351860060042875, + "learning_rate": 0.00012568371836509936, + "loss": 0.6334, + "step": 1087 + }, + { + "epoch": 0.4352, + "grad_norm": 0.31926815801438274, + "learning_rate": 0.00012555849243727299, + "loss": 0.615, + "step": 1088 + }, + { + "epoch": 0.4356, + "grad_norm": 0.3232317581728528, + "learning_rate": 0.00012543322361397647, + "loss": 0.6696, + "step": 1089 + }, + { + "epoch": 0.436, + "grad_norm": 0.3420996415534882, + "learning_rate": 0.00012530791210545162, + "loss": 0.6164, + "step": 1090 + }, + { + "epoch": 0.4364, + "grad_norm": 0.38207290025601376, + "learning_rate": 0.00012518255812201203, + "loss": 0.6368, + "step": 1091 + }, + { + "epoch": 0.4368, + "grad_norm": 0.33912036701999765, + "learning_rate": 0.00012505716187404241, + "loss": 0.6518, + "step": 1092 + }, + { + "epoch": 0.4372, + "grad_norm": 0.36194506538681326, + "learning_rate": 0.00012493172357199857, + "loss": 0.698, + "step": 1093 + }, + { + "epoch": 0.4376, + "grad_norm": 0.34036246217276134, + "learning_rate": 0.00012480624342640673, + "loss": 0.6231, + "step": 1094 + }, + { + "epoch": 0.438, + "grad_norm": 0.3343319436104641, + "learning_rate": 0.0001246807216478634, + "loss": 0.6036, + "step": 1095 + }, + { + "epoch": 0.4384, + "grad_norm": 0.35032259157872203, + "learning_rate": 0.0001245551584470351, + "loss": 0.6167, + "step": 1096 + }, + { + "epoch": 0.4388, + "grad_norm": 0.4146810033641503, + "learning_rate": 0.00012442955403465768, + "loss": 0.6654, + "step": 1097 + }, + { + "epoch": 0.4392, + "grad_norm": 0.35832129820145203, + "learning_rate": 0.00012430390862153625, + "loss": 0.6339, + "step": 1098 + }, + { + "epoch": 0.4396, + "grad_norm": 0.358377151169997, + "learning_rate": 0.00012417822241854467, + "loss": 0.6366, + "step": 1099 + }, + { + "epoch": 0.44, + "grad_norm": 0.36373121113996665, + "learning_rate": 0.00012405249563662537, + "loss": 0.6565, + "step": 1100 + }, + { + "epoch": 0.4404, + "grad_norm": 0.3594541755442927, + "learning_rate": 0.00012392672848678877, + "loss": 0.6997, + "step": 1101 + }, + { + "epoch": 0.4408, + "grad_norm": 0.3629727624468758, + "learning_rate": 0.0001238009211801131, + "loss": 0.7129, + "step": 1102 + }, + { + "epoch": 0.4412, + "grad_norm": 0.3350770583209822, + "learning_rate": 0.00012367507392774398, + "loss": 0.6296, + "step": 1103 + }, + { + "epoch": 0.4416, + "grad_norm": 0.35016548464672453, + "learning_rate": 0.00012354918694089406, + "loss": 0.6335, + "step": 1104 + }, + { + "epoch": 0.442, + "grad_norm": 0.3519380630599524, + "learning_rate": 0.00012342326043084266, + "loss": 0.6988, + "step": 1105 + }, + { + "epoch": 0.4424, + "grad_norm": 0.35177244271116015, + "learning_rate": 0.00012329729460893552, + "loss": 0.6384, + "step": 1106 + }, + { + "epoch": 0.4428, + "grad_norm": 0.35467429467861333, + "learning_rate": 0.00012317128968658425, + "loss": 0.6653, + "step": 1107 + }, + { + "epoch": 0.4432, + "grad_norm": 0.3589107500094258, + "learning_rate": 0.0001230452458752661, + "loss": 0.6427, + "step": 1108 + }, + { + "epoch": 0.4436, + "grad_norm": 0.34188766267721576, + "learning_rate": 0.00012291916338652364, + "loss": 0.6229, + "step": 1109 + }, + { + "epoch": 0.444, + "grad_norm": 0.3337032531326285, + "learning_rate": 0.00012279304243196436, + "loss": 0.6307, + "step": 1110 + }, + { + "epoch": 0.4444, + "grad_norm": 0.3375032722495382, + "learning_rate": 0.00012266688322326024, + "loss": 0.6475, + "step": 1111 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3611670117129286, + "learning_rate": 0.0001225406859721475, + "loss": 0.65, + "step": 1112 + }, + { + "epoch": 0.4452, + "grad_norm": 0.3616691548735018, + "learning_rate": 0.00012241445089042623, + "loss": 0.6473, + "step": 1113 + }, + { + "epoch": 0.4456, + "grad_norm": 0.37326942100056826, + "learning_rate": 0.00012228817818995996, + "loss": 0.6276, + "step": 1114 + }, + { + "epoch": 0.446, + "grad_norm": 0.3482463645852628, + "learning_rate": 0.00012216186808267546, + "loss": 0.6567, + "step": 1115 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3525700826952431, + "learning_rate": 0.00012203552078056209, + "loss": 0.6498, + "step": 1116 + }, + { + "epoch": 0.4468, + "grad_norm": 0.37206347910603454, + "learning_rate": 0.00012190913649567184, + "loss": 0.6466, + "step": 1117 + }, + { + "epoch": 0.4472, + "grad_norm": 0.36237934031288455, + "learning_rate": 0.00012178271544011863, + "loss": 0.6255, + "step": 1118 + }, + { + "epoch": 0.4476, + "grad_norm": 0.3515339387422734, + "learning_rate": 0.00012165625782607817, + "loss": 0.5916, + "step": 1119 + }, + { + "epoch": 0.448, + "grad_norm": 0.3689400504529256, + "learning_rate": 0.0001215297638657875, + "loss": 0.6404, + "step": 1120 + }, + { + "epoch": 0.4484, + "grad_norm": 0.34157562776012124, + "learning_rate": 0.00012140323377154466, + "loss": 0.6106, + "step": 1121 + }, + { + "epoch": 0.4488, + "grad_norm": 0.368410902246951, + "learning_rate": 0.00012127666775570836, + "loss": 0.6499, + "step": 1122 + }, + { + "epoch": 0.4492, + "grad_norm": 0.3539844358616864, + "learning_rate": 0.0001211500660306975, + "loss": 0.6585, + "step": 1123 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3673415854540668, + "learning_rate": 0.00012102342880899109, + "loss": 0.7017, + "step": 1124 + }, + { + "epoch": 0.45, + "grad_norm": 0.3518001322298715, + "learning_rate": 0.00012089675630312754, + "loss": 0.6741, + "step": 1125 + }, + { + "epoch": 0.4504, + "grad_norm": 0.3360602890643158, + "learning_rate": 0.00012077004872570454, + "loss": 0.5728, + "step": 1126 + }, + { + "epoch": 0.4508, + "grad_norm": 0.3458319940972609, + "learning_rate": 0.0001206433062893787, + "loss": 0.6127, + "step": 1127 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3713090287879673, + "learning_rate": 0.00012051652920686505, + "loss": 0.6804, + "step": 1128 + }, + { + "epoch": 0.4516, + "grad_norm": 0.38617354952982746, + "learning_rate": 0.00012038971769093686, + "loss": 0.6677, + "step": 1129 + }, + { + "epoch": 0.452, + "grad_norm": 0.3524726737540465, + "learning_rate": 0.00012026287195442503, + "loss": 0.6201, + "step": 1130 + }, + { + "epoch": 0.4524, + "grad_norm": 0.3484102447747805, + "learning_rate": 0.0001201359922102181, + "loss": 0.6489, + "step": 1131 + }, + { + "epoch": 0.4528, + "grad_norm": 0.33740052898910877, + "learning_rate": 0.0001200090786712615, + "loss": 0.6275, + "step": 1132 + }, + { + "epoch": 0.4532, + "grad_norm": 0.33438727982873373, + "learning_rate": 0.00011988213155055754, + "loss": 0.6378, + "step": 1133 + }, + { + "epoch": 0.4536, + "grad_norm": 0.3631159101354578, + "learning_rate": 0.00011975515106116472, + "loss": 0.6554, + "step": 1134 + }, + { + "epoch": 0.454, + "grad_norm": 0.33194697447543714, + "learning_rate": 0.00011962813741619777, + "loss": 0.6633, + "step": 1135 + }, + { + "epoch": 0.4544, + "grad_norm": 0.36456216988924117, + "learning_rate": 0.00011950109082882681, + "loss": 0.6077, + "step": 1136 + }, + { + "epoch": 0.4548, + "grad_norm": 0.3834217067690752, + "learning_rate": 0.0001193740115122774, + "loss": 0.6044, + "step": 1137 + }, + { + "epoch": 0.4552, + "grad_norm": 0.35361345763259006, + "learning_rate": 0.00011924689967983006, + "loss": 0.6219, + "step": 1138 + }, + { + "epoch": 0.4556, + "grad_norm": 0.34553967278756625, + "learning_rate": 0.00011911975554481971, + "loss": 0.638, + "step": 1139 + }, + { + "epoch": 0.456, + "grad_norm": 0.3836534901465859, + "learning_rate": 0.0001189925793206357, + "loss": 0.6969, + "step": 1140 + }, + { + "epoch": 0.4564, + "grad_norm": 0.3638036872842273, + "learning_rate": 0.00011886537122072105, + "loss": 0.6725, + "step": 1141 + }, + { + "epoch": 0.4568, + "grad_norm": 0.39978315045217133, + "learning_rate": 0.00011873813145857249, + "loss": 0.6513, + "step": 1142 + }, + { + "epoch": 0.4572, + "grad_norm": 0.3806959135931955, + "learning_rate": 0.00011861086024773962, + "loss": 0.679, + "step": 1143 + }, + { + "epoch": 0.4576, + "grad_norm": 0.35341600190389044, + "learning_rate": 0.000118483557801825, + "loss": 0.6578, + "step": 1144 + }, + { + "epoch": 0.458, + "grad_norm": 0.3563699363910567, + "learning_rate": 0.00011835622433448361, + "loss": 0.6471, + "step": 1145 + }, + { + "epoch": 0.4584, + "grad_norm": 0.33067674651689005, + "learning_rate": 0.00011822886005942244, + "loss": 0.6387, + "step": 1146 + }, + { + "epoch": 0.4588, + "grad_norm": 0.35904413352647985, + "learning_rate": 0.00011810146519040021, + "loss": 0.6727, + "step": 1147 + }, + { + "epoch": 0.4592, + "grad_norm": 0.6630529372769212, + "learning_rate": 0.00011797403994122698, + "loss": 0.6404, + "step": 1148 + }, + { + "epoch": 0.4596, + "grad_norm": 0.32328732899515183, + "learning_rate": 0.00011784658452576378, + "loss": 0.6218, + "step": 1149 + }, + { + "epoch": 0.46, + "grad_norm": 0.3674146730299686, + "learning_rate": 0.0001177190991579223, + "loss": 0.6411, + "step": 1150 + }, + { + "epoch": 0.4604, + "grad_norm": 0.3464141037597477, + "learning_rate": 0.00011759158405166446, + "loss": 0.6021, + "step": 1151 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3462192086231998, + "learning_rate": 0.00011746403942100215, + "loss": 0.6465, + "step": 1152 + }, + { + "epoch": 0.4612, + "grad_norm": 0.3373173125544461, + "learning_rate": 0.00011733646547999677, + "loss": 0.5826, + "step": 1153 + }, + { + "epoch": 0.4616, + "grad_norm": 0.35866904219362455, + "learning_rate": 0.00011720886244275893, + "loss": 0.639, + "step": 1154 + }, + { + "epoch": 0.462, + "grad_norm": 0.3731033563035224, + "learning_rate": 0.00011708123052344804, + "loss": 0.613, + "step": 1155 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3693917713517358, + "learning_rate": 0.00011695356993627202, + "loss": 0.6582, + "step": 1156 + }, + { + "epoch": 0.4628, + "grad_norm": 0.3527565389122925, + "learning_rate": 0.00011682588089548692, + "loss": 0.643, + "step": 1157 + }, + { + "epoch": 0.4632, + "grad_norm": 0.4065757328066187, + "learning_rate": 0.00011669816361539647, + "loss": 0.6243, + "step": 1158 + }, + { + "epoch": 0.4636, + "grad_norm": 0.3401808120681071, + "learning_rate": 0.00011657041831035184, + "loss": 0.6475, + "step": 1159 + }, + { + "epoch": 0.464, + "grad_norm": 0.33491804020140525, + "learning_rate": 0.0001164426451947513, + "loss": 0.5667, + "step": 1160 + }, + { + "epoch": 0.4644, + "grad_norm": 0.3509247945129264, + "learning_rate": 0.00011631484448303965, + "loss": 0.6602, + "step": 1161 + }, + { + "epoch": 0.4648, + "grad_norm": 0.3448036143304661, + "learning_rate": 0.00011618701638970814, + "loss": 0.6396, + "step": 1162 + }, + { + "epoch": 0.4652, + "grad_norm": 0.35217793454086344, + "learning_rate": 0.00011605916112929388, + "loss": 0.6358, + "step": 1163 + }, + { + "epoch": 0.4656, + "grad_norm": 0.4157376625846788, + "learning_rate": 0.00011593127891637967, + "loss": 0.6287, + "step": 1164 + }, + { + "epoch": 0.466, + "grad_norm": 0.3518045641667168, + "learning_rate": 0.00011580336996559343, + "loss": 0.6457, + "step": 1165 + }, + { + "epoch": 0.4664, + "grad_norm": 0.3851283507513587, + "learning_rate": 0.00011567543449160809, + "loss": 0.6561, + "step": 1166 + }, + { + "epoch": 0.4668, + "grad_norm": 0.3467525558653285, + "learning_rate": 0.00011554747270914097, + "loss": 0.6543, + "step": 1167 + }, + { + "epoch": 0.4672, + "grad_norm": 0.36874210413197406, + "learning_rate": 0.00011541948483295357, + "loss": 0.6428, + "step": 1168 + }, + { + "epoch": 0.4676, + "grad_norm": 0.35625172641639313, + "learning_rate": 0.00011529147107785128, + "loss": 0.6327, + "step": 1169 + }, + { + "epoch": 0.468, + "grad_norm": 0.3391724767182548, + "learning_rate": 0.00011516343165868279, + "loss": 0.6185, + "step": 1170 + }, + { + "epoch": 0.4684, + "grad_norm": 0.3732951354423005, + "learning_rate": 0.00011503536679033999, + "loss": 0.6607, + "step": 1171 + }, + { + "epoch": 0.4688, + "grad_norm": 0.33778442308236173, + "learning_rate": 0.00011490727668775733, + "loss": 0.6351, + "step": 1172 + }, + { + "epoch": 0.4692, + "grad_norm": 0.3597063141045392, + "learning_rate": 0.00011477916156591179, + "loss": 0.6181, + "step": 1173 + }, + { + "epoch": 0.4696, + "grad_norm": 0.36179807923506846, + "learning_rate": 0.00011465102163982217, + "loss": 0.6746, + "step": 1174 + }, + { + "epoch": 0.47, + "grad_norm": 0.3330308143333079, + "learning_rate": 0.00011452285712454904, + "loss": 0.5767, + "step": 1175 + }, + { + "epoch": 0.4704, + "grad_norm": 0.35813905476401037, + "learning_rate": 0.00011439466823519414, + "loss": 0.6646, + "step": 1176 + }, + { + "epoch": 0.4708, + "grad_norm": 0.3450569829700074, + "learning_rate": 0.00011426645518690016, + "loss": 0.6575, + "step": 1177 + }, + { + "epoch": 0.4712, + "grad_norm": 0.4133442723018541, + "learning_rate": 0.00011413821819485035, + "loss": 0.6522, + "step": 1178 + }, + { + "epoch": 0.4716, + "grad_norm": 0.34402182080967436, + "learning_rate": 0.00011400995747426811, + "loss": 0.5987, + "step": 1179 + }, + { + "epoch": 0.472, + "grad_norm": 0.35033698500174304, + "learning_rate": 0.00011388167324041669, + "loss": 0.6686, + "step": 1180 + }, + { + "epoch": 0.4724, + "grad_norm": 0.36188935714893783, + "learning_rate": 0.00011375336570859876, + "loss": 0.6262, + "step": 1181 + }, + { + "epoch": 0.4728, + "grad_norm": 0.3719189511733754, + "learning_rate": 0.00011362503509415619, + "loss": 0.6258, + "step": 1182 + }, + { + "epoch": 0.4732, + "grad_norm": 0.3569183083026708, + "learning_rate": 0.00011349668161246944, + "loss": 0.6232, + "step": 1183 + }, + { + "epoch": 0.4736, + "grad_norm": 0.34895691234867987, + "learning_rate": 0.00011336830547895752, + "loss": 0.6486, + "step": 1184 + }, + { + "epoch": 0.474, + "grad_norm": 0.33847296120051706, + "learning_rate": 0.00011323990690907733, + "loss": 0.6376, + "step": 1185 + }, + { + "epoch": 0.4744, + "grad_norm": 0.3564539231903446, + "learning_rate": 0.00011311148611832345, + "loss": 0.6851, + "step": 1186 + }, + { + "epoch": 0.4748, + "grad_norm": 0.37641134678983584, + "learning_rate": 0.0001129830433222278, + "loss": 0.6495, + "step": 1187 + }, + { + "epoch": 0.4752, + "grad_norm": 0.3483141808653871, + "learning_rate": 0.00011285457873635921, + "loss": 0.642, + "step": 1188 + }, + { + "epoch": 0.4756, + "grad_norm": 0.3628071073948621, + "learning_rate": 0.00011272609257632305, + "loss": 0.6341, + "step": 1189 + }, + { + "epoch": 0.476, + "grad_norm": 0.3565896734004511, + "learning_rate": 0.00011259758505776092, + "loss": 0.6375, + "step": 1190 + }, + { + "epoch": 0.4764, + "grad_norm": 0.34875490237998463, + "learning_rate": 0.00011246905639635029, + "loss": 0.6194, + "step": 1191 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3528192238282489, + "learning_rate": 0.00011234050680780406, + "loss": 0.622, + "step": 1192 + }, + { + "epoch": 0.4772, + "grad_norm": 0.33711625222077923, + "learning_rate": 0.00011221193650787032, + "loss": 0.6242, + "step": 1193 + }, + { + "epoch": 0.4776, + "grad_norm": 0.36364403618449903, + "learning_rate": 0.00011208334571233185, + "loss": 0.6329, + "step": 1194 + }, + { + "epoch": 0.478, + "grad_norm": 0.3458245750874432, + "learning_rate": 0.0001119547346370059, + "loss": 0.6953, + "step": 1195 + }, + { + "epoch": 0.4784, + "grad_norm": 0.35316328695100874, + "learning_rate": 0.0001118261034977437, + "loss": 0.6158, + "step": 1196 + }, + { + "epoch": 0.4788, + "grad_norm": 0.3682412673076238, + "learning_rate": 0.00011169745251043021, + "loss": 0.7115, + "step": 1197 + }, + { + "epoch": 0.4792, + "grad_norm": 0.33913215904512334, + "learning_rate": 0.00011156878189098356, + "loss": 0.6328, + "step": 1198 + }, + { + "epoch": 0.4796, + "grad_norm": 0.34887865709752547, + "learning_rate": 0.00011144009185535509, + "loss": 0.6243, + "step": 1199 + }, + { + "epoch": 0.48, + "grad_norm": 0.3663865307807514, + "learning_rate": 0.00011131138261952845, + "loss": 0.6191, + "step": 1200 + }, + { + "epoch": 0.4804, + "grad_norm": 0.3370232501645842, + "learning_rate": 0.00011118265439951967, + "loss": 0.5982, + "step": 1201 + }, + { + "epoch": 0.4808, + "grad_norm": 0.35896060343046443, + "learning_rate": 0.0001110539074113766, + "loss": 0.6726, + "step": 1202 + }, + { + "epoch": 0.4812, + "grad_norm": 0.3301155377914508, + "learning_rate": 0.00011092514187117864, + "loss": 0.6175, + "step": 1203 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3419130759760223, + "learning_rate": 0.00011079635799503624, + "loss": 0.6229, + "step": 1204 + }, + { + "epoch": 0.482, + "grad_norm": 0.3548958405548371, + "learning_rate": 0.00011066755599909064, + "loss": 0.6612, + "step": 1205 + }, + { + "epoch": 0.4824, + "grad_norm": 0.33624645660686237, + "learning_rate": 0.00011053873609951362, + "loss": 0.652, + "step": 1206 + }, + { + "epoch": 0.4828, + "grad_norm": 0.3594937632014105, + "learning_rate": 0.00011040989851250678, + "loss": 0.6674, + "step": 1207 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3513060841937797, + "learning_rate": 0.00011028104345430161, + "loss": 0.6305, + "step": 1208 + }, + { + "epoch": 0.4836, + "grad_norm": 0.34984900178680145, + "learning_rate": 0.00011015217114115883, + "loss": 0.6737, + "step": 1209 + }, + { + "epoch": 0.484, + "grad_norm": 0.34433824279802844, + "learning_rate": 0.00011002328178936811, + "loss": 0.636, + "step": 1210 + }, + { + "epoch": 0.4844, + "grad_norm": 0.35542548151075953, + "learning_rate": 0.00010989437561524776, + "loss": 0.6236, + "step": 1211 + }, + { + "epoch": 0.4848, + "grad_norm": 0.33325446198619896, + "learning_rate": 0.0001097654528351443, + "loss": 0.5909, + "step": 1212 + }, + { + "epoch": 0.4852, + "grad_norm": 0.3737512950007258, + "learning_rate": 0.00010963651366543213, + "loss": 0.615, + "step": 1213 + }, + { + "epoch": 0.4856, + "grad_norm": 0.34473431451785225, + "learning_rate": 0.0001095075583225131, + "loss": 0.6144, + "step": 1214 + }, + { + "epoch": 0.486, + "grad_norm": 0.36014593259355815, + "learning_rate": 0.00010937858702281631, + "loss": 0.6516, + "step": 1215 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3862404785487015, + "learning_rate": 0.00010924959998279753, + "loss": 0.6791, + "step": 1216 + }, + { + "epoch": 0.4868, + "grad_norm": 0.3625573244343477, + "learning_rate": 0.00010912059741893908, + "loss": 0.6865, + "step": 1217 + }, + { + "epoch": 0.4872, + "grad_norm": 0.3289197732318271, + "learning_rate": 0.00010899157954774919, + "loss": 0.6207, + "step": 1218 + }, + { + "epoch": 0.4876, + "grad_norm": 0.3600277694995468, + "learning_rate": 0.00010886254658576184, + "loss": 0.6383, + "step": 1219 + }, + { + "epoch": 0.488, + "grad_norm": 0.3543077565948878, + "learning_rate": 0.0001087334987495364, + "loss": 0.6723, + "step": 1220 + }, + { + "epoch": 0.4884, + "grad_norm": 0.3309490300200102, + "learning_rate": 0.0001086044362556571, + "loss": 0.6172, + "step": 1221 + }, + { + "epoch": 0.4888, + "grad_norm": 0.366905751195663, + "learning_rate": 0.00010847535932073287, + "loss": 0.681, + "step": 1222 + }, + { + "epoch": 0.4892, + "grad_norm": 0.3968664880824747, + "learning_rate": 0.00010834626816139677, + "loss": 0.6659, + "step": 1223 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3587325502034165, + "learning_rate": 0.00010821716299430578, + "loss": 0.6258, + "step": 1224 + }, + { + "epoch": 0.49, + "grad_norm": 0.3485719603322062, + "learning_rate": 0.00010808804403614043, + "loss": 0.6643, + "step": 1225 + }, + { + "epoch": 0.4904, + "grad_norm": 0.3484885283904212, + "learning_rate": 0.00010795891150360435, + "loss": 0.6372, + "step": 1226 + }, + { + "epoch": 0.4908, + "grad_norm": 0.3678839256648748, + "learning_rate": 0.00010782976561342398, + "loss": 0.6328, + "step": 1227 + }, + { + "epoch": 0.4912, + "grad_norm": 0.33509059890827697, + "learning_rate": 0.00010770060658234815, + "loss": 0.6262, + "step": 1228 + }, + { + "epoch": 0.4916, + "grad_norm": 0.33251123198984384, + "learning_rate": 0.00010757143462714777, + "loss": 0.6462, + "step": 1229 + }, + { + "epoch": 0.492, + "grad_norm": 0.35032556086450606, + "learning_rate": 0.0001074422499646154, + "loss": 0.6057, + "step": 1230 + }, + { + "epoch": 0.4924, + "grad_norm": 0.3744452549493245, + "learning_rate": 0.00010731305281156498, + "loss": 0.6257, + "step": 1231 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3270291197362055, + "learning_rate": 0.00010718384338483141, + "loss": 0.6118, + "step": 1232 + }, + { + "epoch": 0.4932, + "grad_norm": 0.34676008033704087, + "learning_rate": 0.00010705462190127011, + "loss": 0.629, + "step": 1233 + }, + { + "epoch": 0.4936, + "grad_norm": 0.3663047890903659, + "learning_rate": 0.00010692538857775684, + "loss": 0.6203, + "step": 1234 + }, + { + "epoch": 0.494, + "grad_norm": 0.3272336309368646, + "learning_rate": 0.00010679614363118717, + "loss": 0.5978, + "step": 1235 + }, + { + "epoch": 0.4944, + "grad_norm": 0.35591471231657873, + "learning_rate": 0.00010666688727847621, + "loss": 0.6532, + "step": 1236 + }, + { + "epoch": 0.4948, + "grad_norm": 0.3452362011658614, + "learning_rate": 0.00010653761973655819, + "loss": 0.6126, + "step": 1237 + }, + { + "epoch": 0.4952, + "grad_norm": 0.3503817246383713, + "learning_rate": 0.00010640834122238606, + "loss": 0.6549, + "step": 1238 + }, + { + "epoch": 0.4956, + "grad_norm": 0.35621375880558626, + "learning_rate": 0.00010627905195293135, + "loss": 0.6651, + "step": 1239 + }, + { + "epoch": 0.496, + "grad_norm": 0.3757460056453115, + "learning_rate": 0.0001061497521451835, + "loss": 0.6685, + "step": 1240 + }, + { + "epoch": 0.4964, + "grad_norm": 0.3560824278739074, + "learning_rate": 0.00010602044201614965, + "loss": 0.6177, + "step": 1241 + }, + { + "epoch": 0.4968, + "grad_norm": 0.3426459162810895, + "learning_rate": 0.00010589112178285432, + "loss": 0.5979, + "step": 1242 + }, + { + "epoch": 0.4972, + "grad_norm": 0.3424236595236985, + "learning_rate": 0.00010576179166233895, + "loss": 0.6452, + "step": 1243 + }, + { + "epoch": 0.4976, + "grad_norm": 0.37326455403865544, + "learning_rate": 0.0001056324518716616, + "loss": 0.6839, + "step": 1244 + }, + { + "epoch": 0.498, + "grad_norm": 0.3447475867572372, + "learning_rate": 0.00010550310262789649, + "loss": 0.669, + "step": 1245 + }, + { + "epoch": 0.4984, + "grad_norm": 0.3645476627658833, + "learning_rate": 0.00010537374414813383, + "loss": 0.648, + "step": 1246 + }, + { + "epoch": 0.4988, + "grad_norm": 0.32084533765043544, + "learning_rate": 0.00010524437664947917, + "loss": 0.5702, + "step": 1247 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3639071006713642, + "learning_rate": 0.0001051150003490534, + "loss": 0.6801, + "step": 1248 + }, + { + "epoch": 0.4996, + "grad_norm": 0.4188043715421482, + "learning_rate": 0.00010498561546399193, + "loss": 0.6284, + "step": 1249 + }, + { + "epoch": 0.5, + "grad_norm": 0.35399792562477883, + "learning_rate": 0.00010485622221144484, + "loss": 0.6328, + "step": 1250 + }, + { + "epoch": 0.5004, + "grad_norm": 0.3374870043789103, + "learning_rate": 0.00010472682080857606, + "loss": 0.6291, + "step": 1251 + }, + { + "epoch": 0.5008, + "grad_norm": 0.33333080790463976, + "learning_rate": 0.00010459741147256326, + "loss": 0.61, + "step": 1252 + }, + { + "epoch": 0.5012, + "grad_norm": 0.3454834033242172, + "learning_rate": 0.00010446799442059749, + "loss": 0.5589, + "step": 1253 + }, + { + "epoch": 0.5016, + "grad_norm": 0.34029312366152303, + "learning_rate": 0.0001043385698698826, + "loss": 0.6486, + "step": 1254 + }, + { + "epoch": 0.502, + "grad_norm": 0.3540017630496081, + "learning_rate": 0.00010420913803763521, + "loss": 0.6303, + "step": 1255 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3488929724271329, + "learning_rate": 0.00010407969914108399, + "loss": 0.6315, + "step": 1256 + }, + { + "epoch": 0.5028, + "grad_norm": 0.35884525176733456, + "learning_rate": 0.00010395025339746964, + "loss": 0.5907, + "step": 1257 + }, + { + "epoch": 0.5032, + "grad_norm": 0.3254700117467145, + "learning_rate": 0.00010382080102404417, + "loss": 0.5824, + "step": 1258 + }, + { + "epoch": 0.5036, + "grad_norm": 0.3487802649810632, + "learning_rate": 0.00010369134223807082, + "loss": 0.5929, + "step": 1259 + }, + { + "epoch": 0.504, + "grad_norm": 0.34198688144213285, + "learning_rate": 0.00010356187725682359, + "loss": 0.6582, + "step": 1260 + }, + { + "epoch": 0.5044, + "grad_norm": 0.35645009121312554, + "learning_rate": 0.00010343240629758684, + "loss": 0.6518, + "step": 1261 + }, + { + "epoch": 0.5048, + "grad_norm": 0.3708941180601143, + "learning_rate": 0.00010330292957765501, + "loss": 0.675, + "step": 1262 + }, + { + "epoch": 0.5052, + "grad_norm": 0.3353731136131598, + "learning_rate": 0.00010317344731433216, + "loss": 0.5948, + "step": 1263 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3469344087220699, + "learning_rate": 0.00010304395972493172, + "loss": 0.6692, + "step": 1264 + }, + { + "epoch": 0.506, + "grad_norm": 0.3609414613558415, + "learning_rate": 0.00010291446702677599, + "loss": 0.6696, + "step": 1265 + }, + { + "epoch": 0.5064, + "grad_norm": 0.3427685116161911, + "learning_rate": 0.00010278496943719584, + "loss": 0.6261, + "step": 1266 + }, + { + "epoch": 0.5068, + "grad_norm": 0.3392486050691237, + "learning_rate": 0.00010265546717353041, + "loss": 0.6241, + "step": 1267 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3361004815479842, + "learning_rate": 0.00010252596045312666, + "loss": 0.5729, + "step": 1268 + }, + { + "epoch": 0.5076, + "grad_norm": 0.3463110258286205, + "learning_rate": 0.000102396449493339, + "loss": 0.6744, + "step": 1269 + }, + { + "epoch": 0.508, + "grad_norm": 0.3515275610021229, + "learning_rate": 0.000102266934511529, + "loss": 0.6336, + "step": 1270 + }, + { + "epoch": 0.5084, + "grad_norm": 0.33174849902873876, + "learning_rate": 0.00010213741572506497, + "loss": 0.6264, + "step": 1271 + }, + { + "epoch": 0.5088, + "grad_norm": 0.33455758686453246, + "learning_rate": 0.00010200789335132158, + "loss": 0.586, + "step": 1272 + }, + { + "epoch": 0.5092, + "grad_norm": 0.3545808144691893, + "learning_rate": 0.00010187836760767953, + "loss": 0.6328, + "step": 1273 + }, + { + "epoch": 0.5096, + "grad_norm": 0.36242011393728973, + "learning_rate": 0.00010174883871152516, + "loss": 0.6553, + "step": 1274 + }, + { + "epoch": 0.51, + "grad_norm": 0.34778223748948844, + "learning_rate": 0.00010161930688025017, + "loss": 0.6424, + "step": 1275 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3359680423864884, + "learning_rate": 0.0001014897723312511, + "loss": 0.6044, + "step": 1276 + }, + { + "epoch": 0.5108, + "grad_norm": 0.35534263644066355, + "learning_rate": 0.0001013602352819291, + "loss": 0.6757, + "step": 1277 + }, + { + "epoch": 0.5112, + "grad_norm": 0.3494413570544863, + "learning_rate": 0.00010123069594968952, + "loss": 0.6295, + "step": 1278 + }, + { + "epoch": 0.5116, + "grad_norm": 0.35573065101587703, + "learning_rate": 0.00010110115455194156, + "loss": 0.6753, + "step": 1279 + }, + { + "epoch": 0.512, + "grad_norm": 0.33097794748432213, + "learning_rate": 0.00010097161130609773, + "loss": 0.6093, + "step": 1280 + }, + { + "epoch": 0.5124, + "grad_norm": 0.35570883664507547, + "learning_rate": 0.00010084206642957393, + "loss": 0.6317, + "step": 1281 + }, + { + "epoch": 0.5128, + "grad_norm": 0.35106539136984977, + "learning_rate": 0.0001007125201397885, + "loss": 0.614, + "step": 1282 + }, + { + "epoch": 0.5132, + "grad_norm": 0.34128861680097217, + "learning_rate": 0.00010058297265416234, + "loss": 0.592, + "step": 1283 + }, + { + "epoch": 0.5136, + "grad_norm": 0.342487405813307, + "learning_rate": 0.00010045342419011832, + "loss": 0.6505, + "step": 1284 + }, + { + "epoch": 0.514, + "grad_norm": 0.34305231840107203, + "learning_rate": 0.00010032387496508089, + "loss": 0.6369, + "step": 1285 + }, + { + "epoch": 0.5144, + "grad_norm": 0.33378806377069, + "learning_rate": 0.00010019432519647585, + "loss": 0.6445, + "step": 1286 + }, + { + "epoch": 0.5148, + "grad_norm": 0.3375143627285051, + "learning_rate": 0.00010006477510172985, + "loss": 0.6599, + "step": 1287 + }, + { + "epoch": 0.5152, + "grad_norm": 0.35097006621776994, + "learning_rate": 9.993522489827016e-05, + "loss": 0.6107, + "step": 1288 + }, + { + "epoch": 0.5156, + "grad_norm": 0.33587296058662613, + "learning_rate": 9.980567480352416e-05, + "loss": 0.6303, + "step": 1289 + }, + { + "epoch": 0.516, + "grad_norm": 0.35666680105213716, + "learning_rate": 9.967612503491914e-05, + "loss": 0.651, + "step": 1290 + }, + { + "epoch": 0.5164, + "grad_norm": 0.33545182714986216, + "learning_rate": 9.954657580988172e-05, + "loss": 0.6459, + "step": 1291 + }, + { + "epoch": 0.5168, + "grad_norm": 0.34485682456959477, + "learning_rate": 9.94170273458377e-05, + "loss": 0.6547, + "step": 1292 + }, + { + "epoch": 0.5172, + "grad_norm": 0.343013812873406, + "learning_rate": 9.928747986021152e-05, + "loss": 0.6334, + "step": 1293 + }, + { + "epoch": 0.5176, + "grad_norm": 0.34384390758781147, + "learning_rate": 9.91579335704261e-05, + "loss": 0.63, + "step": 1294 + }, + { + "epoch": 0.518, + "grad_norm": 0.3234300874086102, + "learning_rate": 9.902838869390229e-05, + "loss": 0.5909, + "step": 1295 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3268275950873985, + "learning_rate": 9.88988454480585e-05, + "loss": 0.5996, + "step": 1296 + }, + { + "epoch": 0.5188, + "grad_norm": 0.3597660984295147, + "learning_rate": 9.876930405031047e-05, + "loss": 0.6504, + "step": 1297 + }, + { + "epoch": 0.5192, + "grad_norm": 0.35453593691446306, + "learning_rate": 9.863976471807089e-05, + "loss": 0.6323, + "step": 1298 + }, + { + "epoch": 0.5196, + "grad_norm": 0.3531127558340611, + "learning_rate": 9.851022766874893e-05, + "loss": 0.6227, + "step": 1299 + }, + { + "epoch": 0.52, + "grad_norm": 0.34845680554253494, + "learning_rate": 9.838069311974986e-05, + "loss": 0.5916, + "step": 1300 + }, + { + "epoch": 0.5204, + "grad_norm": 0.33831499539595655, + "learning_rate": 9.825116128847488e-05, + "loss": 0.6246, + "step": 1301 + }, + { + "epoch": 0.5208, + "grad_norm": 0.33747552361821015, + "learning_rate": 9.812163239232051e-05, + "loss": 0.6098, + "step": 1302 + }, + { + "epoch": 0.5212, + "grad_norm": 0.3487999262928088, + "learning_rate": 9.799210664867843e-05, + "loss": 0.6142, + "step": 1303 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3519251218931331, + "learning_rate": 9.786258427493505e-05, + "loss": 0.6087, + "step": 1304 + }, + { + "epoch": 0.522, + "grad_norm": 0.3654639303417923, + "learning_rate": 9.7733065488471e-05, + "loss": 0.6872, + "step": 1305 + }, + { + "epoch": 0.5224, + "grad_norm": 0.3684929790585567, + "learning_rate": 9.760355050666102e-05, + "loss": 0.6679, + "step": 1306 + }, + { + "epoch": 0.5228, + "grad_norm": 0.34737766133230247, + "learning_rate": 9.747403954687334e-05, + "loss": 0.5503, + "step": 1307 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3385619913992404, + "learning_rate": 9.734453282646961e-05, + "loss": 0.624, + "step": 1308 + }, + { + "epoch": 0.5236, + "grad_norm": 0.3375394800263621, + "learning_rate": 9.721503056280418e-05, + "loss": 0.6492, + "step": 1309 + }, + { + "epoch": 0.524, + "grad_norm": 0.4104809079299157, + "learning_rate": 9.708553297322406e-05, + "loss": 0.6269, + "step": 1310 + }, + { + "epoch": 0.5244, + "grad_norm": 0.34047952869390213, + "learning_rate": 9.695604027506829e-05, + "loss": 0.6216, + "step": 1311 + }, + { + "epoch": 0.5248, + "grad_norm": 0.32997830779488657, + "learning_rate": 9.682655268566783e-05, + "loss": 0.6367, + "step": 1312 + }, + { + "epoch": 0.5252, + "grad_norm": 0.3502591733179358, + "learning_rate": 9.669707042234501e-05, + "loss": 0.5962, + "step": 1313 + }, + { + "epoch": 0.5256, + "grad_norm": 0.36226227435034725, + "learning_rate": 9.656759370241319e-05, + "loss": 0.6322, + "step": 1314 + }, + { + "epoch": 0.526, + "grad_norm": 0.36136848736690047, + "learning_rate": 9.643812274317644e-05, + "loss": 0.701, + "step": 1315 + }, + { + "epoch": 0.5264, + "grad_norm": 0.33275918560578055, + "learning_rate": 9.630865776192918e-05, + "loss": 0.6091, + "step": 1316 + }, + { + "epoch": 0.5268, + "grad_norm": 0.3504721176055122, + "learning_rate": 9.617919897595586e-05, + "loss": 0.6337, + "step": 1317 + }, + { + "epoch": 0.5272, + "grad_norm": 0.3861122537747958, + "learning_rate": 9.604974660253037e-05, + "loss": 0.6548, + "step": 1318 + }, + { + "epoch": 0.5276, + "grad_norm": 0.33354428832287447, + "learning_rate": 9.592030085891602e-05, + "loss": 0.5992, + "step": 1319 + }, + { + "epoch": 0.528, + "grad_norm": 0.33783856158488534, + "learning_rate": 9.579086196236482e-05, + "loss": 0.6165, + "step": 1320 + }, + { + "epoch": 0.5284, + "grad_norm": 0.3397692219995272, + "learning_rate": 9.56614301301174e-05, + "loss": 0.5972, + "step": 1321 + }, + { + "epoch": 0.5288, + "grad_norm": 0.3726436179559889, + "learning_rate": 9.553200557940253e-05, + "loss": 0.6567, + "step": 1322 + }, + { + "epoch": 0.5292, + "grad_norm": 0.3740094636261001, + "learning_rate": 9.540258852743676e-05, + "loss": 0.6072, + "step": 1323 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3473144180024521, + "learning_rate": 9.527317919142398e-05, + "loss": 0.6699, + "step": 1324 + }, + { + "epoch": 0.53, + "grad_norm": 0.35049605499757824, + "learning_rate": 9.514377778855521e-05, + "loss": 0.6654, + "step": 1325 + }, + { + "epoch": 0.5304, + "grad_norm": 0.33126843882063756, + "learning_rate": 9.501438453600807e-05, + "loss": 0.6267, + "step": 1326 + }, + { + "epoch": 0.5308, + "grad_norm": 0.35051466769057354, + "learning_rate": 9.488499965094664e-05, + "loss": 0.6817, + "step": 1327 + }, + { + "epoch": 0.5312, + "grad_norm": 0.36063296938358125, + "learning_rate": 9.475562335052086e-05, + "loss": 0.6122, + "step": 1328 + }, + { + "epoch": 0.5316, + "grad_norm": 0.35092141745658173, + "learning_rate": 9.462625585186622e-05, + "loss": 0.6638, + "step": 1329 + }, + { + "epoch": 0.532, + "grad_norm": 0.4218326001924773, + "learning_rate": 9.449689737210352e-05, + "loss": 0.5866, + "step": 1330 + }, + { + "epoch": 0.5324, + "grad_norm": 0.41197958155266357, + "learning_rate": 9.436754812833843e-05, + "loss": 0.6252, + "step": 1331 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3366671562252165, + "learning_rate": 9.423820833766108e-05, + "loss": 0.6093, + "step": 1332 + }, + { + "epoch": 0.5332, + "grad_norm": 0.34610849975686436, + "learning_rate": 9.410887821714571e-05, + "loss": 0.5994, + "step": 1333 + }, + { + "epoch": 0.5336, + "grad_norm": 0.34033293503836815, + "learning_rate": 9.39795579838504e-05, + "loss": 0.6112, + "step": 1334 + }, + { + "epoch": 0.534, + "grad_norm": 0.33741213458653624, + "learning_rate": 9.385024785481654e-05, + "loss": 0.6204, + "step": 1335 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3612458277810479, + "learning_rate": 9.372094804706867e-05, + "loss": 0.6882, + "step": 1336 + }, + { + "epoch": 0.5348, + "grad_norm": 0.33716664030270793, + "learning_rate": 9.359165877761397e-05, + "loss": 0.6369, + "step": 1337 + }, + { + "epoch": 0.5352, + "grad_norm": 0.34964309823147316, + "learning_rate": 9.346238026344186e-05, + "loss": 0.6314, + "step": 1338 + }, + { + "epoch": 0.5356, + "grad_norm": 0.34489818587790194, + "learning_rate": 9.333311272152386e-05, + "loss": 0.618, + "step": 1339 + }, + { + "epoch": 0.536, + "grad_norm": 0.3703409070384164, + "learning_rate": 9.320385636881283e-05, + "loss": 0.6189, + "step": 1340 + }, + { + "epoch": 0.5364, + "grad_norm": 0.32808582341634207, + "learning_rate": 9.307461142224318e-05, + "loss": 0.6187, + "step": 1341 + }, + { + "epoch": 0.5368, + "grad_norm": 0.3446626780159281, + "learning_rate": 9.294537809872991e-05, + "loss": 0.5834, + "step": 1342 + }, + { + "epoch": 0.5372, + "grad_norm": 0.3350436294866912, + "learning_rate": 9.281615661516864e-05, + "loss": 0.5831, + "step": 1343 + }, + { + "epoch": 0.5376, + "grad_norm": 0.34148315360487524, + "learning_rate": 9.268694718843503e-05, + "loss": 0.5881, + "step": 1344 + }, + { + "epoch": 0.538, + "grad_norm": 0.3601618590645133, + "learning_rate": 9.255775003538462e-05, + "loss": 0.636, + "step": 1345 + }, + { + "epoch": 0.5384, + "grad_norm": 0.3393424797889516, + "learning_rate": 9.242856537285227e-05, + "loss": 0.6274, + "step": 1346 + }, + { + "epoch": 0.5388, + "grad_norm": 0.343149145366101, + "learning_rate": 9.229939341765188e-05, + "loss": 0.6221, + "step": 1347 + }, + { + "epoch": 0.5392, + "grad_norm": 0.35692843628237547, + "learning_rate": 9.217023438657605e-05, + "loss": 0.6269, + "step": 1348 + }, + { + "epoch": 0.5396, + "grad_norm": 0.333803007205758, + "learning_rate": 9.204108849639565e-05, + "loss": 0.6241, + "step": 1349 + }, + { + "epoch": 0.54, + "grad_norm": 0.3844682892008765, + "learning_rate": 9.19119559638596e-05, + "loss": 0.6593, + "step": 1350 + }, + { + "epoch": 0.5404, + "grad_norm": 0.394809207078214, + "learning_rate": 9.178283700569424e-05, + "loss": 0.6066, + "step": 1351 + }, + { + "epoch": 0.5408, + "grad_norm": 0.34849520821286584, + "learning_rate": 9.165373183860328e-05, + "loss": 0.6417, + "step": 1352 + }, + { + "epoch": 0.5412, + "grad_norm": 0.328419664069886, + "learning_rate": 9.152464067926717e-05, + "loss": 0.6278, + "step": 1353 + }, + { + "epoch": 0.5416, + "grad_norm": 0.33761062084738297, + "learning_rate": 9.139556374434288e-05, + "loss": 0.588, + "step": 1354 + }, + { + "epoch": 0.542, + "grad_norm": 0.3342884235812433, + "learning_rate": 9.126650125046361e-05, + "loss": 0.5918, + "step": 1355 + }, + { + "epoch": 0.5424, + "grad_norm": 0.34043589106532285, + "learning_rate": 9.113745341423817e-05, + "loss": 0.6224, + "step": 1356 + }, + { + "epoch": 0.5428, + "grad_norm": 0.3333134613118826, + "learning_rate": 9.100842045225084e-05, + "loss": 0.631, + "step": 1357 + }, + { + "epoch": 0.5432, + "grad_norm": 0.36662979303788956, + "learning_rate": 9.087940258106093e-05, + "loss": 0.6044, + "step": 1358 + }, + { + "epoch": 0.5436, + "grad_norm": 0.350954421324191, + "learning_rate": 9.075040001720248e-05, + "loss": 0.581, + "step": 1359 + }, + { + "epoch": 0.544, + "grad_norm": 0.33733312179983765, + "learning_rate": 9.062141297718371e-05, + "loss": 0.6292, + "step": 1360 + }, + { + "epoch": 0.5444, + "grad_norm": 0.3549294970748782, + "learning_rate": 9.049244167748694e-05, + "loss": 0.6374, + "step": 1361 + }, + { + "epoch": 0.5448, + "grad_norm": 0.3602806648834775, + "learning_rate": 9.036348633456792e-05, + "loss": 0.6205, + "step": 1362 + }, + { + "epoch": 0.5452, + "grad_norm": 0.35985574803936327, + "learning_rate": 9.02345471648557e-05, + "loss": 0.6452, + "step": 1363 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3793161645024034, + "learning_rate": 9.010562438475225e-05, + "loss": 0.6225, + "step": 1364 + }, + { + "epoch": 0.546, + "grad_norm": 0.3340735735140277, + "learning_rate": 8.997671821063191e-05, + "loss": 0.624, + "step": 1365 + }, + { + "epoch": 0.5464, + "grad_norm": 0.3350381095178896, + "learning_rate": 8.984782885884119e-05, + "loss": 0.5865, + "step": 1366 + }, + { + "epoch": 0.5468, + "grad_norm": 0.35258269054173075, + "learning_rate": 8.971895654569841e-05, + "loss": 0.6198, + "step": 1367 + }, + { + "epoch": 0.5472, + "grad_norm": 0.36295131136689934, + "learning_rate": 8.959010148749323e-05, + "loss": 0.6587, + "step": 1368 + }, + { + "epoch": 0.5476, + "grad_norm": 0.37638320613834875, + "learning_rate": 8.94612639004864e-05, + "loss": 0.6694, + "step": 1369 + }, + { + "epoch": 0.548, + "grad_norm": 0.3515046750583341, + "learning_rate": 8.933244400090937e-05, + "loss": 0.6454, + "step": 1370 + }, + { + "epoch": 0.5484, + "grad_norm": 0.3813241742673669, + "learning_rate": 8.920364200496379e-05, + "loss": 0.6057, + "step": 1371 + }, + { + "epoch": 0.5488, + "grad_norm": 0.32850438706934315, + "learning_rate": 8.907485812882137e-05, + "loss": 0.6019, + "step": 1372 + }, + { + "epoch": 0.5492, + "grad_norm": 0.34498959496933074, + "learning_rate": 8.894609258862339e-05, + "loss": 0.627, + "step": 1373 + }, + { + "epoch": 0.5496, + "grad_norm": 0.35667574091889104, + "learning_rate": 8.881734560048036e-05, + "loss": 0.6409, + "step": 1374 + }, + { + "epoch": 0.55, + "grad_norm": 0.35507706295378344, + "learning_rate": 8.868861738047158e-05, + "loss": 0.627, + "step": 1375 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3711015801034512, + "learning_rate": 8.855990814464496e-05, + "loss": 0.6233, + "step": 1376 + }, + { + "epoch": 0.5508, + "grad_norm": 0.35645935528359046, + "learning_rate": 8.843121810901642e-05, + "loss": 0.6554, + "step": 1377 + }, + { + "epoch": 0.5512, + "grad_norm": 0.3403366655222805, + "learning_rate": 8.830254748956982e-05, + "loss": 0.629, + "step": 1378 + }, + { + "epoch": 0.5516, + "grad_norm": 0.37629625156447527, + "learning_rate": 8.817389650225631e-05, + "loss": 0.6182, + "step": 1379 + }, + { + "epoch": 0.552, + "grad_norm": 0.345875362567636, + "learning_rate": 8.804526536299413e-05, + "loss": 0.6055, + "step": 1380 + }, + { + "epoch": 0.5524, + "grad_norm": 0.33404949695921016, + "learning_rate": 8.791665428766818e-05, + "loss": 0.627, + "step": 1381 + }, + { + "epoch": 0.5528, + "grad_norm": 0.4068141191571966, + "learning_rate": 8.778806349212968e-05, + "loss": 0.6711, + "step": 1382 + }, + { + "epoch": 0.5532, + "grad_norm": 0.3374017182640437, + "learning_rate": 8.765949319219595e-05, + "loss": 0.6103, + "step": 1383 + }, + { + "epoch": 0.5536, + "grad_norm": 0.34207581665394704, + "learning_rate": 8.753094360364972e-05, + "loss": 0.65, + "step": 1384 + }, + { + "epoch": 0.554, + "grad_norm": 0.3267454950329156, + "learning_rate": 8.740241494223911e-05, + "loss": 0.5939, + "step": 1385 + }, + { + "epoch": 0.5544, + "grad_norm": 0.35810071664815823, + "learning_rate": 8.727390742367699e-05, + "loss": 0.6359, + "step": 1386 + }, + { + "epoch": 0.5548, + "grad_norm": 0.3551505732144722, + "learning_rate": 8.714542126364079e-05, + "loss": 0.6481, + "step": 1387 + }, + { + "epoch": 0.5552, + "grad_norm": 0.34768341140094017, + "learning_rate": 8.701695667777221e-05, + "loss": 0.6867, + "step": 1388 + }, + { + "epoch": 0.5556, + "grad_norm": 0.3395311643499959, + "learning_rate": 8.688851388167656e-05, + "loss": 0.6051, + "step": 1389 + }, + { + "epoch": 0.556, + "grad_norm": 0.34737002301926034, + "learning_rate": 8.676009309092272e-05, + "loss": 0.5916, + "step": 1390 + }, + { + "epoch": 0.5564, + "grad_norm": 0.35852557052994755, + "learning_rate": 8.663169452104247e-05, + "loss": 0.6135, + "step": 1391 + }, + { + "epoch": 0.5568, + "grad_norm": 0.33085468263069967, + "learning_rate": 8.650331838753057e-05, + "loss": 0.5551, + "step": 1392 + }, + { + "epoch": 0.5572, + "grad_norm": 0.36288272915032965, + "learning_rate": 8.637496490584385e-05, + "loss": 0.648, + "step": 1393 + }, + { + "epoch": 0.5576, + "grad_norm": 0.3593536841658371, + "learning_rate": 8.624663429140128e-05, + "loss": 0.6849, + "step": 1394 + }, + { + "epoch": 0.558, + "grad_norm": 0.3788466375356503, + "learning_rate": 8.611832675958336e-05, + "loss": 0.6558, + "step": 1395 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3384717996689434, + "learning_rate": 8.59900425257319e-05, + "loss": 0.5839, + "step": 1396 + }, + { + "epoch": 0.5588, + "grad_norm": 0.3720588308067637, + "learning_rate": 8.586178180514968e-05, + "loss": 0.6398, + "step": 1397 + }, + { + "epoch": 0.5592, + "grad_norm": 0.3227102592124542, + "learning_rate": 8.573354481309985e-05, + "loss": 0.5731, + "step": 1398 + }, + { + "epoch": 0.5596, + "grad_norm": 0.36349581862674507, + "learning_rate": 8.560533176480587e-05, + "loss": 0.6128, + "step": 1399 + }, + { + "epoch": 0.56, + "grad_norm": 0.393821757193281, + "learning_rate": 8.5477142875451e-05, + "loss": 0.6547, + "step": 1400 + }, + { + "epoch": 0.5604, + "grad_norm": 0.36439378783743004, + "learning_rate": 8.534897836017784e-05, + "loss": 0.6512, + "step": 1401 + }, + { + "epoch": 0.5608, + "grad_norm": 0.3411692652437216, + "learning_rate": 8.522083843408823e-05, + "loss": 0.5871, + "step": 1402 + }, + { + "epoch": 0.5612, + "grad_norm": 0.33776851446598916, + "learning_rate": 8.509272331224269e-05, + "loss": 0.5583, + "step": 1403 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3590738807699268, + "learning_rate": 8.496463320966005e-05, + "loss": 0.6391, + "step": 1404 + }, + { + "epoch": 0.562, + "grad_norm": 0.365703700029338, + "learning_rate": 8.48365683413172e-05, + "loss": 0.6617, + "step": 1405 + }, + { + "epoch": 0.5624, + "grad_norm": 0.3526122930372505, + "learning_rate": 8.470852892214874e-05, + "loss": 0.6446, + "step": 1406 + }, + { + "epoch": 0.5628, + "grad_norm": 0.36900049711306404, + "learning_rate": 8.458051516704644e-05, + "loss": 0.6279, + "step": 1407 + }, + { + "epoch": 0.5632, + "grad_norm": 0.36941146433944927, + "learning_rate": 8.445252729085906e-05, + "loss": 0.6276, + "step": 1408 + }, + { + "epoch": 0.5636, + "grad_norm": 0.3742118804570097, + "learning_rate": 8.432456550839195e-05, + "loss": 0.6203, + "step": 1409 + }, + { + "epoch": 0.564, + "grad_norm": 0.34650767561250045, + "learning_rate": 8.419663003440657e-05, + "loss": 0.6507, + "step": 1410 + }, + { + "epoch": 0.5644, + "grad_norm": 0.43898204956051495, + "learning_rate": 8.406872108362034e-05, + "loss": 0.6228, + "step": 1411 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3607453694697088, + "learning_rate": 8.394083887070613e-05, + "loss": 0.6446, + "step": 1412 + }, + { + "epoch": 0.5652, + "grad_norm": 0.34182866489981933, + "learning_rate": 8.381298361029189e-05, + "loss": 0.6226, + "step": 1413 + }, + { + "epoch": 0.5656, + "grad_norm": 0.3742610368049245, + "learning_rate": 8.36851555169604e-05, + "loss": 0.6336, + "step": 1414 + }, + { + "epoch": 0.566, + "grad_norm": 0.3454849337082054, + "learning_rate": 8.355735480524874e-05, + "loss": 0.6192, + "step": 1415 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3502224039208517, + "learning_rate": 8.342958168964817e-05, + "loss": 0.6369, + "step": 1416 + }, + { + "epoch": 0.5668, + "grad_norm": 0.3718134030541887, + "learning_rate": 8.330183638460356e-05, + "loss": 0.6492, + "step": 1417 + }, + { + "epoch": 0.5672, + "grad_norm": 0.3624606680464668, + "learning_rate": 8.317411910451313e-05, + "loss": 0.6141, + "step": 1418 + }, + { + "epoch": 0.5676, + "grad_norm": 0.34742545895317145, + "learning_rate": 8.304643006372797e-05, + "loss": 0.6098, + "step": 1419 + }, + { + "epoch": 0.568, + "grad_norm": 0.40057662795640137, + "learning_rate": 8.291876947655196e-05, + "loss": 0.6366, + "step": 1420 + }, + { + "epoch": 0.5684, + "grad_norm": 0.345727686669028, + "learning_rate": 8.279113755724111e-05, + "loss": 0.6614, + "step": 1421 + }, + { + "epoch": 0.5688, + "grad_norm": 0.3328107042605165, + "learning_rate": 8.266353452000326e-05, + "loss": 0.624, + "step": 1422 + }, + { + "epoch": 0.5692, + "grad_norm": 0.35151415097284505, + "learning_rate": 8.253596057899789e-05, + "loss": 0.5577, + "step": 1423 + }, + { + "epoch": 0.5696, + "grad_norm": 0.40594804322909833, + "learning_rate": 8.240841594833554e-05, + "loss": 0.5966, + "step": 1424 + }, + { + "epoch": 0.57, + "grad_norm": 0.35567838303548605, + "learning_rate": 8.228090084207774e-05, + "loss": 0.6188, + "step": 1425 + }, + { + "epoch": 0.5704, + "grad_norm": 0.33426743422578514, + "learning_rate": 8.215341547423624e-05, + "loss": 0.573, + "step": 1426 + }, + { + "epoch": 0.5708, + "grad_norm": 0.3600479894510643, + "learning_rate": 8.202596005877306e-05, + "loss": 0.6321, + "step": 1427 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3486290301517772, + "learning_rate": 8.189853480959981e-05, + "loss": 0.6372, + "step": 1428 + }, + { + "epoch": 0.5716, + "grad_norm": 0.34223847404772584, + "learning_rate": 8.177113994057755e-05, + "loss": 0.6057, + "step": 1429 + }, + { + "epoch": 0.572, + "grad_norm": 0.35597344622619104, + "learning_rate": 8.16437756655164e-05, + "loss": 0.6159, + "step": 1430 + }, + { + "epoch": 0.5724, + "grad_norm": 0.3582317597717354, + "learning_rate": 8.1516442198175e-05, + "loss": 0.6325, + "step": 1431 + }, + { + "epoch": 0.5728, + "grad_norm": 0.39871033733728933, + "learning_rate": 8.138913975226044e-05, + "loss": 0.6031, + "step": 1432 + }, + { + "epoch": 0.5732, + "grad_norm": 0.3529099409918213, + "learning_rate": 8.126186854142752e-05, + "loss": 0.6357, + "step": 1433 + }, + { + "epoch": 0.5736, + "grad_norm": 0.3398109519070245, + "learning_rate": 8.113462877927893e-05, + "loss": 0.6173, + "step": 1434 + }, + { + "epoch": 0.574, + "grad_norm": 0.3559734332419003, + "learning_rate": 8.100742067936431e-05, + "loss": 0.6419, + "step": 1435 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3327454815046734, + "learning_rate": 8.088024445518033e-05, + "loss": 0.626, + "step": 1436 + }, + { + "epoch": 0.5748, + "grad_norm": 0.3723573253181277, + "learning_rate": 8.075310032017e-05, + "loss": 0.6793, + "step": 1437 + }, + { + "epoch": 0.5752, + "grad_norm": 0.3626763820021229, + "learning_rate": 8.06259884877226e-05, + "loss": 0.6869, + "step": 1438 + }, + { + "epoch": 0.5756, + "grad_norm": 0.3369433680940479, + "learning_rate": 8.049890917117322e-05, + "loss": 0.587, + "step": 1439 + }, + { + "epoch": 0.576, + "grad_norm": 0.3620611041529325, + "learning_rate": 8.037186258380226e-05, + "loss": 0.5919, + "step": 1440 + }, + { + "epoch": 0.5764, + "grad_norm": 0.3307415356263632, + "learning_rate": 8.024484893883529e-05, + "loss": 0.6069, + "step": 1441 + }, + { + "epoch": 0.5768, + "grad_norm": 0.3703035450620265, + "learning_rate": 8.01178684494425e-05, + "loss": 0.6679, + "step": 1442 + }, + { + "epoch": 0.5772, + "grad_norm": 0.34436065944666616, + "learning_rate": 7.99909213287385e-05, + "loss": 0.6307, + "step": 1443 + }, + { + "epoch": 0.5776, + "grad_norm": 0.3634717918826862, + "learning_rate": 7.986400778978193e-05, + "loss": 0.6651, + "step": 1444 + }, + { + "epoch": 0.578, + "grad_norm": 0.3670417232457763, + "learning_rate": 7.973712804557501e-05, + "loss": 0.607, + "step": 1445 + }, + { + "epoch": 0.5784, + "grad_norm": 0.355642465027834, + "learning_rate": 7.96102823090632e-05, + "loss": 0.6181, + "step": 1446 + }, + { + "epoch": 0.5788, + "grad_norm": 0.3516485913587409, + "learning_rate": 7.948347079313494e-05, + "loss": 0.6482, + "step": 1447 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3447910881142975, + "learning_rate": 7.935669371062133e-05, + "loss": 0.5983, + "step": 1448 + }, + { + "epoch": 0.5796, + "grad_norm": 0.3402079050458831, + "learning_rate": 7.922995127429548e-05, + "loss": 0.6111, + "step": 1449 + }, + { + "epoch": 0.58, + "grad_norm": 0.36801272918948597, + "learning_rate": 7.91032436968725e-05, + "loss": 0.6572, + "step": 1450 + }, + { + "epoch": 0.5804, + "grad_norm": 0.33889533218091145, + "learning_rate": 7.897657119100896e-05, + "loss": 0.6177, + "step": 1451 + }, + { + "epoch": 0.5808, + "grad_norm": 0.3644963953865768, + "learning_rate": 7.88499339693025e-05, + "loss": 0.5866, + "step": 1452 + }, + { + "epoch": 0.5812, + "grad_norm": 0.3588575229144209, + "learning_rate": 7.872333224429167e-05, + "loss": 0.6166, + "step": 1453 + }, + { + "epoch": 0.5816, + "grad_norm": 0.3372220593947156, + "learning_rate": 7.859676622845535e-05, + "loss": 0.5507, + "step": 1454 + }, + { + "epoch": 0.582, + "grad_norm": 0.3294336764623329, + "learning_rate": 7.847023613421251e-05, + "loss": 0.5814, + "step": 1455 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3621068399864894, + "learning_rate": 7.834374217392188e-05, + "loss": 0.5946, + "step": 1456 + }, + { + "epoch": 0.5828, + "grad_norm": 0.33904287966850216, + "learning_rate": 7.82172845598814e-05, + "loss": 0.6063, + "step": 1457 + }, + { + "epoch": 0.5832, + "grad_norm": 0.36175090834503343, + "learning_rate": 7.809086350432819e-05, + "loss": 0.6005, + "step": 1458 + }, + { + "epoch": 0.5836, + "grad_norm": 0.3571985631122103, + "learning_rate": 7.796447921943792e-05, + "loss": 0.6325, + "step": 1459 + }, + { + "epoch": 0.584, + "grad_norm": 0.3955170703935457, + "learning_rate": 7.78381319173246e-05, + "loss": 0.645, + "step": 1460 + }, + { + "epoch": 0.5844, + "grad_norm": 0.33034514246320046, + "learning_rate": 7.771182181004005e-05, + "loss": 0.6078, + "step": 1461 + }, + { + "epoch": 0.5848, + "grad_norm": 0.374720598768528, + "learning_rate": 7.758554910957378e-05, + "loss": 0.6717, + "step": 1462 + }, + { + "epoch": 0.5852, + "grad_norm": 0.3536240940306809, + "learning_rate": 7.745931402785251e-05, + "loss": 0.5734, + "step": 1463 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3632092428720265, + "learning_rate": 7.73331167767398e-05, + "loss": 0.6438, + "step": 1464 + }, + { + "epoch": 0.586, + "grad_norm": 0.38339157480175035, + "learning_rate": 7.72069575680357e-05, + "loss": 0.6561, + "step": 1465 + }, + { + "epoch": 0.5864, + "grad_norm": 0.34138554844386654, + "learning_rate": 7.708083661347637e-05, + "loss": 0.6167, + "step": 1466 + }, + { + "epoch": 0.5868, + "grad_norm": 0.35147349734926064, + "learning_rate": 7.695475412473391e-05, + "loss": 0.6651, + "step": 1467 + }, + { + "epoch": 0.5872, + "grad_norm": 0.32850754726847736, + "learning_rate": 7.682871031341578e-05, + "loss": 0.5925, + "step": 1468 + }, + { + "epoch": 0.5876, + "grad_norm": 0.3566837414245581, + "learning_rate": 7.670270539106451e-05, + "loss": 0.6453, + "step": 1469 + }, + { + "epoch": 0.588, + "grad_norm": 0.3327063941527943, + "learning_rate": 7.657673956915735e-05, + "loss": 0.6383, + "step": 1470 + }, + { + "epoch": 0.5884, + "grad_norm": 0.327214303925731, + "learning_rate": 7.645081305910595e-05, + "loss": 0.6093, + "step": 1471 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3759942478506926, + "learning_rate": 7.632492607225604e-05, + "loss": 0.6267, + "step": 1472 + }, + { + "epoch": 0.5892, + "grad_norm": 0.335506912856553, + "learning_rate": 7.619907881988692e-05, + "loss": 0.5978, + "step": 1473 + }, + { + "epoch": 0.5896, + "grad_norm": 0.33225509143100557, + "learning_rate": 7.607327151321126e-05, + "loss": 0.6012, + "step": 1474 + }, + { + "epoch": 0.59, + "grad_norm": 0.3301363288014174, + "learning_rate": 7.594750436337467e-05, + "loss": 0.5967, + "step": 1475 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3520220573224516, + "learning_rate": 7.582177758145532e-05, + "loss": 0.6253, + "step": 1476 + }, + { + "epoch": 0.5908, + "grad_norm": 0.35776203174188537, + "learning_rate": 7.569609137846376e-05, + "loss": 0.588, + "step": 1477 + }, + { + "epoch": 0.5912, + "grad_norm": 0.32838797616857823, + "learning_rate": 7.557044596534234e-05, + "loss": 0.6202, + "step": 1478 + }, + { + "epoch": 0.5916, + "grad_norm": 0.32378600448202055, + "learning_rate": 7.544484155296492e-05, + "loss": 0.6383, + "step": 1479 + }, + { + "epoch": 0.592, + "grad_norm": 0.34562514537934313, + "learning_rate": 7.531927835213656e-05, + "loss": 0.6047, + "step": 1480 + }, + { + "epoch": 0.5924, + "grad_norm": 0.3619700721042768, + "learning_rate": 7.519375657359331e-05, + "loss": 0.6079, + "step": 1481 + }, + { + "epoch": 0.5928, + "grad_norm": 0.3553866992644801, + "learning_rate": 7.506827642800145e-05, + "loss": 0.6085, + "step": 1482 + }, + { + "epoch": 0.5932, + "grad_norm": 0.3568363066675366, + "learning_rate": 7.494283812595761e-05, + "loss": 0.6437, + "step": 1483 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3328699680755155, + "learning_rate": 7.4817441877988e-05, + "loss": 0.5958, + "step": 1484 + }, + { + "epoch": 0.594, + "grad_norm": 0.3476380867250527, + "learning_rate": 7.469208789454838e-05, + "loss": 0.6237, + "step": 1485 + }, + { + "epoch": 0.5944, + "grad_norm": 0.39764315061535754, + "learning_rate": 7.456677638602355e-05, + "loss": 0.6256, + "step": 1486 + }, + { + "epoch": 0.5948, + "grad_norm": 0.3175764987422378, + "learning_rate": 7.444150756272704e-05, + "loss": 0.5968, + "step": 1487 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3471809013149163, + "learning_rate": 7.431628163490066e-05, + "loss": 0.6534, + "step": 1488 + }, + { + "epoch": 0.5956, + "grad_norm": 0.33912306226399297, + "learning_rate": 7.419109881271433e-05, + "loss": 0.6432, + "step": 1489 + }, + { + "epoch": 0.596, + "grad_norm": 0.3671412851365395, + "learning_rate": 7.40659593062655e-05, + "loss": 0.6833, + "step": 1490 + }, + { + "epoch": 0.5964, + "grad_norm": 0.34018469479626184, + "learning_rate": 7.394086332557906e-05, + "loss": 0.6123, + "step": 1491 + }, + { + "epoch": 0.5968, + "grad_norm": 0.3143271250669063, + "learning_rate": 7.38158110806068e-05, + "loss": 0.5757, + "step": 1492 + }, + { + "epoch": 0.5972, + "grad_norm": 0.3747457614576391, + "learning_rate": 7.369080278122705e-05, + "loss": 0.6242, + "step": 1493 + }, + { + "epoch": 0.5976, + "grad_norm": 0.3299584007994645, + "learning_rate": 7.356583863724442e-05, + "loss": 0.6144, + "step": 1494 + }, + { + "epoch": 0.598, + "grad_norm": 0.3413173438447968, + "learning_rate": 7.344091885838948e-05, + "loss": 0.5983, + "step": 1495 + }, + { + "epoch": 0.5984, + "grad_norm": 0.35731476648357896, + "learning_rate": 7.331604365431825e-05, + "loss": 0.6357, + "step": 1496 + }, + { + "epoch": 0.5988, + "grad_norm": 0.36985199008247144, + "learning_rate": 7.319121323461197e-05, + "loss": 0.6363, + "step": 1497 + }, + { + "epoch": 0.5992, + "grad_norm": 0.36139746941975087, + "learning_rate": 7.306642780877675e-05, + "loss": 0.6435, + "step": 1498 + }, + { + "epoch": 0.5996, + "grad_norm": 0.3681686163413637, + "learning_rate": 7.294168758624307e-05, + "loss": 0.6949, + "step": 1499 + }, + { + "epoch": 0.6, + "grad_norm": 0.3773067706293726, + "learning_rate": 7.281699277636572e-05, + "loss": 0.6184, + "step": 1500 + }, + { + "epoch": 0.6004, + "grad_norm": 0.3393541897664475, + "learning_rate": 7.269234358842314e-05, + "loss": 0.5896, + "step": 1501 + }, + { + "epoch": 0.6008, + "grad_norm": 0.32358105086572475, + "learning_rate": 7.256774023161728e-05, + "loss": 0.5677, + "step": 1502 + }, + { + "epoch": 0.6012, + "grad_norm": 0.3342513260527789, + "learning_rate": 7.244318291507309e-05, + "loss": 0.6213, + "step": 1503 + }, + { + "epoch": 0.6016, + "grad_norm": 0.31814745140145045, + "learning_rate": 7.231867184783826e-05, + "loss": 0.5922, + "step": 1504 + }, + { + "epoch": 0.602, + "grad_norm": 0.34784167366745294, + "learning_rate": 7.2194207238883e-05, + "loss": 0.63, + "step": 1505 + }, + { + "epoch": 0.6024, + "grad_norm": 0.36043986672576783, + "learning_rate": 7.206978929709935e-05, + "loss": 0.624, + "step": 1506 + }, + { + "epoch": 0.6028, + "grad_norm": 0.3711628694852042, + "learning_rate": 7.194541823130118e-05, + "loss": 0.5885, + "step": 1507 + }, + { + "epoch": 0.6032, + "grad_norm": 0.33722221627158105, + "learning_rate": 7.182109425022357e-05, + "loss": 0.5979, + "step": 1508 + }, + { + "epoch": 0.6036, + "grad_norm": 0.34251133433916997, + "learning_rate": 7.169681756252264e-05, + "loss": 0.5948, + "step": 1509 + }, + { + "epoch": 0.604, + "grad_norm": 0.3295678289072513, + "learning_rate": 7.157258837677514e-05, + "loss": 0.5835, + "step": 1510 + }, + { + "epoch": 0.6044, + "grad_norm": 0.31868964772249, + "learning_rate": 7.144840690147811e-05, + "loss": 0.6026, + "step": 1511 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3407551877917364, + "learning_rate": 7.132427334504846e-05, + "loss": 0.5956, + "step": 1512 + }, + { + "epoch": 0.6052, + "grad_norm": 0.36301195896097443, + "learning_rate": 7.120018791582266e-05, + "loss": 0.6283, + "step": 1513 + }, + { + "epoch": 0.6056, + "grad_norm": 0.35832312783288844, + "learning_rate": 7.107615082205654e-05, + "loss": 0.6494, + "step": 1514 + }, + { + "epoch": 0.606, + "grad_norm": 0.3277676821441542, + "learning_rate": 7.095216227192467e-05, + "loss": 0.5795, + "step": 1515 + }, + { + "epoch": 0.6064, + "grad_norm": 0.34409138634054304, + "learning_rate": 7.082822247352023e-05, + "loss": 0.6172, + "step": 1516 + }, + { + "epoch": 0.6068, + "grad_norm": 0.33163630536725375, + "learning_rate": 7.07043316348545e-05, + "loss": 0.6148, + "step": 1517 + }, + { + "epoch": 0.6072, + "grad_norm": 0.3807080731525262, + "learning_rate": 7.058048996385664e-05, + "loss": 0.6268, + "step": 1518 + }, + { + "epoch": 0.6076, + "grad_norm": 0.3302425537167297, + "learning_rate": 7.045669766837333e-05, + "loss": 0.6181, + "step": 1519 + }, + { + "epoch": 0.608, + "grad_norm": 0.3416511838952011, + "learning_rate": 7.033295495616834e-05, + "loss": 0.6031, + "step": 1520 + }, + { + "epoch": 0.6084, + "grad_norm": 0.3708701738547392, + "learning_rate": 7.020926203492218e-05, + "loss": 0.6189, + "step": 1521 + }, + { + "epoch": 0.6088, + "grad_norm": 0.3337542319880197, + "learning_rate": 7.008561911223186e-05, + "loss": 0.6023, + "step": 1522 + }, + { + "epoch": 0.6092, + "grad_norm": 0.3685066491511751, + "learning_rate": 6.996202639561041e-05, + "loss": 0.6607, + "step": 1523 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3530816436474091, + "learning_rate": 6.983848409248671e-05, + "loss": 0.6099, + "step": 1524 + }, + { + "epoch": 0.61, + "grad_norm": 0.33140625458221923, + "learning_rate": 6.971499241020495e-05, + "loss": 0.6135, + "step": 1525 + }, + { + "epoch": 0.6104, + "grad_norm": 0.33283277876250533, + "learning_rate": 6.959155155602433e-05, + "loss": 0.5767, + "step": 1526 + }, + { + "epoch": 0.6108, + "grad_norm": 0.36481283269146, + "learning_rate": 6.946816173711878e-05, + "loss": 0.6464, + "step": 1527 + }, + { + "epoch": 0.6112, + "grad_norm": 0.34026942238835184, + "learning_rate": 6.934482316057663e-05, + "loss": 0.6132, + "step": 1528 + }, + { + "epoch": 0.6116, + "grad_norm": 0.3378818728280427, + "learning_rate": 6.922153603340016e-05, + "loss": 0.5984, + "step": 1529 + }, + { + "epoch": 0.612, + "grad_norm": 0.34921102193272463, + "learning_rate": 6.909830056250527e-05, + "loss": 0.5649, + "step": 1530 + }, + { + "epoch": 0.6124, + "grad_norm": 0.3572602699436952, + "learning_rate": 6.897511695472123e-05, + "loss": 0.599, + "step": 1531 + }, + { + "epoch": 0.6128, + "grad_norm": 0.3487090468932393, + "learning_rate": 6.885198541679015e-05, + "loss": 0.586, + "step": 1532 + }, + { + "epoch": 0.6132, + "grad_norm": 0.33667924874566424, + "learning_rate": 6.872890615536694e-05, + "loss": 0.6496, + "step": 1533 + }, + { + "epoch": 0.6136, + "grad_norm": 0.33375982398075066, + "learning_rate": 6.860587937701862e-05, + "loss": 0.5801, + "step": 1534 + }, + { + "epoch": 0.614, + "grad_norm": 0.33669078924570117, + "learning_rate": 6.848290528822416e-05, + "loss": 0.6009, + "step": 1535 + }, + { + "epoch": 0.6144, + "grad_norm": 0.34801084739291366, + "learning_rate": 6.835998409537412e-05, + "loss": 0.601, + "step": 1536 + }, + { + "epoch": 0.6148, + "grad_norm": 0.35856953999733276, + "learning_rate": 6.823711600477025e-05, + "loss": 0.6364, + "step": 1537 + }, + { + "epoch": 0.6152, + "grad_norm": 0.4207855839311045, + "learning_rate": 6.811430122262529e-05, + "loss": 0.6066, + "step": 1538 + }, + { + "epoch": 0.6156, + "grad_norm": 0.34510379391800705, + "learning_rate": 6.799153995506233e-05, + "loss": 0.5881, + "step": 1539 + }, + { + "epoch": 0.616, + "grad_norm": 0.3483349424676601, + "learning_rate": 6.786883240811479e-05, + "loss": 0.6192, + "step": 1540 + }, + { + "epoch": 0.6164, + "grad_norm": 0.3426317429269225, + "learning_rate": 6.774617878772582e-05, + "loss": 0.5891, + "step": 1541 + }, + { + "epoch": 0.6168, + "grad_norm": 0.3570391863388374, + "learning_rate": 6.76235792997482e-05, + "loss": 0.6508, + "step": 1542 + }, + { + "epoch": 0.6172, + "grad_norm": 0.33436810655168264, + "learning_rate": 6.750103414994374e-05, + "loss": 0.6168, + "step": 1543 + }, + { + "epoch": 0.6176, + "grad_norm": 0.33635108547988973, + "learning_rate": 6.737854354398307e-05, + "loss": 0.6019, + "step": 1544 + }, + { + "epoch": 0.618, + "grad_norm": 0.4032601744512013, + "learning_rate": 6.725610768744534e-05, + "loss": 0.6302, + "step": 1545 + }, + { + "epoch": 0.6184, + "grad_norm": 0.3435958627550203, + "learning_rate": 6.713372678581774e-05, + "loss": 0.6156, + "step": 1546 + }, + { + "epoch": 0.6188, + "grad_norm": 0.34363295200916266, + "learning_rate": 6.70114010444953e-05, + "loss": 0.6079, + "step": 1547 + }, + { + "epoch": 0.6192, + "grad_norm": 0.34363602325097997, + "learning_rate": 6.688913066878039e-05, + "loss": 0.6334, + "step": 1548 + }, + { + "epoch": 0.6196, + "grad_norm": 0.32753115254913256, + "learning_rate": 6.676691586388255e-05, + "loss": 0.6111, + "step": 1549 + }, + { + "epoch": 0.62, + "grad_norm": 0.35690818367626614, + "learning_rate": 6.664475683491796e-05, + "loss": 0.62, + "step": 1550 + }, + { + "epoch": 0.6204, + "grad_norm": 0.34196251229699387, + "learning_rate": 6.652265378690922e-05, + "loss": 0.6442, + "step": 1551 + }, + { + "epoch": 0.6208, + "grad_norm": 0.33911356642323115, + "learning_rate": 6.640060692478509e-05, + "loss": 0.5776, + "step": 1552 + }, + { + "epoch": 0.6212, + "grad_norm": 0.3346480835046118, + "learning_rate": 6.627861645337984e-05, + "loss": 0.6016, + "step": 1553 + }, + { + "epoch": 0.6216, + "grad_norm": 0.347171889573341, + "learning_rate": 6.615668257743321e-05, + "loss": 0.6187, + "step": 1554 + }, + { + "epoch": 0.622, + "grad_norm": 0.3304898760604142, + "learning_rate": 6.603480550158995e-05, + "loss": 0.5886, + "step": 1555 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3668158254051957, + "learning_rate": 6.591298543039949e-05, + "loss": 0.6175, + "step": 1556 + }, + { + "epoch": 0.6228, + "grad_norm": 0.36068648999637415, + "learning_rate": 6.579122256831551e-05, + "loss": 0.578, + "step": 1557 + }, + { + "epoch": 0.6232, + "grad_norm": 0.324636999369861, + "learning_rate": 6.56695171196958e-05, + "loss": 0.5967, + "step": 1558 + }, + { + "epoch": 0.6236, + "grad_norm": 0.3362041332820818, + "learning_rate": 6.554786928880164e-05, + "loss": 0.6431, + "step": 1559 + }, + { + "epoch": 0.624, + "grad_norm": 0.3182739366074914, + "learning_rate": 6.542627927979771e-05, + "loss": 0.5741, + "step": 1560 + }, + { + "epoch": 0.6244, + "grad_norm": 0.3301255157545653, + "learning_rate": 6.530474729675167e-05, + "loss": 0.6307, + "step": 1561 + }, + { + "epoch": 0.6248, + "grad_norm": 0.3159628923173484, + "learning_rate": 6.518327354363374e-05, + "loss": 0.6092, + "step": 1562 + }, + { + "epoch": 0.6252, + "grad_norm": 0.3219719456819235, + "learning_rate": 6.506185822431638e-05, + "loss": 0.6225, + "step": 1563 + }, + { + "epoch": 0.6256, + "grad_norm": 0.31618004869884997, + "learning_rate": 6.494050154257407e-05, + "loss": 0.601, + "step": 1564 + }, + { + "epoch": 0.626, + "grad_norm": 0.34916825527224976, + "learning_rate": 6.481920370208274e-05, + "loss": 0.5667, + "step": 1565 + }, + { + "epoch": 0.6264, + "grad_norm": 0.3492294364000783, + "learning_rate": 6.469796490641973e-05, + "loss": 0.6265, + "step": 1566 + }, + { + "epoch": 0.6268, + "grad_norm": 0.33940934685888674, + "learning_rate": 6.457678535906322e-05, + "loss": 0.548, + "step": 1567 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3339632201390132, + "learning_rate": 6.445566526339188e-05, + "loss": 0.576, + "step": 1568 + }, + { + "epoch": 0.6276, + "grad_norm": 0.35130711286775096, + "learning_rate": 6.433460482268464e-05, + "loss": 0.621, + "step": 1569 + }, + { + "epoch": 0.628, + "grad_norm": 0.36342412613863156, + "learning_rate": 6.42136042401204e-05, + "loss": 0.6298, + "step": 1570 + }, + { + "epoch": 0.6284, + "grad_norm": 0.3365696836642408, + "learning_rate": 6.409266371877751e-05, + "loss": 0.605, + "step": 1571 + }, + { + "epoch": 0.6288, + "grad_norm": 0.35262966344092284, + "learning_rate": 6.397178346163349e-05, + "loss": 0.6144, + "step": 1572 + }, + { + "epoch": 0.6292, + "grad_norm": 0.3479282156154242, + "learning_rate": 6.38509636715648e-05, + "loss": 0.6308, + "step": 1573 + }, + { + "epoch": 0.6296, + "grad_norm": 0.35797898629366776, + "learning_rate": 6.373020455134634e-05, + "loss": 0.6108, + "step": 1574 + }, + { + "epoch": 0.63, + "grad_norm": 0.3581346498101776, + "learning_rate": 6.360950630365126e-05, + "loss": 0.6253, + "step": 1575 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3415194837946404, + "learning_rate": 6.34888691310505e-05, + "loss": 0.5685, + "step": 1576 + }, + { + "epoch": 0.6308, + "grad_norm": 0.3476884710311523, + "learning_rate": 6.33682932360125e-05, + "loss": 0.5958, + "step": 1577 + }, + { + "epoch": 0.6312, + "grad_norm": 0.3535544533307336, + "learning_rate": 6.324777882090287e-05, + "loss": 0.6489, + "step": 1578 + }, + { + "epoch": 0.6316, + "grad_norm": 0.3247012241756796, + "learning_rate": 6.312732608798397e-05, + "loss": 0.5907, + "step": 1579 + }, + { + "epoch": 0.632, + "grad_norm": 0.35701198841586984, + "learning_rate": 6.300693523941482e-05, + "loss": 0.6268, + "step": 1580 + }, + { + "epoch": 0.6324, + "grad_norm": 0.3530680640497662, + "learning_rate": 6.288660647725034e-05, + "loss": 0.6148, + "step": 1581 + }, + { + "epoch": 0.6328, + "grad_norm": 0.32991564455360417, + "learning_rate": 6.276634000344143e-05, + "loss": 0.5772, + "step": 1582 + }, + { + "epoch": 0.6332, + "grad_norm": 0.3395810927203878, + "learning_rate": 6.264613601983435e-05, + "loss": 0.6066, + "step": 1583 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3373112099227295, + "learning_rate": 6.25259947281705e-05, + "loss": 0.5861, + "step": 1584 + }, + { + "epoch": 0.634, + "grad_norm": 0.334228708671868, + "learning_rate": 6.24059163300861e-05, + "loss": 0.5617, + "step": 1585 + }, + { + "epoch": 0.6344, + "grad_norm": 0.345907115889204, + "learning_rate": 6.22859010271118e-05, + "loss": 0.6248, + "step": 1586 + }, + { + "epoch": 0.6348, + "grad_norm": 0.3779567983486365, + "learning_rate": 6.216594902067232e-05, + "loss": 0.6303, + "step": 1587 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3583018978395348, + "learning_rate": 6.204606051208617e-05, + "loss": 0.6137, + "step": 1588 + }, + { + "epoch": 0.6356, + "grad_norm": 0.3433342008135954, + "learning_rate": 6.192623570256535e-05, + "loss": 0.6089, + "step": 1589 + }, + { + "epoch": 0.636, + "grad_norm": 0.3353786937623075, + "learning_rate": 6.180647479321485e-05, + "loss": 0.6303, + "step": 1590 + }, + { + "epoch": 0.6364, + "grad_norm": 0.34009862544113934, + "learning_rate": 6.168677798503247e-05, + "loss": 0.6307, + "step": 1591 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3736520602666011, + "learning_rate": 6.156714547890838e-05, + "loss": 0.5946, + "step": 1592 + }, + { + "epoch": 0.6372, + "grad_norm": 0.3435425647296458, + "learning_rate": 6.144757747562489e-05, + "loss": 0.6302, + "step": 1593 + }, + { + "epoch": 0.6376, + "grad_norm": 0.32727000904697967, + "learning_rate": 6.13280741758561e-05, + "loss": 0.6127, + "step": 1594 + }, + { + "epoch": 0.638, + "grad_norm": 0.36753270428283846, + "learning_rate": 6.120863578016735e-05, + "loss": 0.632, + "step": 1595 + }, + { + "epoch": 0.6384, + "grad_norm": 0.32514548505631924, + "learning_rate": 6.108926248901521e-05, + "loss": 0.5782, + "step": 1596 + }, + { + "epoch": 0.6388, + "grad_norm": 0.3207337446453717, + "learning_rate": 6.096995450274692e-05, + "loss": 0.5836, + "step": 1597 + }, + { + "epoch": 0.6392, + "grad_norm": 0.358030142284751, + "learning_rate": 6.085071202160004e-05, + "loss": 0.6495, + "step": 1598 + }, + { + "epoch": 0.6396, + "grad_norm": 0.3508927594214803, + "learning_rate": 6.0731535245702366e-05, + "loss": 0.5946, + "step": 1599 + }, + { + "epoch": 0.64, + "grad_norm": 0.3386012278500723, + "learning_rate": 6.061242437507131e-05, + "loss": 0.6233, + "step": 1600 + }, + { + "epoch": 0.6404, + "grad_norm": 0.354317079178696, + "learning_rate": 6.049337960961362e-05, + "loss": 0.5686, + "step": 1601 + }, + { + "epoch": 0.6408, + "grad_norm": 0.3457713745654834, + "learning_rate": 6.0374401149125204e-05, + "loss": 0.6255, + "step": 1602 + }, + { + "epoch": 0.6412, + "grad_norm": 0.3433001073124816, + "learning_rate": 6.025548919329067e-05, + "loss": 0.629, + "step": 1603 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3423525871043034, + "learning_rate": 6.013664394168297e-05, + "loss": 0.5807, + "step": 1604 + }, + { + "epoch": 0.642, + "grad_norm": 0.3474180322514183, + "learning_rate": 6.00178655937631e-05, + "loss": 0.6273, + "step": 1605 + }, + { + "epoch": 0.6424, + "grad_norm": 0.3614314983708964, + "learning_rate": 5.989915434887985e-05, + "loss": 0.5821, + "step": 1606 + }, + { + "epoch": 0.6428, + "grad_norm": 0.3478797951136011, + "learning_rate": 5.978051040626924e-05, + "loss": 0.5748, + "step": 1607 + }, + { + "epoch": 0.6432, + "grad_norm": 0.356975941297783, + "learning_rate": 5.9661933965054516e-05, + "loss": 0.5878, + "step": 1608 + }, + { + "epoch": 0.6436, + "grad_norm": 0.3435682743215697, + "learning_rate": 5.9543425224245534e-05, + "loss": 0.5934, + "step": 1609 + }, + { + "epoch": 0.644, + "grad_norm": 0.3425447440862132, + "learning_rate": 5.942498438273849e-05, + "loss": 0.6592, + "step": 1610 + }, + { + "epoch": 0.6444, + "grad_norm": 0.32859194465404135, + "learning_rate": 5.9306611639315724e-05, + "loss": 0.6402, + "step": 1611 + }, + { + "epoch": 0.6448, + "grad_norm": 0.32171709661226766, + "learning_rate": 5.9188307192645145e-05, + "loss": 0.5754, + "step": 1612 + }, + { + "epoch": 0.6452, + "grad_norm": 0.3664751697653012, + "learning_rate": 5.907007124128023e-05, + "loss": 0.6061, + "step": 1613 + }, + { + "epoch": 0.6456, + "grad_norm": 0.3561084222916176, + "learning_rate": 5.895190398365935e-05, + "loss": 0.6439, + "step": 1614 + }, + { + "epoch": 0.646, + "grad_norm": 0.35906093816510193, + "learning_rate": 5.883380561810563e-05, + "loss": 0.6493, + "step": 1615 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3791646125351826, + "learning_rate": 5.871577634282654e-05, + "loss": 0.6317, + "step": 1616 + }, + { + "epoch": 0.6468, + "grad_norm": 0.3407556331076295, + "learning_rate": 5.8597816355913684e-05, + "loss": 0.5503, + "step": 1617 + }, + { + "epoch": 0.6472, + "grad_norm": 0.3465952031897078, + "learning_rate": 5.84799258553423e-05, + "loss": 0.6688, + "step": 1618 + }, + { + "epoch": 0.6476, + "grad_norm": 0.3397174114357965, + "learning_rate": 5.836210503897099e-05, + "loss": 0.6059, + "step": 1619 + }, + { + "epoch": 0.648, + "grad_norm": 0.34348908309352155, + "learning_rate": 5.82443541045415e-05, + "loss": 0.6413, + "step": 1620 + }, + { + "epoch": 0.6484, + "grad_norm": 0.3644566121091593, + "learning_rate": 5.812667324967813e-05, + "loss": 0.6031, + "step": 1621 + }, + { + "epoch": 0.6488, + "grad_norm": 0.33281137684175827, + "learning_rate": 5.8009062671887726e-05, + "loss": 0.5977, + "step": 1622 + }, + { + "epoch": 0.6492, + "grad_norm": 0.34054041000294205, + "learning_rate": 5.789152256855916e-05, + "loss": 0.6005, + "step": 1623 + }, + { + "epoch": 0.6496, + "grad_norm": 0.35050290727508115, + "learning_rate": 5.7774053136962935e-05, + "loss": 0.6152, + "step": 1624 + }, + { + "epoch": 0.65, + "grad_norm": 0.35763787377139644, + "learning_rate": 5.765665457425102e-05, + "loss": 0.614, + "step": 1625 + }, + { + "epoch": 0.6504, + "grad_norm": 0.3573031278750954, + "learning_rate": 5.753932707745635e-05, + "loss": 0.607, + "step": 1626 + }, + { + "epoch": 0.6508, + "grad_norm": 0.36203299996635585, + "learning_rate": 5.7422070843492734e-05, + "loss": 0.6577, + "step": 1627 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3533041362444965, + "learning_rate": 5.730488606915429e-05, + "loss": 0.6193, + "step": 1628 + }, + { + "epoch": 0.6516, + "grad_norm": 0.39042971332239085, + "learning_rate": 5.7187772951115236e-05, + "loss": 0.5948, + "step": 1629 + }, + { + "epoch": 0.652, + "grad_norm": 0.3615896964890657, + "learning_rate": 5.707073168592942e-05, + "loss": 0.6288, + "step": 1630 + }, + { + "epoch": 0.6524, + "grad_norm": 0.3872827555876587, + "learning_rate": 5.695376247003025e-05, + "loss": 0.6421, + "step": 1631 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3650559383497089, + "learning_rate": 5.6836865499730176e-05, + "loss": 0.61, + "step": 1632 + }, + { + "epoch": 0.6532, + "grad_norm": 0.3293487025294577, + "learning_rate": 5.6720040971220326e-05, + "loss": 0.5818, + "step": 1633 + }, + { + "epoch": 0.6536, + "grad_norm": 0.372959382045621, + "learning_rate": 5.660328908057028e-05, + "loss": 0.5676, + "step": 1634 + }, + { + "epoch": 0.654, + "grad_norm": 0.3387007113246456, + "learning_rate": 5.648661002372768e-05, + "loss": 0.5966, + "step": 1635 + }, + { + "epoch": 0.6544, + "grad_norm": 0.3264396720269018, + "learning_rate": 5.637000399651804e-05, + "loss": 0.5993, + "step": 1636 + }, + { + "epoch": 0.6548, + "grad_norm": 0.31000237024948457, + "learning_rate": 5.6253471194644214e-05, + "loss": 0.5697, + "step": 1637 + }, + { + "epoch": 0.6552, + "grad_norm": 0.3526078597111627, + "learning_rate": 5.613701181368618e-05, + "loss": 0.6248, + "step": 1638 + }, + { + "epoch": 0.6556, + "grad_norm": 0.32001206541055655, + "learning_rate": 5.602062604910063e-05, + "loss": 0.6204, + "step": 1639 + }, + { + "epoch": 0.656, + "grad_norm": 0.3496514013896952, + "learning_rate": 5.590431409622081e-05, + "loss": 0.6022, + "step": 1640 + }, + { + "epoch": 0.6564, + "grad_norm": 0.34586292300320254, + "learning_rate": 5.578807615025607e-05, + "loss": 0.598, + "step": 1641 + }, + { + "epoch": 0.6568, + "grad_norm": 0.3432127154514644, + "learning_rate": 5.567191240629151e-05, + "loss": 0.5738, + "step": 1642 + }, + { + "epoch": 0.6572, + "grad_norm": 0.3364872402176342, + "learning_rate": 5.555582305928766e-05, + "loss": 0.5496, + "step": 1643 + }, + { + "epoch": 0.6576, + "grad_norm": 0.35323619348013563, + "learning_rate": 5.543980830408022e-05, + "loss": 0.6186, + "step": 1644 + }, + { + "epoch": 0.658, + "grad_norm": 0.35721395185667604, + "learning_rate": 5.532386833537977e-05, + "loss": 0.6382, + "step": 1645 + }, + { + "epoch": 0.6584, + "grad_norm": 0.33627601356894526, + "learning_rate": 5.520800334777132e-05, + "loss": 0.594, + "step": 1646 + }, + { + "epoch": 0.6588, + "grad_norm": 0.34691227027823734, + "learning_rate": 5.5092213535714034e-05, + "loss": 0.6038, + "step": 1647 + }, + { + "epoch": 0.6592, + "grad_norm": 0.32413368663446984, + "learning_rate": 5.497649909354083e-05, + "loss": 0.5532, + "step": 1648 + }, + { + "epoch": 0.6596, + "grad_norm": 0.3294145113172244, + "learning_rate": 5.4860860215458286e-05, + "loss": 0.5638, + "step": 1649 + }, + { + "epoch": 0.66, + "grad_norm": 0.3496300712042131, + "learning_rate": 5.474529709554612e-05, + "loss": 0.5839, + "step": 1650 + }, + { + "epoch": 0.6604, + "grad_norm": 0.33031214213845245, + "learning_rate": 5.4629809927756794e-05, + "loss": 0.5767, + "step": 1651 + }, + { + "epoch": 0.6608, + "grad_norm": 0.37293263941655797, + "learning_rate": 5.451439890591539e-05, + "loss": 0.6267, + "step": 1652 + }, + { + "epoch": 0.6612, + "grad_norm": 0.33245490315332754, + "learning_rate": 5.439906422371914e-05, + "loss": 0.5575, + "step": 1653 + }, + { + "epoch": 0.6616, + "grad_norm": 0.32580261091835244, + "learning_rate": 5.42838060747372e-05, + "loss": 0.596, + "step": 1654 + }, + { + "epoch": 0.662, + "grad_norm": 0.35853580611736774, + "learning_rate": 5.416862465241033e-05, + "loss": 0.6202, + "step": 1655 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3498791525095215, + "learning_rate": 5.4053520150050384e-05, + "loss": 0.563, + "step": 1656 + }, + { + "epoch": 0.6628, + "grad_norm": 0.39668693688774814, + "learning_rate": 5.393849276084018e-05, + "loss": 0.63, + "step": 1657 + }, + { + "epoch": 0.6632, + "grad_norm": 0.3454852702854712, + "learning_rate": 5.382354267783316e-05, + "loss": 0.6302, + "step": 1658 + }, + { + "epoch": 0.6636, + "grad_norm": 0.3763804250470061, + "learning_rate": 5.370867009395294e-05, + "loss": 0.6038, + "step": 1659 + }, + { + "epoch": 0.664, + "grad_norm": 0.36015198509194524, + "learning_rate": 5.3593875201993174e-05, + "loss": 0.5984, + "step": 1660 + }, + { + "epoch": 0.6644, + "grad_norm": 0.3507800181470306, + "learning_rate": 5.347915819461699e-05, + "loss": 0.5994, + "step": 1661 + }, + { + "epoch": 0.6648, + "grad_norm": 0.34420271337836417, + "learning_rate": 5.336451926435688e-05, + "loss": 0.5801, + "step": 1662 + }, + { + "epoch": 0.6652, + "grad_norm": 0.33850927670740566, + "learning_rate": 5.3249958603614305e-05, + "loss": 0.5739, + "step": 1663 + }, + { + "epoch": 0.6656, + "grad_norm": 0.351094793513127, + "learning_rate": 5.3135476404659366e-05, + "loss": 0.5978, + "step": 1664 + }, + { + "epoch": 0.666, + "grad_norm": 0.335916258589442, + "learning_rate": 5.302107285963045e-05, + "loss": 0.5728, + "step": 1665 + }, + { + "epoch": 0.6664, + "grad_norm": 0.30513854130361645, + "learning_rate": 5.290674816053389e-05, + "loss": 0.5611, + "step": 1666 + }, + { + "epoch": 0.6668, + "grad_norm": 0.35943567473275423, + "learning_rate": 5.279250249924383e-05, + "loss": 0.6203, + "step": 1667 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3428927765200386, + "learning_rate": 5.26783360675016e-05, + "loss": 0.6008, + "step": 1668 + }, + { + "epoch": 0.6676, + "grad_norm": 0.35017777266739336, + "learning_rate": 5.25642490569157e-05, + "loss": 0.6162, + "step": 1669 + }, + { + "epoch": 0.668, + "grad_norm": 0.3885116261542779, + "learning_rate": 5.245024165896126e-05, + "loss": 0.6497, + "step": 1670 + }, + { + "epoch": 0.6684, + "grad_norm": 0.3400541914465486, + "learning_rate": 5.233631406497976e-05, + "loss": 0.6133, + "step": 1671 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3275659916402285, + "learning_rate": 5.222246646617886e-05, + "loss": 0.5754, + "step": 1672 + }, + { + "epoch": 0.6692, + "grad_norm": 0.3630102866461434, + "learning_rate": 5.2108699053631784e-05, + "loss": 0.5595, + "step": 1673 + }, + { + "epoch": 0.6696, + "grad_norm": 0.36478597471794466, + "learning_rate": 5.199501201827741e-05, + "loss": 0.5864, + "step": 1674 + }, + { + "epoch": 0.67, + "grad_norm": 0.36960611381067765, + "learning_rate": 5.1881405550919493e-05, + "loss": 0.6221, + "step": 1675 + }, + { + "epoch": 0.6704, + "grad_norm": 0.3450897290147815, + "learning_rate": 5.176787984222674e-05, + "loss": 0.6098, + "step": 1676 + }, + { + "epoch": 0.6708, + "grad_norm": 0.3821327690534526, + "learning_rate": 5.1654435082732175e-05, + "loss": 0.6306, + "step": 1677 + }, + { + "epoch": 0.6712, + "grad_norm": 0.3413146874905284, + "learning_rate": 5.1541071462833115e-05, + "loss": 0.6033, + "step": 1678 + }, + { + "epoch": 0.6716, + "grad_norm": 0.3553980761339589, + "learning_rate": 5.1427789172790566e-05, + "loss": 0.6723, + "step": 1679 + }, + { + "epoch": 0.672, + "grad_norm": 0.34657476100242796, + "learning_rate": 5.1314588402729044e-05, + "loss": 0.6361, + "step": 1680 + }, + { + "epoch": 0.6724, + "grad_norm": 0.3495602397264736, + "learning_rate": 5.120146934263638e-05, + "loss": 0.6052, + "step": 1681 + }, + { + "epoch": 0.6728, + "grad_norm": 0.3308800927928125, + "learning_rate": 5.10884321823631e-05, + "loss": 0.5832, + "step": 1682 + }, + { + "epoch": 0.6732, + "grad_norm": 0.34180578026316344, + "learning_rate": 5.0975477111622426e-05, + "loss": 0.6362, + "step": 1683 + }, + { + "epoch": 0.6736, + "grad_norm": 0.316429545263734, + "learning_rate": 5.086260431998967e-05, + "loss": 0.5789, + "step": 1684 + }, + { + "epoch": 0.674, + "grad_norm": 0.3473239200277124, + "learning_rate": 5.074981399690218e-05, + "loss": 0.6228, + "step": 1685 + }, + { + "epoch": 0.6744, + "grad_norm": 0.3212290043833729, + "learning_rate": 5.063710633165881e-05, + "loss": 0.5681, + "step": 1686 + }, + { + "epoch": 0.6748, + "grad_norm": 0.3376077723916017, + "learning_rate": 5.052448151341967e-05, + "loss": 0.6022, + "step": 1687 + }, + { + "epoch": 0.6752, + "grad_norm": 0.34382234786732874, + "learning_rate": 5.0411939731205946e-05, + "loss": 0.5896, + "step": 1688 + }, + { + "epoch": 0.6756, + "grad_norm": 0.334168296603689, + "learning_rate": 5.0299481173899296e-05, + "loss": 0.6305, + "step": 1689 + }, + { + "epoch": 0.676, + "grad_norm": 0.32108508408255027, + "learning_rate": 5.018710603024187e-05, + "loss": 0.568, + "step": 1690 + }, + { + "epoch": 0.6764, + "grad_norm": 0.337426417658709, + "learning_rate": 5.0074814488835665e-05, + "loss": 0.6087, + "step": 1691 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3325197469675423, + "learning_rate": 4.99626067381425e-05, + "loss": 0.5546, + "step": 1692 + }, + { + "epoch": 0.6772, + "grad_norm": 0.3344666617338685, + "learning_rate": 4.9850482966483455e-05, + "loss": 0.5597, + "step": 1693 + }, + { + "epoch": 0.6776, + "grad_norm": 0.35372497641394407, + "learning_rate": 4.973844336203879e-05, + "loss": 0.6546, + "step": 1694 + }, + { + "epoch": 0.678, + "grad_norm": 0.3461997501455237, + "learning_rate": 4.962648811284738e-05, + "loss": 0.6123, + "step": 1695 + }, + { + "epoch": 0.6784, + "grad_norm": 0.35165958624702814, + "learning_rate": 4.951461740680655e-05, + "loss": 0.6477, + "step": 1696 + }, + { + "epoch": 0.6788, + "grad_norm": 0.3395316114162352, + "learning_rate": 4.9402831431671834e-05, + "loss": 0.5678, + "step": 1697 + }, + { + "epoch": 0.6792, + "grad_norm": 0.3291753734724247, + "learning_rate": 4.929113037505641e-05, + "loss": 0.5985, + "step": 1698 + }, + { + "epoch": 0.6796, + "grad_norm": 0.36381547209597087, + "learning_rate": 4.91795144244311e-05, + "loss": 0.6167, + "step": 1699 + }, + { + "epoch": 0.68, + "grad_norm": 0.32974021690173844, + "learning_rate": 4.9067983767123736e-05, + "loss": 0.5671, + "step": 1700 + }, + { + "epoch": 0.6804, + "grad_norm": 0.3320824590238343, + "learning_rate": 4.8956538590319055e-05, + "loss": 0.5249, + "step": 1701 + }, + { + "epoch": 0.6808, + "grad_norm": 0.3419505556167843, + "learning_rate": 4.884517908105837e-05, + "loss": 0.5919, + "step": 1702 + }, + { + "epoch": 0.6812, + "grad_norm": 0.3397681816840063, + "learning_rate": 4.873390542623922e-05, + "loss": 0.5519, + "step": 1703 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3875580450426956, + "learning_rate": 4.8622717812615e-05, + "loss": 0.653, + "step": 1704 + }, + { + "epoch": 0.682, + "grad_norm": 0.37924891875909367, + "learning_rate": 4.851161642679466e-05, + "loss": 0.5606, + "step": 1705 + }, + { + "epoch": 0.6824, + "grad_norm": 0.368679004842044, + "learning_rate": 4.840060145524254e-05, + "loss": 0.676, + "step": 1706 + }, + { + "epoch": 0.6828, + "grad_norm": 0.3276517811322601, + "learning_rate": 4.8289673084277954e-05, + "loss": 0.5866, + "step": 1707 + }, + { + "epoch": 0.6832, + "grad_norm": 0.35101894023359137, + "learning_rate": 4.817883150007474e-05, + "loss": 0.6115, + "step": 1708 + }, + { + "epoch": 0.6836, + "grad_norm": 0.34967835358066, + "learning_rate": 4.80680768886612e-05, + "loss": 0.6214, + "step": 1709 + }, + { + "epoch": 0.684, + "grad_norm": 0.337382406007058, + "learning_rate": 4.795740943591955e-05, + "loss": 0.6163, + "step": 1710 + }, + { + "epoch": 0.6844, + "grad_norm": 0.3232351705102931, + "learning_rate": 4.7846829327585876e-05, + "loss": 0.5935, + "step": 1711 + }, + { + "epoch": 0.6848, + "grad_norm": 0.32085864274891623, + "learning_rate": 4.77363367492496e-05, + "loss": 0.5997, + "step": 1712 + }, + { + "epoch": 0.6852, + "grad_norm": 0.33468587225981816, + "learning_rate": 4.762593188635321e-05, + "loss": 0.6233, + "step": 1713 + }, + { + "epoch": 0.6856, + "grad_norm": 0.39442272618975327, + "learning_rate": 4.751561492419202e-05, + "loss": 0.6117, + "step": 1714 + }, + { + "epoch": 0.686, + "grad_norm": 0.3461264133463066, + "learning_rate": 4.74053860479137e-05, + "loss": 0.5971, + "step": 1715 + }, + { + "epoch": 0.6864, + "grad_norm": 0.34582394392278937, + "learning_rate": 4.729524544251837e-05, + "loss": 0.6283, + "step": 1716 + }, + { + "epoch": 0.6868, + "grad_norm": 0.3709121160353713, + "learning_rate": 4.718519329285771e-05, + "loss": 0.612, + "step": 1717 + }, + { + "epoch": 0.6872, + "grad_norm": 0.3450289858246148, + "learning_rate": 4.707522978363508e-05, + "loss": 0.6151, + "step": 1718 + }, + { + "epoch": 0.6876, + "grad_norm": 0.3554537104560121, + "learning_rate": 4.696535509940498e-05, + "loss": 0.5926, + "step": 1719 + }, + { + "epoch": 0.688, + "grad_norm": 0.3700955022172051, + "learning_rate": 4.6855569424572955e-05, + "loss": 0.5758, + "step": 1720 + }, + { + "epoch": 0.6884, + "grad_norm": 0.3521617004846447, + "learning_rate": 4.674587294339513e-05, + "loss": 0.6001, + "step": 1721 + }, + { + "epoch": 0.6888, + "grad_norm": 0.37920595064499485, + "learning_rate": 4.663626583997789e-05, + "loss": 0.6172, + "step": 1722 + }, + { + "epoch": 0.6892, + "grad_norm": 0.3311674718568459, + "learning_rate": 4.652674829827761e-05, + "loss": 0.5296, + "step": 1723 + }, + { + "epoch": 0.6896, + "grad_norm": 0.34088698548975355, + "learning_rate": 4.6417320502100316e-05, + "loss": 0.6063, + "step": 1724 + }, + { + "epoch": 0.69, + "grad_norm": 0.33469346316607584, + "learning_rate": 4.630798263510162e-05, + "loss": 0.5783, + "step": 1725 + }, + { + "epoch": 0.6904, + "grad_norm": 0.34441486420204004, + "learning_rate": 4.6198734880785965e-05, + "loss": 0.5876, + "step": 1726 + }, + { + "epoch": 0.6908, + "grad_norm": 0.3521399585532281, + "learning_rate": 4.608957742250667e-05, + "loss": 0.5987, + "step": 1727 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3326145599750792, + "learning_rate": 4.598051044346542e-05, + "loss": 0.5592, + "step": 1728 + }, + { + "epoch": 0.6916, + "grad_norm": 0.34024540042317375, + "learning_rate": 4.587153412671217e-05, + "loss": 0.5718, + "step": 1729 + }, + { + "epoch": 0.692, + "grad_norm": 0.33676043102294895, + "learning_rate": 4.5762648655144666e-05, + "loss": 0.6207, + "step": 1730 + }, + { + "epoch": 0.6924, + "grad_norm": 0.3585715026248596, + "learning_rate": 4.565385421150816e-05, + "loss": 0.5834, + "step": 1731 + }, + { + "epoch": 0.6928, + "grad_norm": 0.33116888316468274, + "learning_rate": 4.55451509783951e-05, + "loss": 0.5826, + "step": 1732 + }, + { + "epoch": 0.6932, + "grad_norm": 0.3244130276411306, + "learning_rate": 4.543653913824496e-05, + "loss": 0.5582, + "step": 1733 + }, + { + "epoch": 0.6936, + "grad_norm": 0.35764178230441473, + "learning_rate": 4.53280188733437e-05, + "loss": 0.6015, + "step": 1734 + }, + { + "epoch": 0.694, + "grad_norm": 0.34451089875575613, + "learning_rate": 4.5219590365823714e-05, + "loss": 0.5741, + "step": 1735 + }, + { + "epoch": 0.6944, + "grad_norm": 0.33862571879533104, + "learning_rate": 4.511125379766331e-05, + "loss": 0.6311, + "step": 1736 + }, + { + "epoch": 0.6948, + "grad_norm": 0.35252409439730675, + "learning_rate": 4.5003009350686474e-05, + "loss": 0.6392, + "step": 1737 + }, + { + "epoch": 0.6952, + "grad_norm": 0.3598745433163055, + "learning_rate": 4.489485720656266e-05, + "loss": 0.6006, + "step": 1738 + }, + { + "epoch": 0.6956, + "grad_norm": 0.3374253785220396, + "learning_rate": 4.478679754680639e-05, + "loss": 0.5838, + "step": 1739 + }, + { + "epoch": 0.696, + "grad_norm": 0.3342818500271891, + "learning_rate": 4.467883055277695e-05, + "loss": 0.6056, + "step": 1740 + }, + { + "epoch": 0.6964, + "grad_norm": 0.37278807358701976, + "learning_rate": 4.457095640567803e-05, + "loss": 0.6245, + "step": 1741 + }, + { + "epoch": 0.6968, + "grad_norm": 0.3601495067664831, + "learning_rate": 4.446317528655766e-05, + "loss": 0.6093, + "step": 1742 + }, + { + "epoch": 0.6972, + "grad_norm": 0.34751045601642516, + "learning_rate": 4.435548737630756e-05, + "loss": 0.607, + "step": 1743 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3435708716862839, + "learning_rate": 4.424789285566316e-05, + "loss": 0.5987, + "step": 1744 + }, + { + "epoch": 0.698, + "grad_norm": 0.33328993609492175, + "learning_rate": 4.414039190520308e-05, + "loss": 0.6102, + "step": 1745 + }, + { + "epoch": 0.6984, + "grad_norm": 0.32839634033697485, + "learning_rate": 4.4032984705348845e-05, + "loss": 0.5584, + "step": 1746 + }, + { + "epoch": 0.6988, + "grad_norm": 0.3260876866685833, + "learning_rate": 4.3925671436364804e-05, + "loss": 0.6048, + "step": 1747 + }, + { + "epoch": 0.6992, + "grad_norm": 0.33469728950785477, + "learning_rate": 4.3818452278357445e-05, + "loss": 0.5648, + "step": 1748 + }, + { + "epoch": 0.6996, + "grad_norm": 0.3412017526101026, + "learning_rate": 4.371132741127553e-05, + "loss": 0.5905, + "step": 1749 + }, + { + "epoch": 0.7, + "grad_norm": 0.3983993751410565, + "learning_rate": 4.360429701490934e-05, + "loss": 0.5768, + "step": 1750 + }, + { + "epoch": 0.7004, + "grad_norm": 0.3611546245259949, + "learning_rate": 4.3497361268890834e-05, + "loss": 0.641, + "step": 1751 + }, + { + "epoch": 0.7008, + "grad_norm": 0.34650976591793875, + "learning_rate": 4.339052035269291e-05, + "loss": 0.6096, + "step": 1752 + }, + { + "epoch": 0.7012, + "grad_norm": 0.34040200369753926, + "learning_rate": 4.328377444562948e-05, + "loss": 0.5889, + "step": 1753 + }, + { + "epoch": 0.7016, + "grad_norm": 0.3665520290469235, + "learning_rate": 4.3177123726854896e-05, + "loss": 0.6222, + "step": 1754 + }, + { + "epoch": 0.702, + "grad_norm": 0.34719181249410996, + "learning_rate": 4.307056837536373e-05, + "loss": 0.5843, + "step": 1755 + }, + { + "epoch": 0.7024, + "grad_norm": 0.3409310044908438, + "learning_rate": 4.296410856999062e-05, + "loss": 0.5861, + "step": 1756 + }, + { + "epoch": 0.7028, + "grad_norm": 0.3281722642151921, + "learning_rate": 4.285774448940972e-05, + "loss": 0.567, + "step": 1757 + }, + { + "epoch": 0.7032, + "grad_norm": 0.34192081092616117, + "learning_rate": 4.275147631213465e-05, + "loss": 0.6002, + "step": 1758 + }, + { + "epoch": 0.7036, + "grad_norm": 0.3337898835715986, + "learning_rate": 4.2645304216517926e-05, + "loss": 0.5795, + "step": 1759 + }, + { + "epoch": 0.704, + "grad_norm": 0.358841520504681, + "learning_rate": 4.253922838075095e-05, + "loss": 0.6217, + "step": 1760 + }, + { + "epoch": 0.7044, + "grad_norm": 0.33134214333215883, + "learning_rate": 4.243324898286348e-05, + "loss": 0.5686, + "step": 1761 + }, + { + "epoch": 0.7048, + "grad_norm": 0.35510024493667014, + "learning_rate": 4.232736620072341e-05, + "loss": 0.5418, + "step": 1762 + }, + { + "epoch": 0.7052, + "grad_norm": 0.3230400921160106, + "learning_rate": 4.222158021203657e-05, + "loss": 0.6214, + "step": 1763 + }, + { + "epoch": 0.7056, + "grad_norm": 0.33404401976266973, + "learning_rate": 4.2115891194346224e-05, + "loss": 0.6302, + "step": 1764 + }, + { + "epoch": 0.706, + "grad_norm": 0.3624898248699408, + "learning_rate": 4.2010299325033034e-05, + "loss": 0.6381, + "step": 1765 + }, + { + "epoch": 0.7064, + "grad_norm": 0.33125456052914526, + "learning_rate": 4.1904804781314436e-05, + "loss": 0.6065, + "step": 1766 + }, + { + "epoch": 0.7068, + "grad_norm": 0.3291357243560303, + "learning_rate": 4.179940774024469e-05, + "loss": 0.5854, + "step": 1767 + }, + { + "epoch": 0.7072, + "grad_norm": 0.32261390402100604, + "learning_rate": 4.169410837871427e-05, + "loss": 0.5558, + "step": 1768 + }, + { + "epoch": 0.7076, + "grad_norm": 0.3321667286365183, + "learning_rate": 4.158890687344986e-05, + "loss": 0.5444, + "step": 1769 + }, + { + "epoch": 0.708, + "grad_norm": 0.33531451395956424, + "learning_rate": 4.1483803401013796e-05, + "loss": 0.5699, + "step": 1770 + }, + { + "epoch": 0.7084, + "grad_norm": 0.35701748113397946, + "learning_rate": 4.137879813780388e-05, + "loss": 0.5794, + "step": 1771 + }, + { + "epoch": 0.7088, + "grad_norm": 0.33230272416291334, + "learning_rate": 4.127389126005319e-05, + "loss": 0.5888, + "step": 1772 + }, + { + "epoch": 0.7092, + "grad_norm": 0.3468405819430816, + "learning_rate": 4.116908294382955e-05, + "loss": 0.604, + "step": 1773 + }, + { + "epoch": 0.7096, + "grad_norm": 0.33183066782789583, + "learning_rate": 4.10643733650355e-05, + "loss": 0.6377, + "step": 1774 + }, + { + "epoch": 0.71, + "grad_norm": 0.34181576143335957, + "learning_rate": 4.0959762699407766e-05, + "loss": 0.5729, + "step": 1775 + }, + { + "epoch": 0.7104, + "grad_norm": 0.36846919900828995, + "learning_rate": 4.0855251122517056e-05, + "loss": 0.6643, + "step": 1776 + }, + { + "epoch": 0.7108, + "grad_norm": 0.3434043943463887, + "learning_rate": 4.0750838809767875e-05, + "loss": 0.631, + "step": 1777 + }, + { + "epoch": 0.7112, + "grad_norm": 0.33783837112536425, + "learning_rate": 4.064652593639808e-05, + "loss": 0.5896, + "step": 1778 + }, + { + "epoch": 0.7116, + "grad_norm": 0.35646109372554274, + "learning_rate": 4.0542312677478614e-05, + "loss": 0.6301, + "step": 1779 + }, + { + "epoch": 0.712, + "grad_norm": 0.3322128925737181, + "learning_rate": 4.043819920791322e-05, + "loss": 0.5699, + "step": 1780 + }, + { + "epoch": 0.7124, + "grad_norm": 0.3401371606977615, + "learning_rate": 4.0334185702438185e-05, + "loss": 0.5688, + "step": 1781 + }, + { + "epoch": 0.7128, + "grad_norm": 0.3777176438153494, + "learning_rate": 4.0230272335622064e-05, + "loss": 0.6604, + "step": 1782 + }, + { + "epoch": 0.7132, + "grad_norm": 0.32841489423285625, + "learning_rate": 4.012645928186533e-05, + "loss": 0.5964, + "step": 1783 + }, + { + "epoch": 0.7136, + "grad_norm": 0.34963614464945214, + "learning_rate": 4.002274671540006e-05, + "loss": 0.627, + "step": 1784 + }, + { + "epoch": 0.714, + "grad_norm": 0.3869157745953258, + "learning_rate": 3.991913481028965e-05, + "loss": 0.6071, + "step": 1785 + }, + { + "epoch": 0.7144, + "grad_norm": 0.37521248581587024, + "learning_rate": 3.981562374042866e-05, + "loss": 0.6436, + "step": 1786 + }, + { + "epoch": 0.7148, + "grad_norm": 0.32794514427064697, + "learning_rate": 3.9712213679542385e-05, + "loss": 0.5686, + "step": 1787 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3344873846850927, + "learning_rate": 3.960890480118653e-05, + "loss": 0.5606, + "step": 1788 + }, + { + "epoch": 0.7156, + "grad_norm": 0.3545953735117356, + "learning_rate": 3.950569727874703e-05, + "loss": 0.6344, + "step": 1789 + }, + { + "epoch": 0.716, + "grad_norm": 0.3552982691824513, + "learning_rate": 3.940259128543967e-05, + "loss": 0.5792, + "step": 1790 + }, + { + "epoch": 0.7164, + "grad_norm": 0.3398457451931291, + "learning_rate": 3.92995869943099e-05, + "loss": 0.6545, + "step": 1791 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3452332071130584, + "learning_rate": 3.9196684578232476e-05, + "loss": 0.6335, + "step": 1792 + }, + { + "epoch": 0.7172, + "grad_norm": 0.32510328040183545, + "learning_rate": 3.9093884209911134e-05, + "loss": 0.585, + "step": 1793 + }, + { + "epoch": 0.7176, + "grad_norm": 0.34076165773645006, + "learning_rate": 3.8991186061878314e-05, + "loss": 0.5706, + "step": 1794 + }, + { + "epoch": 0.718, + "grad_norm": 0.3421915886836552, + "learning_rate": 3.8888590306494974e-05, + "loss": 0.5821, + "step": 1795 + }, + { + "epoch": 0.7184, + "grad_norm": 0.36172255238718126, + "learning_rate": 3.8786097115950214e-05, + "loss": 0.6032, + "step": 1796 + }, + { + "epoch": 0.7188, + "grad_norm": 0.3353589426405682, + "learning_rate": 3.868370666226094e-05, + "loss": 0.5814, + "step": 1797 + }, + { + "epoch": 0.7192, + "grad_norm": 0.32584203607046014, + "learning_rate": 3.858141911727168e-05, + "loss": 0.5738, + "step": 1798 + }, + { + "epoch": 0.7196, + "grad_norm": 0.34155613523147377, + "learning_rate": 3.8479234652654175e-05, + "loss": 0.5847, + "step": 1799 + }, + { + "epoch": 0.72, + "grad_norm": 0.36998435870649954, + "learning_rate": 3.8377153439907266e-05, + "loss": 0.5652, + "step": 1800 + }, + { + "epoch": 0.7204, + "grad_norm": 0.34777455503922344, + "learning_rate": 3.8275175650356485e-05, + "loss": 0.6085, + "step": 1801 + }, + { + "epoch": 0.7208, + "grad_norm": 0.33243987266833946, + "learning_rate": 3.817330145515374e-05, + "loss": 0.5815, + "step": 1802 + }, + { + "epoch": 0.7212, + "grad_norm": 0.3911085821400549, + "learning_rate": 3.807153102527704e-05, + "loss": 0.5978, + "step": 1803 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3707021613874098, + "learning_rate": 3.7969864531530344e-05, + "loss": 0.5883, + "step": 1804 + }, + { + "epoch": 0.722, + "grad_norm": 0.3798153645094958, + "learning_rate": 3.786830214454315e-05, + "loss": 0.6166, + "step": 1805 + }, + { + "epoch": 0.7224, + "grad_norm": 0.34308854005452744, + "learning_rate": 3.776684403477015e-05, + "loss": 0.5656, + "step": 1806 + }, + { + "epoch": 0.7228, + "grad_norm": 0.36171966315223114, + "learning_rate": 3.766549037249112e-05, + "loss": 0.6234, + "step": 1807 + }, + { + "epoch": 0.7232, + "grad_norm": 0.34254198497350036, + "learning_rate": 3.756424132781043e-05, + "loss": 0.561, + "step": 1808 + }, + { + "epoch": 0.7236, + "grad_norm": 0.3493856315987089, + "learning_rate": 3.7463097070657e-05, + "loss": 0.589, + "step": 1809 + }, + { + "epoch": 0.724, + "grad_norm": 0.3294289895291501, + "learning_rate": 3.736205777078381e-05, + "loss": 0.5617, + "step": 1810 + }, + { + "epoch": 0.7244, + "grad_norm": 0.38191187657568854, + "learning_rate": 3.72611235977677e-05, + "loss": 0.5502, + "step": 1811 + }, + { + "epoch": 0.7248, + "grad_norm": 0.7463582450918105, + "learning_rate": 3.716029472100903e-05, + "loss": 0.6148, + "step": 1812 + }, + { + "epoch": 0.7252, + "grad_norm": 0.3627308277690777, + "learning_rate": 3.705957130973149e-05, + "loss": 0.6106, + "step": 1813 + }, + { + "epoch": 0.7256, + "grad_norm": 0.3509859534297781, + "learning_rate": 3.69589535329818e-05, + "loss": 0.5607, + "step": 1814 + }, + { + "epoch": 0.726, + "grad_norm": 0.3570227999609146, + "learning_rate": 3.6858441559629306e-05, + "loss": 0.5917, + "step": 1815 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3211693703387772, + "learning_rate": 3.6758035558365825e-05, + "loss": 0.6232, + "step": 1816 + }, + { + "epoch": 0.7268, + "grad_norm": 0.3297213799425913, + "learning_rate": 3.665773569770526e-05, + "loss": 0.559, + "step": 1817 + }, + { + "epoch": 0.7272, + "grad_norm": 0.34321841832562183, + "learning_rate": 3.655754214598349e-05, + "loss": 0.5917, + "step": 1818 + }, + { + "epoch": 0.7276, + "grad_norm": 0.3439005678265292, + "learning_rate": 3.6457455071357916e-05, + "loss": 0.6591, + "step": 1819 + }, + { + "epoch": 0.728, + "grad_norm": 0.32660589437964493, + "learning_rate": 3.63574746418072e-05, + "loss": 0.5578, + "step": 1820 + }, + { + "epoch": 0.7284, + "grad_norm": 0.3443583477589589, + "learning_rate": 3.6257601025131026e-05, + "loss": 0.6174, + "step": 1821 + }, + { + "epoch": 0.7288, + "grad_norm": 0.3866299288827764, + "learning_rate": 3.615783438894991e-05, + "loss": 0.6492, + "step": 1822 + }, + { + "epoch": 0.7292, + "grad_norm": 0.392867925507431, + "learning_rate": 3.605817490070464e-05, + "loss": 0.6031, + "step": 1823 + }, + { + "epoch": 0.7296, + "grad_norm": 0.34054806404404225, + "learning_rate": 3.595862272765638e-05, + "loss": 0.5955, + "step": 1824 + }, + { + "epoch": 0.73, + "grad_norm": 0.36146903190102, + "learning_rate": 3.585917803688603e-05, + "loss": 0.5402, + "step": 1825 + }, + { + "epoch": 0.7304, + "grad_norm": 0.3420451815961708, + "learning_rate": 3.575984099529414e-05, + "loss": 0.6383, + "step": 1826 + }, + { + "epoch": 0.7308, + "grad_norm": 0.3163032617663247, + "learning_rate": 3.56606117696006e-05, + "loss": 0.5998, + "step": 1827 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3507263830427564, + "learning_rate": 3.556149052634443e-05, + "loss": 0.5836, + "step": 1828 + }, + { + "epoch": 0.7316, + "grad_norm": 0.34326890644825564, + "learning_rate": 3.546247743188328e-05, + "loss": 0.5431, + "step": 1829 + }, + { + "epoch": 0.732, + "grad_norm": 0.33205724492765964, + "learning_rate": 3.5363572652393326e-05, + "loss": 0.6209, + "step": 1830 + }, + { + "epoch": 0.7324, + "grad_norm": 0.3166654437001939, + "learning_rate": 3.526477635386904e-05, + "loss": 0.559, + "step": 1831 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3514293830851197, + "learning_rate": 3.5166088702122736e-05, + "loss": 0.6157, + "step": 1832 + }, + { + "epoch": 0.7332, + "grad_norm": 0.34952287180088665, + "learning_rate": 3.5067509862784454e-05, + "loss": 0.5588, + "step": 1833 + }, + { + "epoch": 0.7336, + "grad_norm": 0.37518819397486813, + "learning_rate": 3.496904000130151e-05, + "loss": 0.5552, + "step": 1834 + }, + { + "epoch": 0.734, + "grad_norm": 0.3316850789388843, + "learning_rate": 3.487067928293848e-05, + "loss": 0.6146, + "step": 1835 + }, + { + "epoch": 0.7344, + "grad_norm": 0.3422435199619646, + "learning_rate": 3.47724278727766e-05, + "loss": 0.6564, + "step": 1836 + }, + { + "epoch": 0.7348, + "grad_norm": 0.33318729202355385, + "learning_rate": 3.467428593571371e-05, + "loss": 0.5803, + "step": 1837 + }, + { + "epoch": 0.7352, + "grad_norm": 0.3548680033208881, + "learning_rate": 3.457625363646399e-05, + "loss": 0.6029, + "step": 1838 + }, + { + "epoch": 0.7356, + "grad_norm": 0.3668029311270937, + "learning_rate": 3.447833113955748e-05, + "loss": 0.6209, + "step": 1839 + }, + { + "epoch": 0.736, + "grad_norm": 0.37981508010159476, + "learning_rate": 3.4380518609340076e-05, + "loss": 0.6044, + "step": 1840 + }, + { + "epoch": 0.7364, + "grad_norm": 0.31997437312829297, + "learning_rate": 3.4282816209972956e-05, + "loss": 0.6278, + "step": 1841 + }, + { + "epoch": 0.7368, + "grad_norm": 0.3719363008912083, + "learning_rate": 3.4185224105432656e-05, + "loss": 0.6127, + "step": 1842 + }, + { + "epoch": 0.7372, + "grad_norm": 0.35338133590475707, + "learning_rate": 3.40877424595104e-05, + "loss": 0.5807, + "step": 1843 + }, + { + "epoch": 0.7376, + "grad_norm": 0.34220788950704023, + "learning_rate": 3.3990371435812187e-05, + "loss": 0.5847, + "step": 1844 + }, + { + "epoch": 0.738, + "grad_norm": 0.3466991009204236, + "learning_rate": 3.389311119775828e-05, + "loss": 0.5966, + "step": 1845 + }, + { + "epoch": 0.7384, + "grad_norm": 0.38253275922104424, + "learning_rate": 3.379596190858296e-05, + "loss": 0.6273, + "step": 1846 + }, + { + "epoch": 0.7388, + "grad_norm": 0.35428621430519175, + "learning_rate": 3.3698923731334453e-05, + "loss": 0.5979, + "step": 1847 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3364181606731447, + "learning_rate": 3.3601996828874326e-05, + "loss": 0.6358, + "step": 1848 + }, + { + "epoch": 0.7396, + "grad_norm": 0.3383820336858019, + "learning_rate": 3.3505181363877535e-05, + "loss": 0.5772, + "step": 1849 + }, + { + "epoch": 0.74, + "grad_norm": 0.3567661646983929, + "learning_rate": 3.340847749883191e-05, + "loss": 0.6294, + "step": 1850 + }, + { + "epoch": 0.7404, + "grad_norm": 0.35059920283725304, + "learning_rate": 3.3311885396038e-05, + "loss": 0.5829, + "step": 1851 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3359809129607562, + "learning_rate": 3.321540521760883e-05, + "loss": 0.619, + "step": 1852 + }, + { + "epoch": 0.7412, + "grad_norm": 0.35364262160446985, + "learning_rate": 3.3119037125469554e-05, + "loss": 0.5866, + "step": 1853 + }, + { + "epoch": 0.7416, + "grad_norm": 0.3368427844750738, + "learning_rate": 3.3022781281357186e-05, + "loss": 0.525, + "step": 1854 + }, + { + "epoch": 0.742, + "grad_norm": 0.34779304005488076, + "learning_rate": 3.292663784682036e-05, + "loss": 0.5958, + "step": 1855 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3610464453397608, + "learning_rate": 3.2830606983219033e-05, + "loss": 0.5725, + "step": 1856 + }, + { + "epoch": 0.7428, + "grad_norm": 0.34493718175933263, + "learning_rate": 3.2734688851724274e-05, + "loss": 0.5806, + "step": 1857 + }, + { + "epoch": 0.7432, + "grad_norm": 0.34079854407821586, + "learning_rate": 3.2638883613317974e-05, + "loss": 0.5926, + "step": 1858 + }, + { + "epoch": 0.7436, + "grad_norm": 0.3374882582749612, + "learning_rate": 3.2543191428792465e-05, + "loss": 0.6039, + "step": 1859 + }, + { + "epoch": 0.744, + "grad_norm": 0.3423983829033276, + "learning_rate": 3.2447612458750365e-05, + "loss": 0.5932, + "step": 1860 + }, + { + "epoch": 0.7444, + "grad_norm": 0.36799890179009165, + "learning_rate": 3.235214686360432e-05, + "loss": 0.6211, + "step": 1861 + }, + { + "epoch": 0.7448, + "grad_norm": 0.39461159177466326, + "learning_rate": 3.2256794803576704e-05, + "loss": 0.6198, + "step": 1862 + }, + { + "epoch": 0.7452, + "grad_norm": 0.3395489404640023, + "learning_rate": 3.21615564386993e-05, + "loss": 0.5558, + "step": 1863 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3431196916316968, + "learning_rate": 3.206643192881307e-05, + "loss": 0.5663, + "step": 1864 + }, + { + "epoch": 0.746, + "grad_norm": 0.34561539999980645, + "learning_rate": 3.197142143356787e-05, + "loss": 0.5618, + "step": 1865 + }, + { + "epoch": 0.7464, + "grad_norm": 0.3343052053046587, + "learning_rate": 3.1876525112422286e-05, + "loss": 0.5943, + "step": 1866 + }, + { + "epoch": 0.7468, + "grad_norm": 0.35098310618615475, + "learning_rate": 3.178174312464326e-05, + "loss": 0.6065, + "step": 1867 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3766296479216891, + "learning_rate": 3.1687075629305786e-05, + "loss": 0.5428, + "step": 1868 + }, + { + "epoch": 0.7476, + "grad_norm": 0.3501028408374377, + "learning_rate": 3.159252278529271e-05, + "loss": 0.5944, + "step": 1869 + }, + { + "epoch": 0.748, + "grad_norm": 0.33062667658635936, + "learning_rate": 3.149808475129452e-05, + "loss": 0.5581, + "step": 1870 + }, + { + "epoch": 0.7484, + "grad_norm": 0.35539241907167646, + "learning_rate": 3.140376168580901e-05, + "loss": 0.6214, + "step": 1871 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3341801415583605, + "learning_rate": 3.130955374714094e-05, + "loss": 0.6086, + "step": 1872 + }, + { + "epoch": 0.7492, + "grad_norm": 0.3542411336205624, + "learning_rate": 3.121546109340191e-05, + "loss": 0.6504, + "step": 1873 + }, + { + "epoch": 0.7496, + "grad_norm": 0.34736361535409627, + "learning_rate": 3.112148388250999e-05, + "loss": 0.6532, + "step": 1874 + }, + { + "epoch": 0.75, + "grad_norm": 0.3810169767837099, + "learning_rate": 3.102762227218957e-05, + "loss": 0.6157, + "step": 1875 + }, + { + "epoch": 0.7504, + "grad_norm": 0.35087605354461415, + "learning_rate": 3.093387641997101e-05, + "loss": 0.5933, + "step": 1876 + }, + { + "epoch": 0.7508, + "grad_norm": 0.3185970759422581, + "learning_rate": 3.084024648319034e-05, + "loss": 0.5531, + "step": 1877 + }, + { + "epoch": 0.7512, + "grad_norm": 0.3669794767755098, + "learning_rate": 3.074673261898903e-05, + "loss": 0.6106, + "step": 1878 + }, + { + "epoch": 0.7516, + "grad_norm": 0.3496011247614334, + "learning_rate": 3.0653334984313806e-05, + "loss": 0.6039, + "step": 1879 + }, + { + "epoch": 0.752, + "grad_norm": 0.3222656267876488, + "learning_rate": 3.056005373591637e-05, + "loss": 0.5845, + "step": 1880 + }, + { + "epoch": 0.7524, + "grad_norm": 0.35643373977727705, + "learning_rate": 3.0466889030352973e-05, + "loss": 0.5773, + "step": 1881 + }, + { + "epoch": 0.7528, + "grad_norm": 0.3541552847448694, + "learning_rate": 3.0373841023984306e-05, + "loss": 0.6217, + "step": 1882 + }, + { + "epoch": 0.7532, + "grad_norm": 0.34768359118936737, + "learning_rate": 3.0280909872975193e-05, + "loss": 0.5923, + "step": 1883 + }, + { + "epoch": 0.7536, + "grad_norm": 0.32586377284940893, + "learning_rate": 3.0188095733294386e-05, + "loss": 0.5702, + "step": 1884 + }, + { + "epoch": 0.754, + "grad_norm": 0.36000759146785977, + "learning_rate": 3.0095398760714267e-05, + "loss": 0.6107, + "step": 1885 + }, + { + "epoch": 0.7544, + "grad_norm": 0.35031617975658125, + "learning_rate": 3.0002819110810475e-05, + "loss": 0.6063, + "step": 1886 + }, + { + "epoch": 0.7548, + "grad_norm": 0.7165571029926521, + "learning_rate": 2.9910356938961782e-05, + "loss": 0.5694, + "step": 1887 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3615817705066542, + "learning_rate": 2.981801240034985e-05, + "loss": 0.5888, + "step": 1888 + }, + { + "epoch": 0.7556, + "grad_norm": 0.3516248634035183, + "learning_rate": 2.9725785649958892e-05, + "loss": 0.6212, + "step": 1889 + }, + { + "epoch": 0.756, + "grad_norm": 0.3355532898864876, + "learning_rate": 2.9633676842575387e-05, + "loss": 0.5598, + "step": 1890 + }, + { + "epoch": 0.7564, + "grad_norm": 0.370978793994998, + "learning_rate": 2.9541686132787905e-05, + "loss": 0.6965, + "step": 1891 + }, + { + "epoch": 0.7568, + "grad_norm": 0.34118286549897464, + "learning_rate": 2.944981367498677e-05, + "loss": 0.5536, + "step": 1892 + }, + { + "epoch": 0.7572, + "grad_norm": 0.35082758802996605, + "learning_rate": 2.93580596233639e-05, + "loss": 0.5757, + "step": 1893 + }, + { + "epoch": 0.7576, + "grad_norm": 0.35326206507041236, + "learning_rate": 2.9266424131912497e-05, + "loss": 0.5663, + "step": 1894 + }, + { + "epoch": 0.758, + "grad_norm": 0.358877536846838, + "learning_rate": 2.9174907354426696e-05, + "loss": 0.6088, + "step": 1895 + }, + { + "epoch": 0.7584, + "grad_norm": 0.36721273543101884, + "learning_rate": 2.9083509444501432e-05, + "loss": 0.5916, + "step": 1896 + }, + { + "epoch": 0.7588, + "grad_norm": 0.34884960136342424, + "learning_rate": 2.899223055553221e-05, + "loss": 0.5862, + "step": 1897 + }, + { + "epoch": 0.7592, + "grad_norm": 0.3315108311402396, + "learning_rate": 2.890107084071465e-05, + "loss": 0.5436, + "step": 1898 + }, + { + "epoch": 0.7596, + "grad_norm": 0.35488601916656104, + "learning_rate": 2.8810030453044478e-05, + "loss": 0.6222, + "step": 1899 + }, + { + "epoch": 0.76, + "grad_norm": 0.34667998487899204, + "learning_rate": 2.8719109545317103e-05, + "loss": 0.5571, + "step": 1900 + }, + { + "epoch": 0.7604, + "grad_norm": 0.33645514984848884, + "learning_rate": 2.8628308270127335e-05, + "loss": 0.5897, + "step": 1901 + }, + { + "epoch": 0.7608, + "grad_norm": 0.41888725813179317, + "learning_rate": 2.853762677986932e-05, + "loss": 0.61, + "step": 1902 + }, + { + "epoch": 0.7612, + "grad_norm": 0.33913218882422785, + "learning_rate": 2.844706522673616e-05, + "loss": 0.5864, + "step": 1903 + }, + { + "epoch": 0.7616, + "grad_norm": 0.36078459614959707, + "learning_rate": 2.835662376271957e-05, + "loss": 0.5586, + "step": 1904 + }, + { + "epoch": 0.762, + "grad_norm": 0.3841008489119458, + "learning_rate": 2.8266302539609745e-05, + "loss": 0.551, + "step": 1905 + }, + { + "epoch": 0.7624, + "grad_norm": 0.35889182134957104, + "learning_rate": 2.817610170899517e-05, + "loss": 0.603, + "step": 1906 + }, + { + "epoch": 0.7628, + "grad_norm": 0.3304056752254963, + "learning_rate": 2.8086021422262122e-05, + "loss": 0.59, + "step": 1907 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3398864641550042, + "learning_rate": 2.7996061830594712e-05, + "loss": 0.5649, + "step": 1908 + }, + { + "epoch": 0.7636, + "grad_norm": 0.33997270611237673, + "learning_rate": 2.7906223084974403e-05, + "loss": 0.5774, + "step": 1909 + }, + { + "epoch": 0.764, + "grad_norm": 0.358760540859238, + "learning_rate": 2.7816505336179798e-05, + "loss": 0.6156, + "step": 1910 + }, + { + "epoch": 0.7644, + "grad_norm": 0.3324780598022909, + "learning_rate": 2.772690873478656e-05, + "loss": 0.5653, + "step": 1911 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3545338643402483, + "learning_rate": 2.7637433431166903e-05, + "loss": 0.5645, + "step": 1912 + }, + { + "epoch": 0.7652, + "grad_norm": 0.3314663072625293, + "learning_rate": 2.754807957548955e-05, + "loss": 0.6198, + "step": 1913 + }, + { + "epoch": 0.7656, + "grad_norm": 0.33597655025069056, + "learning_rate": 2.7458847317719305e-05, + "loss": 0.5753, + "step": 1914 + }, + { + "epoch": 0.766, + "grad_norm": 0.5223302936269263, + "learning_rate": 2.736973680761702e-05, + "loss": 0.5595, + "step": 1915 + }, + { + "epoch": 0.7664, + "grad_norm": 0.35334424448491386, + "learning_rate": 2.728074819473908e-05, + "loss": 0.5968, + "step": 1916 + }, + { + "epoch": 0.7668, + "grad_norm": 0.31679051950561166, + "learning_rate": 2.7191881628437333e-05, + "loss": 0.5927, + "step": 1917 + }, + { + "epoch": 0.7672, + "grad_norm": 0.35835588969750193, + "learning_rate": 2.7103137257858868e-05, + "loss": 0.6096, + "step": 1918 + }, + { + "epoch": 0.7676, + "grad_norm": 0.39784975047738036, + "learning_rate": 2.7014515231945557e-05, + "loss": 0.6012, + "step": 1919 + }, + { + "epoch": 0.768, + "grad_norm": 0.3418078968959598, + "learning_rate": 2.6926015699434072e-05, + "loss": 0.6177, + "step": 1920 + }, + { + "epoch": 0.7684, + "grad_norm": 0.3278648428829672, + "learning_rate": 2.683763880885538e-05, + "loss": 0.5762, + "step": 1921 + }, + { + "epoch": 0.7688, + "grad_norm": 0.35906205351496273, + "learning_rate": 2.674938470853472e-05, + "loss": 0.5897, + "step": 1922 + }, + { + "epoch": 0.7692, + "grad_norm": 0.34729834198526843, + "learning_rate": 2.6661253546591157e-05, + "loss": 0.5416, + "step": 1923 + }, + { + "epoch": 0.7696, + "grad_norm": 0.35561882001773815, + "learning_rate": 2.6573245470937523e-05, + "loss": 0.6232, + "step": 1924 + }, + { + "epoch": 0.77, + "grad_norm": 0.345277417829143, + "learning_rate": 2.6485360629279987e-05, + "loss": 0.5967, + "step": 1925 + }, + { + "epoch": 0.7704, + "grad_norm": 0.4115697957958416, + "learning_rate": 2.639759916911788e-05, + "loss": 0.612, + "step": 1926 + }, + { + "epoch": 0.7708, + "grad_norm": 0.33966309723785676, + "learning_rate": 2.6309961237743585e-05, + "loss": 0.618, + "step": 1927 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3302196082114761, + "learning_rate": 2.6222446982242e-05, + "loss": 0.5555, + "step": 1928 + }, + { + "epoch": 0.7716, + "grad_norm": 0.34908164626731897, + "learning_rate": 2.61350565494906e-05, + "loss": 0.5759, + "step": 1929 + }, + { + "epoch": 0.772, + "grad_norm": 0.3270119125339375, + "learning_rate": 2.6047790086158952e-05, + "loss": 0.5945, + "step": 1930 + }, + { + "epoch": 0.7724, + "grad_norm": 0.32270857553503557, + "learning_rate": 2.5960647738708555e-05, + "loss": 0.5769, + "step": 1931 + }, + { + "epoch": 0.7728, + "grad_norm": 0.4308707041004166, + "learning_rate": 2.587362965339265e-05, + "loss": 0.6266, + "step": 1932 + }, + { + "epoch": 0.7732, + "grad_norm": 0.3334837630486001, + "learning_rate": 2.5786735976255973e-05, + "loss": 0.5902, + "step": 1933 + }, + { + "epoch": 0.7736, + "grad_norm": 0.352081859909484, + "learning_rate": 2.5699966853134337e-05, + "loss": 0.5881, + "step": 1934 + }, + { + "epoch": 0.774, + "grad_norm": 0.4876058517345553, + "learning_rate": 2.5613322429654574e-05, + "loss": 0.5469, + "step": 1935 + }, + { + "epoch": 0.7744, + "grad_norm": 0.34472447926214234, + "learning_rate": 2.5526802851234268e-05, + "loss": 0.6236, + "step": 1936 + }, + { + "epoch": 0.7748, + "grad_norm": 0.3429335963910842, + "learning_rate": 2.5440408263081382e-05, + "loss": 0.6022, + "step": 1937 + }, + { + "epoch": 0.7752, + "grad_norm": 0.3525901489542083, + "learning_rate": 2.5354138810194226e-05, + "loss": 0.5891, + "step": 1938 + }, + { + "epoch": 0.7756, + "grad_norm": 0.3390282795907182, + "learning_rate": 2.5267994637360993e-05, + "loss": 0.5652, + "step": 1939 + }, + { + "epoch": 0.776, + "grad_norm": 0.3288355057563831, + "learning_rate": 2.5181975889159615e-05, + "loss": 0.5666, + "step": 1940 + }, + { + "epoch": 0.7764, + "grad_norm": 0.3259738427143579, + "learning_rate": 2.509608270995758e-05, + "loss": 0.5871, + "step": 1941 + }, + { + "epoch": 0.7768, + "grad_norm": 0.3320669275361078, + "learning_rate": 2.501031524391163e-05, + "loss": 0.5315, + "step": 1942 + }, + { + "epoch": 0.7772, + "grad_norm": 0.3604358444439062, + "learning_rate": 2.4924673634967466e-05, + "loss": 0.6197, + "step": 1943 + }, + { + "epoch": 0.7776, + "grad_norm": 0.34102281009712276, + "learning_rate": 2.4839158026859587e-05, + "loss": 0.6, + "step": 1944 + }, + { + "epoch": 0.778, + "grad_norm": 0.3234797014190219, + "learning_rate": 2.475376856311097e-05, + "loss": 0.5779, + "step": 1945 + }, + { + "epoch": 0.7784, + "grad_norm": 0.31951513180750857, + "learning_rate": 2.4668505387033026e-05, + "loss": 0.5645, + "step": 1946 + }, + { + "epoch": 0.7788, + "grad_norm": 0.3530404230204508, + "learning_rate": 2.4583368641725078e-05, + "loss": 0.5602, + "step": 1947 + }, + { + "epoch": 0.7792, + "grad_norm": 0.32186540414463677, + "learning_rate": 2.44983584700743e-05, + "loss": 0.6049, + "step": 1948 + }, + { + "epoch": 0.7796, + "grad_norm": 0.3545125860463158, + "learning_rate": 2.4413475014755393e-05, + "loss": 0.5902, + "step": 1949 + }, + { + "epoch": 0.78, + "grad_norm": 0.3482011501090206, + "learning_rate": 2.432871841823047e-05, + "loss": 0.5999, + "step": 1950 + }, + { + "epoch": 0.7804, + "grad_norm": 0.36112621390059313, + "learning_rate": 2.42440888227487e-05, + "loss": 0.568, + "step": 1951 + }, + { + "epoch": 0.7808, + "grad_norm": 0.37731093017360207, + "learning_rate": 2.4159586370346088e-05, + "loss": 0.6142, + "step": 1952 + }, + { + "epoch": 0.7812, + "grad_norm": 0.3747022497765928, + "learning_rate": 2.4075211202845227e-05, + "loss": 0.5944, + "step": 1953 + }, + { + "epoch": 0.7816, + "grad_norm": 0.3555496374811139, + "learning_rate": 2.3990963461855075e-05, + "loss": 0.6317, + "step": 1954 + }, + { + "epoch": 0.782, + "grad_norm": 0.3407273756582039, + "learning_rate": 2.3906843288770886e-05, + "loss": 0.5854, + "step": 1955 + }, + { + "epoch": 0.7824, + "grad_norm": 0.3440908624146322, + "learning_rate": 2.3822850824773625e-05, + "loss": 0.5426, + "step": 1956 + }, + { + "epoch": 0.7828, + "grad_norm": 0.33841018116361676, + "learning_rate": 2.3738986210829993e-05, + "loss": 0.55, + "step": 1957 + }, + { + "epoch": 0.7832, + "grad_norm": 0.33270722732089786, + "learning_rate": 2.3655249587692073e-05, + "loss": 0.5519, + "step": 1958 + }, + { + "epoch": 0.7836, + "grad_norm": 0.3618477372722039, + "learning_rate": 2.3571641095897223e-05, + "loss": 0.5556, + "step": 1959 + }, + { + "epoch": 0.784, + "grad_norm": 0.3352707046874701, + "learning_rate": 2.3488160875767717e-05, + "loss": 0.5609, + "step": 1960 + }, + { + "epoch": 0.7844, + "grad_norm": 0.3827338795032483, + "learning_rate": 2.3404809067410525e-05, + "loss": 0.6086, + "step": 1961 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3785897101425981, + "learning_rate": 2.3321585810717117e-05, + "loss": 0.6115, + "step": 1962 + }, + { + "epoch": 0.7852, + "grad_norm": 0.32939407360776796, + "learning_rate": 2.3238491245363147e-05, + "loss": 0.6083, + "step": 1963 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3465189710411792, + "learning_rate": 2.315552551080845e-05, + "loss": 0.5899, + "step": 1964 + }, + { + "epoch": 0.786, + "grad_norm": 0.3585136683243423, + "learning_rate": 2.307268874629649e-05, + "loss": 0.6303, + "step": 1965 + }, + { + "epoch": 0.7864, + "grad_norm": 0.35488822785063706, + "learning_rate": 2.2989981090854305e-05, + "loss": 0.5763, + "step": 1966 + }, + { + "epoch": 0.7868, + "grad_norm": 0.36232148088412175, + "learning_rate": 2.290740268329227e-05, + "loss": 0.607, + "step": 1967 + }, + { + "epoch": 0.7872, + "grad_norm": 0.34905703501076474, + "learning_rate": 2.282495366220383e-05, + "loss": 0.5998, + "step": 1968 + }, + { + "epoch": 0.7876, + "grad_norm": 0.3343963766216138, + "learning_rate": 2.2742634165965316e-05, + "loss": 0.5806, + "step": 1969 + }, + { + "epoch": 0.788, + "grad_norm": 0.33554182680707184, + "learning_rate": 2.266044433273562e-05, + "loss": 0.5434, + "step": 1970 + }, + { + "epoch": 0.7884, + "grad_norm": 0.3442527655293528, + "learning_rate": 2.2578384300456014e-05, + "loss": 0.5984, + "step": 1971 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3454136604067078, + "learning_rate": 2.249645420684998e-05, + "loss": 0.5778, + "step": 1972 + }, + { + "epoch": 0.7892, + "grad_norm": 0.3385411124771602, + "learning_rate": 2.2414654189422847e-05, + "loss": 0.589, + "step": 1973 + }, + { + "epoch": 0.7896, + "grad_norm": 0.33037457311455093, + "learning_rate": 2.233298438546172e-05, + "loss": 0.6205, + "step": 1974 + }, + { + "epoch": 0.79, + "grad_norm": 0.34447887093263985, + "learning_rate": 2.2251444932035094e-05, + "loss": 0.5498, + "step": 1975 + }, + { + "epoch": 0.7904, + "grad_norm": 0.33162742401137485, + "learning_rate": 2.2170035965992675e-05, + "loss": 0.5917, + "step": 1976 + }, + { + "epoch": 0.7908, + "grad_norm": 0.34456838936680356, + "learning_rate": 2.2088757623965262e-05, + "loss": 0.6382, + "step": 1977 + }, + { + "epoch": 0.7912, + "grad_norm": 0.3423977987340702, + "learning_rate": 2.2007610042364336e-05, + "loss": 0.5645, + "step": 1978 + }, + { + "epoch": 0.7916, + "grad_norm": 0.33175868418516596, + "learning_rate": 2.1926593357381996e-05, + "loss": 0.5869, + "step": 1979 + }, + { + "epoch": 0.792, + "grad_norm": 0.3259637184455592, + "learning_rate": 2.184570770499056e-05, + "loss": 0.595, + "step": 1980 + }, + { + "epoch": 0.7924, + "grad_norm": 0.33487038070045444, + "learning_rate": 2.176495322094254e-05, + "loss": 0.5693, + "step": 1981 + }, + { + "epoch": 0.7928, + "grad_norm": 0.40524246973378053, + "learning_rate": 2.1684330040770183e-05, + "loss": 0.6128, + "step": 1982 + }, + { + "epoch": 0.7932, + "grad_norm": 0.32186782971805505, + "learning_rate": 2.1603838299785484e-05, + "loss": 0.5763, + "step": 1983 + }, + { + "epoch": 0.7936, + "grad_norm": 0.361702443538986, + "learning_rate": 2.1523478133079777e-05, + "loss": 0.6091, + "step": 1984 + }, + { + "epoch": 0.794, + "grad_norm": 0.36141663884275566, + "learning_rate": 2.1443249675523536e-05, + "loss": 0.6281, + "step": 1985 + }, + { + "epoch": 0.7944, + "grad_norm": 0.36232498191397317, + "learning_rate": 2.1363153061766294e-05, + "loss": 0.559, + "step": 1986 + }, + { + "epoch": 0.7948, + "grad_norm": 0.32724249560950086, + "learning_rate": 2.1283188426236178e-05, + "loss": 0.5145, + "step": 1987 + }, + { + "epoch": 0.7952, + "grad_norm": 0.34170056116195524, + "learning_rate": 2.1203355903139933e-05, + "loss": 0.5922, + "step": 1988 + }, + { + "epoch": 0.7956, + "grad_norm": 0.3310127243944117, + "learning_rate": 2.112365562646248e-05, + "loss": 0.5938, + "step": 1989 + }, + { + "epoch": 0.796, + "grad_norm": 0.37493958928888, + "learning_rate": 2.1044087729966856e-05, + "loss": 0.5968, + "step": 1990 + }, + { + "epoch": 0.7964, + "grad_norm": 0.33613546556127055, + "learning_rate": 2.096465234719389e-05, + "loss": 0.5768, + "step": 1991 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3325727050763233, + "learning_rate": 2.0885349611461967e-05, + "loss": 0.5461, + "step": 1992 + }, + { + "epoch": 0.7972, + "grad_norm": 0.33725501876277636, + "learning_rate": 2.0806179655866966e-05, + "loss": 0.5623, + "step": 1993 + }, + { + "epoch": 0.7976, + "grad_norm": 0.3633590484486614, + "learning_rate": 2.072714261328177e-05, + "loss": 0.5884, + "step": 1994 + }, + { + "epoch": 0.798, + "grad_norm": 0.3764466040216392, + "learning_rate": 2.0648238616356332e-05, + "loss": 0.5921, + "step": 1995 + }, + { + "epoch": 0.7984, + "grad_norm": 0.3145057249043822, + "learning_rate": 2.0569467797517173e-05, + "loss": 0.5603, + "step": 1996 + }, + { + "epoch": 0.7988, + "grad_norm": 0.324959610351305, + "learning_rate": 2.0490830288967444e-05, + "loss": 0.5872, + "step": 1997 + }, + { + "epoch": 0.7992, + "grad_norm": 0.340462206888166, + "learning_rate": 2.0412326222686418e-05, + "loss": 0.5936, + "step": 1998 + }, + { + "epoch": 0.7996, + "grad_norm": 0.34466338871786273, + "learning_rate": 2.033395573042952e-05, + "loss": 0.6344, + "step": 1999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3300321247103113, + "learning_rate": 2.025571894372794e-05, + "loss": 0.5504, + "step": 2000 + }, + { + "epoch": 0.8004, + "grad_norm": 0.33346644760322913, + "learning_rate": 2.0177615993888422e-05, + "loss": 0.5694, + "step": 2001 + }, + { + "epoch": 0.8008, + "grad_norm": 0.3438563421476556, + "learning_rate": 2.0099647011993216e-05, + "loss": 0.5771, + "step": 2002 + }, + { + "epoch": 0.8012, + "grad_norm": 0.3429622590645044, + "learning_rate": 2.00218121288996e-05, + "loss": 0.5867, + "step": 2003 + }, + { + "epoch": 0.8016, + "grad_norm": 0.3375130326381454, + "learning_rate": 1.9944111475239867e-05, + "loss": 0.5606, + "step": 2004 + }, + { + "epoch": 0.802, + "grad_norm": 0.3532062037107467, + "learning_rate": 1.9866545181421013e-05, + "loss": 0.639, + "step": 2005 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3377893264435425, + "learning_rate": 1.9789113377624502e-05, + "loss": 0.5882, + "step": 2006 + }, + { + "epoch": 0.8028, + "grad_norm": 0.35031490365634965, + "learning_rate": 1.971181619380611e-05, + "loss": 0.5503, + "step": 2007 + }, + { + "epoch": 0.8032, + "grad_norm": 0.34602423861902754, + "learning_rate": 1.963465375969572e-05, + "loss": 0.5966, + "step": 2008 + }, + { + "epoch": 0.8036, + "grad_norm": 0.36039315536662786, + "learning_rate": 1.9557626204796986e-05, + "loss": 0.6479, + "step": 2009 + }, + { + "epoch": 0.804, + "grad_norm": 0.3340153632783169, + "learning_rate": 1.9480733658387175e-05, + "loss": 0.5855, + "step": 2010 + }, + { + "epoch": 0.8044, + "grad_norm": 0.32314770029857476, + "learning_rate": 1.9403976249517085e-05, + "loss": 0.5736, + "step": 2011 + }, + { + "epoch": 0.8048, + "grad_norm": 0.34818143552495107, + "learning_rate": 1.9327354107010566e-05, + "loss": 0.5693, + "step": 2012 + }, + { + "epoch": 0.8052, + "grad_norm": 0.338316754813434, + "learning_rate": 1.9250867359464576e-05, + "loss": 0.5862, + "step": 2013 + }, + { + "epoch": 0.8056, + "grad_norm": 0.3454876898375605, + "learning_rate": 1.9174516135248744e-05, + "loss": 0.5996, + "step": 2014 + }, + { + "epoch": 0.806, + "grad_norm": 0.3445315276335558, + "learning_rate": 1.9098300562505266e-05, + "loss": 0.6002, + "step": 2015 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3496144771363773, + "learning_rate": 1.902222076914869e-05, + "loss": 0.6209, + "step": 2016 + }, + { + "epoch": 0.8068, + "grad_norm": 0.3398542244163301, + "learning_rate": 1.894627688286571e-05, + "loss": 0.5329, + "step": 2017 + }, + { + "epoch": 0.8072, + "grad_norm": 0.3534623363527577, + "learning_rate": 1.8870469031114868e-05, + "loss": 0.5855, + "step": 2018 + }, + { + "epoch": 0.8076, + "grad_norm": 0.34738356781161317, + "learning_rate": 1.8794797341126402e-05, + "loss": 0.6095, + "step": 2019 + }, + { + "epoch": 0.808, + "grad_norm": 0.3455584906849207, + "learning_rate": 1.871926193990202e-05, + "loss": 0.5946, + "step": 2020 + }, + { + "epoch": 0.8084, + "grad_norm": 0.3326389491789541, + "learning_rate": 1.8643862954214754e-05, + "loss": 0.559, + "step": 2021 + }, + { + "epoch": 0.8088, + "grad_norm": 0.34172098276548246, + "learning_rate": 1.856860051060866e-05, + "loss": 0.6053, + "step": 2022 + }, + { + "epoch": 0.8092, + "grad_norm": 0.3434873966275109, + "learning_rate": 1.8493474735398576e-05, + "loss": 0.5983, + "step": 2023 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3564931568143112, + "learning_rate": 1.841848575467001e-05, + "loss": 0.6121, + "step": 2024 + }, + { + "epoch": 0.81, + "grad_norm": 0.32283698622671636, + "learning_rate": 1.8343633694278895e-05, + "loss": 0.5339, + "step": 2025 + }, + { + "epoch": 0.8104, + "grad_norm": 0.33907509599918256, + "learning_rate": 1.8268918679851388e-05, + "loss": 0.6126, + "step": 2026 + }, + { + "epoch": 0.8108, + "grad_norm": 0.3456536891232961, + "learning_rate": 1.8194340836783563e-05, + "loss": 0.6003, + "step": 2027 + }, + { + "epoch": 0.8112, + "grad_norm": 0.5645567074909379, + "learning_rate": 1.811990029024133e-05, + "loss": 0.6047, + "step": 2028 + }, + { + "epoch": 0.8116, + "grad_norm": 0.33826711823216155, + "learning_rate": 1.8045597165160133e-05, + "loss": 0.5925, + "step": 2029 + }, + { + "epoch": 0.812, + "grad_norm": 0.3305265188201426, + "learning_rate": 1.7971431586244815e-05, + "loss": 0.5565, + "step": 2030 + }, + { + "epoch": 0.8124, + "grad_norm": 0.34634656094588795, + "learning_rate": 1.7897403677969403e-05, + "loss": 0.6143, + "step": 2031 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3570019545422078, + "learning_rate": 1.782351356457679e-05, + "loss": 0.5901, + "step": 2032 + }, + { + "epoch": 0.8132, + "grad_norm": 0.3377890457099209, + "learning_rate": 1.774976137007861e-05, + "loss": 0.5996, + "step": 2033 + }, + { + "epoch": 0.8136, + "grad_norm": 0.3563023468880287, + "learning_rate": 1.767614721825509e-05, + "loss": 0.5776, + "step": 2034 + }, + { + "epoch": 0.814, + "grad_norm": 0.33953225292164674, + "learning_rate": 1.7602671232654754e-05, + "loss": 0.5339, + "step": 2035 + }, + { + "epoch": 0.8144, + "grad_norm": 0.36826665696608535, + "learning_rate": 1.7529333536594215e-05, + "loss": 0.5819, + "step": 2036 + }, + { + "epoch": 0.8148, + "grad_norm": 0.3323992173322127, + "learning_rate": 1.7456134253157975e-05, + "loss": 0.5531, + "step": 2037 + }, + { + "epoch": 0.8152, + "grad_norm": 0.3344180779819317, + "learning_rate": 1.7383073505198255e-05, + "loss": 0.6052, + "step": 2038 + }, + { + "epoch": 0.8156, + "grad_norm": 0.34888494102531564, + "learning_rate": 1.7310151415334798e-05, + "loss": 0.5617, + "step": 2039 + }, + { + "epoch": 0.816, + "grad_norm": 0.3502559122482306, + "learning_rate": 1.723736810595461e-05, + "loss": 0.5941, + "step": 2040 + }, + { + "epoch": 0.8164, + "grad_norm": 0.7209477016353654, + "learning_rate": 1.716472369921178e-05, + "loss": 0.5645, + "step": 2041 + }, + { + "epoch": 0.8168, + "grad_norm": 0.33700584654711524, + "learning_rate": 1.7092218317027232e-05, + "loss": 0.5765, + "step": 2042 + }, + { + "epoch": 0.8172, + "grad_norm": 0.3598343415422159, + "learning_rate": 1.7019852081088617e-05, + "loss": 0.6089, + "step": 2043 + }, + { + "epoch": 0.8176, + "grad_norm": 0.33946299005586367, + "learning_rate": 1.6947625112850073e-05, + "loss": 0.587, + "step": 2044 + }, + { + "epoch": 0.818, + "grad_norm": 0.5583069211823942, + "learning_rate": 1.6875537533531948e-05, + "loss": 0.5511, + "step": 2045 + }, + { + "epoch": 0.8184, + "grad_norm": 0.366562389586145, + "learning_rate": 1.680358946412064e-05, + "loss": 0.5958, + "step": 2046 + }, + { + "epoch": 0.8188, + "grad_norm": 0.33455470150572314, + "learning_rate": 1.673178102536842e-05, + "loss": 0.612, + "step": 2047 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3686209541133758, + "learning_rate": 1.6660112337793256e-05, + "loss": 0.5552, + "step": 2048 + }, + { + "epoch": 0.8196, + "grad_norm": 0.35436738074759055, + "learning_rate": 1.6588583521678535e-05, + "loss": 0.5935, + "step": 2049 + }, + { + "epoch": 0.82, + "grad_norm": 0.3447122268173417, + "learning_rate": 1.65171946970729e-05, + "loss": 0.5845, + "step": 2050 + }, + { + "epoch": 0.8204, + "grad_norm": 0.36429045978656727, + "learning_rate": 1.644594598378999e-05, + "loss": 0.6181, + "step": 2051 + }, + { + "epoch": 0.8208, + "grad_norm": 0.38445558327193396, + "learning_rate": 1.6374837501408403e-05, + "loss": 0.5828, + "step": 2052 + }, + { + "epoch": 0.8212, + "grad_norm": 0.3926817490647689, + "learning_rate": 1.6303869369271264e-05, + "loss": 0.5558, + "step": 2053 + }, + { + "epoch": 0.8216, + "grad_norm": 0.36464265465604595, + "learning_rate": 1.623304170648625e-05, + "loss": 0.5766, + "step": 2054 + }, + { + "epoch": 0.822, + "grad_norm": 0.3433207753429021, + "learning_rate": 1.6162354631925204e-05, + "loss": 0.5556, + "step": 2055 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3385924963633419, + "learning_rate": 1.609180826422404e-05, + "loss": 0.5877, + "step": 2056 + }, + { + "epoch": 0.8228, + "grad_norm": 0.38593124490029423, + "learning_rate": 1.6021402721782532e-05, + "loss": 0.5923, + "step": 2057 + }, + { + "epoch": 0.8232, + "grad_norm": 0.35482846986110306, + "learning_rate": 1.5951138122764132e-05, + "loss": 0.5648, + "step": 2058 + }, + { + "epoch": 0.8236, + "grad_norm": 0.3445861470215919, + "learning_rate": 1.58810145850957e-05, + "loss": 0.5827, + "step": 2059 + }, + { + "epoch": 0.824, + "grad_norm": 0.3472389735635199, + "learning_rate": 1.5811032226467305e-05, + "loss": 0.5944, + "step": 2060 + }, + { + "epoch": 0.8244, + "grad_norm": 0.3188110832300483, + "learning_rate": 1.574119116433219e-05, + "loss": 0.6018, + "step": 2061 + }, + { + "epoch": 0.8248, + "grad_norm": 0.3293400635332867, + "learning_rate": 1.5671491515906355e-05, + "loss": 0.5512, + "step": 2062 + }, + { + "epoch": 0.8252, + "grad_norm": 0.4845587784638902, + "learning_rate": 1.5601933398168522e-05, + "loss": 0.5875, + "step": 2063 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3808356167565948, + "learning_rate": 1.553251692785985e-05, + "loss": 0.5855, + "step": 2064 + }, + { + "epoch": 0.826, + "grad_norm": 0.3582145981743341, + "learning_rate": 1.5463242221483743e-05, + "loss": 0.608, + "step": 2065 + }, + { + "epoch": 0.8264, + "grad_norm": 0.341944239074536, + "learning_rate": 1.5394109395305757e-05, + "loss": 0.5831, + "step": 2066 + }, + { + "epoch": 0.8268, + "grad_norm": 0.35544254885132803, + "learning_rate": 1.5325118565353234e-05, + "loss": 0.6136, + "step": 2067 + }, + { + "epoch": 0.8272, + "grad_norm": 0.31741929242365463, + "learning_rate": 1.5256269847415283e-05, + "loss": 0.529, + "step": 2068 + }, + { + "epoch": 0.8276, + "grad_norm": 0.3577898381015023, + "learning_rate": 1.5187563357042423e-05, + "loss": 0.6031, + "step": 2069 + }, + { + "epoch": 0.828, + "grad_norm": 0.4909330665553209, + "learning_rate": 1.5118999209546559e-05, + "loss": 0.5942, + "step": 2070 + }, + { + "epoch": 0.8284, + "grad_norm": 0.3459967010536719, + "learning_rate": 1.5050577520000607e-05, + "loss": 0.5861, + "step": 2071 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3729678607753999, + "learning_rate": 1.4982298403238471e-05, + "loss": 0.594, + "step": 2072 + }, + { + "epoch": 0.8292, + "grad_norm": 0.4090867702653392, + "learning_rate": 1.4914161973854712e-05, + "loss": 0.6108, + "step": 2073 + }, + { + "epoch": 0.8296, + "grad_norm": 0.3379008513631914, + "learning_rate": 1.4846168346204425e-05, + "loss": 0.5701, + "step": 2074 + }, + { + "epoch": 0.83, + "grad_norm": 0.3479251378314484, + "learning_rate": 1.4778317634403083e-05, + "loss": 0.5894, + "step": 2075 + }, + { + "epoch": 0.8304, + "grad_norm": 0.34575192501995333, + "learning_rate": 1.4710609952326238e-05, + "loss": 0.5834, + "step": 2076 + }, + { + "epoch": 0.8308, + "grad_norm": 0.35259532439471253, + "learning_rate": 1.4643045413609458e-05, + "loss": 0.5619, + "step": 2077 + }, + { + "epoch": 0.8312, + "grad_norm": 0.36088566864808475, + "learning_rate": 1.457562413164799e-05, + "loss": 0.6035, + "step": 2078 + }, + { + "epoch": 0.8316, + "grad_norm": 0.3619762257003658, + "learning_rate": 1.4508346219596724e-05, + "loss": 0.6257, + "step": 2079 + }, + { + "epoch": 0.832, + "grad_norm": 0.33927954550870054, + "learning_rate": 1.444121179036989e-05, + "loss": 0.5573, + "step": 2080 + }, + { + "epoch": 0.8324, + "grad_norm": 0.6645996870320319, + "learning_rate": 1.4374220956640893e-05, + "loss": 0.6153, + "step": 2081 + }, + { + "epoch": 0.8328, + "grad_norm": 0.3429083338924902, + "learning_rate": 1.4307373830842174e-05, + "loss": 0.5926, + "step": 2082 + }, + { + "epoch": 0.8332, + "grad_norm": 0.3487607768016974, + "learning_rate": 1.424067052516499e-05, + "loss": 0.5658, + "step": 2083 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3360185981609451, + "learning_rate": 1.4174111151559189e-05, + "loss": 0.571, + "step": 2084 + }, + { + "epoch": 0.834, + "grad_norm": 0.3808572645249857, + "learning_rate": 1.4107695821733025e-05, + "loss": 0.5747, + "step": 2085 + }, + { + "epoch": 0.8344, + "grad_norm": 0.3592965446953947, + "learning_rate": 1.4041424647153112e-05, + "loss": 0.6209, + "step": 2086 + }, + { + "epoch": 0.8348, + "grad_norm": 0.336451723889464, + "learning_rate": 1.3975297739043992e-05, + "loss": 0.5813, + "step": 2087 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3452593481096945, + "learning_rate": 1.3909315208388184e-05, + "loss": 0.6158, + "step": 2088 + }, + { + "epoch": 0.8356, + "grad_norm": 0.35079055441792556, + "learning_rate": 1.3843477165925844e-05, + "loss": 0.5859, + "step": 2089 + }, + { + "epoch": 0.836, + "grad_norm": 0.3252303011864, + "learning_rate": 1.3777783722154603e-05, + "loss": 0.5465, + "step": 2090 + }, + { + "epoch": 0.8364, + "grad_norm": 0.3316008207825591, + "learning_rate": 1.3712234987329486e-05, + "loss": 0.5683, + "step": 2091 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3426650526067097, + "learning_rate": 1.3646831071462607e-05, + "loss": 0.5636, + "step": 2092 + }, + { + "epoch": 0.8372, + "grad_norm": 0.34348157223588005, + "learning_rate": 1.3581572084323013e-05, + "loss": 0.5896, + "step": 2093 + }, + { + "epoch": 0.8376, + "grad_norm": 0.33426418782158246, + "learning_rate": 1.3516458135436538e-05, + "loss": 0.5741, + "step": 2094 + }, + { + "epoch": 0.838, + "grad_norm": 0.5450857217741052, + "learning_rate": 1.3451489334085554e-05, + "loss": 0.6091, + "step": 2095 + }, + { + "epoch": 0.8384, + "grad_norm": 0.37112334679589354, + "learning_rate": 1.3386665789308883e-05, + "loss": 0.5957, + "step": 2096 + }, + { + "epoch": 0.8388, + "grad_norm": 0.3650657952317654, + "learning_rate": 1.3321987609901554e-05, + "loss": 0.5804, + "step": 2097 + }, + { + "epoch": 0.8392, + "grad_norm": 0.3436873179991808, + "learning_rate": 1.325745490441458e-05, + "loss": 0.5974, + "step": 2098 + }, + { + "epoch": 0.8396, + "grad_norm": 0.3376329269121168, + "learning_rate": 1.3193067781154833e-05, + "loss": 0.5341, + "step": 2099 + }, + { + "epoch": 0.84, + "grad_norm": 0.35743497857189027, + "learning_rate": 1.3128826348184887e-05, + "loss": 0.5931, + "step": 2100 + }, + { + "epoch": 0.8404, + "grad_norm": 0.35691555443942813, + "learning_rate": 1.3064730713322792e-05, + "loss": 0.564, + "step": 2101 + }, + { + "epoch": 0.8408, + "grad_norm": 0.3488535101750259, + "learning_rate": 1.300078098414188e-05, + "loss": 0.6032, + "step": 2102 + }, + { + "epoch": 0.8412, + "grad_norm": 0.3172197040457403, + "learning_rate": 1.2936977267970596e-05, + "loss": 0.5563, + "step": 2103 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3306876381261289, + "learning_rate": 1.2873319671892337e-05, + "loss": 0.5758, + "step": 2104 + }, + { + "epoch": 0.842, + "grad_norm": 0.3369166000365717, + "learning_rate": 1.2809808302745297e-05, + "loss": 0.5406, + "step": 2105 + }, + { + "epoch": 0.8424, + "grad_norm": 0.35732401228280025, + "learning_rate": 1.2746443267122233e-05, + "loss": 0.572, + "step": 2106 + }, + { + "epoch": 0.8428, + "grad_norm": 0.3599015927677342, + "learning_rate": 1.2683224671370286e-05, + "loss": 0.5777, + "step": 2107 + }, + { + "epoch": 0.8432, + "grad_norm": 0.35448239242111973, + "learning_rate": 1.2620152621590819e-05, + "loss": 0.5589, + "step": 2108 + }, + { + "epoch": 0.8436, + "grad_norm": 0.3365683844191083, + "learning_rate": 1.255722722363929e-05, + "loss": 0.5256, + "step": 2109 + }, + { + "epoch": 0.844, + "grad_norm": 0.36195666986890024, + "learning_rate": 1.2494448583125018e-05, + "loss": 0.604, + "step": 2110 + }, + { + "epoch": 0.8444, + "grad_norm": 0.34492524171948424, + "learning_rate": 1.2431816805410967e-05, + "loss": 0.5719, + "step": 2111 + }, + { + "epoch": 0.8448, + "grad_norm": 0.33899530490449875, + "learning_rate": 1.2369331995613665e-05, + "loss": 0.5785, + "step": 2112 + }, + { + "epoch": 0.8452, + "grad_norm": 0.3483370447155762, + "learning_rate": 1.2306994258602922e-05, + "loss": 0.5758, + "step": 2113 + }, + { + "epoch": 0.8456, + "grad_norm": 0.37141725085210964, + "learning_rate": 1.2244803699001783e-05, + "loss": 0.6501, + "step": 2114 + }, + { + "epoch": 0.846, + "grad_norm": 0.32631894828415153, + "learning_rate": 1.218276042118629e-05, + "loss": 0.6012, + "step": 2115 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3443753329616641, + "learning_rate": 1.2120864529285203e-05, + "loss": 0.5654, + "step": 2116 + }, + { + "epoch": 0.8468, + "grad_norm": 0.32433973224085527, + "learning_rate": 1.2059116127179993e-05, + "loss": 0.5942, + "step": 2117 + }, + { + "epoch": 0.8472, + "grad_norm": 0.34710544152147893, + "learning_rate": 1.199751531850457e-05, + "loss": 0.5826, + "step": 2118 + }, + { + "epoch": 0.8476, + "grad_norm": 0.38471513154291287, + "learning_rate": 1.1936062206645182e-05, + "loss": 0.551, + "step": 2119 + }, + { + "epoch": 0.848, + "grad_norm": 0.33527920319158955, + "learning_rate": 1.1874756894740135e-05, + "loss": 0.5181, + "step": 2120 + }, + { + "epoch": 0.8484, + "grad_norm": 0.3398984166096767, + "learning_rate": 1.1813599485679683e-05, + "loss": 0.5839, + "step": 2121 + }, + { + "epoch": 0.8488, + "grad_norm": 0.3373449070753534, + "learning_rate": 1.1752590082105864e-05, + "loss": 0.5607, + "step": 2122 + }, + { + "epoch": 0.8492, + "grad_norm": 0.34089897512678086, + "learning_rate": 1.1691728786412316e-05, + "loss": 0.5652, + "step": 2123 + }, + { + "epoch": 0.8496, + "grad_norm": 0.35003168205330665, + "learning_rate": 1.1631015700744152e-05, + "loss": 0.5889, + "step": 2124 + }, + { + "epoch": 0.85, + "grad_norm": 0.3568026042940518, + "learning_rate": 1.1570450926997655e-05, + "loss": 0.6093, + "step": 2125 + }, + { + "epoch": 0.8504, + "grad_norm": 0.3827902928256842, + "learning_rate": 1.1510034566820204e-05, + "loss": 0.5776, + "step": 2126 + }, + { + "epoch": 0.8508, + "grad_norm": 0.3642162679937661, + "learning_rate": 1.1449766721610189e-05, + "loss": 0.6023, + "step": 2127 + }, + { + "epoch": 0.8512, + "grad_norm": 0.36057336322997025, + "learning_rate": 1.1389647492516598e-05, + "loss": 0.5186, + "step": 2128 + }, + { + "epoch": 0.8516, + "grad_norm": 0.3636241408296972, + "learning_rate": 1.132967698043913e-05, + "loss": 0.6217, + "step": 2129 + }, + { + "epoch": 0.852, + "grad_norm": 0.3317373651782228, + "learning_rate": 1.1269855286027797e-05, + "loss": 0.5544, + "step": 2130 + }, + { + "epoch": 0.8524, + "grad_norm": 0.33341520477991027, + "learning_rate": 1.1210182509682854e-05, + "loss": 0.6192, + "step": 2131 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3412776256292698, + "learning_rate": 1.1150658751554665e-05, + "loss": 0.5709, + "step": 2132 + }, + { + "epoch": 0.8532, + "grad_norm": 0.3357547206020433, + "learning_rate": 1.1091284111543498e-05, + "loss": 0.5858, + "step": 2133 + }, + { + "epoch": 0.8536, + "grad_norm": 0.36459156646194935, + "learning_rate": 1.1032058689299296e-05, + "loss": 0.6304, + "step": 2134 + }, + { + "epoch": 0.854, + "grad_norm": 0.3287866467689789, + "learning_rate": 1.0972982584221592e-05, + "loss": 0.5963, + "step": 2135 + }, + { + "epoch": 0.8544, + "grad_norm": 0.33925962870122844, + "learning_rate": 1.0914055895459352e-05, + "loss": 0.5686, + "step": 2136 + }, + { + "epoch": 0.8548, + "grad_norm": 0.387450059725076, + "learning_rate": 1.08552787219107e-05, + "loss": 0.5654, + "step": 2137 + }, + { + "epoch": 0.8552, + "grad_norm": 0.3437873979196186, + "learning_rate": 1.0796651162222915e-05, + "loss": 0.5904, + "step": 2138 + }, + { + "epoch": 0.8556, + "grad_norm": 0.35215658105124237, + "learning_rate": 1.07381733147921e-05, + "loss": 0.5938, + "step": 2139 + }, + { + "epoch": 0.856, + "grad_norm": 0.3411921172604908, + "learning_rate": 1.067984527776309e-05, + "loss": 0.5878, + "step": 2140 + }, + { + "epoch": 0.8564, + "grad_norm": 0.33376281233262733, + "learning_rate": 1.0621667149029379e-05, + "loss": 0.619, + "step": 2141 + }, + { + "epoch": 0.8568, + "grad_norm": 0.33852640238096193, + "learning_rate": 1.056363902623274e-05, + "loss": 0.5891, + "step": 2142 + }, + { + "epoch": 0.8572, + "grad_norm": 0.34858034692372686, + "learning_rate": 1.0505761006763314e-05, + "loss": 0.594, + "step": 2143 + }, + { + "epoch": 0.8576, + "grad_norm": 0.34536636726515607, + "learning_rate": 1.0448033187759221e-05, + "loss": 0.6005, + "step": 2144 + }, + { + "epoch": 0.858, + "grad_norm": 0.35874099678350707, + "learning_rate": 1.0390455666106547e-05, + "loss": 0.5819, + "step": 2145 + }, + { + "epoch": 0.8584, + "grad_norm": 0.33635170634072625, + "learning_rate": 1.0333028538439094e-05, + "loss": 0.5632, + "step": 2146 + }, + { + "epoch": 0.8588, + "grad_norm": 0.33136983679728427, + "learning_rate": 1.027575190113832e-05, + "loss": 0.5834, + "step": 2147 + }, + { + "epoch": 0.8592, + "grad_norm": 0.35258195045669577, + "learning_rate": 1.0218625850333041e-05, + "loss": 0.5615, + "step": 2148 + }, + { + "epoch": 0.8596, + "grad_norm": 0.3430590005549731, + "learning_rate": 1.0161650481899342e-05, + "loss": 0.5392, + "step": 2149 + }, + { + "epoch": 0.86, + "grad_norm": 0.347786574887435, + "learning_rate": 1.010482589146048e-05, + "loss": 0.617, + "step": 2150 + }, + { + "epoch": 0.8604, + "grad_norm": 0.32974970272121834, + "learning_rate": 1.0048152174386583e-05, + "loss": 0.5241, + "step": 2151 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3327895781603911, + "learning_rate": 9.991629425794623e-06, + "loss": 0.5575, + "step": 2152 + }, + { + "epoch": 0.8612, + "grad_norm": 0.3466328037932508, + "learning_rate": 9.935257740548143e-06, + "loss": 0.6101, + "step": 2153 + }, + { + "epoch": 0.8616, + "grad_norm": 0.39210566626994137, + "learning_rate": 9.879037213257213e-06, + "loss": 0.5923, + "step": 2154 + }, + { + "epoch": 0.862, + "grad_norm": 0.3250064069208217, + "learning_rate": 9.822967938278171e-06, + "loss": 0.526, + "step": 2155 + }, + { + "epoch": 0.8624, + "grad_norm": 0.33374959570321466, + "learning_rate": 9.767050009713474e-06, + "loss": 0.5672, + "step": 2156 + }, + { + "epoch": 0.8628, + "grad_norm": 0.33645634186278234, + "learning_rate": 9.711283521411674e-06, + "loss": 0.5337, + "step": 2157 + }, + { + "epoch": 0.8632, + "grad_norm": 0.40772098857340405, + "learning_rate": 9.655668566967025e-06, + "loss": 0.5643, + "step": 2158 + }, + { + "epoch": 0.8636, + "grad_norm": 0.34107492104769427, + "learning_rate": 9.600205239719584e-06, + "loss": 0.61, + "step": 2159 + }, + { + "epoch": 0.864, + "grad_norm": 0.38363834491880583, + "learning_rate": 9.544893632754814e-06, + "loss": 0.5668, + "step": 2160 + }, + { + "epoch": 0.8644, + "grad_norm": 0.3357640280010078, + "learning_rate": 9.489733838903647e-06, + "loss": 0.5896, + "step": 2161 + }, + { + "epoch": 0.8648, + "grad_norm": 0.346235620539173, + "learning_rate": 9.434725950742118e-06, + "loss": 0.616, + "step": 2162 + }, + { + "epoch": 0.8652, + "grad_norm": 0.33011483628748084, + "learning_rate": 9.379870060591434e-06, + "loss": 0.5937, + "step": 2163 + }, + { + "epoch": 0.8656, + "grad_norm": 0.34631395790079156, + "learning_rate": 9.325166260517592e-06, + "loss": 0.5647, + "step": 2164 + }, + { + "epoch": 0.866, + "grad_norm": 0.32898803954272465, + "learning_rate": 9.270614642331376e-06, + "loss": 0.5698, + "step": 2165 + }, + { + "epoch": 0.8664, + "grad_norm": 0.3464200472980868, + "learning_rate": 9.216215297588183e-06, + "loss": 0.576, + "step": 2166 + }, + { + "epoch": 0.8668, + "grad_norm": 0.343889344991586, + "learning_rate": 9.161968317587787e-06, + "loss": 0.6337, + "step": 2167 + }, + { + "epoch": 0.8672, + "grad_norm": 0.33164638899696103, + "learning_rate": 9.107873793374322e-06, + "loss": 0.6191, + "step": 2168 + }, + { + "epoch": 0.8676, + "grad_norm": 0.3574478499833753, + "learning_rate": 9.053931815735994e-06, + "loss": 0.6558, + "step": 2169 + }, + { + "epoch": 0.868, + "grad_norm": 0.36453568461488134, + "learning_rate": 9.000142475204964e-06, + "loss": 0.5521, + "step": 2170 + }, + { + "epoch": 0.8684, + "grad_norm": 0.38408131723261274, + "learning_rate": 8.946505862057286e-06, + "loss": 0.6281, + "step": 2171 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3601526494169314, + "learning_rate": 8.893022066312672e-06, + "loss": 0.5922, + "step": 2172 + }, + { + "epoch": 0.8692, + "grad_norm": 0.34482270087578476, + "learning_rate": 8.839691177734322e-06, + "loss": 0.6064, + "step": 2173 + }, + { + "epoch": 0.8696, + "grad_norm": 0.32648666829842754, + "learning_rate": 8.786513285828834e-06, + "loss": 0.5437, + "step": 2174 + }, + { + "epoch": 0.87, + "grad_norm": 0.3936583852917972, + "learning_rate": 8.733488479845997e-06, + "loss": 0.6111, + "step": 2175 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3175571242137894, + "learning_rate": 8.680616848778711e-06, + "loss": 0.5332, + "step": 2176 + }, + { + "epoch": 0.8708, + "grad_norm": 0.3686518734142378, + "learning_rate": 8.627898481362817e-06, + "loss": 0.6331, + "step": 2177 + }, + { + "epoch": 0.8712, + "grad_norm": 0.3539160746294602, + "learning_rate": 8.575333466076863e-06, + "loss": 0.6253, + "step": 2178 + }, + { + "epoch": 0.8716, + "grad_norm": 0.32778557025960625, + "learning_rate": 8.522921891142032e-06, + "loss": 0.553, + "step": 2179 + }, + { + "epoch": 0.872, + "grad_norm": 0.3485023348541899, + "learning_rate": 8.470663844522052e-06, + "loss": 0.5814, + "step": 2180 + }, + { + "epoch": 0.8724, + "grad_norm": 0.3310926997935631, + "learning_rate": 8.418559413922933e-06, + "loss": 0.5582, + "step": 2181 + }, + { + "epoch": 0.8728, + "grad_norm": 0.37437652214535044, + "learning_rate": 8.366608686792854e-06, + "loss": 0.5453, + "step": 2182 + }, + { + "epoch": 0.8732, + "grad_norm": 0.3481714449337977, + "learning_rate": 8.31481175032206e-06, + "loss": 0.5731, + "step": 2183 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3446384338979933, + "learning_rate": 8.263168691442624e-06, + "loss": 0.6235, + "step": 2184 + }, + { + "epoch": 0.874, + "grad_norm": 0.35092100052470027, + "learning_rate": 8.21167959682848e-06, + "loss": 0.5939, + "step": 2185 + }, + { + "epoch": 0.8744, + "grad_norm": 0.33976686257110345, + "learning_rate": 8.16034455289506e-06, + "loss": 0.5432, + "step": 2186 + }, + { + "epoch": 0.8748, + "grad_norm": 0.3424977560024411, + "learning_rate": 8.109163645799267e-06, + "loss": 0.5625, + "step": 2187 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3259931498126824, + "learning_rate": 8.058136961439333e-06, + "loss": 0.5553, + "step": 2188 + }, + { + "epoch": 0.8756, + "grad_norm": 0.329605914429993, + "learning_rate": 8.007264585454633e-06, + "loss": 0.5597, + "step": 2189 + }, + { + "epoch": 0.876, + "grad_norm": 0.3570522607877131, + "learning_rate": 7.956546603225601e-06, + "loss": 0.5992, + "step": 2190 + }, + { + "epoch": 0.8764, + "grad_norm": 0.37566719111983116, + "learning_rate": 7.905983099873504e-06, + "loss": 0.5553, + "step": 2191 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3854131661562234, + "learning_rate": 7.85557416026037e-06, + "loss": 0.546, + "step": 2192 + }, + { + "epoch": 0.8772, + "grad_norm": 0.3247006170241847, + "learning_rate": 7.805319868988758e-06, + "loss": 0.5784, + "step": 2193 + }, + { + "epoch": 0.8776, + "grad_norm": 0.36393035734443796, + "learning_rate": 7.755220310401811e-06, + "loss": 0.6415, + "step": 2194 + }, + { + "epoch": 0.878, + "grad_norm": 0.3430330211948053, + "learning_rate": 7.705275568582848e-06, + "loss": 0.5825, + "step": 2195 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3364752895914115, + "learning_rate": 7.655485727355415e-06, + "loss": 0.6064, + "step": 2196 + }, + { + "epoch": 0.8788, + "grad_norm": 0.32887588473018375, + "learning_rate": 7.605850870283049e-06, + "loss": 0.6015, + "step": 2197 + }, + { + "epoch": 0.8792, + "grad_norm": 0.37339551678069766, + "learning_rate": 7.556371080669222e-06, + "loss": 0.5891, + "step": 2198 + }, + { + "epoch": 0.8796, + "grad_norm": 0.3412578827035179, + "learning_rate": 7.5070464415571415e-06, + "loss": 0.6025, + "step": 2199 + }, + { + "epoch": 0.88, + "grad_norm": 0.3176814606146469, + "learning_rate": 7.457877035729588e-06, + "loss": 0.546, + "step": 2200 + }, + { + "epoch": 0.8804, + "grad_norm": 0.35333484894842804, + "learning_rate": 7.408862945708839e-06, + "loss": 0.5666, + "step": 2201 + }, + { + "epoch": 0.8808, + "grad_norm": 0.34067618744853434, + "learning_rate": 7.360004253756459e-06, + "loss": 0.5558, + "step": 2202 + }, + { + "epoch": 0.8812, + "grad_norm": 0.3493373501733574, + "learning_rate": 7.311301041873275e-06, + "loss": 0.5826, + "step": 2203 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3470043637665803, + "learning_rate": 7.262753391799127e-06, + "loss": 0.5155, + "step": 2204 + }, + { + "epoch": 0.882, + "grad_norm": 0.3557963350107681, + "learning_rate": 7.21436138501278e-06, + "loss": 0.5693, + "step": 2205 + }, + { + "epoch": 0.8824, + "grad_norm": 0.4127331932220236, + "learning_rate": 7.166125102731735e-06, + "loss": 0.6331, + "step": 2206 + }, + { + "epoch": 0.8828, + "grad_norm": 0.34864615914110403, + "learning_rate": 7.118044625912213e-06, + "loss": 0.5679, + "step": 2207 + }, + { + "epoch": 0.8832, + "grad_norm": 0.3180934168730491, + "learning_rate": 7.070120035248906e-06, + "loss": 0.5818, + "step": 2208 + }, + { + "epoch": 0.8836, + "grad_norm": 0.3678221847733007, + "learning_rate": 7.022351411174866e-06, + "loss": 0.6132, + "step": 2209 + }, + { + "epoch": 0.884, + "grad_norm": 0.3802873506675512, + "learning_rate": 6.974738833861383e-06, + "loss": 0.5353, + "step": 2210 + }, + { + "epoch": 0.8844, + "grad_norm": 0.3893805654449064, + "learning_rate": 6.927282383217892e-06, + "loss": 0.566, + "step": 2211 + }, + { + "epoch": 0.8848, + "grad_norm": 0.33747086502280854, + "learning_rate": 6.879982138891716e-06, + "loss": 0.5934, + "step": 2212 + }, + { + "epoch": 0.8852, + "grad_norm": 0.34829980188505943, + "learning_rate": 6.83283818026812e-06, + "loss": 0.5513, + "step": 2213 + }, + { + "epoch": 0.8856, + "grad_norm": 0.33545995370726744, + "learning_rate": 6.785850586469989e-06, + "loss": 0.5382, + "step": 2214 + }, + { + "epoch": 0.886, + "grad_norm": 0.3686607696228201, + "learning_rate": 6.739019436357774e-06, + "loss": 0.545, + "step": 2215 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3360522782306863, + "learning_rate": 6.692344808529427e-06, + "loss": 0.5816, + "step": 2216 + }, + { + "epoch": 0.8868, + "grad_norm": 0.3450348296150317, + "learning_rate": 6.645826781320142e-06, + "loss": 0.6355, + "step": 2217 + }, + { + "epoch": 0.8872, + "grad_norm": 0.32055865848150106, + "learning_rate": 6.599465432802332e-06, + "loss": 0.6024, + "step": 2218 + }, + { + "epoch": 0.8876, + "grad_norm": 0.34997373998894216, + "learning_rate": 6.553260840785414e-06, + "loss": 0.5816, + "step": 2219 + }, + { + "epoch": 0.888, + "grad_norm": 0.32760059318056384, + "learning_rate": 6.507213082815744e-06, + "loss": 0.5817, + "step": 2220 + }, + { + "epoch": 0.8884, + "grad_norm": 0.3242711784203104, + "learning_rate": 6.461322236176437e-06, + "loss": 0.5533, + "step": 2221 + }, + { + "epoch": 0.8888, + "grad_norm": 0.3316895678247226, + "learning_rate": 6.415588377887305e-06, + "loss": 0.5733, + "step": 2222 + }, + { + "epoch": 0.8892, + "grad_norm": 0.3450565066703003, + "learning_rate": 6.370011584704616e-06, + "loss": 0.5729, + "step": 2223 + }, + { + "epoch": 0.8896, + "grad_norm": 0.34878316171392754, + "learning_rate": 6.324591933121071e-06, + "loss": 0.6155, + "step": 2224 + }, + { + "epoch": 0.89, + "grad_norm": 0.3606929169768857, + "learning_rate": 6.2793294993656494e-06, + "loss": 0.629, + "step": 2225 + }, + { + "epoch": 0.8904, + "grad_norm": 0.35038468644965637, + "learning_rate": 6.2342243594034066e-06, + "loss": 0.5781, + "step": 2226 + }, + { + "epoch": 0.8908, + "grad_norm": 0.3490618098903304, + "learning_rate": 6.1892765889355e-06, + "loss": 0.5881, + "step": 2227 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3414333206005217, + "learning_rate": 6.144486263398886e-06, + "loss": 0.6083, + "step": 2228 + }, + { + "epoch": 0.8916, + "grad_norm": 0.3346368217085105, + "learning_rate": 6.0998534579663425e-06, + "loss": 0.573, + "step": 2229 + }, + { + "epoch": 0.892, + "grad_norm": 0.33617848103996995, + "learning_rate": 6.055378247546218e-06, + "loss": 0.6062, + "step": 2230 + }, + { + "epoch": 0.8924, + "grad_norm": 0.35135871815177266, + "learning_rate": 6.01106070678239e-06, + "loss": 0.6165, + "step": 2231 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3351654589423389, + "learning_rate": 5.96690091005414e-06, + "loss": 0.6054, + "step": 2232 + }, + { + "epoch": 0.8932, + "grad_norm": 0.33937639291603283, + "learning_rate": 5.922898931475973e-06, + "loss": 0.5504, + "step": 2233 + }, + { + "epoch": 0.8936, + "grad_norm": 0.37166808012426555, + "learning_rate": 5.879054844897536e-06, + "loss": 0.5873, + "step": 2234 + }, + { + "epoch": 0.894, + "grad_norm": 0.34574335975587034, + "learning_rate": 5.835368723903456e-06, + "loss": 0.626, + "step": 2235 + }, + { + "epoch": 0.8944, + "grad_norm": 0.345540554802604, + "learning_rate": 5.791840641813295e-06, + "loss": 0.5567, + "step": 2236 + }, + { + "epoch": 0.8948, + "grad_norm": 0.36032088988088107, + "learning_rate": 5.748470671681327e-06, + "loss": 0.6215, + "step": 2237 + }, + { + "epoch": 0.8952, + "grad_norm": 0.35939823610080457, + "learning_rate": 5.705258886296494e-06, + "loss": 0.5905, + "step": 2238 + }, + { + "epoch": 0.8956, + "grad_norm": 0.3908783146108529, + "learning_rate": 5.662205358182226e-06, + "loss": 0.5593, + "step": 2239 + }, + { + "epoch": 0.896, + "grad_norm": 0.35664593571911507, + "learning_rate": 5.6193101595963585e-06, + "loss": 0.5696, + "step": 2240 + }, + { + "epoch": 0.8964, + "grad_norm": 0.33399971612249935, + "learning_rate": 5.576573362531001e-06, + "loss": 0.5926, + "step": 2241 + }, + { + "epoch": 0.8968, + "grad_norm": 0.35303598976057604, + "learning_rate": 5.533995038712403e-06, + "loss": 0.5655, + "step": 2242 + }, + { + "epoch": 0.8972, + "grad_norm": 0.34392994527527554, + "learning_rate": 5.491575259600879e-06, + "loss": 0.6274, + "step": 2243 + }, + { + "epoch": 0.8976, + "grad_norm": 0.3531636604039412, + "learning_rate": 5.449314096390601e-06, + "loss": 0.5736, + "step": 2244 + }, + { + "epoch": 0.898, + "grad_norm": 0.3440271547583296, + "learning_rate": 5.407211620009544e-06, + "loss": 0.5564, + "step": 2245 + }, + { + "epoch": 0.8984, + "grad_norm": 0.35193330728329647, + "learning_rate": 5.365267901119397e-06, + "loss": 0.5596, + "step": 2246 + }, + { + "epoch": 0.8988, + "grad_norm": 0.3794083771602135, + "learning_rate": 5.323483010115382e-06, + "loss": 0.6279, + "step": 2247 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3441663137581851, + "learning_rate": 5.281857017126124e-06, + "loss": 0.6086, + "step": 2248 + }, + { + "epoch": 0.8996, + "grad_norm": 0.35205692114369125, + "learning_rate": 5.240389992013606e-06, + "loss": 0.5958, + "step": 2249 + }, + { + "epoch": 0.9, + "grad_norm": 0.3255215354488011, + "learning_rate": 5.199082004372957e-06, + "loss": 0.5505, + "step": 2250 + }, + { + "epoch": 0.9004, + "grad_norm": 0.33028656101732806, + "learning_rate": 5.157933123532465e-06, + "loss": 0.5662, + "step": 2251 + }, + { + "epoch": 0.9008, + "grad_norm": 0.33655710024673235, + "learning_rate": 5.116943418553355e-06, + "loss": 0.5752, + "step": 2252 + }, + { + "epoch": 0.9012, + "grad_norm": 0.3466324797514499, + "learning_rate": 5.076112958229673e-06, + "loss": 0.5663, + "step": 2253 + }, + { + "epoch": 0.9016, + "grad_norm": 0.34687514423799, + "learning_rate": 5.035441811088204e-06, + "loss": 0.6246, + "step": 2254 + }, + { + "epoch": 0.902, + "grad_norm": 0.36854360123541985, + "learning_rate": 4.994930045388413e-06, + "loss": 0.6255, + "step": 2255 + }, + { + "epoch": 0.9024, + "grad_norm": 0.3475971068131044, + "learning_rate": 4.9545777291222116e-06, + "loss": 0.624, + "step": 2256 + }, + { + "epoch": 0.9028, + "grad_norm": 0.3484591062388061, + "learning_rate": 4.914384930013927e-06, + "loss": 0.6347, + "step": 2257 + }, + { + "epoch": 0.9032, + "grad_norm": 0.35781500647827147, + "learning_rate": 4.874351715520153e-06, + "loss": 0.6228, + "step": 2258 + }, + { + "epoch": 0.9036, + "grad_norm": 0.33479370202552666, + "learning_rate": 4.834478152829658e-06, + "loss": 0.5395, + "step": 2259 + }, + { + "epoch": 0.904, + "grad_norm": 0.3141925719904852, + "learning_rate": 4.794764308863242e-06, + "loss": 0.5663, + "step": 2260 + }, + { + "epoch": 0.9044, + "grad_norm": 0.33925843606100425, + "learning_rate": 4.7552102502737e-06, + "loss": 0.5878, + "step": 2261 + }, + { + "epoch": 0.9048, + "grad_norm": 0.31770909911930123, + "learning_rate": 4.715816043445609e-06, + "loss": 0.5423, + "step": 2262 + }, + { + "epoch": 0.9052, + "grad_norm": 0.33756385648246795, + "learning_rate": 4.676581754495235e-06, + "loss": 0.5288, + "step": 2263 + }, + { + "epoch": 0.9056, + "grad_norm": 0.33754218398571206, + "learning_rate": 4.637507449270517e-06, + "loss": 0.5738, + "step": 2264 + }, + { + "epoch": 0.906, + "grad_norm": 0.3398726259565656, + "learning_rate": 4.5985931933508754e-06, + "loss": 0.5553, + "step": 2265 + }, + { + "epoch": 0.9064, + "grad_norm": 0.3370387764338048, + "learning_rate": 4.559839052047066e-06, + "loss": 0.5934, + "step": 2266 + }, + { + "epoch": 0.9068, + "grad_norm": 0.3499969257394548, + "learning_rate": 4.521245090401172e-06, + "loss": 0.5721, + "step": 2267 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3545203034920037, + "learning_rate": 4.482811373186402e-06, + "loss": 0.5903, + "step": 2268 + }, + { + "epoch": 0.9076, + "grad_norm": 0.3488243483737572, + "learning_rate": 4.444537964907058e-06, + "loss": 0.5761, + "step": 2269 + }, + { + "epoch": 0.908, + "grad_norm": 0.336752297507094, + "learning_rate": 4.406424929798403e-06, + "loss": 0.5814, + "step": 2270 + }, + { + "epoch": 0.9084, + "grad_norm": 0.3441597428004464, + "learning_rate": 4.368472331826478e-06, + "loss": 0.5953, + "step": 2271 + }, + { + "epoch": 0.9088, + "grad_norm": 0.34571888716720656, + "learning_rate": 4.330680234688112e-06, + "loss": 0.5509, + "step": 2272 + }, + { + "epoch": 0.9092, + "grad_norm": 0.347900098704324, + "learning_rate": 4.2930487018107424e-06, + "loss": 0.5934, + "step": 2273 + }, + { + "epoch": 0.9096, + "grad_norm": 0.39059031228716573, + "learning_rate": 4.25557779635235e-06, + "loss": 0.5872, + "step": 2274 + }, + { + "epoch": 0.91, + "grad_norm": 0.3115463345804607, + "learning_rate": 4.2182675812012965e-06, + "loss": 0.5412, + "step": 2275 + }, + { + "epoch": 0.9104, + "grad_norm": 0.35340103148990887, + "learning_rate": 4.1811181189762685e-06, + "loss": 0.5759, + "step": 2276 + }, + { + "epoch": 0.9108, + "grad_norm": 0.33121808970550043, + "learning_rate": 4.144129472026137e-06, + "loss": 0.5895, + "step": 2277 + }, + { + "epoch": 0.9112, + "grad_norm": 0.36392858232423886, + "learning_rate": 4.107301702429922e-06, + "loss": 0.5693, + "step": 2278 + }, + { + "epoch": 0.9116, + "grad_norm": 0.3376782724989046, + "learning_rate": 4.070634871996615e-06, + "loss": 0.5469, + "step": 2279 + }, + { + "epoch": 0.912, + "grad_norm": 0.3342447081785353, + "learning_rate": 4.034129042265066e-06, + "loss": 0.6196, + "step": 2280 + }, + { + "epoch": 0.9124, + "grad_norm": 0.3342148208439684, + "learning_rate": 3.997784274503946e-06, + "loss": 0.5585, + "step": 2281 + }, + { + "epoch": 0.9128, + "grad_norm": 0.3522583216373266, + "learning_rate": 3.961600629711615e-06, + "loss": 0.577, + "step": 2282 + }, + { + "epoch": 0.9132, + "grad_norm": 0.3430041773663385, + "learning_rate": 3.925578168616006e-06, + "loss": 0.5813, + "step": 2283 + }, + { + "epoch": 0.9136, + "grad_norm": 0.35823019371664233, + "learning_rate": 3.8897169516745495e-06, + "loss": 0.5895, + "step": 2284 + }, + { + "epoch": 0.914, + "grad_norm": 0.32901058067219846, + "learning_rate": 3.854017039074009e-06, + "loss": 0.5724, + "step": 2285 + }, + { + "epoch": 0.9144, + "grad_norm": 0.3483576351065279, + "learning_rate": 3.818478490730471e-06, + "loss": 0.5943, + "step": 2286 + }, + { + "epoch": 0.9148, + "grad_norm": 0.3599007095841397, + "learning_rate": 3.783101366289199e-06, + "loss": 0.6038, + "step": 2287 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3497951907501343, + "learning_rate": 3.7478857251245227e-06, + "loss": 0.593, + "step": 2288 + }, + { + "epoch": 0.9156, + "grad_norm": 0.3302294964276083, + "learning_rate": 3.712831626339752e-06, + "loss": 0.5568, + "step": 2289 + }, + { + "epoch": 0.916, + "grad_norm": 0.3377498089915625, + "learning_rate": 3.6779391287670494e-06, + "loss": 0.5952, + "step": 2290 + }, + { + "epoch": 0.9164, + "grad_norm": 0.36302543364294015, + "learning_rate": 3.643208290967415e-06, + "loss": 0.5769, + "step": 2291 + }, + { + "epoch": 0.9168, + "grad_norm": 0.34320098024274204, + "learning_rate": 3.6086391712304878e-06, + "loss": 0.5187, + "step": 2292 + }, + { + "epoch": 0.9172, + "grad_norm": 0.3313292168229392, + "learning_rate": 3.5742318275745145e-06, + "loss": 0.5564, + "step": 2293 + }, + { + "epoch": 0.9176, + "grad_norm": 0.32071056996032454, + "learning_rate": 3.5399863177462024e-06, + "loss": 0.5473, + "step": 2294 + }, + { + "epoch": 0.918, + "grad_norm": 0.35446421413133045, + "learning_rate": 3.5059026992206647e-06, + "loss": 0.5955, + "step": 2295 + }, + { + "epoch": 0.9184, + "grad_norm": 0.359590208707977, + "learning_rate": 3.471981029201321e-06, + "loss": 0.5935, + "step": 2296 + }, + { + "epoch": 0.9188, + "grad_norm": 0.3309916716308859, + "learning_rate": 3.4382213646197757e-06, + "loss": 0.5802, + "step": 2297 + }, + { + "epoch": 0.9192, + "grad_norm": 0.3420185321533074, + "learning_rate": 3.404623762135728e-06, + "loss": 0.5919, + "step": 2298 + }, + { + "epoch": 0.9196, + "grad_norm": 0.3511245899584105, + "learning_rate": 3.371188278136883e-06, + "loss": 0.5894, + "step": 2299 + }, + { + "epoch": 0.92, + "grad_norm": 0.34254644203055135, + "learning_rate": 3.3379149687388867e-06, + "loss": 0.5692, + "step": 2300 + }, + { + "epoch": 0.9204, + "grad_norm": 0.358085252248771, + "learning_rate": 3.3048038897851573e-06, + "loss": 0.6141, + "step": 2301 + }, + { + "epoch": 0.9208, + "grad_norm": 0.33421788692383125, + "learning_rate": 3.271855096846899e-06, + "loss": 0.5761, + "step": 2302 + }, + { + "epoch": 0.9212, + "grad_norm": 0.3305924315223357, + "learning_rate": 3.239068645222898e-06, + "loss": 0.5717, + "step": 2303 + }, + { + "epoch": 0.9216, + "grad_norm": 0.31373356311523226, + "learning_rate": 3.2064445899394724e-06, + "loss": 0.5026, + "step": 2304 + }, + { + "epoch": 0.922, + "grad_norm": 0.3395025535682046, + "learning_rate": 3.1739829857504234e-06, + "loss": 0.5479, + "step": 2305 + }, + { + "epoch": 0.9224, + "grad_norm": 0.3355033876942166, + "learning_rate": 3.1416838871368924e-06, + "loss": 0.5423, + "step": 2306 + }, + { + "epoch": 0.9228, + "grad_norm": 0.32447934130362194, + "learning_rate": 3.1095473483072733e-06, + "loss": 0.5355, + "step": 2307 + }, + { + "epoch": 0.9232, + "grad_norm": 0.3288323850850867, + "learning_rate": 3.077573423197144e-06, + "loss": 0.6181, + "step": 2308 + }, + { + "epoch": 0.9236, + "grad_norm": 0.37778904569320426, + "learning_rate": 3.045762165469168e-06, + "loss": 0.5845, + "step": 2309 + }, + { + "epoch": 0.924, + "grad_norm": 0.3532701249550537, + "learning_rate": 3.014113628512982e-06, + "loss": 0.5839, + "step": 2310 + }, + { + "epoch": 0.9244, + "grad_norm": 0.35176239612432864, + "learning_rate": 2.982627865445109e-06, + "loss": 0.5754, + "step": 2311 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3368894956902543, + "learning_rate": 2.9513049291089555e-06, + "loss": 0.5937, + "step": 2312 + }, + { + "epoch": 0.9252, + "grad_norm": 0.3402801417193046, + "learning_rate": 2.9201448720745706e-06, + "loss": 0.5909, + "step": 2313 + }, + { + "epoch": 0.9256, + "grad_norm": 0.33648464173002784, + "learning_rate": 2.8891477466386986e-06, + "loss": 0.5641, + "step": 2314 + }, + { + "epoch": 0.926, + "grad_norm": 0.5168140106464583, + "learning_rate": 2.8583136048245697e-06, + "loss": 0.5442, + "step": 2315 + }, + { + "epoch": 0.9264, + "grad_norm": 0.363563408061145, + "learning_rate": 2.827642498381955e-06, + "loss": 0.6082, + "step": 2316 + }, + { + "epoch": 0.9268, + "grad_norm": 0.3504629693467053, + "learning_rate": 2.797134478786911e-06, + "loss": 0.6349, + "step": 2317 + }, + { + "epoch": 0.9272, + "grad_norm": 0.3616532069744127, + "learning_rate": 2.76678959724187e-06, + "loss": 0.6088, + "step": 2318 + }, + { + "epoch": 0.9276, + "grad_norm": 0.3507151585532411, + "learning_rate": 2.7366079046753924e-06, + "loss": 0.5617, + "step": 2319 + }, + { + "epoch": 0.928, + "grad_norm": 0.340841838394149, + "learning_rate": 2.706589451742181e-06, + "loss": 0.5989, + "step": 2320 + }, + { + "epoch": 0.9284, + "grad_norm": 0.335911542938969, + "learning_rate": 2.6767342888229908e-06, + "loss": 0.5318, + "step": 2321 + }, + { + "epoch": 0.9288, + "grad_norm": 0.3734278475275637, + "learning_rate": 2.647042466024485e-06, + "loss": 0.6167, + "step": 2322 + }, + { + "epoch": 0.9292, + "grad_norm": 0.3364953235443596, + "learning_rate": 2.617514033179236e-06, + "loss": 0.5745, + "step": 2323 + }, + { + "epoch": 0.9296, + "grad_norm": 0.3365097083040503, + "learning_rate": 2.5881490398455332e-06, + "loss": 0.581, + "step": 2324 + }, + { + "epoch": 0.93, + "grad_norm": 0.3501391841424808, + "learning_rate": 2.5589475353073988e-06, + "loss": 0.6072, + "step": 2325 + }, + { + "epoch": 0.9304, + "grad_norm": 0.33519945683522734, + "learning_rate": 2.5299095685744735e-06, + "loss": 0.5621, + "step": 2326 + }, + { + "epoch": 0.9308, + "grad_norm": 0.3398021668968586, + "learning_rate": 2.5010351883819284e-06, + "loss": 0.598, + "step": 2327 + }, + { + "epoch": 0.9312, + "grad_norm": 0.35673857756315136, + "learning_rate": 2.472324443190355e-06, + "loss": 0.6154, + "step": 2328 + }, + { + "epoch": 0.9316, + "grad_norm": 0.33820518950481704, + "learning_rate": 2.44377738118573e-06, + "loss": 0.5879, + "step": 2329 + }, + { + "epoch": 0.932, + "grad_norm": 0.3528059102780005, + "learning_rate": 2.415394050279318e-06, + "loss": 0.5682, + "step": 2330 + }, + { + "epoch": 0.9324, + "grad_norm": 0.34393419476168063, + "learning_rate": 2.3871744981076136e-06, + "loss": 0.5976, + "step": 2331 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3605738869567883, + "learning_rate": 2.359118772032176e-06, + "loss": 0.6091, + "step": 2332 + }, + { + "epoch": 0.9332, + "grad_norm": 0.3200807481244181, + "learning_rate": 2.331226919139662e-06, + "loss": 0.5778, + "step": 2333 + }, + { + "epoch": 0.9336, + "grad_norm": 0.33176492826326637, + "learning_rate": 2.30349898624167e-06, + "loss": 0.5517, + "step": 2334 + }, + { + "epoch": 0.934, + "grad_norm": 0.3647925866698948, + "learning_rate": 2.2759350198746976e-06, + "loss": 0.5866, + "step": 2335 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3489184401906757, + "learning_rate": 2.2485350663000725e-06, + "loss": 0.5707, + "step": 2336 + }, + { + "epoch": 0.9348, + "grad_norm": 0.33620195135863973, + "learning_rate": 2.2212991715038324e-06, + "loss": 0.5142, + "step": 2337 + }, + { + "epoch": 0.9352, + "grad_norm": 0.3366421335199147, + "learning_rate": 2.1942273811966563e-06, + "loss": 0.5633, + "step": 2338 + }, + { + "epoch": 0.9356, + "grad_norm": 0.3504606894013506, + "learning_rate": 2.1673197408138115e-06, + "loss": 0.5986, + "step": 2339 + }, + { + "epoch": 0.936, + "grad_norm": 0.3643680697201967, + "learning_rate": 2.1405762955151176e-06, + "loss": 0.5975, + "step": 2340 + }, + { + "epoch": 0.9364, + "grad_norm": 0.36971268743873886, + "learning_rate": 2.1139970901847606e-06, + "loss": 0.6056, + "step": 2341 + }, + { + "epoch": 0.9368, + "grad_norm": 0.3205072351313767, + "learning_rate": 2.0875821694313013e-06, + "loss": 0.5336, + "step": 2342 + }, + { + "epoch": 0.9372, + "grad_norm": 0.3281437642775695, + "learning_rate": 2.061331577587566e-06, + "loss": 0.5739, + "step": 2343 + }, + { + "epoch": 0.9376, + "grad_norm": 0.35423646459317043, + "learning_rate": 2.035245358710591e-06, + "loss": 0.5491, + "step": 2344 + }, + { + "epoch": 0.938, + "grad_norm": 0.33130157775600444, + "learning_rate": 2.009323556581566e-06, + "loss": 0.5358, + "step": 2345 + }, + { + "epoch": 0.9384, + "grad_norm": 0.34238238221209616, + "learning_rate": 1.983566214705701e-06, + "loss": 0.5751, + "step": 2346 + }, + { + "epoch": 0.9388, + "grad_norm": 0.34453265639186, + "learning_rate": 1.9579733763121944e-06, + "loss": 0.5756, + "step": 2347 + }, + { + "epoch": 0.9392, + "grad_norm": 0.32732656269296073, + "learning_rate": 1.9325450843541536e-06, + "loss": 0.5608, + "step": 2348 + }, + { + "epoch": 0.9396, + "grad_norm": 0.3205611924412672, + "learning_rate": 1.9072813815085523e-06, + "loss": 0.572, + "step": 2349 + }, + { + "epoch": 0.94, + "grad_norm": 0.3421977666997643, + "learning_rate": 1.882182310176095e-06, + "loss": 0.5506, + "step": 2350 + }, + { + "epoch": 0.9404, + "grad_norm": 0.33785091419659685, + "learning_rate": 1.857247912481197e-06, + "loss": 0.5786, + "step": 2351 + }, + { + "epoch": 0.9408, + "grad_norm": 0.34156770747211374, + "learning_rate": 1.8324782302718834e-06, + "loss": 0.5906, + "step": 2352 + }, + { + "epoch": 0.9412, + "grad_norm": 0.35953608773240814, + "learning_rate": 1.807873305119756e-06, + "loss": 0.5629, + "step": 2353 + }, + { + "epoch": 0.9416, + "grad_norm": 0.33150979888496007, + "learning_rate": 1.7834331783198932e-06, + "loss": 0.539, + "step": 2354 + }, + { + "epoch": 0.942, + "grad_norm": 0.34936801364046166, + "learning_rate": 1.7591578908907724e-06, + "loss": 0.6048, + "step": 2355 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3716423177680825, + "learning_rate": 1.7350474835742147e-06, + "loss": 0.6283, + "step": 2356 + }, + { + "epoch": 0.9428, + "grad_norm": 0.34291629219173464, + "learning_rate": 1.7111019968353626e-06, + "loss": 0.5801, + "step": 2357 + }, + { + "epoch": 0.9432, + "grad_norm": 0.3755431906469608, + "learning_rate": 1.687321470862524e-06, + "loss": 0.6079, + "step": 2358 + }, + { + "epoch": 0.9436, + "grad_norm": 0.3906127082745325, + "learning_rate": 1.6637059455671622e-06, + "loss": 0.6016, + "step": 2359 + }, + { + "epoch": 0.944, + "grad_norm": 0.34306552212420544, + "learning_rate": 1.6402554605838172e-06, + "loss": 0.6368, + "step": 2360 + }, + { + "epoch": 0.9444, + "grad_norm": 0.34742338711971815, + "learning_rate": 1.6169700552700284e-06, + "loss": 0.6125, + "step": 2361 + }, + { + "epoch": 0.9448, + "grad_norm": 0.3618315613050588, + "learning_rate": 1.5938497687062904e-06, + "loss": 0.6126, + "step": 2362 + }, + { + "epoch": 0.9452, + "grad_norm": 0.337892601102626, + "learning_rate": 1.5708946396959856e-06, + "loss": 0.5933, + "step": 2363 + }, + { + "epoch": 0.9456, + "grad_norm": 0.35492727607163377, + "learning_rate": 1.5481047067652743e-06, + "loss": 0.6258, + "step": 2364 + }, + { + "epoch": 0.946, + "grad_norm": 0.37793937027747654, + "learning_rate": 1.5254800081630826e-06, + "loss": 0.5969, + "step": 2365 + }, + { + "epoch": 0.9464, + "grad_norm": 0.33236758505131336, + "learning_rate": 1.5030205818610254e-06, + "loss": 0.6169, + "step": 2366 + }, + { + "epoch": 0.9468, + "grad_norm": 0.3740054825973699, + "learning_rate": 1.4807264655533281e-06, + "loss": 0.56, + "step": 2367 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3341365422221843, + "learning_rate": 1.4585976966567828e-06, + "loss": 0.5486, + "step": 2368 + }, + { + "epoch": 0.9476, + "grad_norm": 0.34825714160269117, + "learning_rate": 1.4366343123106695e-06, + "loss": 0.6052, + "step": 2369 + }, + { + "epoch": 0.948, + "grad_norm": 0.35285631594915745, + "learning_rate": 1.4148363493766802e-06, + "loss": 0.598, + "step": 2370 + }, + { + "epoch": 0.9484, + "grad_norm": 0.3573694612073983, + "learning_rate": 1.3932038444389062e-06, + "loss": 0.5494, + "step": 2371 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3695274957618561, + "learning_rate": 1.3717368338037161e-06, + "loss": 0.5738, + "step": 2372 + }, + { + "epoch": 0.9492, + "grad_norm": 0.3818659467699447, + "learning_rate": 1.3504353534997683e-06, + "loss": 0.6347, + "step": 2373 + }, + { + "epoch": 0.9496, + "grad_norm": 0.39038771420550783, + "learning_rate": 1.3292994392778536e-06, + "loss": 0.5303, + "step": 2374 + }, + { + "epoch": 0.95, + "grad_norm": 0.3677982950208698, + "learning_rate": 1.30832912661093e-06, + "loss": 0.6208, + "step": 2375 + }, + { + "epoch": 0.9504, + "grad_norm": 0.32153789890989554, + "learning_rate": 1.2875244506940109e-06, + "loss": 0.5584, + "step": 2376 + }, + { + "epoch": 0.9508, + "grad_norm": 0.3301731661423886, + "learning_rate": 1.2668854464441104e-06, + "loss": 0.5679, + "step": 2377 + }, + { + "epoch": 0.9512, + "grad_norm": 0.3558794472870126, + "learning_rate": 1.2464121485001978e-06, + "loss": 0.5833, + "step": 2378 + }, + { + "epoch": 0.9516, + "grad_norm": 0.33375321358163346, + "learning_rate": 1.2261045912231318e-06, + "loss": 0.5675, + "step": 2379 + }, + { + "epoch": 0.952, + "grad_norm": 0.35354128098230686, + "learning_rate": 1.2059628086956044e-06, + "loss": 0.5916, + "step": 2380 + }, + { + "epoch": 0.9524, + "grad_norm": 0.3491234364844722, + "learning_rate": 1.1859868347220749e-06, + "loss": 0.5967, + "step": 2381 + }, + { + "epoch": 0.9528, + "grad_norm": 0.33956987922311915, + "learning_rate": 1.1661767028287363e-06, + "loss": 0.5891, + "step": 2382 + }, + { + "epoch": 0.9532, + "grad_norm": 0.3426281690207688, + "learning_rate": 1.1465324462634375e-06, + "loss": 0.5967, + "step": 2383 + }, + { + "epoch": 0.9536, + "grad_norm": 0.38649212119752785, + "learning_rate": 1.1270540979956502e-06, + "loss": 0.5573, + "step": 2384 + }, + { + "epoch": 0.954, + "grad_norm": 0.3482598521397009, + "learning_rate": 1.1077416907163574e-06, + "loss": 0.6243, + "step": 2385 + }, + { + "epoch": 0.9544, + "grad_norm": 0.34723913994768063, + "learning_rate": 1.0885952568380764e-06, + "loss": 0.5999, + "step": 2386 + }, + { + "epoch": 0.9548, + "grad_norm": 0.3420298689457409, + "learning_rate": 1.0696148284947694e-06, + "loss": 0.5875, + "step": 2387 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3475456256380816, + "learning_rate": 1.0508004375417546e-06, + "loss": 0.5913, + "step": 2388 + }, + { + "epoch": 0.9556, + "grad_norm": 0.3405188863402418, + "learning_rate": 1.0321521155557179e-06, + "loss": 0.5335, + "step": 2389 + }, + { + "epoch": 0.956, + "grad_norm": 0.35529313752021524, + "learning_rate": 1.0136698938346011e-06, + "loss": 0.5859, + "step": 2390 + }, + { + "epoch": 0.9564, + "grad_norm": 0.3430242218150067, + "learning_rate": 9.953538033975918e-07, + "loss": 0.6045, + "step": 2391 + }, + { + "epoch": 0.9568, + "grad_norm": 0.348871926036714, + "learning_rate": 9.772038749850665e-07, + "loss": 0.572, + "step": 2392 + }, + { + "epoch": 0.9572, + "grad_norm": 0.35317078131207735, + "learning_rate": 9.59220139058492e-07, + "loss": 0.6179, + "step": 2393 + }, + { + "epoch": 0.9576, + "grad_norm": 0.323743943833337, + "learning_rate": 9.414026258004582e-07, + "loss": 0.5333, + "step": 2394 + }, + { + "epoch": 0.958, + "grad_norm": 0.33483465085793146, + "learning_rate": 9.237513651145225e-07, + "loss": 0.5729, + "step": 2395 + }, + { + "epoch": 0.9584, + "grad_norm": 0.3253237012665603, + "learning_rate": 9.062663866252541e-07, + "loss": 0.5949, + "step": 2396 + }, + { + "epoch": 0.9588, + "grad_norm": 0.3325136181038339, + "learning_rate": 8.889477196781571e-07, + "loss": 0.5344, + "step": 2397 + }, + { + "epoch": 0.9592, + "grad_norm": 0.35296644109808906, + "learning_rate": 8.717953933395694e-07, + "loss": 0.605, + "step": 2398 + }, + { + "epoch": 0.9596, + "grad_norm": 0.3390970266581428, + "learning_rate": 8.548094363966974e-07, + "loss": 0.5447, + "step": 2399 + }, + { + "epoch": 0.96, + "grad_norm": 0.36166833688023653, + "learning_rate": 8.379898773574924e-07, + "loss": 0.5642, + "step": 2400 + }, + { + "epoch": 0.9604, + "grad_norm": 0.42843968940094684, + "learning_rate": 8.213367444506515e-07, + "loss": 0.5747, + "step": 2401 + }, + { + "epoch": 0.9608, + "grad_norm": 0.3472675809359243, + "learning_rate": 8.048500656255509e-07, + "loss": 0.6021, + "step": 2402 + }, + { + "epoch": 0.9612, + "grad_norm": 0.37501497520210736, + "learning_rate": 7.885298685522235e-07, + "loss": 0.5626, + "step": 2403 + }, + { + "epoch": 0.9616, + "grad_norm": 0.36453428855164266, + "learning_rate": 7.72376180621237e-07, + "loss": 0.5585, + "step": 2404 + }, + { + "epoch": 0.962, + "grad_norm": 0.36866230726234994, + "learning_rate": 7.563890289437825e-07, + "loss": 0.6233, + "step": 2405 + }, + { + "epoch": 0.9624, + "grad_norm": 0.357034665309984, + "learning_rate": 7.405684403514634e-07, + "loss": 0.5996, + "step": 2406 + }, + { + "epoch": 0.9628, + "grad_norm": 0.3462692994450068, + "learning_rate": 7.24914441396396e-07, + "loss": 0.5877, + "step": 2407 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3442435233311984, + "learning_rate": 7.094270583510975e-07, + "loss": 0.5682, + "step": 2408 + }, + { + "epoch": 0.9636, + "grad_norm": 0.38064657260176105, + "learning_rate": 6.941063172084095e-07, + "loss": 0.5741, + "step": 2409 + }, + { + "epoch": 0.964, + "grad_norm": 0.3534661622519689, + "learning_rate": 6.78952243681541e-07, + "loss": 0.6255, + "step": 2410 + }, + { + "epoch": 0.9644, + "grad_norm": 0.3328590047781539, + "learning_rate": 6.639648632039697e-07, + "loss": 0.5752, + "step": 2411 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3429833811094635, + "learning_rate": 6.491442009293858e-07, + "loss": 0.6012, + "step": 2412 + }, + { + "epoch": 0.9652, + "grad_norm": 0.3561655404495796, + "learning_rate": 6.344902817316812e-07, + "loss": 0.5503, + "step": 2413 + }, + { + "epoch": 0.9656, + "grad_norm": 0.33775128399922205, + "learning_rate": 6.200031302049047e-07, + "loss": 0.5851, + "step": 2414 + }, + { + "epoch": 0.966, + "grad_norm": 0.3627069667953671, + "learning_rate": 6.056827706632185e-07, + "loss": 0.6216, + "step": 2415 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3493171408060569, + "learning_rate": 5.915292271408524e-07, + "loss": 0.6179, + "step": 2416 + }, + { + "epoch": 0.9668, + "grad_norm": 0.362713924190697, + "learning_rate": 5.775425233920495e-07, + "loss": 0.5923, + "step": 2417 + }, + { + "epoch": 0.9672, + "grad_norm": 0.3883190641575212, + "learning_rate": 5.637226828910436e-07, + "loss": 0.5861, + "step": 2418 + }, + { + "epoch": 0.9676, + "grad_norm": 0.34342397440804545, + "learning_rate": 5.500697288320478e-07, + "loss": 0.5277, + "step": 2419 + }, + { + "epoch": 0.968, + "grad_norm": 0.34098783955612, + "learning_rate": 5.365836841291438e-07, + "loss": 0.5932, + "step": 2420 + }, + { + "epoch": 0.9684, + "grad_norm": 0.37181080315426046, + "learning_rate": 5.232645714163265e-07, + "loss": 0.5796, + "step": 2421 + }, + { + "epoch": 0.9688, + "grad_norm": 0.34473297284696364, + "learning_rate": 5.101124130473811e-07, + "loss": 0.5895, + "step": 2422 + }, + { + "epoch": 0.9692, + "grad_norm": 0.37410772340681114, + "learning_rate": 4.971272310959063e-07, + "loss": 0.5757, + "step": 2423 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3653164649937013, + "learning_rate": 4.843090473552913e-07, + "loss": 0.6452, + "step": 2424 + }, + { + "epoch": 0.97, + "grad_norm": 0.3593211432808053, + "learning_rate": 4.7165788333860536e-07, + "loss": 0.5985, + "step": 2425 + }, + { + "epoch": 0.9704, + "grad_norm": 0.35245013507684075, + "learning_rate": 4.5917376027861945e-07, + "loss": 0.5228, + "step": 2426 + }, + { + "epoch": 0.9708, + "grad_norm": 0.3537624277112952, + "learning_rate": 4.468566991277512e-07, + "loss": 0.5949, + "step": 2427 + }, + { + "epoch": 0.9712, + "grad_norm": 0.32705621542268176, + "learning_rate": 4.347067205580424e-07, + "loss": 0.5503, + "step": 2428 + }, + { + "epoch": 0.9716, + "grad_norm": 0.3382575227276866, + "learning_rate": 4.2272384496112594e-07, + "loss": 0.5573, + "step": 2429 + }, + { + "epoch": 0.972, + "grad_norm": 0.3584734009623166, + "learning_rate": 4.1090809244814785e-07, + "loss": 0.5704, + "step": 2430 + }, + { + "epoch": 0.9724, + "grad_norm": 0.32131468623218323, + "learning_rate": 3.9925948284980086e-07, + "loss": 0.566, + "step": 2431 + }, + { + "epoch": 0.9728, + "grad_norm": 0.35002803250987646, + "learning_rate": 3.877780357162353e-07, + "loss": 0.6152, + "step": 2432 + }, + { + "epoch": 0.9732, + "grad_norm": 0.3511181384629274, + "learning_rate": 3.7646377031705924e-07, + "loss": 0.5882, + "step": 2433 + }, + { + "epoch": 0.9736, + "grad_norm": 0.34404651785041235, + "learning_rate": 3.653167056413054e-07, + "loss": 0.6157, + "step": 2434 + }, + { + "epoch": 0.974, + "grad_norm": 0.3379154238877894, + "learning_rate": 3.543368603973529e-07, + "loss": 0.5449, + "step": 2435 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3381776255081913, + "learning_rate": 3.4352425301297233e-07, + "loss": 0.5812, + "step": 2436 + }, + { + "epoch": 0.9748, + "grad_norm": 0.34012625463195734, + "learning_rate": 3.3287890163523626e-07, + "loss": 0.5641, + "step": 2437 + }, + { + "epoch": 0.9752, + "grad_norm": 0.36650941460648767, + "learning_rate": 3.2240082413049765e-07, + "loss": 0.6161, + "step": 2438 + }, + { + "epoch": 0.9756, + "grad_norm": 0.34436265712860376, + "learning_rate": 3.120900380844116e-07, + "loss": 0.597, + "step": 2439 + }, + { + "epoch": 0.976, + "grad_norm": 0.32650237175810803, + "learning_rate": 3.019465608018024e-07, + "loss": 0.5198, + "step": 2440 + }, + { + "epoch": 0.9764, + "grad_norm": 0.32621853345125035, + "learning_rate": 2.91970409306741e-07, + "loss": 0.5791, + "step": 2441 + }, + { + "epoch": 0.9768, + "grad_norm": 0.3340739671871019, + "learning_rate": 2.8216160034244543e-07, + "loss": 0.5695, + "step": 2442 + }, + { + "epoch": 0.9772, + "grad_norm": 0.3516249067193672, + "learning_rate": 2.7252015037131375e-07, + "loss": 0.5613, + "step": 2443 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3548198648851666, + "learning_rate": 2.630460755748132e-07, + "loss": 0.6438, + "step": 2444 + }, + { + "epoch": 0.978, + "grad_norm": 0.33513881905909393, + "learning_rate": 2.537393918535358e-07, + "loss": 0.5676, + "step": 2445 + }, + { + "epoch": 0.9784, + "grad_norm": 0.3448911204069862, + "learning_rate": 2.4460011482713153e-07, + "loss": 0.5809, + "step": 2446 + }, + { + "epoch": 0.9788, + "grad_norm": 0.32549122686674664, + "learning_rate": 2.3562825983427516e-07, + "loss": 0.5408, + "step": 2447 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3352492333611208, + "learning_rate": 2.2682384193266626e-07, + "loss": 0.5787, + "step": 2448 + }, + { + "epoch": 0.9796, + "grad_norm": 0.3511330639105213, + "learning_rate": 2.1818687589896246e-07, + "loss": 0.5626, + "step": 2449 + }, + { + "epoch": 0.98, + "grad_norm": 0.3334275035098752, + "learning_rate": 2.0971737622883515e-07, + "loss": 0.5496, + "step": 2450 + }, + { + "epoch": 0.9804, + "grad_norm": 0.3318676137989981, + "learning_rate": 2.01415357136836e-07, + "loss": 0.5564, + "step": 2451 + }, + { + "epoch": 0.9808, + "grad_norm": 0.39134888539753543, + "learning_rate": 1.93280832556475e-07, + "loss": 0.6444, + "step": 2452 + }, + { + "epoch": 0.9812, + "grad_norm": 0.35476133630040274, + "learning_rate": 1.853138161401313e-07, + "loss": 0.6449, + "step": 2453 + }, + { + "epoch": 0.9816, + "grad_norm": 0.34844717565920313, + "learning_rate": 1.7751432125903134e-07, + "loss": 0.6005, + "step": 2454 + }, + { + "epoch": 0.982, + "grad_norm": 0.3451148912982019, + "learning_rate": 1.6988236100329292e-07, + "loss": 0.6096, + "step": 2455 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3760768751207625, + "learning_rate": 1.6241794818180333e-07, + "loss": 0.5715, + "step": 2456 + }, + { + "epoch": 0.9828, + "grad_norm": 0.34236614555096034, + "learning_rate": 1.5512109532229702e-07, + "loss": 0.5783, + "step": 2457 + }, + { + "epoch": 0.9832, + "grad_norm": 0.3648978677422674, + "learning_rate": 1.4799181467125556e-07, + "loss": 0.6236, + "step": 2458 + }, + { + "epoch": 0.9836, + "grad_norm": 0.3382695517918823, + "learning_rate": 1.4103011819395217e-07, + "loss": 0.6144, + "step": 2459 + }, + { + "epoch": 0.984, + "grad_norm": 0.33547126455341314, + "learning_rate": 1.3423601757436287e-07, + "loss": 0.5946, + "step": 2460 + }, + { + "epoch": 0.9844, + "grad_norm": 0.35722960414129934, + "learning_rate": 1.276095242151998e-07, + "loss": 0.5891, + "step": 2461 + }, + { + "epoch": 0.9848, + "grad_norm": 0.35703466561425545, + "learning_rate": 1.211506492378778e-07, + "loss": 0.5573, + "step": 2462 + }, + { + "epoch": 0.9852, + "grad_norm": 0.3600369050933721, + "learning_rate": 1.1485940348249235e-07, + "loss": 0.5731, + "step": 2463 + }, + { + "epoch": 0.9856, + "grad_norm": 0.35005886164808475, + "learning_rate": 1.0873579750780849e-07, + "loss": 0.6015, + "step": 2464 + }, + { + "epoch": 0.986, + "grad_norm": 0.38173089236257796, + "learning_rate": 1.0277984159122733e-07, + "loss": 0.5207, + "step": 2465 + }, + { + "epoch": 0.9864, + "grad_norm": 0.3389675973408878, + "learning_rate": 9.699154572877511e-08, + "loss": 0.5788, + "step": 2466 + }, + { + "epoch": 0.9868, + "grad_norm": 0.32971814138367084, + "learning_rate": 9.137091963510314e-08, + "loss": 0.5361, + "step": 2467 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3278269739115661, + "learning_rate": 8.591797274344338e-08, + "loss": 0.5783, + "step": 2468 + }, + { + "epoch": 0.9876, + "grad_norm": 0.35134902862350553, + "learning_rate": 8.063271420563068e-08, + "loss": 0.5904, + "step": 2469 + }, + { + "epoch": 0.988, + "grad_norm": 0.3282824064837508, + "learning_rate": 7.551515289203615e-08, + "loss": 0.5787, + "step": 2470 + }, + { + "epoch": 0.9884, + "grad_norm": 0.39575686355806367, + "learning_rate": 7.056529739158934e-08, + "loss": 0.5738, + "step": 2471 + }, + { + "epoch": 0.9888, + "grad_norm": 0.34569557216135277, + "learning_rate": 6.578315601177831e-08, + "loss": 0.5655, + "step": 2472 + }, + { + "epoch": 0.9892, + "grad_norm": 0.35208160623381174, + "learning_rate": 6.116873677858292e-08, + "loss": 0.5656, + "step": 2473 + }, + { + "epoch": 0.9896, + "grad_norm": 0.3689703449415024, + "learning_rate": 5.6722047436497116e-08, + "loss": 0.546, + "step": 2474 + }, + { + "epoch": 0.99, + "grad_norm": 0.3988292102292916, + "learning_rate": 5.2443095448506674e-08, + "loss": 0.5201, + "step": 2475 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3610586624211906, + "learning_rate": 4.8331887996100336e-08, + "loss": 0.6618, + "step": 2476 + }, + { + "epoch": 0.9908, + "grad_norm": 0.3610758732511304, + "learning_rate": 4.438843197922538e-08, + "loss": 0.5826, + "step": 2477 + }, + { + "epoch": 0.9912, + "grad_norm": 0.3249391309986885, + "learning_rate": 4.061273401627652e-08, + "loss": 0.5627, + "step": 2478 + }, + { + "epoch": 0.9916, + "grad_norm": 0.37379325075132186, + "learning_rate": 3.7004800444095935e-08, + "loss": 0.6008, + "step": 2479 + }, + { + "epoch": 0.992, + "grad_norm": 0.3391315745868408, + "learning_rate": 3.356463731798432e-08, + "loss": 0.5779, + "step": 2480 + }, + { + "epoch": 0.9924, + "grad_norm": 0.3519546846907048, + "learning_rate": 3.0292250411645404e-08, + "loss": 0.5718, + "step": 2481 + }, + { + "epoch": 0.9928, + "grad_norm": 0.3654826071464851, + "learning_rate": 2.718764521721928e-08, + "loss": 0.5915, + "step": 2482 + }, + { + "epoch": 0.9932, + "grad_norm": 0.3339716780119255, + "learning_rate": 2.4250826945226847e-08, + "loss": 0.531, + "step": 2483 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3591835383870838, + "learning_rate": 2.148180052462534e-08, + "loss": 0.5962, + "step": 2484 + }, + { + "epoch": 0.994, + "grad_norm": 0.32906026676200706, + "learning_rate": 1.888057060274173e-08, + "loss": 0.5505, + "step": 2485 + }, + { + "epoch": 0.9944, + "grad_norm": 0.33556356391048286, + "learning_rate": 1.6447141545272715e-08, + "loss": 0.5274, + "step": 2486 + }, + { + "epoch": 0.9948, + "grad_norm": 0.3714304373448778, + "learning_rate": 1.4181517436306912e-08, + "loss": 0.6015, + "step": 2487 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3610303975888095, + "learning_rate": 1.2083702078302672e-08, + "loss": 0.5901, + "step": 2488 + }, + { + "epoch": 0.9956, + "grad_norm": 0.3408485582669479, + "learning_rate": 1.0153698992088068e-08, + "loss": 0.5576, + "step": 2489 + }, + { + "epoch": 0.996, + "grad_norm": 0.47199005918345777, + "learning_rate": 8.391511416816489e-09, + "loss": 0.5967, + "step": 2490 + }, + { + "epoch": 0.9964, + "grad_norm": 0.35849436583308514, + "learning_rate": 6.797142310022153e-09, + "loss": 0.563, + "step": 2491 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3162945272534749, + "learning_rate": 5.370594347575697e-09, + "loss": 0.5457, + "step": 2492 + }, + { + "epoch": 0.9972, + "grad_norm": 0.34495301983804705, + "learning_rate": 4.111869923684175e-09, + "loss": 0.5793, + "step": 2493 + }, + { + "epoch": 0.9976, + "grad_norm": 0.3482463638127228, + "learning_rate": 3.0209711509132654e-09, + "loss": 0.6114, + "step": 2494 + }, + { + "epoch": 0.998, + "grad_norm": 0.34834264419494415, + "learning_rate": 2.0978998601206556e-09, + "loss": 0.5633, + "step": 2495 + }, + { + "epoch": 0.9984, + "grad_norm": 0.37229288685720197, + "learning_rate": 1.342657600544861e-09, + "loss": 0.6379, + "step": 2496 + }, + { + "epoch": 0.9988, + "grad_norm": 0.32919562198756436, + "learning_rate": 7.552456397053043e-10, + "loss": 0.539, + "step": 2497 + }, + { + "epoch": 0.9992, + "grad_norm": 0.35739179206642846, + "learning_rate": 3.3566496349113353e-10, + "loss": 0.6179, + "step": 2498 + }, + { + "epoch": 0.9996, + "grad_norm": 0.377010254391696, + "learning_rate": 8.391627608350661e-11, + "loss": 0.6228, + "step": 2499 + }, + { + "epoch": 1.0, + "grad_norm": 0.36175449337551296, + "learning_rate": 0.0, + "loss": 0.6089, + "step": 2500 + }, + { + "epoch": 1.0, + "step": 2500, + "total_flos": 2225655608836096.0, + "train_loss": 0.6471238312482834, + "train_runtime": 39645.3417, + "train_samples_per_second": 1.009, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 2500, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 2225655608836096.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..509139647e40f99c318a4486c4fd46b95c571fbc --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "up_proj", + "gate_proj", + "q_proj", + "down_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..76f2f8522bd15a8067d741ce043c1cb784403ec4 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2929435931b50af416e3c32a44e2be03d7c0e670a8dcac5ca650b01382413dff +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..cf7c6c0c6a545a788c84b1a27a7646822f148cac --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2002d93f399005e211b3ff01fdaca15d40b94b01b86add68364e0a8731bb38df +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..6b7ce7ff70e213d78a3f684616aa6b58a9945a3b --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_50000_epochs_1_lora/trainer_state.json @@ -0,0 +1,21917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00032, + "grad_norm": 1.1731204456473057, + "learning_rate": 2.1276595744680853e-06, + "loss": 1.5375, + "step": 1 + }, + { + "epoch": 0.00064, + "grad_norm": 1.0874451922760042, + "learning_rate": 4.255319148936171e-06, + "loss": 1.5737, + "step": 2 + }, + { + "epoch": 0.00096, + "grad_norm": 1.1522666344335273, + "learning_rate": 6.3829787234042555e-06, + "loss": 1.574, + "step": 3 + }, + { + "epoch": 0.00128, + "grad_norm": 1.1438797548320527, + "learning_rate": 8.510638297872341e-06, + "loss": 1.5675, + "step": 4 + }, + { + "epoch": 0.0016, + "grad_norm": 1.1527978191782793, + "learning_rate": 1.0638297872340426e-05, + "loss": 1.4695, + "step": 5 + }, + { + "epoch": 0.00192, + "grad_norm": 1.030976505512538, + "learning_rate": 1.2765957446808511e-05, + "loss": 1.5025, + "step": 6 + }, + { + "epoch": 0.00224, + "grad_norm": 0.8994453718426768, + "learning_rate": 1.4893617021276596e-05, + "loss": 1.4405, + "step": 7 + }, + { + "epoch": 0.00256, + "grad_norm": 0.9866812823506541, + "learning_rate": 1.7021276595744682e-05, + "loss": 1.4088, + "step": 8 + }, + { + "epoch": 0.00288, + "grad_norm": 1.005930125260958, + "learning_rate": 1.9148936170212766e-05, + "loss": 1.3633, + "step": 9 + }, + { + "epoch": 0.0032, + "grad_norm": 1.002795962550857, + "learning_rate": 2.1276595744680852e-05, + "loss": 1.2755, + "step": 10 + }, + { + "epoch": 0.00352, + "grad_norm": 0.9506223845208505, + "learning_rate": 2.340425531914894e-05, + "loss": 1.2413, + "step": 11 + }, + { + "epoch": 0.00384, + "grad_norm": 1.002996990567505, + "learning_rate": 2.5531914893617022e-05, + "loss": 1.1836, + "step": 12 + }, + { + "epoch": 0.00416, + "grad_norm": 1.2914749552786817, + "learning_rate": 2.765957446808511e-05, + "loss": 1.1046, + "step": 13 + }, + { + "epoch": 0.00448, + "grad_norm": 0.8620511781430058, + "learning_rate": 2.9787234042553192e-05, + "loss": 0.9993, + "step": 14 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8815276678933675, + "learning_rate": 3.191489361702128e-05, + "loss": 1.0141, + "step": 15 + }, + { + "epoch": 0.00512, + "grad_norm": 0.8892144594051491, + "learning_rate": 3.4042553191489365e-05, + "loss": 0.9935, + "step": 16 + }, + { + "epoch": 0.00544, + "grad_norm": 0.8060622568937051, + "learning_rate": 3.617021276595745e-05, + "loss": 1.0498, + "step": 17 + }, + { + "epoch": 0.00576, + "grad_norm": 0.7717226275464889, + "learning_rate": 3.829787234042553e-05, + "loss": 0.9595, + "step": 18 + }, + { + "epoch": 0.00608, + "grad_norm": 0.7214390063255575, + "learning_rate": 4.0425531914893614e-05, + "loss": 0.9034, + "step": 19 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6569777573026117, + "learning_rate": 4.2553191489361704e-05, + "loss": 0.9259, + "step": 20 + }, + { + "epoch": 0.00672, + "grad_norm": 0.6105655595251536, + "learning_rate": 4.468085106382979e-05, + "loss": 1.0036, + "step": 21 + }, + { + "epoch": 0.00704, + "grad_norm": 0.5500381506972107, + "learning_rate": 4.680851063829788e-05, + "loss": 0.8723, + "step": 22 + }, + { + "epoch": 0.00736, + "grad_norm": 0.5960513728068643, + "learning_rate": 4.893617021276596e-05, + "loss": 1.0233, + "step": 23 + }, + { + "epoch": 0.00768, + "grad_norm": 0.5967296068618473, + "learning_rate": 5.1063829787234044e-05, + "loss": 0.8992, + "step": 24 + }, + { + "epoch": 0.008, + "grad_norm": 0.519443074201136, + "learning_rate": 5.319148936170213e-05, + "loss": 0.8753, + "step": 25 + }, + { + "epoch": 0.00832, + "grad_norm": 0.4925062984123281, + "learning_rate": 5.531914893617022e-05, + "loss": 0.8894, + "step": 26 + }, + { + "epoch": 0.00864, + "grad_norm": 0.5023296539264742, + "learning_rate": 5.744680851063831e-05, + "loss": 0.8267, + "step": 27 + }, + { + "epoch": 0.00896, + "grad_norm": 0.49306587234290833, + "learning_rate": 5.9574468085106384e-05, + "loss": 0.8744, + "step": 28 + }, + { + "epoch": 0.00928, + "grad_norm": 0.48486792040650717, + "learning_rate": 6.170212765957447e-05, + "loss": 0.8754, + "step": 29 + }, + { + "epoch": 0.0096, + "grad_norm": 0.5032819920258179, + "learning_rate": 6.382978723404256e-05, + "loss": 0.8425, + "step": 30 + }, + { + "epoch": 0.00992, + "grad_norm": 0.5246804064999607, + "learning_rate": 6.595744680851063e-05, + "loss": 0.8512, + "step": 31 + }, + { + "epoch": 0.01024, + "grad_norm": 0.4932282136254817, + "learning_rate": 6.808510638297873e-05, + "loss": 0.789, + "step": 32 + }, + { + "epoch": 0.01056, + "grad_norm": 0.4818690946226732, + "learning_rate": 7.021276595744681e-05, + "loss": 0.8577, + "step": 33 + }, + { + "epoch": 0.01088, + "grad_norm": 0.5182161730047697, + "learning_rate": 7.23404255319149e-05, + "loss": 0.8742, + "step": 34 + }, + { + "epoch": 0.0112, + "grad_norm": 0.5190136272248325, + "learning_rate": 7.446808510638298e-05, + "loss": 0.8694, + "step": 35 + }, + { + "epoch": 0.01152, + "grad_norm": 0.5013872882456878, + "learning_rate": 7.659574468085106e-05, + "loss": 0.7884, + "step": 36 + }, + { + "epoch": 0.01184, + "grad_norm": 0.4755846116681576, + "learning_rate": 7.872340425531916e-05, + "loss": 0.8186, + "step": 37 + }, + { + "epoch": 0.01216, + "grad_norm": 0.5023490769633178, + "learning_rate": 8.085106382978723e-05, + "loss": 0.8458, + "step": 38 + }, + { + "epoch": 0.01248, + "grad_norm": 0.5506541905065853, + "learning_rate": 8.297872340425533e-05, + "loss": 0.8365, + "step": 39 + }, + { + "epoch": 0.0128, + "grad_norm": 0.49883353079038545, + "learning_rate": 8.510638297872341e-05, + "loss": 0.8089, + "step": 40 + }, + { + "epoch": 0.01312, + "grad_norm": 0.5032322471078805, + "learning_rate": 8.723404255319149e-05, + "loss": 0.7647, + "step": 41 + }, + { + "epoch": 0.01344, + "grad_norm": 0.5300451060396111, + "learning_rate": 8.936170212765958e-05, + "loss": 0.8643, + "step": 42 + }, + { + "epoch": 0.01376, + "grad_norm": 0.5223459318507665, + "learning_rate": 9.148936170212766e-05, + "loss": 0.8625, + "step": 43 + }, + { + "epoch": 0.01408, + "grad_norm": 0.4933444414366919, + "learning_rate": 9.361702127659576e-05, + "loss": 0.8369, + "step": 44 + }, + { + "epoch": 0.0144, + "grad_norm": 0.45964304834337194, + "learning_rate": 9.574468085106384e-05, + "loss": 0.7714, + "step": 45 + }, + { + "epoch": 0.01472, + "grad_norm": 0.47183346334080883, + "learning_rate": 9.787234042553192e-05, + "loss": 0.8145, + "step": 46 + }, + { + "epoch": 0.01504, + "grad_norm": 0.4889165264405602, + "learning_rate": 0.0001, + "loss": 0.8636, + "step": 47 + }, + { + "epoch": 0.01536, + "grad_norm": 0.45880842758274565, + "learning_rate": 0.00010212765957446809, + "loss": 0.8467, + "step": 48 + }, + { + "epoch": 0.01568, + "grad_norm": 0.4546382255534783, + "learning_rate": 0.00010425531914893618, + "loss": 0.7218, + "step": 49 + }, + { + "epoch": 0.016, + "grad_norm": 0.45789199269746717, + "learning_rate": 0.00010638297872340425, + "loss": 0.8478, + "step": 50 + }, + { + "epoch": 0.01632, + "grad_norm": 0.46500190214603254, + "learning_rate": 0.00010851063829787234, + "loss": 0.8135, + "step": 51 + }, + { + "epoch": 0.01664, + "grad_norm": 0.46975396433824307, + "learning_rate": 0.00011063829787234043, + "loss": 0.7553, + "step": 52 + }, + { + "epoch": 0.01696, + "grad_norm": 0.4818383433872269, + "learning_rate": 0.00011276595744680852, + "loss": 0.7569, + "step": 53 + }, + { + "epoch": 0.01728, + "grad_norm": 0.8761618046394473, + "learning_rate": 0.00011489361702127661, + "loss": 0.7913, + "step": 54 + }, + { + "epoch": 0.0176, + "grad_norm": 0.43519748375111394, + "learning_rate": 0.00011702127659574468, + "loss": 0.7454, + "step": 55 + }, + { + "epoch": 0.01792, + "grad_norm": 0.44196300412314393, + "learning_rate": 0.00011914893617021277, + "loss": 0.7915, + "step": 56 + }, + { + "epoch": 0.01824, + "grad_norm": 0.4320803614200594, + "learning_rate": 0.00012127659574468086, + "loss": 0.7815, + "step": 57 + }, + { + "epoch": 0.01856, + "grad_norm": 0.434454215620284, + "learning_rate": 0.00012340425531914893, + "loss": 0.8358, + "step": 58 + }, + { + "epoch": 0.01888, + "grad_norm": 0.444109592699101, + "learning_rate": 0.00012553191489361702, + "loss": 0.8034, + "step": 59 + }, + { + "epoch": 0.0192, + "grad_norm": 0.44321008869127104, + "learning_rate": 0.00012765957446808513, + "loss": 0.7547, + "step": 60 + }, + { + "epoch": 0.01952, + "grad_norm": 0.465332043208612, + "learning_rate": 0.00012978723404255318, + "loss": 0.8228, + "step": 61 + }, + { + "epoch": 0.01984, + "grad_norm": 0.45905730231690656, + "learning_rate": 0.00013191489361702127, + "loss": 0.7571, + "step": 62 + }, + { + "epoch": 0.02016, + "grad_norm": 0.44239081478921777, + "learning_rate": 0.00013404255319148938, + "loss": 0.7157, + "step": 63 + }, + { + "epoch": 0.02048, + "grad_norm": 0.4376818526592498, + "learning_rate": 0.00013617021276595746, + "loss": 0.7761, + "step": 64 + }, + { + "epoch": 0.0208, + "grad_norm": 0.4641660188193671, + "learning_rate": 0.00013829787234042554, + "loss": 0.7672, + "step": 65 + }, + { + "epoch": 0.02112, + "grad_norm": 0.4437586543840469, + "learning_rate": 0.00014042553191489363, + "loss": 0.7645, + "step": 66 + }, + { + "epoch": 0.02144, + "grad_norm": 0.43784489225170686, + "learning_rate": 0.0001425531914893617, + "loss": 0.7887, + "step": 67 + }, + { + "epoch": 0.02176, + "grad_norm": 0.4348987607939572, + "learning_rate": 0.0001446808510638298, + "loss": 0.7532, + "step": 68 + }, + { + "epoch": 0.02208, + "grad_norm": 0.4439487163648935, + "learning_rate": 0.00014680851063829788, + "loss": 0.7627, + "step": 69 + }, + { + "epoch": 0.0224, + "grad_norm": 0.43865263620003636, + "learning_rate": 0.00014893617021276596, + "loss": 0.7902, + "step": 70 + }, + { + "epoch": 0.02272, + "grad_norm": 0.42266616421302383, + "learning_rate": 0.00015106382978723407, + "loss": 0.7184, + "step": 71 + }, + { + "epoch": 0.02304, + "grad_norm": 0.4305898725706256, + "learning_rate": 0.00015319148936170213, + "loss": 0.7616, + "step": 72 + }, + { + "epoch": 0.02336, + "grad_norm": 0.44985286747216346, + "learning_rate": 0.0001553191489361702, + "loss": 0.7949, + "step": 73 + }, + { + "epoch": 0.02368, + "grad_norm": 0.4431490716592776, + "learning_rate": 0.00015744680851063832, + "loss": 0.7553, + "step": 74 + }, + { + "epoch": 0.024, + "grad_norm": 0.42491119706935265, + "learning_rate": 0.00015957446808510637, + "loss": 0.784, + "step": 75 + }, + { + "epoch": 0.02432, + "grad_norm": 0.42683973187367136, + "learning_rate": 0.00016170212765957446, + "loss": 0.7743, + "step": 76 + }, + { + "epoch": 0.02464, + "grad_norm": 0.4424036302783349, + "learning_rate": 0.00016382978723404257, + "loss": 0.7532, + "step": 77 + }, + { + "epoch": 0.02496, + "grad_norm": 0.4939331210509091, + "learning_rate": 0.00016595744680851065, + "loss": 0.7956, + "step": 78 + }, + { + "epoch": 0.02528, + "grad_norm": 0.451524645795332, + "learning_rate": 0.00016808510638297873, + "loss": 0.8558, + "step": 79 + }, + { + "epoch": 0.0256, + "grad_norm": 0.46899146025448957, + "learning_rate": 0.00017021276595744682, + "loss": 0.7842, + "step": 80 + }, + { + "epoch": 0.02592, + "grad_norm": 0.4257699612799091, + "learning_rate": 0.0001723404255319149, + "loss": 0.7888, + "step": 81 + }, + { + "epoch": 0.02624, + "grad_norm": 0.4163742024238363, + "learning_rate": 0.00017446808510638298, + "loss": 0.7359, + "step": 82 + }, + { + "epoch": 0.02656, + "grad_norm": 0.43745821044811817, + "learning_rate": 0.00017659574468085107, + "loss": 0.8091, + "step": 83 + }, + { + "epoch": 0.02688, + "grad_norm": 0.46021608034195793, + "learning_rate": 0.00017872340425531915, + "loss": 0.8086, + "step": 84 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4344927830913337, + "learning_rate": 0.00018085106382978726, + "loss": 0.7739, + "step": 85 + }, + { + "epoch": 0.02752, + "grad_norm": 0.4562564386414459, + "learning_rate": 0.00018297872340425532, + "loss": 0.8199, + "step": 86 + }, + { + "epoch": 0.02784, + "grad_norm": 0.4622391342149164, + "learning_rate": 0.0001851063829787234, + "loss": 0.7877, + "step": 87 + }, + { + "epoch": 0.02816, + "grad_norm": 0.43992782341599485, + "learning_rate": 0.0001872340425531915, + "loss": 0.7858, + "step": 88 + }, + { + "epoch": 0.02848, + "grad_norm": 0.47778531966000015, + "learning_rate": 0.00018936170212765957, + "loss": 0.7728, + "step": 89 + }, + { + "epoch": 0.0288, + "grad_norm": 0.43201711511780055, + "learning_rate": 0.00019148936170212768, + "loss": 0.7886, + "step": 90 + }, + { + "epoch": 0.02912, + "grad_norm": 0.4051612223053757, + "learning_rate": 0.00019361702127659576, + "loss": 0.7675, + "step": 91 + }, + { + "epoch": 0.02944, + "grad_norm": 0.43458334760183204, + "learning_rate": 0.00019574468085106384, + "loss": 0.7546, + "step": 92 + }, + { + "epoch": 0.02976, + "grad_norm": 0.44083664287493324, + "learning_rate": 0.00019787234042553193, + "loss": 0.7344, + "step": 93 + }, + { + "epoch": 0.03008, + "grad_norm": 0.4384926816131158, + "learning_rate": 0.0002, + "loss": 0.7635, + "step": 94 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4593902461605018, + "learning_rate": 0.00019999994628472071, + "loss": 0.783, + "step": 95 + }, + { + "epoch": 0.03072, + "grad_norm": 0.40538192170834214, + "learning_rate": 0.00019999978513894056, + "loss": 0.7276, + "step": 96 + }, + { + "epoch": 0.03104, + "grad_norm": 0.4335493052472172, + "learning_rate": 0.00019999951656283268, + "loss": 0.7662, + "step": 97 + }, + { + "epoch": 0.03136, + "grad_norm": 0.4439500994385527, + "learning_rate": 0.00019999914055668561, + "loss": 0.8079, + "step": 98 + }, + { + "epoch": 0.03168, + "grad_norm": 0.4443586289405873, + "learning_rate": 0.00019999865712090327, + "loss": 0.7917, + "step": 99 + }, + { + "epoch": 0.032, + "grad_norm": 0.43339383197052117, + "learning_rate": 0.000199998066256005, + "loss": 0.7308, + "step": 100 + }, + { + "epoch": 0.03232, + "grad_norm": 0.42602679580634845, + "learning_rate": 0.00019999736796262564, + "loss": 0.7523, + "step": 101 + }, + { + "epoch": 0.03264, + "grad_norm": 0.43622610490088404, + "learning_rate": 0.00019999656224151528, + "loss": 0.8239, + "step": 102 + }, + { + "epoch": 0.03296, + "grad_norm": 0.38275105288948313, + "learning_rate": 0.00019999564909353962, + "loss": 0.7389, + "step": 103 + }, + { + "epoch": 0.03328, + "grad_norm": 0.4154825734979613, + "learning_rate": 0.00019999462851967952, + "loss": 0.7232, + "step": 104 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4197450959577546, + "learning_rate": 0.00019999350052103153, + "loss": 0.6809, + "step": 105 + }, + { + "epoch": 0.03392, + "grad_norm": 0.4481345587105765, + "learning_rate": 0.00019999226509880735, + "loss": 0.7461, + "step": 106 + }, + { + "epoch": 0.03424, + "grad_norm": 0.43864292562090484, + "learning_rate": 0.00019999092225433428, + "loss": 0.7475, + "step": 107 + }, + { + "epoch": 0.03456, + "grad_norm": 0.4351432662380862, + "learning_rate": 0.0001999894719890549, + "loss": 0.7112, + "step": 108 + }, + { + "epoch": 0.03488, + "grad_norm": 0.4340270376809887, + "learning_rate": 0.0001999879143045273, + "loss": 0.8173, + "step": 109 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4419247595812773, + "learning_rate": 0.00019998624920242482, + "loss": 0.7257, + "step": 110 + }, + { + "epoch": 0.03552, + "grad_norm": 0.42418034402832416, + "learning_rate": 0.00019998447668453633, + "loss": 0.7403, + "step": 111 + }, + { + "epoch": 0.03584, + "grad_norm": 0.42013132890142585, + "learning_rate": 0.00019998259675276607, + "loss": 0.7273, + "step": 112 + }, + { + "epoch": 0.03616, + "grad_norm": 0.4195797152062477, + "learning_rate": 0.00019998060940913366, + "loss": 0.7359, + "step": 113 + }, + { + "epoch": 0.03648, + "grad_norm": 0.47343513465246695, + "learning_rate": 0.0001999785146557741, + "loss": 0.7648, + "step": 114 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4314934182495577, + "learning_rate": 0.0001999763124949378, + "loss": 0.7369, + "step": 115 + }, + { + "epoch": 0.03712, + "grad_norm": 0.4169870682876902, + "learning_rate": 0.00019997400292899055, + "loss": 0.7623, + "step": 116 + }, + { + "epoch": 0.03744, + "grad_norm": 0.42705686220444367, + "learning_rate": 0.00019997158596041353, + "loss": 0.7748, + "step": 117 + }, + { + "epoch": 0.03776, + "grad_norm": 0.43707761829477404, + "learning_rate": 0.00019996906159180334, + "loss": 0.7721, + "step": 118 + }, + { + "epoch": 0.03808, + "grad_norm": 0.4303830646358726, + "learning_rate": 0.00019996642982587182, + "loss": 0.7439, + "step": 119 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4697790863576078, + "learning_rate": 0.00019996369066544643, + "loss": 0.7746, + "step": 120 + }, + { + "epoch": 0.03872, + "grad_norm": 0.41907770314531756, + "learning_rate": 0.00019996084411346975, + "loss": 0.7242, + "step": 121 + }, + { + "epoch": 0.03904, + "grad_norm": 0.4231403841755841, + "learning_rate": 0.0001999578901729999, + "loss": 0.7389, + "step": 122 + }, + { + "epoch": 0.03936, + "grad_norm": 0.4420251841404513, + "learning_rate": 0.0001999548288472103, + "loss": 0.7664, + "step": 123 + }, + { + "epoch": 0.03968, + "grad_norm": 0.4132985858471485, + "learning_rate": 0.00019995166013938976, + "loss": 0.7271, + "step": 124 + }, + { + "epoch": 0.04, + "grad_norm": 0.4186223243627318, + "learning_rate": 0.00019994838405294247, + "loss": 0.7653, + "step": 125 + }, + { + "epoch": 0.04032, + "grad_norm": 0.39363335788049236, + "learning_rate": 0.0001999450005913879, + "loss": 0.7418, + "step": 126 + }, + { + "epoch": 0.04064, + "grad_norm": 0.42970976851958553, + "learning_rate": 0.00019994150975836093, + "loss": 0.78, + "step": 127 + }, + { + "epoch": 0.04096, + "grad_norm": 0.4229964846342404, + "learning_rate": 0.0001999379115576118, + "loss": 0.7799, + "step": 128 + }, + { + "epoch": 0.04128, + "grad_norm": 0.44877061698998616, + "learning_rate": 0.00019993420599300602, + "loss": 0.8015, + "step": 129 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4305286517221353, + "learning_rate": 0.00019993039306852458, + "loss": 0.8373, + "step": 130 + }, + { + "epoch": 0.04192, + "grad_norm": 0.42284280674688873, + "learning_rate": 0.00019992647278826368, + "loss": 0.7611, + "step": 131 + }, + { + "epoch": 0.04224, + "grad_norm": 0.4609941178480334, + "learning_rate": 0.0001999224451564349, + "loss": 0.7681, + "step": 132 + }, + { + "epoch": 0.04256, + "grad_norm": 0.4488488848602971, + "learning_rate": 0.00019991831017736518, + "loss": 0.8339, + "step": 133 + }, + { + "epoch": 0.04288, + "grad_norm": 0.44277414607454013, + "learning_rate": 0.0001999140678554967, + "loss": 0.7883, + "step": 134 + }, + { + "epoch": 0.0432, + "grad_norm": 0.3951794719168181, + "learning_rate": 0.00019990971819538707, + "loss": 0.7472, + "step": 135 + }, + { + "epoch": 0.04352, + "grad_norm": 0.4284900147790867, + "learning_rate": 0.00019990526120170908, + "loss": 0.6998, + "step": 136 + }, + { + "epoch": 0.04384, + "grad_norm": 0.4431910180861046, + "learning_rate": 0.00019990069687925098, + "loss": 0.7936, + "step": 137 + }, + { + "epoch": 0.04416, + "grad_norm": 0.43552565116953146, + "learning_rate": 0.0001998960252329162, + "loss": 0.7357, + "step": 138 + }, + { + "epoch": 0.04448, + "grad_norm": 0.44525793652397894, + "learning_rate": 0.00019989124626772353, + "loss": 0.756, + "step": 139 + }, + { + "epoch": 0.0448, + "grad_norm": 0.43809580788943686, + "learning_rate": 0.00019988635998880702, + "loss": 0.7998, + "step": 140 + }, + { + "epoch": 0.04512, + "grad_norm": 0.4019782963026797, + "learning_rate": 0.00019988136640141608, + "loss": 0.748, + "step": 141 + }, + { + "epoch": 0.04544, + "grad_norm": 0.49782136174784086, + "learning_rate": 0.00019987626551091526, + "loss": 0.7979, + "step": 142 + }, + { + "epoch": 0.04576, + "grad_norm": 0.5296834431801735, + "learning_rate": 0.00019987105732278458, + "loss": 0.7204, + "step": 143 + }, + { + "epoch": 0.04608, + "grad_norm": 0.4176342903284688, + "learning_rate": 0.00019986574184261912, + "loss": 0.7115, + "step": 144 + }, + { + "epoch": 0.0464, + "grad_norm": 0.3998532267253694, + "learning_rate": 0.0001998603190761294, + "loss": 0.7143, + "step": 145 + }, + { + "epoch": 0.04672, + "grad_norm": 0.4326528288413225, + "learning_rate": 0.00019985478902914114, + "loss": 0.7296, + "step": 146 + }, + { + "epoch": 0.04704, + "grad_norm": 0.42619185588922404, + "learning_rate": 0.00019984915170759526, + "loss": 0.7583, + "step": 147 + }, + { + "epoch": 0.04736, + "grad_norm": 0.40430419383774363, + "learning_rate": 0.00019984340711754796, + "loss": 0.6987, + "step": 148 + }, + { + "epoch": 0.04768, + "grad_norm": 0.4467205333763215, + "learning_rate": 0.00019983755526517075, + "loss": 0.732, + "step": 149 + }, + { + "epoch": 0.048, + "grad_norm": 0.388442786464422, + "learning_rate": 0.0001998315961567502, + "loss": 0.7369, + "step": 150 + }, + { + "epoch": 0.04832, + "grad_norm": 0.3880083864543083, + "learning_rate": 0.00019982552979868828, + "loss": 0.7397, + "step": 151 + }, + { + "epoch": 0.04864, + "grad_norm": 0.39891635644433154, + "learning_rate": 0.00019981935619750214, + "loss": 0.7184, + "step": 152 + }, + { + "epoch": 0.04896, + "grad_norm": 0.4135459316469171, + "learning_rate": 0.00019981307535982406, + "loss": 0.7289, + "step": 153 + }, + { + "epoch": 0.04928, + "grad_norm": 0.3861488505399555, + "learning_rate": 0.00019980668729240158, + "loss": 0.6985, + "step": 154 + }, + { + "epoch": 0.0496, + "grad_norm": 0.44608865752548255, + "learning_rate": 0.0001998001920020975, + "loss": 0.7259, + "step": 155 + }, + { + "epoch": 0.04992, + "grad_norm": 0.42100894479871004, + "learning_rate": 0.0001997935894958897, + "loss": 0.77, + "step": 156 + }, + { + "epoch": 0.05024, + "grad_norm": 0.4174110619045265, + "learning_rate": 0.00019978687978087126, + "loss": 0.7178, + "step": 157 + }, + { + "epoch": 0.05056, + "grad_norm": 0.4308663947925414, + "learning_rate": 0.0001997800628642505, + "loss": 0.8116, + "step": 158 + }, + { + "epoch": 0.05088, + "grad_norm": 0.4001049289467878, + "learning_rate": 0.0001997731387533509, + "loss": 0.718, + "step": 159 + }, + { + "epoch": 0.0512, + "grad_norm": 0.42079041121567273, + "learning_rate": 0.000199766107455611, + "loss": 0.7559, + "step": 160 + }, + { + "epoch": 0.05152, + "grad_norm": 0.400988421086698, + "learning_rate": 0.00019975896897858462, + "loss": 0.7585, + "step": 161 + }, + { + "epoch": 0.05184, + "grad_norm": 0.40519314433787235, + "learning_rate": 0.00019975172332994064, + "loss": 0.7746, + "step": 162 + }, + { + "epoch": 0.05216, + "grad_norm": 0.4011011567396514, + "learning_rate": 0.0001997443705174631, + "loss": 0.7249, + "step": 163 + }, + { + "epoch": 0.05248, + "grad_norm": 0.39896690231529963, + "learning_rate": 0.0001997369105490512, + "loss": 0.7207, + "step": 164 + }, + { + "epoch": 0.0528, + "grad_norm": 0.39029807861651156, + "learning_rate": 0.0001997293434327192, + "loss": 0.6764, + "step": 165 + }, + { + "epoch": 0.05312, + "grad_norm": 0.40871952633787084, + "learning_rate": 0.00019972166917659647, + "loss": 0.7614, + "step": 166 + }, + { + "epoch": 0.05344, + "grad_norm": 0.3979016647270798, + "learning_rate": 0.00019971388778892754, + "loss": 0.6654, + "step": 167 + }, + { + "epoch": 0.05376, + "grad_norm": 0.42043578133981985, + "learning_rate": 0.00019970599927807202, + "loss": 0.7452, + "step": 168 + }, + { + "epoch": 0.05408, + "grad_norm": 0.4072320901060178, + "learning_rate": 0.0001996980036525045, + "loss": 0.7072, + "step": 169 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3973739493689002, + "learning_rate": 0.0001996899009208148, + "loss": 0.7701, + "step": 170 + }, + { + "epoch": 0.05472, + "grad_norm": 0.39021026431850786, + "learning_rate": 0.00019968169109170773, + "loss": 0.7587, + "step": 171 + }, + { + "epoch": 0.05504, + "grad_norm": 0.412900634118538, + "learning_rate": 0.00019967337417400313, + "loss": 0.7574, + "step": 172 + }, + { + "epoch": 0.05536, + "grad_norm": 0.43102406990920983, + "learning_rate": 0.0001996649501766359, + "loss": 0.7588, + "step": 173 + }, + { + "epoch": 0.05568, + "grad_norm": 0.41930394190492015, + "learning_rate": 0.000199656419108656, + "loss": 0.7442, + "step": 174 + }, + { + "epoch": 0.056, + "grad_norm": 0.43674689725598764, + "learning_rate": 0.0001996477809792284, + "loss": 0.7352, + "step": 175 + }, + { + "epoch": 0.05632, + "grad_norm": 0.4453725734876997, + "learning_rate": 0.00019963903579763313, + "loss": 0.7385, + "step": 176 + }, + { + "epoch": 0.05664, + "grad_norm": 0.42876817533957673, + "learning_rate": 0.0001996301835732651, + "loss": 0.7217, + "step": 177 + }, + { + "epoch": 0.05696, + "grad_norm": 0.418426365088918, + "learning_rate": 0.0001996212243156344, + "loss": 0.762, + "step": 178 + }, + { + "epoch": 0.05728, + "grad_norm": 0.4103204533243087, + "learning_rate": 0.00019961215803436595, + "loss": 0.7672, + "step": 179 + }, + { + "epoch": 0.0576, + "grad_norm": 0.38899188788309413, + "learning_rate": 0.00019960298473919972, + "loss": 0.7198, + "step": 180 + }, + { + "epoch": 0.05792, + "grad_norm": 0.42512049915701683, + "learning_rate": 0.00019959370443999063, + "loss": 0.7572, + "step": 181 + }, + { + "epoch": 0.05824, + "grad_norm": 0.39589404157816827, + "learning_rate": 0.00019958431714670857, + "loss": 0.7856, + "step": 182 + }, + { + "epoch": 0.05856, + "grad_norm": 0.4005547070868384, + "learning_rate": 0.00019957482286943838, + "loss": 0.7429, + "step": 183 + }, + { + "epoch": 0.05888, + "grad_norm": 0.4211613797439173, + "learning_rate": 0.00019956522161837975, + "loss": 0.7364, + "step": 184 + }, + { + "epoch": 0.0592, + "grad_norm": 0.39825753824106586, + "learning_rate": 0.00019955551340384743, + "loss": 0.73, + "step": 185 + }, + { + "epoch": 0.05952, + "grad_norm": 0.4033078721501204, + "learning_rate": 0.000199545698236271, + "loss": 0.7517, + "step": 186 + }, + { + "epoch": 0.05984, + "grad_norm": 0.409040308738097, + "learning_rate": 0.00019953577612619484, + "loss": 0.7586, + "step": 187 + }, + { + "epoch": 0.06016, + "grad_norm": 0.40795573538716995, + "learning_rate": 0.00019952574708427849, + "loss": 0.7151, + "step": 188 + }, + { + "epoch": 0.06048, + "grad_norm": 0.40579597124070016, + "learning_rate": 0.00019951561112129614, + "loss": 0.7847, + "step": 189 + }, + { + "epoch": 0.0608, + "grad_norm": 0.41542554720413943, + "learning_rate": 0.00019950536824813684, + "loss": 0.7643, + "step": 190 + }, + { + "epoch": 0.06112, + "grad_norm": 0.38383861271606445, + "learning_rate": 0.00019949501847580468, + "loss": 0.6891, + "step": 191 + }, + { + "epoch": 0.06144, + "grad_norm": 0.39631534097947313, + "learning_rate": 0.0001994845618154184, + "loss": 0.7638, + "step": 192 + }, + { + "epoch": 0.06176, + "grad_norm": 0.4136445289531319, + "learning_rate": 0.00019947399827821167, + "loss": 0.6831, + "step": 193 + }, + { + "epoch": 0.06208, + "grad_norm": 0.4193170159492938, + "learning_rate": 0.000199463327875533, + "loss": 0.7256, + "step": 194 + }, + { + "epoch": 0.0624, + "grad_norm": 0.4293844050838592, + "learning_rate": 0.00019945255061884558, + "loss": 0.7581, + "step": 195 + }, + { + "epoch": 0.06272, + "grad_norm": 0.42058192682057277, + "learning_rate": 0.00019944166651972753, + "loss": 0.7437, + "step": 196 + }, + { + "epoch": 0.06304, + "grad_norm": 0.4290448472953912, + "learning_rate": 0.00019943067558987173, + "loss": 0.7493, + "step": 197 + }, + { + "epoch": 0.06336, + "grad_norm": 0.37354230740165073, + "learning_rate": 0.0001994195778410857, + "loss": 0.7365, + "step": 198 + }, + { + "epoch": 0.06368, + "grad_norm": 0.44989861229606387, + "learning_rate": 0.0001994083732852919, + "loss": 0.7501, + "step": 199 + }, + { + "epoch": 0.064, + "grad_norm": 0.3999556571515187, + "learning_rate": 0.00019939706193452744, + "loss": 0.6995, + "step": 200 + }, + { + "epoch": 0.06432, + "grad_norm": 0.4906851537775836, + "learning_rate": 0.00019938564380094414, + "loss": 0.7537, + "step": 201 + }, + { + "epoch": 0.06464, + "grad_norm": 0.41015748077898057, + "learning_rate": 0.00019937411889680854, + "loss": 0.7069, + "step": 202 + }, + { + "epoch": 0.06496, + "grad_norm": 0.4336475847452223, + "learning_rate": 0.00019936248723450195, + "loss": 0.7481, + "step": 203 + }, + { + "epoch": 0.06528, + "grad_norm": 0.4055756795485628, + "learning_rate": 0.00019935074882652034, + "loss": 0.7076, + "step": 204 + }, + { + "epoch": 0.0656, + "grad_norm": 0.3932595761842593, + "learning_rate": 0.0001993389036854743, + "loss": 0.7103, + "step": 205 + }, + { + "epoch": 0.06592, + "grad_norm": 0.4390232786872728, + "learning_rate": 0.0001993269518240892, + "loss": 0.7166, + "step": 206 + }, + { + "epoch": 0.06624, + "grad_norm": 0.4096971237501023, + "learning_rate": 0.0001993148932552049, + "loss": 0.7667, + "step": 207 + }, + { + "epoch": 0.06656, + "grad_norm": 0.40859693813475584, + "learning_rate": 0.00019930272799177607, + "loss": 0.7489, + "step": 208 + }, + { + "epoch": 0.06688, + "grad_norm": 0.41221207010486643, + "learning_rate": 0.00019929045604687187, + "loss": 0.7128, + "step": 209 + }, + { + "epoch": 0.0672, + "grad_norm": 0.5434540483850174, + "learning_rate": 0.00019927807743367611, + "loss": 0.7341, + "step": 210 + }, + { + "epoch": 0.06752, + "grad_norm": 0.45360060601262314, + "learning_rate": 0.00019926559216548728, + "loss": 0.7629, + "step": 211 + }, + { + "epoch": 0.06784, + "grad_norm": 0.40185826295679233, + "learning_rate": 0.0001992530002557183, + "loss": 0.7708, + "step": 212 + }, + { + "epoch": 0.06816, + "grad_norm": 0.40964848035629653, + "learning_rate": 0.00019924030171789676, + "loss": 0.745, + "step": 213 + }, + { + "epoch": 0.06848, + "grad_norm": 0.3960611256344289, + "learning_rate": 0.00019922749656566476, + "loss": 0.6921, + "step": 214 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4241656510189951, + "learning_rate": 0.00019921458481277895, + "loss": 0.7762, + "step": 215 + }, + { + "epoch": 0.06912, + "grad_norm": 0.387980076780542, + "learning_rate": 0.00019920156647311048, + "loss": 0.7131, + "step": 216 + }, + { + "epoch": 0.06944, + "grad_norm": 0.4054873163160114, + "learning_rate": 0.00019918844156064505, + "loss": 0.7034, + "step": 217 + }, + { + "epoch": 0.06976, + "grad_norm": 0.39539500463069105, + "learning_rate": 0.00019917521008948287, + "loss": 0.7339, + "step": 218 + }, + { + "epoch": 0.07008, + "grad_norm": 0.38885942769563575, + "learning_rate": 0.00019916187207383846, + "loss": 0.7737, + "step": 219 + }, + { + "epoch": 0.0704, + "grad_norm": 0.42177712975947823, + "learning_rate": 0.00019914842752804103, + "loss": 0.7967, + "step": 220 + }, + { + "epoch": 0.07072, + "grad_norm": 0.4210850229779522, + "learning_rate": 0.00019913487646653407, + "loss": 0.7173, + "step": 221 + }, + { + "epoch": 0.07104, + "grad_norm": 0.40742160560153656, + "learning_rate": 0.00019912121890387562, + "loss": 0.7331, + "step": 222 + }, + { + "epoch": 0.07136, + "grad_norm": 0.4191705551965322, + "learning_rate": 0.00019910745485473804, + "loss": 0.7154, + "step": 223 + }, + { + "epoch": 0.07168, + "grad_norm": 0.4233092491679564, + "learning_rate": 0.00019909358433390812, + "loss": 0.7744, + "step": 224 + }, + { + "epoch": 0.072, + "grad_norm": 0.4193130476377568, + "learning_rate": 0.00019907960735628704, + "loss": 0.7612, + "step": 225 + }, + { + "epoch": 0.07232, + "grad_norm": 0.40321576798819103, + "learning_rate": 0.00019906552393689038, + "loss": 0.7599, + "step": 226 + }, + { + "epoch": 0.07264, + "grad_norm": 0.38047122311599635, + "learning_rate": 0.000199051334090848, + "loss": 0.7178, + "step": 227 + }, + { + "epoch": 0.07296, + "grad_norm": 0.3951723954239479, + "learning_rate": 0.00019903703783340413, + "loss": 0.7889, + "step": 228 + }, + { + "epoch": 0.07328, + "grad_norm": 0.38408564960849323, + "learning_rate": 0.00019902263517991732, + "loss": 0.7144, + "step": 229 + }, + { + "epoch": 0.0736, + "grad_norm": 0.3835867778635915, + "learning_rate": 0.00019900812614586044, + "loss": 0.7334, + "step": 230 + }, + { + "epoch": 0.07392, + "grad_norm": 0.37616731257706504, + "learning_rate": 0.00019899351074682063, + "loss": 0.7208, + "step": 231 + }, + { + "epoch": 0.07424, + "grad_norm": 0.4059665060035399, + "learning_rate": 0.00019897878899849926, + "loss": 0.7589, + "step": 232 + }, + { + "epoch": 0.07456, + "grad_norm": 0.36633209765115576, + "learning_rate": 0.000198963960916712, + "loss": 0.6973, + "step": 233 + }, + { + "epoch": 0.07488, + "grad_norm": 0.3848590319578474, + "learning_rate": 0.00019894902651738878, + "loss": 0.6899, + "step": 234 + }, + { + "epoch": 0.0752, + "grad_norm": 0.37115991361388495, + "learning_rate": 0.00019893398581657365, + "loss": 0.7372, + "step": 235 + }, + { + "epoch": 0.07552, + "grad_norm": 0.38925631584518244, + "learning_rate": 0.00019891883883042496, + "loss": 0.77, + "step": 236 + }, + { + "epoch": 0.07584, + "grad_norm": 0.3921187708606024, + "learning_rate": 0.0001989035855752152, + "loss": 0.7476, + "step": 237 + }, + { + "epoch": 0.07616, + "grad_norm": 0.3856452255442644, + "learning_rate": 0.000198888226067331, + "loss": 0.7207, + "step": 238 + }, + { + "epoch": 0.07648, + "grad_norm": 0.3984872935543763, + "learning_rate": 0.00019887276032327318, + "loss": 0.736, + "step": 239 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3915371543475769, + "learning_rate": 0.00019885718835965666, + "loss": 0.7146, + "step": 240 + }, + { + "epoch": 0.07712, + "grad_norm": 0.39266246316836856, + "learning_rate": 0.00019884151019321054, + "loss": 0.7087, + "step": 241 + }, + { + "epoch": 0.07744, + "grad_norm": 0.38461881795166664, + "learning_rate": 0.00019882572584077788, + "loss": 0.7825, + "step": 242 + }, + { + "epoch": 0.07776, + "grad_norm": 0.3924735719126863, + "learning_rate": 0.00019880983531931596, + "loss": 0.7353, + "step": 243 + }, + { + "epoch": 0.07808, + "grad_norm": 0.38095421708008886, + "learning_rate": 0.00019879383864589606, + "loss": 0.7371, + "step": 244 + }, + { + "epoch": 0.0784, + "grad_norm": 0.41911007880966034, + "learning_rate": 0.00019877773583770346, + "loss": 0.7486, + "step": 245 + }, + { + "epoch": 0.07872, + "grad_norm": 0.6036030468188308, + "learning_rate": 0.00019876152691203748, + "loss": 0.7411, + "step": 246 + }, + { + "epoch": 0.07904, + "grad_norm": 0.39942905648066546, + "learning_rate": 0.00019874521188631154, + "loss": 0.6852, + "step": 247 + }, + { + "epoch": 0.07936, + "grad_norm": 0.4011128910600863, + "learning_rate": 0.0001987287907780529, + "loss": 0.7182, + "step": 248 + }, + { + "epoch": 0.07968, + "grad_norm": 0.40190246584576583, + "learning_rate": 0.00019871226360490286, + "loss": 0.7275, + "step": 249 + }, + { + "epoch": 0.08, + "grad_norm": 0.39918816572839777, + "learning_rate": 0.00019869563038461664, + "loss": 0.7246, + "step": 250 + }, + { + "epoch": 0.08032, + "grad_norm": 0.3942953012704703, + "learning_rate": 0.00019867889113506343, + "loss": 0.7171, + "step": 251 + }, + { + "epoch": 0.08064, + "grad_norm": 0.37653074922940427, + "learning_rate": 0.00019866204587422627, + "loss": 0.6838, + "step": 252 + }, + { + "epoch": 0.08096, + "grad_norm": 0.4063054496037096, + "learning_rate": 0.00019864509462020217, + "loss": 0.7884, + "step": 253 + }, + { + "epoch": 0.08128, + "grad_norm": 0.3996311018272426, + "learning_rate": 0.0001986280373912019, + "loss": 0.7356, + "step": 254 + }, + { + "epoch": 0.0816, + "grad_norm": 0.3919731619260927, + "learning_rate": 0.00019861087420555018, + "loss": 0.7207, + "step": 255 + }, + { + "epoch": 0.08192, + "grad_norm": 0.3753944432703445, + "learning_rate": 0.00019859360508168544, + "loss": 0.6757, + "step": 256 + }, + { + "epoch": 0.08224, + "grad_norm": 0.3972143798271442, + "learning_rate": 0.00019857623003816013, + "loss": 0.7128, + "step": 257 + }, + { + "epoch": 0.08256, + "grad_norm": 0.394576659186477, + "learning_rate": 0.00019855874909364022, + "loss": 0.7093, + "step": 258 + }, + { + "epoch": 0.08288, + "grad_norm": 0.4306181660524957, + "learning_rate": 0.00019854116226690564, + "loss": 0.8011, + "step": 259 + }, + { + "epoch": 0.0832, + "grad_norm": 0.43917347187513456, + "learning_rate": 0.00019852346957685004, + "loss": 0.7001, + "step": 260 + }, + { + "epoch": 0.08352, + "grad_norm": 0.38619493331680604, + "learning_rate": 0.00019850567104248078, + "loss": 0.6714, + "step": 261 + }, + { + "epoch": 0.08384, + "grad_norm": 0.39185682881131534, + "learning_rate": 0.00019848776668291885, + "loss": 0.7346, + "step": 262 + }, + { + "epoch": 0.08416, + "grad_norm": 0.3970161198449849, + "learning_rate": 0.0001984697565173991, + "loss": 0.6804, + "step": 263 + }, + { + "epoch": 0.08448, + "grad_norm": 0.40749158498554805, + "learning_rate": 0.00019845164056526987, + "loss": 0.6896, + "step": 264 + }, + { + "epoch": 0.0848, + "grad_norm": 0.387205310428941, + "learning_rate": 0.00019843341884599326, + "loss": 0.6973, + "step": 265 + }, + { + "epoch": 0.08512, + "grad_norm": 0.3987324599604142, + "learning_rate": 0.000198415091379145, + "loss": 0.6619, + "step": 266 + }, + { + "epoch": 0.08544, + "grad_norm": 0.4285974439889262, + "learning_rate": 0.00019839665818441432, + "loss": 0.7207, + "step": 267 + }, + { + "epoch": 0.08576, + "grad_norm": 0.38351120241378284, + "learning_rate": 0.00019837811928160418, + "loss": 0.7648, + "step": 268 + }, + { + "epoch": 0.08608, + "grad_norm": 0.4365965888950885, + "learning_rate": 0.000198359474690631, + "loss": 0.7123, + "step": 269 + }, + { + "epoch": 0.0864, + "grad_norm": 0.38602629914454967, + "learning_rate": 0.0001983407244315247, + "loss": 0.7121, + "step": 270 + }, + { + "epoch": 0.08672, + "grad_norm": 0.3990678215990375, + "learning_rate": 0.0001983218685244289, + "loss": 0.7182, + "step": 271 + }, + { + "epoch": 0.08704, + "grad_norm": 0.4017389951871944, + "learning_rate": 0.00019830290698960053, + "loss": 0.7595, + "step": 272 + }, + { + "epoch": 0.08736, + "grad_norm": 0.4142872969203646, + "learning_rate": 0.00019828383984741007, + "loss": 0.7476, + "step": 273 + }, + { + "epoch": 0.08768, + "grad_norm": 0.4312517387355425, + "learning_rate": 0.0001982646671183415, + "loss": 0.7136, + "step": 274 + }, + { + "epoch": 0.088, + "grad_norm": 0.39073818316410924, + "learning_rate": 0.0001982453888229922, + "loss": 0.71, + "step": 275 + }, + { + "epoch": 0.08832, + "grad_norm": 0.40378235037568677, + "learning_rate": 0.0001982260049820729, + "loss": 0.7153, + "step": 276 + }, + { + "epoch": 0.08864, + "grad_norm": 0.41744686856989976, + "learning_rate": 0.00019820651561640778, + "loss": 0.7061, + "step": 277 + }, + { + "epoch": 0.08896, + "grad_norm": 0.44073694938968294, + "learning_rate": 0.00019818692074693441, + "loss": 0.7551, + "step": 278 + }, + { + "epoch": 0.08928, + "grad_norm": 0.41495195011598074, + "learning_rate": 0.00019816722039470364, + "loss": 0.7389, + "step": 279 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3893739240546949, + "learning_rate": 0.00019814741458087966, + "loss": 0.7219, + "step": 280 + }, + { + "epoch": 0.08992, + "grad_norm": 0.39734050761821677, + "learning_rate": 0.00019812750332673997, + "loss": 0.7144, + "step": 281 + }, + { + "epoch": 0.09024, + "grad_norm": 0.3995677661543884, + "learning_rate": 0.00019810748665367536, + "loss": 0.7737, + "step": 282 + }, + { + "epoch": 0.09056, + "grad_norm": 0.38807398553849615, + "learning_rate": 0.00019808736458318987, + "loss": 0.7286, + "step": 283 + }, + { + "epoch": 0.09088, + "grad_norm": 0.3896745873389733, + "learning_rate": 0.00019806713713690067, + "loss": 0.7723, + "step": 284 + }, + { + "epoch": 0.0912, + "grad_norm": 0.39932755474156606, + "learning_rate": 0.0001980468043365383, + "loss": 0.7098, + "step": 285 + }, + { + "epoch": 0.09152, + "grad_norm": 0.4166439305222204, + "learning_rate": 0.0001980263662039464, + "loss": 0.7309, + "step": 286 + }, + { + "epoch": 0.09184, + "grad_norm": 0.43485967761182587, + "learning_rate": 0.00019800582276108172, + "loss": 0.7879, + "step": 287 + }, + { + "epoch": 0.09216, + "grad_norm": 0.4039692399472808, + "learning_rate": 0.00019798517403001422, + "loss": 0.6795, + "step": 288 + }, + { + "epoch": 0.09248, + "grad_norm": 0.4041694349935407, + "learning_rate": 0.00019796442003292697, + "loss": 0.725, + "step": 289 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4313839714011346, + "learning_rate": 0.00019794356079211604, + "loss": 0.7963, + "step": 290 + }, + { + "epoch": 0.09312, + "grad_norm": 0.3858259405265454, + "learning_rate": 0.0001979225963299907, + "loss": 0.733, + "step": 291 + }, + { + "epoch": 0.09344, + "grad_norm": 0.43041826493714264, + "learning_rate": 0.00019790152666907318, + "loss": 0.7296, + "step": 292 + }, + { + "epoch": 0.09376, + "grad_norm": 0.3803588873535391, + "learning_rate": 0.00019788035183199867, + "loss": 0.7085, + "step": 293 + }, + { + "epoch": 0.09408, + "grad_norm": 0.3988119900129413, + "learning_rate": 0.0001978590718415155, + "loss": 0.6845, + "step": 294 + }, + { + "epoch": 0.0944, + "grad_norm": 0.38587862906097614, + "learning_rate": 0.00019783768672048484, + "loss": 0.6867, + "step": 295 + }, + { + "epoch": 0.09472, + "grad_norm": 0.37636745195633065, + "learning_rate": 0.0001978161964918808, + "loss": 0.7104, + "step": 296 + }, + { + "epoch": 0.09504, + "grad_norm": 0.4233309449118061, + "learning_rate": 0.00019779460117879056, + "loss": 0.7115, + "step": 297 + }, + { + "epoch": 0.09536, + "grad_norm": 0.38227248134881303, + "learning_rate": 0.00019777290080441403, + "loss": 0.6601, + "step": 298 + }, + { + "epoch": 0.09568, + "grad_norm": 0.38112738163132837, + "learning_rate": 0.000197751095392064, + "loss": 0.7168, + "step": 299 + }, + { + "epoch": 0.096, + "grad_norm": 0.40719871074341357, + "learning_rate": 0.00019772918496516618, + "loss": 0.7509, + "step": 300 + }, + { + "epoch": 0.09632, + "grad_norm": 0.3902519370404223, + "learning_rate": 0.0001977071695472591, + "loss": 0.736, + "step": 301 + }, + { + "epoch": 0.09664, + "grad_norm": 0.39785614818438686, + "learning_rate": 0.00019768504916199402, + "loss": 0.6749, + "step": 302 + }, + { + "epoch": 0.09696, + "grad_norm": 0.4337021476051533, + "learning_rate": 0.00019766282383313496, + "loss": 0.7215, + "step": 303 + }, + { + "epoch": 0.09728, + "grad_norm": 0.3994556451153538, + "learning_rate": 0.0001976404935845588, + "loss": 0.6886, + "step": 304 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4423135171685121, + "learning_rate": 0.00019761805844025493, + "loss": 0.7452, + "step": 305 + }, + { + "epoch": 0.09792, + "grad_norm": 0.4002539317488952, + "learning_rate": 0.00019759551842432567, + "loss": 0.6484, + "step": 306 + }, + { + "epoch": 0.09824, + "grad_norm": 0.37680569680480824, + "learning_rate": 0.00019757287356098578, + "loss": 0.7572, + "step": 307 + }, + { + "epoch": 0.09856, + "grad_norm": 0.4098670439538843, + "learning_rate": 0.00019755012387456287, + "loss": 0.7531, + "step": 308 + }, + { + "epoch": 0.09888, + "grad_norm": 0.39289540752990076, + "learning_rate": 0.00019752726938949695, + "loss": 0.7133, + "step": 309 + }, + { + "epoch": 0.0992, + "grad_norm": 0.38922217354120847, + "learning_rate": 0.0001975043101303408, + "loss": 0.732, + "step": 310 + }, + { + "epoch": 0.09952, + "grad_norm": 0.3880880034800749, + "learning_rate": 0.00019748124612175964, + "loss": 0.6843, + "step": 311 + }, + { + "epoch": 0.09984, + "grad_norm": 0.3926781777740438, + "learning_rate": 0.00019745807738853129, + "loss": 0.7181, + "step": 312 + }, + { + "epoch": 0.10016, + "grad_norm": 0.3989927994943441, + "learning_rate": 0.000197434803955546, + "loss": 0.6273, + "step": 313 + }, + { + "epoch": 0.10048, + "grad_norm": 0.41350674532685083, + "learning_rate": 0.00019741142584780663, + "loss": 0.7536, + "step": 314 + }, + { + "epoch": 0.1008, + "grad_norm": 0.39078931167545056, + "learning_rate": 0.00019738794309042833, + "loss": 0.7497, + "step": 315 + }, + { + "epoch": 0.10112, + "grad_norm": 0.40601442946106014, + "learning_rate": 0.00019736435570863882, + "loss": 0.6929, + "step": 316 + }, + { + "epoch": 0.10144, + "grad_norm": 0.3675734639441937, + "learning_rate": 0.00019734066372777812, + "loss": 0.7149, + "step": 317 + }, + { + "epoch": 0.10176, + "grad_norm": 0.40099515701906197, + "learning_rate": 0.00019731686717329864, + "loss": 0.7306, + "step": 318 + }, + { + "epoch": 0.10208, + "grad_norm": 0.36265216681507756, + "learning_rate": 0.0001972929660707652, + "loss": 0.6564, + "step": 319 + }, + { + "epoch": 0.1024, + "grad_norm": 0.392931660116471, + "learning_rate": 0.00019726896044585486, + "loss": 0.7989, + "step": 320 + }, + { + "epoch": 0.10272, + "grad_norm": 0.41333241036388846, + "learning_rate": 0.000197244850324357, + "loss": 0.7486, + "step": 321 + }, + { + "epoch": 0.10304, + "grad_norm": 0.384478374445219, + "learning_rate": 0.00019722063573217327, + "loss": 0.7425, + "step": 322 + }, + { + "epoch": 0.10336, + "grad_norm": 0.38101090140120525, + "learning_rate": 0.0001971963166953175, + "loss": 0.6949, + "step": 323 + }, + { + "epoch": 0.10368, + "grad_norm": 0.37823225285318696, + "learning_rate": 0.00019717189323991584, + "loss": 0.6881, + "step": 324 + }, + { + "epoch": 0.104, + "grad_norm": 0.3655537604689273, + "learning_rate": 0.00019714736539220648, + "loss": 0.6682, + "step": 325 + }, + { + "epoch": 0.10432, + "grad_norm": 0.39197705762369495, + "learning_rate": 0.00019712273317853987, + "loss": 0.6997, + "step": 326 + }, + { + "epoch": 0.10464, + "grad_norm": 0.3922854893583484, + "learning_rate": 0.0001970979966253785, + "loss": 0.6911, + "step": 327 + }, + { + "epoch": 0.10496, + "grad_norm": 0.39687178433365566, + "learning_rate": 0.00019707315575929698, + "loss": 0.7163, + "step": 328 + }, + { + "epoch": 0.10528, + "grad_norm": 0.42409085617806075, + "learning_rate": 0.000197048210606982, + "loss": 0.7043, + "step": 329 + }, + { + "epoch": 0.1056, + "grad_norm": 0.43758902659978566, + "learning_rate": 0.00019702316119523235, + "loss": 0.7661, + "step": 330 + }, + { + "epoch": 0.10592, + "grad_norm": 0.38965988942002916, + "learning_rate": 0.00019699800755095865, + "loss": 0.6692, + "step": 331 + }, + { + "epoch": 0.10624, + "grad_norm": 0.41109767298007194, + "learning_rate": 0.00019697274970118366, + "loss": 0.705, + "step": 332 + }, + { + "epoch": 0.10656, + "grad_norm": 0.37753455435085426, + "learning_rate": 0.00019694738767304197, + "loss": 0.6923, + "step": 333 + }, + { + "epoch": 0.10688, + "grad_norm": 0.3648177509009567, + "learning_rate": 0.00019692192149378023, + "loss": 0.7262, + "step": 334 + }, + { + "epoch": 0.1072, + "grad_norm": 0.3916748545574145, + "learning_rate": 0.00019689635119075682, + "loss": 0.7081, + "step": 335 + }, + { + "epoch": 0.10752, + "grad_norm": 0.383792734246004, + "learning_rate": 0.00019687067679144212, + "loss": 0.6848, + "step": 336 + }, + { + "epoch": 0.10784, + "grad_norm": 0.3984452938273053, + "learning_rate": 0.00019684489832341826, + "loss": 0.713, + "step": 337 + }, + { + "epoch": 0.10816, + "grad_norm": 0.39431641919942684, + "learning_rate": 0.00019681901581437917, + "loss": 0.7119, + "step": 338 + }, + { + "epoch": 0.10848, + "grad_norm": 0.40844450591602066, + "learning_rate": 0.00019679302929213058, + "loss": 0.7191, + "step": 339 + }, + { + "epoch": 0.1088, + "grad_norm": 0.41398697679787083, + "learning_rate": 0.00019676693878459002, + "loss": 0.7771, + "step": 340 + }, + { + "epoch": 0.10912, + "grad_norm": 0.5955820097137865, + "learning_rate": 0.00019674074431978657, + "loss": 0.7569, + "step": 341 + }, + { + "epoch": 0.10944, + "grad_norm": 0.3919756720617864, + "learning_rate": 0.00019671444592586117, + "loss": 0.6646, + "step": 342 + }, + { + "epoch": 0.10976, + "grad_norm": 0.3703194327080098, + "learning_rate": 0.00019668804363106627, + "loss": 0.6757, + "step": 343 + }, + { + "epoch": 0.11008, + "grad_norm": 0.3765740441482955, + "learning_rate": 0.00019666153746376606, + "loss": 0.7035, + "step": 344 + }, + { + "epoch": 0.1104, + "grad_norm": 0.38879270566355456, + "learning_rate": 0.00019663492745243622, + "loss": 0.698, + "step": 345 + }, + { + "epoch": 0.11072, + "grad_norm": 0.40171601066504403, + "learning_rate": 0.00019660821362566403, + "loss": 0.7354, + "step": 346 + }, + { + "epoch": 0.11104, + "grad_norm": 0.37111334014738273, + "learning_rate": 0.00019658139601214835, + "loss": 0.7475, + "step": 347 + }, + { + "epoch": 0.11136, + "grad_norm": 0.3978166612221857, + "learning_rate": 0.00019655447464069945, + "loss": 0.7028, + "step": 348 + }, + { + "epoch": 0.11168, + "grad_norm": 0.3887547492967027, + "learning_rate": 0.00019652744954023912, + "loss": 0.6875, + "step": 349 + }, + { + "epoch": 0.112, + "grad_norm": 0.4111348797509523, + "learning_rate": 0.00019650032073980058, + "loss": 0.7415, + "step": 350 + }, + { + "epoch": 0.11232, + "grad_norm": 0.43309875952900573, + "learning_rate": 0.0001964730882685285, + "loss": 0.7014, + "step": 351 + }, + { + "epoch": 0.11264, + "grad_norm": 0.40540616161922716, + "learning_rate": 0.00019644575215567876, + "loss": 0.6965, + "step": 352 + }, + { + "epoch": 0.11296, + "grad_norm": 0.4096166178936405, + "learning_rate": 0.0001964183124306188, + "loss": 0.7619, + "step": 353 + }, + { + "epoch": 0.11328, + "grad_norm": 0.379456114577488, + "learning_rate": 0.0001963907691228272, + "loss": 0.7117, + "step": 354 + }, + { + "epoch": 0.1136, + "grad_norm": 0.4069196300965038, + "learning_rate": 0.00019636312226189399, + "loss": 0.7495, + "step": 355 + }, + { + "epoch": 0.11392, + "grad_norm": 0.37669048703806457, + "learning_rate": 0.00019633537187752022, + "loss": 0.681, + "step": 356 + }, + { + "epoch": 0.11424, + "grad_norm": 0.39215560481927914, + "learning_rate": 0.00019630751799951836, + "loss": 0.7227, + "step": 357 + }, + { + "epoch": 0.11456, + "grad_norm": 0.3712988338345372, + "learning_rate": 0.000196279560657812, + "loss": 0.6932, + "step": 358 + }, + { + "epoch": 0.11488, + "grad_norm": 0.374590012989257, + "learning_rate": 0.0001962514998824358, + "loss": 0.679, + "step": 359 + }, + { + "epoch": 0.1152, + "grad_norm": 0.367036087727824, + "learning_rate": 0.00019622333570353567, + "loss": 0.6418, + "step": 360 + }, + { + "epoch": 0.11552, + "grad_norm": 0.38005827094282785, + "learning_rate": 0.00019619506815136856, + "loss": 0.7146, + "step": 361 + }, + { + "epoch": 0.11584, + "grad_norm": 0.3934441512232598, + "learning_rate": 0.00019616669725630237, + "loss": 0.6961, + "step": 362 + }, + { + "epoch": 0.11616, + "grad_norm": 0.38764081166938164, + "learning_rate": 0.0001961382230488162, + "loss": 0.7108, + "step": 363 + }, + { + "epoch": 0.11648, + "grad_norm": 0.3843976388365349, + "learning_rate": 0.00019610964555949998, + "loss": 0.7215, + "step": 364 + }, + { + "epoch": 0.1168, + "grad_norm": 0.37067109235427065, + "learning_rate": 0.0001960809648190547, + "loss": 0.7085, + "step": 365 + }, + { + "epoch": 0.11712, + "grad_norm": 0.3770878409066295, + "learning_rate": 0.00019605218085829226, + "loss": 0.7356, + "step": 366 + }, + { + "epoch": 0.11744, + "grad_norm": 0.3890169918260607, + "learning_rate": 0.00019602329370813543, + "loss": 0.721, + "step": 367 + }, + { + "epoch": 0.11776, + "grad_norm": 0.40592105605295065, + "learning_rate": 0.00019599430339961777, + "loss": 0.7165, + "step": 368 + }, + { + "epoch": 0.11808, + "grad_norm": 0.3965771578828951, + "learning_rate": 0.0001959652099638838, + "loss": 0.7477, + "step": 369 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3981457787965504, + "learning_rate": 0.00019593601343218873, + "loss": 0.6976, + "step": 370 + }, + { + "epoch": 0.11872, + "grad_norm": 0.8638849378376803, + "learning_rate": 0.00019590671383589857, + "loss": 0.6371, + "step": 371 + }, + { + "epoch": 0.11904, + "grad_norm": 0.42371243699872835, + "learning_rate": 0.00019587731120649006, + "loss": 0.732, + "step": 372 + }, + { + "epoch": 0.11936, + "grad_norm": 0.37303697851029777, + "learning_rate": 0.00019584780557555055, + "loss": 0.7099, + "step": 373 + }, + { + "epoch": 0.11968, + "grad_norm": 0.37309287181346984, + "learning_rate": 0.00019581819697477812, + "loss": 0.7065, + "step": 374 + }, + { + "epoch": 0.12, + "grad_norm": 0.37341602065644214, + "learning_rate": 0.0001957884854359815, + "loss": 0.712, + "step": 375 + }, + { + "epoch": 0.12032, + "grad_norm": 0.3938336337340652, + "learning_rate": 0.00019575867099107992, + "loss": 0.7176, + "step": 376 + }, + { + "epoch": 0.12064, + "grad_norm": 0.37294778150394053, + "learning_rate": 0.00019572875367210324, + "loss": 0.6682, + "step": 377 + }, + { + "epoch": 0.12096, + "grad_norm": 0.42590614261212495, + "learning_rate": 0.00019569873351119176, + "loss": 0.7735, + "step": 378 + }, + { + "epoch": 0.12128, + "grad_norm": 0.39030842770338153, + "learning_rate": 0.00019566861054059635, + "loss": 0.6681, + "step": 379 + }, + { + "epoch": 0.1216, + "grad_norm": 0.38551260912476676, + "learning_rate": 0.00019563838479267823, + "loss": 0.6603, + "step": 380 + }, + { + "epoch": 0.12192, + "grad_norm": 0.3724970175408012, + "learning_rate": 0.00019560805629990918, + "loss": 0.6926, + "step": 381 + }, + { + "epoch": 0.12224, + "grad_norm": 0.3642718535051343, + "learning_rate": 0.00019557762509487118, + "loss": 0.7204, + "step": 382 + }, + { + "epoch": 0.12256, + "grad_norm": 0.42099778531197657, + "learning_rate": 0.00019554709121025668, + "loss": 0.6775, + "step": 383 + }, + { + "epoch": 0.12288, + "grad_norm": 0.3748142124825435, + "learning_rate": 0.00019551645467886838, + "loss": 0.696, + "step": 384 + }, + { + "epoch": 0.1232, + "grad_norm": 0.397445605346402, + "learning_rate": 0.00019548571553361935, + "loss": 0.6832, + "step": 385 + }, + { + "epoch": 0.12352, + "grad_norm": 0.40512924448828924, + "learning_rate": 0.00019545487380753272, + "loss": 0.7361, + "step": 386 + }, + { + "epoch": 0.12384, + "grad_norm": 0.37382306080883454, + "learning_rate": 0.00019542392953374199, + "loss": 0.6811, + "step": 387 + }, + { + "epoch": 0.12416, + "grad_norm": 0.3767244954827213, + "learning_rate": 0.00019539288274549076, + "loss": 0.6763, + "step": 388 + }, + { + "epoch": 0.12448, + "grad_norm": 0.39692856518474445, + "learning_rate": 0.00019536173347613276, + "loss": 0.7161, + "step": 389 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3714663251464577, + "learning_rate": 0.00019533048175913184, + "loss": 0.6755, + "step": 390 + }, + { + "epoch": 0.12512, + "grad_norm": 0.35768354200580715, + "learning_rate": 0.0001952991276280619, + "loss": 0.6706, + "step": 391 + }, + { + "epoch": 0.12544, + "grad_norm": 0.37211542233846645, + "learning_rate": 0.0001952676711166068, + "loss": 0.6699, + "step": 392 + }, + { + "epoch": 0.12576, + "grad_norm": 0.4191728240750743, + "learning_rate": 0.00019523611225856052, + "loss": 0.7522, + "step": 393 + }, + { + "epoch": 0.12608, + "grad_norm": 0.45781307337599053, + "learning_rate": 0.00019520445108782685, + "loss": 0.6705, + "step": 394 + }, + { + "epoch": 0.1264, + "grad_norm": 0.43145682281704706, + "learning_rate": 0.00019517268763841962, + "loss": 0.7601, + "step": 395 + }, + { + "epoch": 0.12672, + "grad_norm": 0.39568012091223576, + "learning_rate": 0.00019514082194446245, + "loss": 0.7347, + "step": 396 + }, + { + "epoch": 0.12704, + "grad_norm": 0.37485006940824006, + "learning_rate": 0.00019510885404018887, + "loss": 0.7333, + "step": 397 + }, + { + "epoch": 0.12736, + "grad_norm": 0.3839258549590152, + "learning_rate": 0.0001950767839599421, + "loss": 0.6951, + "step": 398 + }, + { + "epoch": 0.12768, + "grad_norm": 0.36938585152920694, + "learning_rate": 0.00019504461173817532, + "loss": 0.7345, + "step": 399 + }, + { + "epoch": 0.128, + "grad_norm": 0.3705899678112069, + "learning_rate": 0.0001950123374094512, + "loss": 0.697, + "step": 400 + }, + { + "epoch": 0.12832, + "grad_norm": 0.36234883123715833, + "learning_rate": 0.00019497996100844233, + "loss": 0.7085, + "step": 401 + }, + { + "epoch": 0.12864, + "grad_norm": 0.34592574496296163, + "learning_rate": 0.0001949474825699308, + "loss": 0.589, + "step": 402 + }, + { + "epoch": 0.12896, + "grad_norm": 0.37931984998800367, + "learning_rate": 0.00019491490212880842, + "loss": 0.7359, + "step": 403 + }, + { + "epoch": 0.12928, + "grad_norm": 0.40686094077555585, + "learning_rate": 0.00019488221972007653, + "loss": 0.7494, + "step": 404 + }, + { + "epoch": 0.1296, + "grad_norm": 0.3663956264428909, + "learning_rate": 0.000194849435378846, + "loss": 0.711, + "step": 405 + }, + { + "epoch": 0.12992, + "grad_norm": 0.3782423164452965, + "learning_rate": 0.00019481654914033723, + "loss": 0.6695, + "step": 406 + }, + { + "epoch": 0.13024, + "grad_norm": 0.38836255872524833, + "learning_rate": 0.00019478356103988013, + "loss": 0.77, + "step": 407 + }, + { + "epoch": 0.13056, + "grad_norm": 0.38528257338844074, + "learning_rate": 0.00019475047111291397, + "loss": 0.7114, + "step": 408 + }, + { + "epoch": 0.13088, + "grad_norm": 0.39360001304381337, + "learning_rate": 0.00019471727939498744, + "loss": 0.6888, + "step": 409 + }, + { + "epoch": 0.1312, + "grad_norm": 0.36464041211415976, + "learning_rate": 0.00019468398592175861, + "loss": 0.7163, + "step": 410 + }, + { + "epoch": 0.13152, + "grad_norm": 0.40454548738611545, + "learning_rate": 0.00019465059072899484, + "loss": 0.6832, + "step": 411 + }, + { + "epoch": 0.13184, + "grad_norm": 0.38499407028274757, + "learning_rate": 0.00019461709385257275, + "loss": 0.7236, + "step": 412 + }, + { + "epoch": 0.13216, + "grad_norm": 0.3816000803054317, + "learning_rate": 0.00019458349532847823, + "loss": 0.6653, + "step": 413 + }, + { + "epoch": 0.13248, + "grad_norm": 0.3733530215542171, + "learning_rate": 0.0001945497951928064, + "loss": 0.685, + "step": 414 + }, + { + "epoch": 0.1328, + "grad_norm": 0.36736445370040927, + "learning_rate": 0.00019451599348176143, + "loss": 0.6857, + "step": 415 + }, + { + "epoch": 0.13312, + "grad_norm": 0.3833224303268615, + "learning_rate": 0.00019448209023165675, + "loss": 0.6891, + "step": 416 + }, + { + "epoch": 0.13344, + "grad_norm": 0.37795485485322516, + "learning_rate": 0.0001944480854789148, + "loss": 0.6779, + "step": 417 + }, + { + "epoch": 0.13376, + "grad_norm": 0.4025319869991608, + "learning_rate": 0.00019441397926006705, + "loss": 0.7184, + "step": 418 + }, + { + "epoch": 0.13408, + "grad_norm": 0.3768627111149694, + "learning_rate": 0.00019437977161175401, + "loss": 0.6517, + "step": 419 + }, + { + "epoch": 0.1344, + "grad_norm": 0.39395613081756176, + "learning_rate": 0.00019434546257072517, + "loss": 0.7245, + "step": 420 + }, + { + "epoch": 0.13472, + "grad_norm": 0.37531513939740224, + "learning_rate": 0.0001943110521738389, + "loss": 0.6626, + "step": 421 + }, + { + "epoch": 0.13504, + "grad_norm": 0.3921708037929644, + "learning_rate": 0.0001942765404580625, + "loss": 0.7297, + "step": 422 + }, + { + "epoch": 0.13536, + "grad_norm": 0.37327316009240097, + "learning_rate": 0.00019424192746047208, + "loss": 0.7166, + "step": 423 + }, + { + "epoch": 0.13568, + "grad_norm": 0.37761914448685563, + "learning_rate": 0.0001942072132182526, + "loss": 0.7518, + "step": 424 + }, + { + "epoch": 0.136, + "grad_norm": 0.36188829160240127, + "learning_rate": 0.00019417239776869772, + "loss": 0.6891, + "step": 425 + }, + { + "epoch": 0.13632, + "grad_norm": 0.39134382392947276, + "learning_rate": 0.0001941374811492099, + "loss": 0.6923, + "step": 426 + }, + { + "epoch": 0.13664, + "grad_norm": 0.37912454753106717, + "learning_rate": 0.00019410246339730033, + "loss": 0.7185, + "step": 427 + }, + { + "epoch": 0.13696, + "grad_norm": 0.36769610925856816, + "learning_rate": 0.00019406734455058863, + "loss": 0.6443, + "step": 428 + }, + { + "epoch": 0.13728, + "grad_norm": 0.3634857691225331, + "learning_rate": 0.00019403212464680328, + "loss": 0.7168, + "step": 429 + }, + { + "epoch": 0.1376, + "grad_norm": 0.41051051790070753, + "learning_rate": 0.0001939968037237812, + "loss": 0.7195, + "step": 430 + }, + { + "epoch": 0.13792, + "grad_norm": 0.3919784059436654, + "learning_rate": 0.00019396138181946784, + "loss": 0.7431, + "step": 431 + }, + { + "epoch": 0.13824, + "grad_norm": 0.38074687066455964, + "learning_rate": 0.00019392585897191715, + "loss": 0.7228, + "step": 432 + }, + { + "epoch": 0.13856, + "grad_norm": 0.3910341618373511, + "learning_rate": 0.00019389023521929156, + "loss": 0.6545, + "step": 433 + }, + { + "epoch": 0.13888, + "grad_norm": 0.36586330213672674, + "learning_rate": 0.0001938545105998618, + "loss": 0.6702, + "step": 434 + }, + { + "epoch": 0.1392, + "grad_norm": 0.3961879535607793, + "learning_rate": 0.00019381868515200705, + "loss": 0.6999, + "step": 435 + }, + { + "epoch": 0.13952, + "grad_norm": 0.3815459570578981, + "learning_rate": 0.00019378275891421485, + "loss": 0.7007, + "step": 436 + }, + { + "epoch": 0.13984, + "grad_norm": 0.38614703987913424, + "learning_rate": 0.00019374673192508088, + "loss": 0.6883, + "step": 437 + }, + { + "epoch": 0.14016, + "grad_norm": 0.3768851582559416, + "learning_rate": 0.00019371060422330918, + "loss": 0.7067, + "step": 438 + }, + { + "epoch": 0.14048, + "grad_norm": 0.4007619168472735, + "learning_rate": 0.00019367437584771188, + "loss": 0.7185, + "step": 439 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3759299339196291, + "learning_rate": 0.00019363804683720942, + "loss": 0.6766, + "step": 440 + }, + { + "epoch": 0.14112, + "grad_norm": 0.3790574386908885, + "learning_rate": 0.0001936016172308302, + "loss": 0.6759, + "step": 441 + }, + { + "epoch": 0.14144, + "grad_norm": 0.4018142631283464, + "learning_rate": 0.00019356508706771077, + "loss": 0.7008, + "step": 442 + }, + { + "epoch": 0.14176, + "grad_norm": 0.3715104629374323, + "learning_rate": 0.0001935284563870957, + "loss": 0.6847, + "step": 443 + }, + { + "epoch": 0.14208, + "grad_norm": 0.3915693736583732, + "learning_rate": 0.00019349172522833746, + "loss": 0.6917, + "step": 444 + }, + { + "epoch": 0.1424, + "grad_norm": 0.3922212702326427, + "learning_rate": 0.00019345489363089665, + "loss": 0.7113, + "step": 445 + }, + { + "epoch": 0.14272, + "grad_norm": 0.3511858399340752, + "learning_rate": 0.00019341796163434158, + "loss": 0.645, + "step": 446 + }, + { + "epoch": 0.14304, + "grad_norm": 0.3772806512240585, + "learning_rate": 0.00019338092927834855, + "loss": 0.652, + "step": 447 + }, + { + "epoch": 0.14336, + "grad_norm": 0.37316128053122677, + "learning_rate": 0.00019334379660270156, + "loss": 0.7183, + "step": 448 + }, + { + "epoch": 0.14368, + "grad_norm": 0.3706915047949919, + "learning_rate": 0.00019330656364729252, + "loss": 0.7094, + "step": 449 + }, + { + "epoch": 0.144, + "grad_norm": 0.36577702005688933, + "learning_rate": 0.00019326923045212096, + "loss": 0.6746, + "step": 450 + }, + { + "epoch": 0.14432, + "grad_norm": 0.4258226313588293, + "learning_rate": 0.0001932317970572942, + "loss": 0.7136, + "step": 451 + }, + { + "epoch": 0.14464, + "grad_norm": 0.339175107622932, + "learning_rate": 0.00019319426350302706, + "loss": 0.6854, + "step": 452 + }, + { + "epoch": 0.14496, + "grad_norm": 0.3643605592427322, + "learning_rate": 0.00019315662982964207, + "loss": 0.7035, + "step": 453 + }, + { + "epoch": 0.14528, + "grad_norm": 0.37048444487208876, + "learning_rate": 0.00019311889607756934, + "loss": 0.7052, + "step": 454 + }, + { + "epoch": 0.1456, + "grad_norm": 0.3878958949320137, + "learning_rate": 0.00019308106228734643, + "loss": 0.7187, + "step": 455 + }, + { + "epoch": 0.14592, + "grad_norm": 0.3757245798801654, + "learning_rate": 0.00019304312849961836, + "loss": 0.6823, + "step": 456 + }, + { + "epoch": 0.14624, + "grad_norm": 0.35536761510980036, + "learning_rate": 0.00019300509475513765, + "loss": 0.6889, + "step": 457 + }, + { + "epoch": 0.14656, + "grad_norm": 0.3949603453634783, + "learning_rate": 0.00019296696109476417, + "loss": 0.7278, + "step": 458 + }, + { + "epoch": 0.14688, + "grad_norm": 0.3964081864025015, + "learning_rate": 0.00019292872755946507, + "loss": 0.6649, + "step": 459 + }, + { + "epoch": 0.1472, + "grad_norm": 0.36046615405584714, + "learning_rate": 0.00019289039419031492, + "loss": 0.6674, + "step": 460 + }, + { + "epoch": 0.14752, + "grad_norm": 0.3658347620257223, + "learning_rate": 0.00019285196102849543, + "loss": 0.646, + "step": 461 + }, + { + "epoch": 0.14784, + "grad_norm": 0.37838847219314875, + "learning_rate": 0.00019281342811529556, + "loss": 0.6343, + "step": 462 + }, + { + "epoch": 0.14816, + "grad_norm": 0.39487930517644987, + "learning_rate": 0.00019277479549211144, + "loss": 0.6999, + "step": 463 + }, + { + "epoch": 0.14848, + "grad_norm": 0.3834375094430825, + "learning_rate": 0.00019273606320044628, + "loss": 0.6894, + "step": 464 + }, + { + "epoch": 0.1488, + "grad_norm": 0.3884922732476337, + "learning_rate": 0.00019269723128191048, + "loss": 0.7034, + "step": 465 + }, + { + "epoch": 0.14912, + "grad_norm": 0.47656592826429717, + "learning_rate": 0.00019265829977822133, + "loss": 0.7038, + "step": 466 + }, + { + "epoch": 0.14944, + "grad_norm": 0.368130993256656, + "learning_rate": 0.00019261926873120316, + "loss": 0.7235, + "step": 467 + }, + { + "epoch": 0.14976, + "grad_norm": 0.377347329743766, + "learning_rate": 0.00019258013818278726, + "loss": 0.7577, + "step": 468 + }, + { + "epoch": 0.15008, + "grad_norm": 0.3567250409471538, + "learning_rate": 0.0001925409081750118, + "loss": 0.6744, + "step": 469 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3689876186039553, + "learning_rate": 0.00019250157875002176, + "loss": 0.6927, + "step": 470 + }, + { + "epoch": 0.15072, + "grad_norm": 0.38071445186778236, + "learning_rate": 0.000192462149950069, + "loss": 0.6704, + "step": 471 + }, + { + "epoch": 0.15104, + "grad_norm": 0.3582514404328002, + "learning_rate": 0.00019242262181751207, + "loss": 0.6678, + "step": 472 + }, + { + "epoch": 0.15136, + "grad_norm": 0.379749312867091, + "learning_rate": 0.00019238299439481633, + "loss": 0.6639, + "step": 473 + }, + { + "epoch": 0.15168, + "grad_norm": 0.36718418254493995, + "learning_rate": 0.00019234326772455364, + "loss": 0.7316, + "step": 474 + }, + { + "epoch": 0.152, + "grad_norm": 0.37382296540011056, + "learning_rate": 0.00019230344184940267, + "loss": 0.6704, + "step": 475 + }, + { + "epoch": 0.15232, + "grad_norm": 0.374160371229486, + "learning_rate": 0.00019226351681214855, + "loss": 0.6644, + "step": 476 + }, + { + "epoch": 0.15264, + "grad_norm": 0.38323432553888526, + "learning_rate": 0.00019222349265568292, + "loss": 0.6906, + "step": 477 + }, + { + "epoch": 0.15296, + "grad_norm": 0.3703821792666677, + "learning_rate": 0.000192183369423004, + "loss": 0.6516, + "step": 478 + }, + { + "epoch": 0.15328, + "grad_norm": 0.3988791062666491, + "learning_rate": 0.00019214314715721646, + "loss": 0.6673, + "step": 479 + }, + { + "epoch": 0.1536, + "grad_norm": 0.37737714726947547, + "learning_rate": 0.0001921028259015312, + "loss": 0.6981, + "step": 480 + }, + { + "epoch": 0.15392, + "grad_norm": 0.4074991460109341, + "learning_rate": 0.00019206240569926566, + "loss": 0.7292, + "step": 481 + }, + { + "epoch": 0.15424, + "grad_norm": 0.3766169697072659, + "learning_rate": 0.00019202188659384344, + "loss": 0.6762, + "step": 482 + }, + { + "epoch": 0.15456, + "grad_norm": 0.3868353365562728, + "learning_rate": 0.00019198126862879442, + "loss": 0.6593, + "step": 483 + }, + { + "epoch": 0.15488, + "grad_norm": 0.39493266875324456, + "learning_rate": 0.00019194055184775476, + "loss": 0.7303, + "step": 484 + }, + { + "epoch": 0.1552, + "grad_norm": 0.375576416682949, + "learning_rate": 0.00019189973629446668, + "loss": 0.7138, + "step": 485 + }, + { + "epoch": 0.15552, + "grad_norm": 0.36822796412355435, + "learning_rate": 0.0001918588220127786, + "loss": 0.6774, + "step": 486 + }, + { + "epoch": 0.15584, + "grad_norm": 0.37519671549304895, + "learning_rate": 0.00019181780904664497, + "loss": 0.6485, + "step": 487 + }, + { + "epoch": 0.15616, + "grad_norm": 0.3903561740445914, + "learning_rate": 0.00019177669744012616, + "loss": 0.7331, + "step": 488 + }, + { + "epoch": 0.15648, + "grad_norm": 0.39679592296013205, + "learning_rate": 0.0001917354872373887, + "loss": 0.7154, + "step": 489 + }, + { + "epoch": 0.1568, + "grad_norm": 0.3966690850434934, + "learning_rate": 0.0001916941784827049, + "loss": 0.708, + "step": 490 + }, + { + "epoch": 0.15712, + "grad_norm": 0.35572597980615833, + "learning_rate": 0.00019165277122045292, + "loss": 0.6958, + "step": 491 + }, + { + "epoch": 0.15744, + "grad_norm": 0.3684989298750791, + "learning_rate": 0.00019161126549511695, + "loss": 0.7078, + "step": 492 + }, + { + "epoch": 0.15776, + "grad_norm": 0.37494314522046396, + "learning_rate": 0.0001915696613512867, + "loss": 0.6826, + "step": 493 + }, + { + "epoch": 0.15808, + "grad_norm": 0.36800431369769, + "learning_rate": 0.00019152795883365783, + "loss": 0.7043, + "step": 494 + }, + { + "epoch": 0.1584, + "grad_norm": 0.3645216670638875, + "learning_rate": 0.00019148615798703146, + "loss": 0.6872, + "step": 495 + }, + { + "epoch": 0.15872, + "grad_norm": 0.38843250403631, + "learning_rate": 0.00019144425885631464, + "loss": 0.6895, + "step": 496 + }, + { + "epoch": 0.15904, + "grad_norm": 0.3671826558394155, + "learning_rate": 0.00019140226148651971, + "loss": 0.675, + "step": 497 + }, + { + "epoch": 0.15936, + "grad_norm": 0.38980079974319665, + "learning_rate": 0.00019136016592276477, + "loss": 0.7428, + "step": 498 + }, + { + "epoch": 0.15968, + "grad_norm": 0.3847898120536561, + "learning_rate": 0.0001913179722102732, + "loss": 0.6488, + "step": 499 + }, + { + "epoch": 0.16, + "grad_norm": 0.3815823017313607, + "learning_rate": 0.00019127568039437406, + "loss": 0.7112, + "step": 500 + }, + { + "epoch": 0.16032, + "grad_norm": 0.36283403121938096, + "learning_rate": 0.00019123329052050166, + "loss": 0.7008, + "step": 501 + }, + { + "epoch": 0.16064, + "grad_norm": 0.38139190035074727, + "learning_rate": 0.0001911908026341956, + "loss": 0.7263, + "step": 502 + }, + { + "epoch": 0.16096, + "grad_norm": 0.3852884330965596, + "learning_rate": 0.00019114821678110094, + "loss": 0.7231, + "step": 503 + }, + { + "epoch": 0.16128, + "grad_norm": 0.35468670230811244, + "learning_rate": 0.00019110553300696786, + "loss": 0.6469, + "step": 504 + }, + { + "epoch": 0.1616, + "grad_norm": 0.3916841680909505, + "learning_rate": 0.0001910627513576518, + "loss": 0.6672, + "step": 505 + }, + { + "epoch": 0.16192, + "grad_norm": 0.3923617194910343, + "learning_rate": 0.0001910198718791133, + "loss": 0.713, + "step": 506 + }, + { + "epoch": 0.16224, + "grad_norm": 0.3854529589005894, + "learning_rate": 0.00019097689461741802, + "loss": 0.7133, + "step": 507 + }, + { + "epoch": 0.16256, + "grad_norm": 0.3847808905280083, + "learning_rate": 0.00019093381961873671, + "loss": 0.693, + "step": 508 + }, + { + "epoch": 0.16288, + "grad_norm": 0.43780163447968024, + "learning_rate": 0.00019089064692934507, + "loss": 0.7216, + "step": 509 + }, + { + "epoch": 0.1632, + "grad_norm": 0.38998446331237996, + "learning_rate": 0.0001908473765956237, + "loss": 0.6999, + "step": 510 + }, + { + "epoch": 0.16352, + "grad_norm": 0.3861633732692842, + "learning_rate": 0.00019080400866405825, + "loss": 0.6912, + "step": 511 + }, + { + "epoch": 0.16384, + "grad_norm": 0.3561765068441513, + "learning_rate": 0.0001907605431812391, + "loss": 0.6302, + "step": 512 + }, + { + "epoch": 0.16416, + "grad_norm": 0.4026059651880242, + "learning_rate": 0.00019071698019386144, + "loss": 0.6915, + "step": 513 + }, + { + "epoch": 0.16448, + "grad_norm": 0.39333405385172815, + "learning_rate": 0.00019067331974872525, + "loss": 0.7218, + "step": 514 + }, + { + "epoch": 0.1648, + "grad_norm": 0.41295627909129, + "learning_rate": 0.0001906295618927352, + "loss": 0.6675, + "step": 515 + }, + { + "epoch": 0.16512, + "grad_norm": 0.3864243276726692, + "learning_rate": 0.00019058570667290051, + "loss": 0.7098, + "step": 516 + }, + { + "epoch": 0.16544, + "grad_norm": 0.4197640644035024, + "learning_rate": 0.00019054175413633524, + "loss": 0.7277, + "step": 517 + }, + { + "epoch": 0.16576, + "grad_norm": 0.38306491913821406, + "learning_rate": 0.00019049770433025772, + "loss": 0.6582, + "step": 518 + }, + { + "epoch": 0.16608, + "grad_norm": 0.37746987041373586, + "learning_rate": 0.00019045355730199097, + "loss": 0.731, + "step": 519 + }, + { + "epoch": 0.1664, + "grad_norm": 0.36792208957798095, + "learning_rate": 0.00019040931309896236, + "loss": 0.6683, + "step": 520 + }, + { + "epoch": 0.16672, + "grad_norm": 0.361020649600021, + "learning_rate": 0.0001903649717687037, + "loss": 0.7403, + "step": 521 + }, + { + "epoch": 0.16704, + "grad_norm": 0.4190848206181769, + "learning_rate": 0.00019032053335885112, + "loss": 0.6752, + "step": 522 + }, + { + "epoch": 0.16736, + "grad_norm": 0.3610270293977483, + "learning_rate": 0.00019027599791714503, + "loss": 0.6831, + "step": 523 + }, + { + "epoch": 0.16768, + "grad_norm": 0.4101385097947774, + "learning_rate": 0.00019023136549143016, + "loss": 0.7375, + "step": 524 + }, + { + "epoch": 0.168, + "grad_norm": 0.3959746087881117, + "learning_rate": 0.00019018663612965534, + "loss": 0.6642, + "step": 525 + }, + { + "epoch": 0.16832, + "grad_norm": 0.36808147170037214, + "learning_rate": 0.00019014180987987357, + "loss": 0.6343, + "step": 526 + }, + { + "epoch": 0.16864, + "grad_norm": 0.3561230421496128, + "learning_rate": 0.0001900968867902419, + "loss": 0.6437, + "step": 527 + }, + { + "epoch": 0.16896, + "grad_norm": 0.3621727179859261, + "learning_rate": 0.00019005186690902157, + "loss": 0.6547, + "step": 528 + }, + { + "epoch": 0.16928, + "grad_norm": 0.38771305701415426, + "learning_rate": 0.00019000675028457757, + "loss": 0.6949, + "step": 529 + }, + { + "epoch": 0.1696, + "grad_norm": 0.37576856789118235, + "learning_rate": 0.00018996153696537903, + "loss": 0.6691, + "step": 530 + }, + { + "epoch": 0.16992, + "grad_norm": 0.42470508485120706, + "learning_rate": 0.00018991622699999884, + "loss": 0.676, + "step": 531 + }, + { + "epoch": 0.17024, + "grad_norm": 0.403438500284725, + "learning_rate": 0.0001898708204371137, + "loss": 0.6939, + "step": 532 + }, + { + "epoch": 0.17056, + "grad_norm": 0.38949096830920715, + "learning_rate": 0.0001898253173255042, + "loss": 0.7248, + "step": 533 + }, + { + "epoch": 0.17088, + "grad_norm": 0.3930678925892035, + "learning_rate": 0.00018977971771405453, + "loss": 0.7135, + "step": 534 + }, + { + "epoch": 0.1712, + "grad_norm": 0.37157515098936317, + "learning_rate": 0.00018973402165175268, + "loss": 0.6971, + "step": 535 + }, + { + "epoch": 0.17152, + "grad_norm": 0.36635220243071737, + "learning_rate": 0.00018968822918769012, + "loss": 0.7514, + "step": 536 + }, + { + "epoch": 0.17184, + "grad_norm": 0.3891934321186216, + "learning_rate": 0.00018964234037106202, + "loss": 0.6485, + "step": 537 + }, + { + "epoch": 0.17216, + "grad_norm": 0.35785957346908515, + "learning_rate": 0.0001895963552511669, + "loss": 0.6761, + "step": 538 + }, + { + "epoch": 0.17248, + "grad_norm": 0.3766558157041046, + "learning_rate": 0.00018955027387740692, + "loss": 0.6772, + "step": 539 + }, + { + "epoch": 0.1728, + "grad_norm": 0.37474470871910476, + "learning_rate": 0.00018950409629928748, + "loss": 0.7119, + "step": 540 + }, + { + "epoch": 0.17312, + "grad_norm": 0.374195870224283, + "learning_rate": 0.00018945782256641746, + "loss": 0.6774, + "step": 541 + }, + { + "epoch": 0.17344, + "grad_norm": 0.39037509683718274, + "learning_rate": 0.00018941145272850899, + "loss": 0.712, + "step": 542 + }, + { + "epoch": 0.17376, + "grad_norm": 0.37084603389527393, + "learning_rate": 0.0001893649868353774, + "loss": 0.6646, + "step": 543 + }, + { + "epoch": 0.17408, + "grad_norm": 0.3667470241932476, + "learning_rate": 0.00018931842493694135, + "loss": 0.7298, + "step": 544 + }, + { + "epoch": 0.1744, + "grad_norm": 0.3736990472485872, + "learning_rate": 0.00018927176708322243, + "loss": 0.6659, + "step": 545 + }, + { + "epoch": 0.17472, + "grad_norm": 0.38070538068360926, + "learning_rate": 0.0001892250133243455, + "loss": 0.6895, + "step": 546 + }, + { + "epoch": 0.17504, + "grad_norm": 0.38498718865211484, + "learning_rate": 0.0001891781637105384, + "loss": 0.739, + "step": 547 + }, + { + "epoch": 0.17536, + "grad_norm": 0.3891829575900404, + "learning_rate": 0.00018913121829213186, + "loss": 0.7068, + "step": 548 + }, + { + "epoch": 0.17568, + "grad_norm": 0.38228959084028435, + "learning_rate": 0.00018908417711955972, + "loss": 0.7266, + "step": 549 + }, + { + "epoch": 0.176, + "grad_norm": 0.3686057452451368, + "learning_rate": 0.00018903704024335848, + "loss": 0.689, + "step": 550 + }, + { + "epoch": 0.17632, + "grad_norm": 0.3776215415512684, + "learning_rate": 0.00018898980771416755, + "loss": 0.7439, + "step": 551 + }, + { + "epoch": 0.17664, + "grad_norm": 0.3625188691744574, + "learning_rate": 0.00018894247958272916, + "loss": 0.6908, + "step": 552 + }, + { + "epoch": 0.17696, + "grad_norm": 0.4268217105640349, + "learning_rate": 0.00018889505589988814, + "loss": 0.6962, + "step": 553 + }, + { + "epoch": 0.17728, + "grad_norm": 0.4026366543037032, + "learning_rate": 0.000188847536716592, + "loss": 0.7042, + "step": 554 + }, + { + "epoch": 0.1776, + "grad_norm": 0.3689913377626984, + "learning_rate": 0.00018879992208389092, + "loss": 0.6568, + "step": 555 + }, + { + "epoch": 0.17792, + "grad_norm": 0.4156387132204756, + "learning_rate": 0.00018875221205293756, + "loss": 0.688, + "step": 556 + }, + { + "epoch": 0.17824, + "grad_norm": 0.36971848701890186, + "learning_rate": 0.00018870440667498702, + "loss": 0.6952, + "step": 557 + }, + { + "epoch": 0.17856, + "grad_norm": 0.3523042166866371, + "learning_rate": 0.00018865650600139694, + "loss": 0.6254, + "step": 558 + }, + { + "epoch": 0.17888, + "grad_norm": 0.3630790996315484, + "learning_rate": 0.00018860851008362724, + "loss": 0.737, + "step": 559 + }, + { + "epoch": 0.1792, + "grad_norm": 0.38745311865216464, + "learning_rate": 0.0001885604189732402, + "loss": 0.7309, + "step": 560 + }, + { + "epoch": 0.17952, + "grad_norm": 0.38121925471868995, + "learning_rate": 0.00018851223272190043, + "loss": 0.6639, + "step": 561 + }, + { + "epoch": 0.17984, + "grad_norm": 0.3853261193292461, + "learning_rate": 0.00018846395138137466, + "loss": 0.7131, + "step": 562 + }, + { + "epoch": 0.18016, + "grad_norm": 0.36237384884975193, + "learning_rate": 0.00018841557500353176, + "loss": 0.6762, + "step": 563 + }, + { + "epoch": 0.18048, + "grad_norm": 0.3858707335243159, + "learning_rate": 0.00018836710364034275, + "loss": 0.6952, + "step": 564 + }, + { + "epoch": 0.1808, + "grad_norm": 0.37621457315239776, + "learning_rate": 0.00018831853734388077, + "loss": 0.7272, + "step": 565 + }, + { + "epoch": 0.18112, + "grad_norm": 0.3842489228267907, + "learning_rate": 0.00018826987616632078, + "loss": 0.7303, + "step": 566 + }, + { + "epoch": 0.18144, + "grad_norm": 0.36767466702570084, + "learning_rate": 0.00018822112015993975, + "loss": 0.7073, + "step": 567 + }, + { + "epoch": 0.18176, + "grad_norm": 0.37858562503127563, + "learning_rate": 0.00018817226937711657, + "loss": 0.6884, + "step": 568 + }, + { + "epoch": 0.18208, + "grad_norm": 0.3745839077458561, + "learning_rate": 0.00018812332387033195, + "loss": 0.6947, + "step": 569 + }, + { + "epoch": 0.1824, + "grad_norm": 0.3727408909978038, + "learning_rate": 0.00018807428369216822, + "loss": 0.629, + "step": 570 + }, + { + "epoch": 0.18272, + "grad_norm": 0.39548208964953424, + "learning_rate": 0.00018802514889530958, + "loss": 0.7309, + "step": 571 + }, + { + "epoch": 0.18304, + "grad_norm": 0.4082997422118062, + "learning_rate": 0.0001879759195325418, + "loss": 0.6722, + "step": 572 + }, + { + "epoch": 0.18336, + "grad_norm": 0.38057927200287106, + "learning_rate": 0.0001879265956567523, + "loss": 0.7027, + "step": 573 + }, + { + "epoch": 0.18368, + "grad_norm": 0.3559895974761702, + "learning_rate": 0.0001878771773209299, + "loss": 0.6578, + "step": 574 + }, + { + "epoch": 0.184, + "grad_norm": 0.3745492614745584, + "learning_rate": 0.00018782766457816504, + "loss": 0.7065, + "step": 575 + }, + { + "epoch": 0.18432, + "grad_norm": 0.38370921252873413, + "learning_rate": 0.00018777805748164964, + "loss": 0.7137, + "step": 576 + }, + { + "epoch": 0.18464, + "grad_norm": 0.394791073280638, + "learning_rate": 0.0001877283560846767, + "loss": 0.6762, + "step": 577 + }, + { + "epoch": 0.18496, + "grad_norm": 0.3761152634292773, + "learning_rate": 0.00018767856044064085, + "loss": 0.7014, + "step": 578 + }, + { + "epoch": 0.18528, + "grad_norm": 0.36935472973616645, + "learning_rate": 0.00018762867060303774, + "loss": 0.6656, + "step": 579 + }, + { + "epoch": 0.1856, + "grad_norm": 0.44574612059928553, + "learning_rate": 0.00018757868662546437, + "loss": 0.646, + "step": 580 + }, + { + "epoch": 0.18592, + "grad_norm": 0.3664767911747923, + "learning_rate": 0.00018752860856161875, + "loss": 0.7, + "step": 581 + }, + { + "epoch": 0.18624, + "grad_norm": 0.37030213814865126, + "learning_rate": 0.00018747843646530006, + "loss": 0.7116, + "step": 582 + }, + { + "epoch": 0.18656, + "grad_norm": 0.3845202168507924, + "learning_rate": 0.00018742817039040844, + "loss": 0.7011, + "step": 583 + }, + { + "epoch": 0.18688, + "grad_norm": 0.37705149960866635, + "learning_rate": 0.00018737781039094502, + "loss": 0.6824, + "step": 584 + }, + { + "epoch": 0.1872, + "grad_norm": 0.37495802093433617, + "learning_rate": 0.00018732735652101184, + "loss": 0.6756, + "step": 585 + }, + { + "epoch": 0.18752, + "grad_norm": 0.3491924235481701, + "learning_rate": 0.0001872768088348118, + "loss": 0.7014, + "step": 586 + }, + { + "epoch": 0.18784, + "grad_norm": 0.3536450319376141, + "learning_rate": 0.00018722616738664851, + "loss": 0.6333, + "step": 587 + }, + { + "epoch": 0.18816, + "grad_norm": 0.3763522998163525, + "learning_rate": 0.00018717543223092638, + "loss": 0.7015, + "step": 588 + }, + { + "epoch": 0.18848, + "grad_norm": 0.4240282292057168, + "learning_rate": 0.00018712460342215046, + "loss": 0.6875, + "step": 589 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3976781443918028, + "learning_rate": 0.00018707368101492645, + "loss": 0.692, + "step": 590 + }, + { + "epoch": 0.18912, + "grad_norm": 0.36554055739748414, + "learning_rate": 0.00018702266506396057, + "loss": 0.7052, + "step": 591 + }, + { + "epoch": 0.18944, + "grad_norm": 0.4053088186534514, + "learning_rate": 0.00018697155562405953, + "loss": 0.7275, + "step": 592 + }, + { + "epoch": 0.18976, + "grad_norm": 0.34807908938544707, + "learning_rate": 0.00018692035275013046, + "loss": 0.6079, + "step": 593 + }, + { + "epoch": 0.19008, + "grad_norm": 0.36271006865430816, + "learning_rate": 0.00018686905649718095, + "loss": 0.6911, + "step": 594 + }, + { + "epoch": 0.1904, + "grad_norm": 0.3852183272838404, + "learning_rate": 0.0001868176669203188, + "loss": 0.7336, + "step": 595 + }, + { + "epoch": 0.19072, + "grad_norm": 0.36141733539153165, + "learning_rate": 0.00018676618407475218, + "loss": 0.6923, + "step": 596 + }, + { + "epoch": 0.19104, + "grad_norm": 0.37200882287899356, + "learning_rate": 0.00018671460801578932, + "loss": 0.6247, + "step": 597 + }, + { + "epoch": 0.19136, + "grad_norm": 0.42640403770624996, + "learning_rate": 0.00018666293879883875, + "loss": 0.7021, + "step": 598 + }, + { + "epoch": 0.19168, + "grad_norm": 0.3767707196253604, + "learning_rate": 0.00018661117647940896, + "loss": 0.6896, + "step": 599 + }, + { + "epoch": 0.192, + "grad_norm": 0.44811369749897056, + "learning_rate": 0.00018655932111310848, + "loss": 0.7261, + "step": 600 + }, + { + "epoch": 0.19232, + "grad_norm": 0.3849516109506925, + "learning_rate": 0.00018650737275564583, + "loss": 0.7014, + "step": 601 + }, + { + "epoch": 0.19264, + "grad_norm": 0.37317183147710203, + "learning_rate": 0.00018645533146282946, + "loss": 0.6821, + "step": 602 + }, + { + "epoch": 0.19296, + "grad_norm": 0.37610310276188946, + "learning_rate": 0.00018640319729056753, + "loss": 0.6341, + "step": 603 + }, + { + "epoch": 0.19328, + "grad_norm": 0.3682102458628179, + "learning_rate": 0.0001863509702948682, + "loss": 0.666, + "step": 604 + }, + { + "epoch": 0.1936, + "grad_norm": 0.3860499748534257, + "learning_rate": 0.00018629865053183911, + "loss": 0.6757, + "step": 605 + }, + { + "epoch": 0.19392, + "grad_norm": 0.3856807367418405, + "learning_rate": 0.00018624623805768776, + "loss": 0.7219, + "step": 606 + }, + { + "epoch": 0.19424, + "grad_norm": 0.38209384833687055, + "learning_rate": 0.00018619373292872108, + "loss": 0.7063, + "step": 607 + }, + { + "epoch": 0.19456, + "grad_norm": 0.35863991143047497, + "learning_rate": 0.00018614113520134566, + "loss": 0.6827, + "step": 608 + }, + { + "epoch": 0.19488, + "grad_norm": 0.3823933385249185, + "learning_rate": 0.0001860884449320676, + "loss": 0.7287, + "step": 609 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3581417414576101, + "learning_rate": 0.00018603566217749223, + "loss": 0.7085, + "step": 610 + }, + { + "epoch": 0.19552, + "grad_norm": 0.377786033844024, + "learning_rate": 0.00018598278699432443, + "loss": 0.7461, + "step": 611 + }, + { + "epoch": 0.19584, + "grad_norm": 0.3927384796181169, + "learning_rate": 0.0001859298194393683, + "loss": 0.7145, + "step": 612 + }, + { + "epoch": 0.19616, + "grad_norm": 0.38126302858738637, + "learning_rate": 0.00018587675956952717, + "loss": 0.7056, + "step": 613 + }, + { + "epoch": 0.19648, + "grad_norm": 0.4017051851174728, + "learning_rate": 0.00018582360744180356, + "loss": 0.7194, + "step": 614 + }, + { + "epoch": 0.1968, + "grad_norm": 0.3832804262944955, + "learning_rate": 0.0001857703631132991, + "loss": 0.7236, + "step": 615 + }, + { + "epoch": 0.19712, + "grad_norm": 0.3835507525150527, + "learning_rate": 0.00018571702664121445, + "loss": 0.7127, + "step": 616 + }, + { + "epoch": 0.19744, + "grad_norm": 0.3717950930513399, + "learning_rate": 0.0001856635980828493, + "loss": 0.6778, + "step": 617 + }, + { + "epoch": 0.19776, + "grad_norm": 0.3892396454500002, + "learning_rate": 0.00018561007749560223, + "loss": 0.6843, + "step": 618 + }, + { + "epoch": 0.19808, + "grad_norm": 0.3884263240541612, + "learning_rate": 0.00018555646493697073, + "loss": 0.6794, + "step": 619 + }, + { + "epoch": 0.1984, + "grad_norm": 0.36032717315297097, + "learning_rate": 0.00018550276046455107, + "loss": 0.6353, + "step": 620 + }, + { + "epoch": 0.19872, + "grad_norm": 0.3717455125989437, + "learning_rate": 0.00018544896413603824, + "loss": 0.6894, + "step": 621 + }, + { + "epoch": 0.19904, + "grad_norm": 0.4029382846007775, + "learning_rate": 0.00018539507600922597, + "loss": 0.6444, + "step": 622 + }, + { + "epoch": 0.19936, + "grad_norm": 0.35177763402127044, + "learning_rate": 0.00018534109614200652, + "loss": 0.6194, + "step": 623 + }, + { + "epoch": 0.19968, + "grad_norm": 0.37288226756772497, + "learning_rate": 0.00018528702459237083, + "loss": 0.6918, + "step": 624 + }, + { + "epoch": 0.2, + "grad_norm": 0.3649060754242833, + "learning_rate": 0.0001852328614184082, + "loss": 0.649, + "step": 625 + }, + { + "epoch": 0.20032, + "grad_norm": 0.36634977948284103, + "learning_rate": 0.00018517860667830648, + "loss": 0.7065, + "step": 626 + }, + { + "epoch": 0.20064, + "grad_norm": 0.37908346594012443, + "learning_rate": 0.00018512426043035184, + "loss": 0.7237, + "step": 627 + }, + { + "epoch": 0.20096, + "grad_norm": 0.34925470856243007, + "learning_rate": 0.00018506982273292874, + "loss": 0.7002, + "step": 628 + }, + { + "epoch": 0.20128, + "grad_norm": 0.34970529649550897, + "learning_rate": 0.00018501529364451993, + "loss": 0.6736, + "step": 629 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3554469104415349, + "learning_rate": 0.00018496067322370627, + "loss": 0.6963, + "step": 630 + }, + { + "epoch": 0.20192, + "grad_norm": 0.3551417578484709, + "learning_rate": 0.0001849059615291668, + "loss": 0.629, + "step": 631 + }, + { + "epoch": 0.20224, + "grad_norm": 0.3618812513414055, + "learning_rate": 0.0001848511586196786, + "loss": 0.6343, + "step": 632 + }, + { + "epoch": 0.20256, + "grad_norm": 0.3691591254135277, + "learning_rate": 0.00018479626455411677, + "loss": 0.6497, + "step": 633 + }, + { + "epoch": 0.20288, + "grad_norm": 0.3791082050849236, + "learning_rate": 0.00018474127939145424, + "loss": 0.7174, + "step": 634 + }, + { + "epoch": 0.2032, + "grad_norm": 0.3826804218084318, + "learning_rate": 0.00018468620319076197, + "loss": 0.6696, + "step": 635 + }, + { + "epoch": 0.20352, + "grad_norm": 0.38073044965980335, + "learning_rate": 0.00018463103601120857, + "loss": 0.6615, + "step": 636 + }, + { + "epoch": 0.20384, + "grad_norm": 0.35355843658521047, + "learning_rate": 0.00018457577791206048, + "loss": 0.6882, + "step": 637 + }, + { + "epoch": 0.20416, + "grad_norm": 0.36121294050894603, + "learning_rate": 0.0001845204289526817, + "loss": 0.6584, + "step": 638 + }, + { + "epoch": 0.20448, + "grad_norm": 0.3349490954718071, + "learning_rate": 0.00018446498919253408, + "loss": 0.64, + "step": 639 + }, + { + "epoch": 0.2048, + "grad_norm": 0.37603377394994797, + "learning_rate": 0.00018440945869117675, + "loss": 0.7433, + "step": 640 + }, + { + "epoch": 0.20512, + "grad_norm": 0.3527726682879851, + "learning_rate": 0.00018435383750826643, + "loss": 0.6961, + "step": 641 + }, + { + "epoch": 0.20544, + "grad_norm": 0.45068127304993216, + "learning_rate": 0.00018429812570355732, + "loss": 0.642, + "step": 642 + }, + { + "epoch": 0.20576, + "grad_norm": 0.38538800083322367, + "learning_rate": 0.00018424232333690094, + "loss": 0.6647, + "step": 643 + }, + { + "epoch": 0.20608, + "grad_norm": 0.36591788596065095, + "learning_rate": 0.00018418643046824604, + "loss": 0.712, + "step": 644 + }, + { + "epoch": 0.2064, + "grad_norm": 0.3707034695442753, + "learning_rate": 0.00018413044715763862, + "loss": 0.7094, + "step": 645 + }, + { + "epoch": 0.20672, + "grad_norm": 0.4120339445920212, + "learning_rate": 0.00018407437346522194, + "loss": 0.7112, + "step": 646 + }, + { + "epoch": 0.20704, + "grad_norm": 0.3790663390655863, + "learning_rate": 0.0001840182094512362, + "loss": 0.7387, + "step": 647 + }, + { + "epoch": 0.20736, + "grad_norm": 0.3467089104227242, + "learning_rate": 0.00018396195517601875, + "loss": 0.6439, + "step": 648 + }, + { + "epoch": 0.20768, + "grad_norm": 0.3625763564568996, + "learning_rate": 0.00018390561070000388, + "loss": 0.6602, + "step": 649 + }, + { + "epoch": 0.208, + "grad_norm": 0.37642659346842217, + "learning_rate": 0.00018384917608372278, + "loss": 0.696, + "step": 650 + }, + { + "epoch": 0.20832, + "grad_norm": 0.37929555899812895, + "learning_rate": 0.00018379265138780343, + "loss": 0.7112, + "step": 651 + }, + { + "epoch": 0.20864, + "grad_norm": 0.374471560265218, + "learning_rate": 0.00018373603667297067, + "loss": 0.6724, + "step": 652 + }, + { + "epoch": 0.20896, + "grad_norm": 0.38324649849294595, + "learning_rate": 0.000183679332000046, + "loss": 0.6798, + "step": 653 + }, + { + "epoch": 0.20928, + "grad_norm": 0.37878220851681876, + "learning_rate": 0.00018362253742994756, + "loss": 0.6337, + "step": 654 + }, + { + "epoch": 0.2096, + "grad_norm": 0.392281924972558, + "learning_rate": 0.0001835656530236901, + "loss": 0.7025, + "step": 655 + }, + { + "epoch": 0.20992, + "grad_norm": 0.3602190368976346, + "learning_rate": 0.00018350867884238476, + "loss": 0.6626, + "step": 656 + }, + { + "epoch": 0.21024, + "grad_norm": 0.37026535929650906, + "learning_rate": 0.00018345161494723935, + "loss": 0.6639, + "step": 657 + }, + { + "epoch": 0.21056, + "grad_norm": 0.36671926006499656, + "learning_rate": 0.00018339446139955783, + "loss": 0.657, + "step": 658 + }, + { + "epoch": 0.21088, + "grad_norm": 0.3575424799140936, + "learning_rate": 0.00018333721826074064, + "loss": 0.6745, + "step": 659 + }, + { + "epoch": 0.2112, + "grad_norm": 0.37607042388640194, + "learning_rate": 0.00018327988559228438, + "loss": 0.6968, + "step": 660 + }, + { + "epoch": 0.21152, + "grad_norm": 0.3678610855760465, + "learning_rate": 0.0001832224634557818, + "loss": 0.6675, + "step": 661 + }, + { + "epoch": 0.21184, + "grad_norm": 0.3868273786573624, + "learning_rate": 0.00018316495191292195, + "loss": 0.6945, + "step": 662 + }, + { + "epoch": 0.21216, + "grad_norm": 0.3775942900351484, + "learning_rate": 0.00018310735102548972, + "loss": 0.6907, + "step": 663 + }, + { + "epoch": 0.21248, + "grad_norm": 0.3593499650917196, + "learning_rate": 0.00018304966085536602, + "loss": 0.6227, + "step": 664 + }, + { + "epoch": 0.2128, + "grad_norm": 0.43533651316148203, + "learning_rate": 0.0001829918814645278, + "loss": 0.6884, + "step": 665 + }, + { + "epoch": 0.21312, + "grad_norm": 1.39738823936067, + "learning_rate": 0.0001829340129150478, + "loss": 0.6519, + "step": 666 + }, + { + "epoch": 0.21344, + "grad_norm": 0.3893288733656194, + "learning_rate": 0.00018287605526909445, + "loss": 0.6419, + "step": 667 + }, + { + "epoch": 0.21376, + "grad_norm": 0.35844663505549856, + "learning_rate": 0.00018281800858893204, + "loss": 0.671, + "step": 668 + }, + { + "epoch": 0.21408, + "grad_norm": 0.34876632435901844, + "learning_rate": 0.00018275987293692034, + "loss": 0.6805, + "step": 669 + }, + { + "epoch": 0.2144, + "grad_norm": 0.3714197501969998, + "learning_rate": 0.00018270164837551494, + "loss": 0.6589, + "step": 670 + }, + { + "epoch": 0.21472, + "grad_norm": 0.3730668762325715, + "learning_rate": 0.0001826433349672667, + "loss": 0.6848, + "step": 671 + }, + { + "epoch": 0.21504, + "grad_norm": 0.38536987660894045, + "learning_rate": 0.00018258493277482213, + "loss": 0.6877, + "step": 672 + }, + { + "epoch": 0.21536, + "grad_norm": 0.4048527650106258, + "learning_rate": 0.00018252644186092298, + "loss": 0.7041, + "step": 673 + }, + { + "epoch": 0.21568, + "grad_norm": 0.3757092389015418, + "learning_rate": 0.00018246786228840635, + "loss": 0.7229, + "step": 674 + }, + { + "epoch": 0.216, + "grad_norm": 0.39499355090932425, + "learning_rate": 0.00018240919412020466, + "loss": 0.6775, + "step": 675 + }, + { + "epoch": 0.21632, + "grad_norm": 0.3703427030667556, + "learning_rate": 0.0001823504374193454, + "loss": 0.6511, + "step": 676 + }, + { + "epoch": 0.21664, + "grad_norm": 0.38373506154959813, + "learning_rate": 0.00018229159224895122, + "loss": 0.6533, + "step": 677 + }, + { + "epoch": 0.21696, + "grad_norm": 0.3696082782693057, + "learning_rate": 0.00018223265867223985, + "loss": 0.7134, + "step": 678 + }, + { + "epoch": 0.21728, + "grad_norm": 0.3797972880923753, + "learning_rate": 0.00018217363675252396, + "loss": 0.6932, + "step": 679 + }, + { + "epoch": 0.2176, + "grad_norm": 0.37870990288539186, + "learning_rate": 0.00018211452655321112, + "loss": 0.6726, + "step": 680 + }, + { + "epoch": 0.21792, + "grad_norm": 0.361952971987553, + "learning_rate": 0.0001820553281378037, + "loss": 0.6157, + "step": 681 + }, + { + "epoch": 0.21824, + "grad_norm": 0.3972171846290433, + "learning_rate": 0.00018199604156989897, + "loss": 0.6592, + "step": 682 + }, + { + "epoch": 0.21856, + "grad_norm": 0.36658247890412493, + "learning_rate": 0.00018193666691318874, + "loss": 0.67, + "step": 683 + }, + { + "epoch": 0.21888, + "grad_norm": 0.4110491988798091, + "learning_rate": 0.0001818772042314596, + "loss": 0.699, + "step": 684 + }, + { + "epoch": 0.2192, + "grad_norm": 0.3611727041903771, + "learning_rate": 0.00018181765358859261, + "loss": 0.7139, + "step": 685 + }, + { + "epoch": 0.21952, + "grad_norm": 0.386263582558337, + "learning_rate": 0.00018175801504856335, + "loss": 0.6923, + "step": 686 + }, + { + "epoch": 0.21984, + "grad_norm": 0.38675112190263383, + "learning_rate": 0.00018169828867544186, + "loss": 0.6989, + "step": 687 + }, + { + "epoch": 0.22016, + "grad_norm": 0.3674138936551115, + "learning_rate": 0.0001816384745333925, + "loss": 0.6743, + "step": 688 + }, + { + "epoch": 0.22048, + "grad_norm": 0.4213474729444471, + "learning_rate": 0.00018157857268667396, + "loss": 0.7251, + "step": 689 + }, + { + "epoch": 0.2208, + "grad_norm": 0.39289033366716836, + "learning_rate": 0.00018151858319963914, + "loss": 0.7161, + "step": 690 + }, + { + "epoch": 0.22112, + "grad_norm": 0.4010299377303811, + "learning_rate": 0.00018145850613673502, + "loss": 0.6701, + "step": 691 + }, + { + "epoch": 0.22144, + "grad_norm": 0.3978792715503202, + "learning_rate": 0.00018139834156250277, + "loss": 0.7011, + "step": 692 + }, + { + "epoch": 0.22176, + "grad_norm": 0.35428077731562635, + "learning_rate": 0.00018133808954157749, + "loss": 0.6718, + "step": 693 + }, + { + "epoch": 0.22208, + "grad_norm": 0.3692508625538983, + "learning_rate": 0.00018127775013868834, + "loss": 0.6909, + "step": 694 + }, + { + "epoch": 0.2224, + "grad_norm": 0.36310482844960207, + "learning_rate": 0.00018121732341865818, + "loss": 0.6494, + "step": 695 + }, + { + "epoch": 0.22272, + "grad_norm": 0.35451425732355163, + "learning_rate": 0.00018115680944640384, + "loss": 0.5966, + "step": 696 + }, + { + "epoch": 0.22304, + "grad_norm": 0.3759293895978303, + "learning_rate": 0.0001810962082869358, + "loss": 0.6973, + "step": 697 + }, + { + "epoch": 0.22336, + "grad_norm": 0.3968830045056176, + "learning_rate": 0.00018103552000535818, + "loss": 0.6588, + "step": 698 + }, + { + "epoch": 0.22368, + "grad_norm": 0.37493430010283585, + "learning_rate": 0.00018097474466686884, + "loss": 0.7354, + "step": 699 + }, + { + "epoch": 0.224, + "grad_norm": 0.4162348933142053, + "learning_rate": 0.00018091388233675896, + "loss": 0.6686, + "step": 700 + }, + { + "epoch": 0.22432, + "grad_norm": 0.5505324084612641, + "learning_rate": 0.00018085293308041335, + "loss": 0.6886, + "step": 701 + }, + { + "epoch": 0.22464, + "grad_norm": 0.4116182971170426, + "learning_rate": 0.00018079189696331013, + "loss": 0.7314, + "step": 702 + }, + { + "epoch": 0.22496, + "grad_norm": 0.4153567985231021, + "learning_rate": 0.00018073077405102072, + "loss": 0.6491, + "step": 703 + }, + { + "epoch": 0.22528, + "grad_norm": 0.3764911933775025, + "learning_rate": 0.0001806695644092098, + "loss": 0.6686, + "step": 704 + }, + { + "epoch": 0.2256, + "grad_norm": 0.3876964994224437, + "learning_rate": 0.00018060826810363523, + "loss": 0.7036, + "step": 705 + }, + { + "epoch": 0.22592, + "grad_norm": 0.35928093842692577, + "learning_rate": 0.000180546885200148, + "loss": 0.7131, + "step": 706 + }, + { + "epoch": 0.22624, + "grad_norm": 0.3708341293662108, + "learning_rate": 0.0001804854157646921, + "loss": 0.6764, + "step": 707 + }, + { + "epoch": 0.22656, + "grad_norm": 0.3641271955234546, + "learning_rate": 0.00018042385986330448, + "loss": 0.7012, + "step": 708 + }, + { + "epoch": 0.22688, + "grad_norm": 0.351875239335879, + "learning_rate": 0.000180362217562115, + "loss": 0.6925, + "step": 709 + }, + { + "epoch": 0.2272, + "grad_norm": 0.374666539626361, + "learning_rate": 0.0001803004889273463, + "loss": 0.6703, + "step": 710 + }, + { + "epoch": 0.22752, + "grad_norm": 0.36141450149841525, + "learning_rate": 0.0001802386740253138, + "loss": 0.6586, + "step": 711 + }, + { + "epoch": 0.22784, + "grad_norm": 0.3879825984734728, + "learning_rate": 0.00018017677292242562, + "loss": 0.6951, + "step": 712 + }, + { + "epoch": 0.22816, + "grad_norm": 0.3880697443654585, + "learning_rate": 0.00018011478568518246, + "loss": 0.6729, + "step": 713 + }, + { + "epoch": 0.22848, + "grad_norm": 0.36702448036834806, + "learning_rate": 0.00018005271238017754, + "loss": 0.6813, + "step": 714 + }, + { + "epoch": 0.2288, + "grad_norm": 0.3654198048597913, + "learning_rate": 0.00017999055307409657, + "loss": 0.6694, + "step": 715 + }, + { + "epoch": 0.22912, + "grad_norm": 0.3544780790270675, + "learning_rate": 0.00017992830783371763, + "loss": 0.681, + "step": 716 + }, + { + "epoch": 0.22944, + "grad_norm": 0.36471597622987206, + "learning_rate": 0.00017986597672591111, + "loss": 0.6634, + "step": 717 + }, + { + "epoch": 0.22976, + "grad_norm": 0.3746561832452932, + "learning_rate": 0.00017980355981763973, + "loss": 0.7001, + "step": 718 + }, + { + "epoch": 0.23008, + "grad_norm": 0.3789342838340834, + "learning_rate": 0.00017974105717595825, + "loss": 0.7074, + "step": 719 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3802972783120846, + "learning_rate": 0.00017967846886801365, + "loss": 0.7069, + "step": 720 + }, + { + "epoch": 0.23072, + "grad_norm": 0.36526081968866997, + "learning_rate": 0.00017961579496104488, + "loss": 0.6373, + "step": 721 + }, + { + "epoch": 0.23104, + "grad_norm": 0.3672669737592333, + "learning_rate": 0.0001795530355223829, + "loss": 0.632, + "step": 722 + }, + { + "epoch": 0.23136, + "grad_norm": 0.3678652252568809, + "learning_rate": 0.00017949019061945046, + "loss": 0.6478, + "step": 723 + }, + { + "epoch": 0.23168, + "grad_norm": 0.39822846891907343, + "learning_rate": 0.0001794272603197623, + "loss": 0.7088, + "step": 724 + }, + { + "epoch": 0.232, + "grad_norm": 0.40129414401337665, + "learning_rate": 0.00017936424469092467, + "loss": 0.6848, + "step": 725 + }, + { + "epoch": 0.23232, + "grad_norm": 0.4058611280373537, + "learning_rate": 0.00017930114380063566, + "loss": 0.7179, + "step": 726 + }, + { + "epoch": 0.23264, + "grad_norm": 0.3657740769226459, + "learning_rate": 0.00017923795771668493, + "loss": 0.6549, + "step": 727 + }, + { + "epoch": 0.23296, + "grad_norm": 0.385324554704592, + "learning_rate": 0.00017917468650695365, + "loss": 0.6989, + "step": 728 + }, + { + "epoch": 0.23328, + "grad_norm": 0.3931880693189958, + "learning_rate": 0.00017911133023941443, + "loss": 0.6615, + "step": 729 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3738320684033318, + "learning_rate": 0.0001790478889821312, + "loss": 0.6984, + "step": 730 + }, + { + "epoch": 0.23392, + "grad_norm": 0.37472414578951646, + "learning_rate": 0.0001789843628032593, + "loss": 0.7068, + "step": 731 + }, + { + "epoch": 0.23424, + "grad_norm": 0.3720541748071402, + "learning_rate": 0.0001789207517710453, + "loss": 0.6493, + "step": 732 + }, + { + "epoch": 0.23456, + "grad_norm": 0.49658220251926044, + "learning_rate": 0.00017885705595382682, + "loss": 0.6343, + "step": 733 + }, + { + "epoch": 0.23488, + "grad_norm": 0.4092338956235856, + "learning_rate": 0.00017879327542003265, + "loss": 0.7418, + "step": 734 + }, + { + "epoch": 0.2352, + "grad_norm": 0.38867382782007576, + "learning_rate": 0.0001787294102381826, + "loss": 0.6749, + "step": 735 + }, + { + "epoch": 0.23552, + "grad_norm": 0.3829738650156427, + "learning_rate": 0.00017866546047688736, + "loss": 0.6503, + "step": 736 + }, + { + "epoch": 0.23584, + "grad_norm": 0.3457762290921722, + "learning_rate": 0.0001786014262048486, + "loss": 0.6054, + "step": 737 + }, + { + "epoch": 0.23616, + "grad_norm": 0.3950841371672311, + "learning_rate": 0.00017853730749085856, + "loss": 0.6663, + "step": 738 + }, + { + "epoch": 0.23648, + "grad_norm": 0.3921900649317959, + "learning_rate": 0.0001784731044038004, + "loss": 0.7087, + "step": 739 + }, + { + "epoch": 0.2368, + "grad_norm": 0.37203824026401816, + "learning_rate": 0.0001784088170126479, + "loss": 0.6614, + "step": 740 + }, + { + "epoch": 0.23712, + "grad_norm": 0.4169831346176795, + "learning_rate": 0.00017834444538646527, + "loss": 0.6891, + "step": 741 + }, + { + "epoch": 0.23744, + "grad_norm": 0.3430654181829531, + "learning_rate": 0.00017827998959440736, + "loss": 0.6283, + "step": 742 + }, + { + "epoch": 0.23776, + "grad_norm": 0.3723546203784155, + "learning_rate": 0.0001782154497057194, + "loss": 0.6607, + "step": 743 + }, + { + "epoch": 0.23808, + "grad_norm": 0.38409773935642044, + "learning_rate": 0.00017815082578973693, + "loss": 0.6974, + "step": 744 + }, + { + "epoch": 0.2384, + "grad_norm": 0.3529225958218236, + "learning_rate": 0.00017808611791588584, + "loss": 0.6461, + "step": 745 + }, + { + "epoch": 0.23872, + "grad_norm": 0.38105695662774514, + "learning_rate": 0.00017802132615368205, + "loss": 0.713, + "step": 746 + }, + { + "epoch": 0.23904, + "grad_norm": 0.3926402910525965, + "learning_rate": 0.00017795645057273177, + "loss": 0.689, + "step": 747 + }, + { + "epoch": 0.23936, + "grad_norm": 0.3790104979677121, + "learning_rate": 0.00017789149124273123, + "loss": 0.6834, + "step": 748 + }, + { + "epoch": 0.23968, + "grad_norm": 0.4045459184686704, + "learning_rate": 0.00017782644823346658, + "loss": 0.6446, + "step": 749 + }, + { + "epoch": 0.24, + "grad_norm": 0.40319121564248955, + "learning_rate": 0.00017776132161481385, + "loss": 0.6753, + "step": 750 + }, + { + "epoch": 0.24032, + "grad_norm": 0.3634116972046028, + "learning_rate": 0.000177696111456739, + "loss": 0.6109, + "step": 751 + }, + { + "epoch": 0.24064, + "grad_norm": 0.3781068354581363, + "learning_rate": 0.00017763081782929757, + "loss": 0.665, + "step": 752 + }, + { + "epoch": 0.24096, + "grad_norm": 0.39310915820479103, + "learning_rate": 0.00017756544080263495, + "loss": 0.6554, + "step": 753 + }, + { + "epoch": 0.24128, + "grad_norm": 0.3761835536526668, + "learning_rate": 0.00017749998044698607, + "loss": 0.6648, + "step": 754 + }, + { + "epoch": 0.2416, + "grad_norm": 0.3748500098790218, + "learning_rate": 0.00017743443683267525, + "loss": 0.7238, + "step": 755 + }, + { + "epoch": 0.24192, + "grad_norm": 0.3506351603982856, + "learning_rate": 0.00017736881003011643, + "loss": 0.6503, + "step": 756 + }, + { + "epoch": 0.24224, + "grad_norm": 0.3604576790932431, + "learning_rate": 0.00017730310010981285, + "loss": 0.6839, + "step": 757 + }, + { + "epoch": 0.24256, + "grad_norm": 0.3789260711437009, + "learning_rate": 0.00017723730714235705, + "loss": 0.6355, + "step": 758 + }, + { + "epoch": 0.24288, + "grad_norm": 0.36539702949886815, + "learning_rate": 0.00017717143119843075, + "loss": 0.6508, + "step": 759 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3897347490416631, + "learning_rate": 0.00017710547234880486, + "loss": 0.655, + "step": 760 + }, + { + "epoch": 0.24352, + "grad_norm": 0.38799776162309213, + "learning_rate": 0.00017703943066433935, + "loss": 0.6799, + "step": 761 + }, + { + "epoch": 0.24384, + "grad_norm": 0.34789732571574994, + "learning_rate": 0.00017697330621598313, + "loss": 0.5818, + "step": 762 + }, + { + "epoch": 0.24416, + "grad_norm": 0.3681128745265903, + "learning_rate": 0.00017690709907477412, + "loss": 0.6985, + "step": 763 + }, + { + "epoch": 0.24448, + "grad_norm": 0.36664982899708504, + "learning_rate": 0.000176840809311839, + "loss": 0.6823, + "step": 764 + }, + { + "epoch": 0.2448, + "grad_norm": 0.369593144412236, + "learning_rate": 0.0001767744369983932, + "loss": 0.7022, + "step": 765 + }, + { + "epoch": 0.24512, + "grad_norm": 0.3696850026902434, + "learning_rate": 0.0001767079822057409, + "loss": 0.7249, + "step": 766 + }, + { + "epoch": 0.24544, + "grad_norm": 0.36928768768576387, + "learning_rate": 0.0001766414450052749, + "loss": 0.707, + "step": 767 + }, + { + "epoch": 0.24576, + "grad_norm": 0.3608620469117216, + "learning_rate": 0.0001765748254684764, + "loss": 0.6627, + "step": 768 + }, + { + "epoch": 0.24608, + "grad_norm": 0.35888561707044647, + "learning_rate": 0.0001765081236669152, + "loss": 0.6559, + "step": 769 + }, + { + "epoch": 0.2464, + "grad_norm": 0.34718463650647857, + "learning_rate": 0.0001764413396722494, + "loss": 0.648, + "step": 770 + }, + { + "epoch": 0.24672, + "grad_norm": 0.3535437473758207, + "learning_rate": 0.00017637447355622538, + "loss": 0.6596, + "step": 771 + }, + { + "epoch": 0.24704, + "grad_norm": 0.38919238187688104, + "learning_rate": 0.00017630752539067785, + "loss": 0.6867, + "step": 772 + }, + { + "epoch": 0.24736, + "grad_norm": 0.36571627750283614, + "learning_rate": 0.00017624049524752954, + "loss": 0.6347, + "step": 773 + }, + { + "epoch": 0.24768, + "grad_norm": 0.35948035619621493, + "learning_rate": 0.00017617338319879136, + "loss": 0.6954, + "step": 774 + }, + { + "epoch": 0.248, + "grad_norm": 0.3636966174866779, + "learning_rate": 0.0001761061893165621, + "loss": 0.6368, + "step": 775 + }, + { + "epoch": 0.24832, + "grad_norm": 0.3623952898853704, + "learning_rate": 0.0001760389136730286, + "loss": 0.6919, + "step": 776 + }, + { + "epoch": 0.24864, + "grad_norm": 0.3744531382753775, + "learning_rate": 0.00017597155634046537, + "loss": 0.6654, + "step": 777 + }, + { + "epoch": 0.24896, + "grad_norm": 0.38270320727918494, + "learning_rate": 0.00017590411739123484, + "loss": 0.6765, + "step": 778 + }, + { + "epoch": 0.24928, + "grad_norm": 0.34822030214222854, + "learning_rate": 0.000175836596897787, + "loss": 0.6288, + "step": 779 + }, + { + "epoch": 0.2496, + "grad_norm": 0.38997789732359966, + "learning_rate": 0.00017576899493265954, + "loss": 0.6895, + "step": 780 + }, + { + "epoch": 0.24992, + "grad_norm": 0.3710814808434627, + "learning_rate": 0.00017570131156847756, + "loss": 0.6511, + "step": 781 + }, + { + "epoch": 0.25024, + "grad_norm": 0.37325666573165384, + "learning_rate": 0.00017563354687795375, + "loss": 0.7231, + "step": 782 + }, + { + "epoch": 0.25056, + "grad_norm": 0.38392593308239936, + "learning_rate": 0.00017556570093388806, + "loss": 0.6574, + "step": 783 + }, + { + "epoch": 0.25088, + "grad_norm": 0.42112599740387235, + "learning_rate": 0.00017549777380916777, + "loss": 0.6723, + "step": 784 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3521731958080482, + "learning_rate": 0.00017542976557676738, + "loss": 0.6597, + "step": 785 + }, + { + "epoch": 0.25152, + "grad_norm": 0.3851579490484671, + "learning_rate": 0.00017536167630974854, + "loss": 0.6971, + "step": 786 + }, + { + "epoch": 0.25184, + "grad_norm": 0.41011682634144714, + "learning_rate": 0.00017529350608125986, + "loss": 0.7226, + "step": 787 + }, + { + "epoch": 0.25216, + "grad_norm": 0.39012475246529094, + "learning_rate": 0.00017522525496453702, + "loss": 0.7073, + "step": 788 + }, + { + "epoch": 0.25248, + "grad_norm": 0.37831221735486503, + "learning_rate": 0.00017515692303290262, + "loss": 0.6941, + "step": 789 + }, + { + "epoch": 0.2528, + "grad_norm": 0.3961407873087152, + "learning_rate": 0.00017508851035976598, + "loss": 0.6362, + "step": 790 + }, + { + "epoch": 0.25312, + "grad_norm": 0.38372475028444036, + "learning_rate": 0.00017502001701862323, + "loss": 0.7038, + "step": 791 + }, + { + "epoch": 0.25344, + "grad_norm": 0.39253207378164034, + "learning_rate": 0.0001749514430830572, + "loss": 0.7087, + "step": 792 + }, + { + "epoch": 0.25376, + "grad_norm": 0.37536172724992684, + "learning_rate": 0.0001748827886267372, + "loss": 0.6629, + "step": 793 + }, + { + "epoch": 0.25408, + "grad_norm": 0.37939017436966077, + "learning_rate": 0.0001748140537234191, + "loss": 0.7054, + "step": 794 + }, + { + "epoch": 0.2544, + "grad_norm": 0.4008613066398397, + "learning_rate": 0.00017474523844694518, + "loss": 0.6793, + "step": 795 + }, + { + "epoch": 0.25472, + "grad_norm": 0.37880333252362164, + "learning_rate": 0.00017467634287124414, + "loss": 0.7231, + "step": 796 + }, + { + "epoch": 0.25504, + "grad_norm": 0.362347756950756, + "learning_rate": 0.0001746073670703308, + "loss": 0.7087, + "step": 797 + }, + { + "epoch": 0.25536, + "grad_norm": 0.3660111047422492, + "learning_rate": 0.00017453831111830632, + "loss": 0.6523, + "step": 798 + }, + { + "epoch": 0.25568, + "grad_norm": 0.40853376479823095, + "learning_rate": 0.00017446917508935785, + "loss": 0.6982, + "step": 799 + }, + { + "epoch": 0.256, + "grad_norm": 0.3680173008859809, + "learning_rate": 0.0001743999590577586, + "loss": 0.7078, + "step": 800 + }, + { + "epoch": 0.25632, + "grad_norm": 0.38643790020373076, + "learning_rate": 0.00017433066309786779, + "loss": 0.6959, + "step": 801 + }, + { + "epoch": 0.25664, + "grad_norm": 0.39890479799005363, + "learning_rate": 0.0001742612872841304, + "loss": 0.7043, + "step": 802 + }, + { + "epoch": 0.25696, + "grad_norm": 0.3647697505304056, + "learning_rate": 0.00017419183169107728, + "loss": 0.6646, + "step": 803 + }, + { + "epoch": 0.25728, + "grad_norm": 0.38063519440723825, + "learning_rate": 0.00017412229639332497, + "loss": 0.6662, + "step": 804 + }, + { + "epoch": 0.2576, + "grad_norm": 0.3613607544656473, + "learning_rate": 0.00017405268146557565, + "loss": 0.6604, + "step": 805 + }, + { + "epoch": 0.25792, + "grad_norm": 0.3705790308535535, + "learning_rate": 0.00017398298698261696, + "loss": 0.6854, + "step": 806 + }, + { + "epoch": 0.25824, + "grad_norm": 0.369450868677443, + "learning_rate": 0.00017391321301932217, + "loss": 0.6787, + "step": 807 + }, + { + "epoch": 0.25856, + "grad_norm": 0.34498991261234885, + "learning_rate": 0.00017384335965064972, + "loss": 0.6087, + "step": 808 + }, + { + "epoch": 0.25888, + "grad_norm": 0.3544005279140098, + "learning_rate": 0.00017377342695164356, + "loss": 0.6523, + "step": 809 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3703786236785271, + "learning_rate": 0.00017370341499743278, + "loss": 0.6886, + "step": 810 + }, + { + "epoch": 0.25952, + "grad_norm": 0.3586889275901231, + "learning_rate": 0.00017363332386323156, + "loss": 0.6845, + "step": 811 + }, + { + "epoch": 0.25984, + "grad_norm": 0.35213584457379365, + "learning_rate": 0.0001735631536243392, + "loss": 0.6585, + "step": 812 + }, + { + "epoch": 0.26016, + "grad_norm": 0.4054390943539348, + "learning_rate": 0.00017349290435614, + "loss": 0.6519, + "step": 813 + }, + { + "epoch": 0.26048, + "grad_norm": 0.36669723823752626, + "learning_rate": 0.0001734225761341032, + "loss": 0.697, + "step": 814 + }, + { + "epoch": 0.2608, + "grad_norm": 0.3579772708248754, + "learning_rate": 0.00017335216903378267, + "loss": 0.7142, + "step": 815 + }, + { + "epoch": 0.26112, + "grad_norm": 0.37007009053306683, + "learning_rate": 0.00017328168313081728, + "loss": 0.6559, + "step": 816 + }, + { + "epoch": 0.26144, + "grad_norm": 0.36960486757346084, + "learning_rate": 0.00017321111850093036, + "loss": 0.67, + "step": 817 + }, + { + "epoch": 0.26176, + "grad_norm": 0.3979872534062356, + "learning_rate": 0.00017314047521992993, + "loss": 0.7315, + "step": 818 + }, + { + "epoch": 0.26208, + "grad_norm": 0.3629816736881855, + "learning_rate": 0.0001730697533637084, + "loss": 0.6971, + "step": 819 + }, + { + "epoch": 0.2624, + "grad_norm": 0.34987269536959664, + "learning_rate": 0.0001729989530082427, + "loss": 0.6862, + "step": 820 + }, + { + "epoch": 0.26272, + "grad_norm": 0.3506826922755894, + "learning_rate": 0.00017292807422959402, + "loss": 0.6874, + "step": 821 + }, + { + "epoch": 0.26304, + "grad_norm": 0.35080493419930325, + "learning_rate": 0.00017285711710390787, + "loss": 0.6482, + "step": 822 + }, + { + "epoch": 0.26336, + "grad_norm": 0.36295944138367087, + "learning_rate": 0.00017278608170741383, + "loss": 0.7002, + "step": 823 + }, + { + "epoch": 0.26368, + "grad_norm": 0.3819450634574015, + "learning_rate": 0.0001727149681164257, + "loss": 0.709, + "step": 824 + }, + { + "epoch": 0.264, + "grad_norm": 0.3736935737242355, + "learning_rate": 0.00017264377640734114, + "loss": 0.6962, + "step": 825 + }, + { + "epoch": 0.26432, + "grad_norm": 0.38204111246911654, + "learning_rate": 0.00017257250665664179, + "loss": 0.691, + "step": 826 + }, + { + "epoch": 0.26464, + "grad_norm": 0.3513189667088068, + "learning_rate": 0.00017250115894089322, + "loss": 0.6449, + "step": 827 + }, + { + "epoch": 0.26496, + "grad_norm": 0.3644897631112619, + "learning_rate": 0.0001724297333367446, + "loss": 0.6863, + "step": 828 + }, + { + "epoch": 0.26528, + "grad_norm": 0.36027334107647974, + "learning_rate": 0.00017235822992092893, + "loss": 0.6706, + "step": 829 + }, + { + "epoch": 0.2656, + "grad_norm": 0.35122715715410546, + "learning_rate": 0.00017228664877026265, + "loss": 0.673, + "step": 830 + }, + { + "epoch": 0.26592, + "grad_norm": 0.3625251310005681, + "learning_rate": 0.00017221498996164582, + "loss": 0.6612, + "step": 831 + }, + { + "epoch": 0.26624, + "grad_norm": 0.35030684269057144, + "learning_rate": 0.00017214325357206193, + "loss": 0.647, + "step": 832 + }, + { + "epoch": 0.26656, + "grad_norm": 0.3675875937136502, + "learning_rate": 0.00017207143967857777, + "loss": 0.7279, + "step": 833 + }, + { + "epoch": 0.26688, + "grad_norm": 0.3727898885861703, + "learning_rate": 0.00017199954835834337, + "loss": 0.7008, + "step": 834 + }, + { + "epoch": 0.2672, + "grad_norm": 0.3618226189281512, + "learning_rate": 0.00017192757968859202, + "loss": 0.7105, + "step": 835 + }, + { + "epoch": 0.26752, + "grad_norm": 0.394255699092009, + "learning_rate": 0.00017185553374664004, + "loss": 0.6932, + "step": 836 + }, + { + "epoch": 0.26784, + "grad_norm": 0.349870418299385, + "learning_rate": 0.00017178341060988678, + "loss": 0.7202, + "step": 837 + }, + { + "epoch": 0.26816, + "grad_norm": 0.3651099621762893, + "learning_rate": 0.0001717112103558146, + "loss": 0.7125, + "step": 838 + }, + { + "epoch": 0.26848, + "grad_norm": 0.36389602459689674, + "learning_rate": 0.00017163893306198854, + "loss": 0.7007, + "step": 839 + }, + { + "epoch": 0.2688, + "grad_norm": 0.36075341264175675, + "learning_rate": 0.00017156657880605653, + "loss": 0.6866, + "step": 840 + }, + { + "epoch": 0.26912, + "grad_norm": 0.3834109027245434, + "learning_rate": 0.00017149414766574918, + "loss": 0.6775, + "step": 841 + }, + { + "epoch": 0.26944, + "grad_norm": 0.3746030932157122, + "learning_rate": 0.00017142163971887965, + "loss": 0.6482, + "step": 842 + }, + { + "epoch": 0.26976, + "grad_norm": 0.356226388303307, + "learning_rate": 0.00017134905504334364, + "loss": 0.6423, + "step": 843 + }, + { + "epoch": 0.27008, + "grad_norm": 0.36012166278949104, + "learning_rate": 0.00017127639371711926, + "loss": 0.7024, + "step": 844 + }, + { + "epoch": 0.2704, + "grad_norm": 0.3654792778007879, + "learning_rate": 0.000171203655818267, + "loss": 0.7008, + "step": 845 + }, + { + "epoch": 0.27072, + "grad_norm": 0.37270068868360284, + "learning_rate": 0.0001711308414249295, + "loss": 0.7033, + "step": 846 + }, + { + "epoch": 0.27104, + "grad_norm": 0.34807063715927555, + "learning_rate": 0.00017105795061533183, + "loss": 0.6358, + "step": 847 + }, + { + "epoch": 0.27136, + "grad_norm": 0.393270880591959, + "learning_rate": 0.0001709849834677809, + "loss": 0.6791, + "step": 848 + }, + { + "epoch": 0.27168, + "grad_norm": 0.3655470453282021, + "learning_rate": 0.00017091194006066572, + "loss": 0.6456, + "step": 849 + }, + { + "epoch": 0.272, + "grad_norm": 0.3506513116310917, + "learning_rate": 0.0001708388204724572, + "loss": 0.682, + "step": 850 + }, + { + "epoch": 0.27232, + "grad_norm": 0.3851875616113911, + "learning_rate": 0.00017076562478170822, + "loss": 0.6612, + "step": 851 + }, + { + "epoch": 0.27264, + "grad_norm": 0.37361029451719346, + "learning_rate": 0.00017069235306705323, + "loss": 0.6621, + "step": 852 + }, + { + "epoch": 0.27296, + "grad_norm": 0.4018506569535499, + "learning_rate": 0.0001706190054072085, + "loss": 0.6689, + "step": 853 + }, + { + "epoch": 0.27328, + "grad_norm": 0.3574705065373127, + "learning_rate": 0.0001705455818809718, + "loss": 0.666, + "step": 854 + }, + { + "epoch": 0.2736, + "grad_norm": 0.37118008203529135, + "learning_rate": 0.00017047208256722244, + "loss": 0.6447, + "step": 855 + }, + { + "epoch": 0.27392, + "grad_norm": 0.37339442339511913, + "learning_rate": 0.00017039850754492112, + "loss": 0.6366, + "step": 856 + }, + { + "epoch": 0.27424, + "grad_norm": 0.36554648138335893, + "learning_rate": 0.00017032485689310998, + "loss": 0.6809, + "step": 857 + }, + { + "epoch": 0.27456, + "grad_norm": 0.3764822919216519, + "learning_rate": 0.00017025113069091223, + "loss": 0.6637, + "step": 858 + }, + { + "epoch": 0.27488, + "grad_norm": 0.4207942342818041, + "learning_rate": 0.0001701773290175324, + "loss": 0.7455, + "step": 859 + }, + { + "epoch": 0.2752, + "grad_norm": 0.34667589640154944, + "learning_rate": 0.00017010345195225598, + "loss": 0.6685, + "step": 860 + }, + { + "epoch": 0.27552, + "grad_norm": 0.3581586943653799, + "learning_rate": 0.0001700294995744496, + "loss": 0.663, + "step": 861 + }, + { + "epoch": 0.27584, + "grad_norm": 0.371326003370556, + "learning_rate": 0.00016995547196356066, + "loss": 0.6911, + "step": 862 + }, + { + "epoch": 0.27616, + "grad_norm": 0.36538386375152104, + "learning_rate": 0.0001698813691991174, + "loss": 0.6506, + "step": 863 + }, + { + "epoch": 0.27648, + "grad_norm": 0.36767576315495165, + "learning_rate": 0.00016980719136072892, + "loss": 0.7034, + "step": 864 + }, + { + "epoch": 0.2768, + "grad_norm": 0.37065161276198183, + "learning_rate": 0.00016973293852808486, + "loss": 0.6778, + "step": 865 + }, + { + "epoch": 0.27712, + "grad_norm": 0.38682531338216397, + "learning_rate": 0.00016965861078095537, + "loss": 0.6658, + "step": 866 + }, + { + "epoch": 0.27744, + "grad_norm": 0.3810259829824189, + "learning_rate": 0.00016958420819919128, + "loss": 0.6738, + "step": 867 + }, + { + "epoch": 0.27776, + "grad_norm": 0.3749059191738754, + "learning_rate": 0.00016950973086272365, + "loss": 0.7156, + "step": 868 + }, + { + "epoch": 0.27808, + "grad_norm": 0.3872215251605679, + "learning_rate": 0.00016943517885156386, + "loss": 0.6865, + "step": 869 + }, + { + "epoch": 0.2784, + "grad_norm": 0.3738462248754198, + "learning_rate": 0.0001693605522458036, + "loss": 0.6918, + "step": 870 + }, + { + "epoch": 0.27872, + "grad_norm": 0.36877661490834046, + "learning_rate": 0.00016928585112561465, + "loss": 0.6679, + "step": 871 + }, + { + "epoch": 0.27904, + "grad_norm": 0.3544422874787804, + "learning_rate": 0.00016921107557124883, + "loss": 0.6272, + "step": 872 + }, + { + "epoch": 0.27936, + "grad_norm": 0.3708189893062282, + "learning_rate": 0.0001691362256630379, + "loss": 0.6806, + "step": 873 + }, + { + "epoch": 0.27968, + "grad_norm": 0.380215015229633, + "learning_rate": 0.00016906130148139364, + "loss": 0.7079, + "step": 874 + }, + { + "epoch": 0.28, + "grad_norm": 0.37096240233370503, + "learning_rate": 0.00016898630310680738, + "loss": 0.6733, + "step": 875 + }, + { + "epoch": 0.28032, + "grad_norm": 0.3941364301418198, + "learning_rate": 0.0001689112306198504, + "loss": 0.6839, + "step": 876 + }, + { + "epoch": 0.28064, + "grad_norm": 0.36432588768029983, + "learning_rate": 0.00016883608410117343, + "loss": 0.6583, + "step": 877 + }, + { + "epoch": 0.28096, + "grad_norm": 0.36934987086358184, + "learning_rate": 0.0001687608636315068, + "loss": 0.6532, + "step": 878 + }, + { + "epoch": 0.28128, + "grad_norm": 0.3694658537509994, + "learning_rate": 0.00016868556929166032, + "loss": 0.6482, + "step": 879 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3713959384678602, + "learning_rate": 0.0001686102011625231, + "loss": 0.6642, + "step": 880 + }, + { + "epoch": 0.28192, + "grad_norm": 0.421555137852422, + "learning_rate": 0.00016853475932506352, + "loss": 0.6305, + "step": 881 + }, + { + "epoch": 0.28224, + "grad_norm": 0.3815269263698826, + "learning_rate": 0.00016845924386032918, + "loss": 0.674, + "step": 882 + }, + { + "epoch": 0.28256, + "grad_norm": 0.3671879656089705, + "learning_rate": 0.0001683836548494468, + "loss": 0.6754, + "step": 883 + }, + { + "epoch": 0.28288, + "grad_norm": 0.33850983586535077, + "learning_rate": 0.00016830799237362203, + "loss": 0.6268, + "step": 884 + }, + { + "epoch": 0.2832, + "grad_norm": 0.3521345262892056, + "learning_rate": 0.00016823225651413953, + "loss": 0.6193, + "step": 885 + }, + { + "epoch": 0.28352, + "grad_norm": 0.6904724303293136, + "learning_rate": 0.00016815644735236268, + "loss": 0.6971, + "step": 886 + }, + { + "epoch": 0.28384, + "grad_norm": 0.3898409987607257, + "learning_rate": 0.0001680805649697338, + "loss": 0.6596, + "step": 887 + }, + { + "epoch": 0.28416, + "grad_norm": 0.36430433774492677, + "learning_rate": 0.0001680046094477737, + "loss": 0.6383, + "step": 888 + }, + { + "epoch": 0.28448, + "grad_norm": 0.4237870824617201, + "learning_rate": 0.00016792858086808177, + "loss": 0.6757, + "step": 889 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3792898105886306, + "learning_rate": 0.00016785247931233602, + "loss": 0.672, + "step": 890 + }, + { + "epoch": 0.28512, + "grad_norm": 0.4005473857322007, + "learning_rate": 0.00016777630486229273, + "loss": 0.7246, + "step": 891 + }, + { + "epoch": 0.28544, + "grad_norm": 0.3784335381702001, + "learning_rate": 0.00016770005759978655, + "loss": 0.6801, + "step": 892 + }, + { + "epoch": 0.28576, + "grad_norm": 0.3661674946140045, + "learning_rate": 0.00016762373760673035, + "loss": 0.7091, + "step": 893 + }, + { + "epoch": 0.28608, + "grad_norm": 0.35777449313307796, + "learning_rate": 0.00016754734496511514, + "loss": 0.6189, + "step": 894 + }, + { + "epoch": 0.2864, + "grad_norm": 0.35590285851087305, + "learning_rate": 0.00016747087975700988, + "loss": 0.6969, + "step": 895 + }, + { + "epoch": 0.28672, + "grad_norm": 0.402997502242191, + "learning_rate": 0.00016739434206456167, + "loss": 0.6836, + "step": 896 + }, + { + "epoch": 0.28704, + "grad_norm": 0.3859134510340487, + "learning_rate": 0.00016731773196999533, + "loss": 0.7321, + "step": 897 + }, + { + "epoch": 0.28736, + "grad_norm": 0.35399634339455766, + "learning_rate": 0.00016724104955561354, + "loss": 0.64, + "step": 898 + }, + { + "epoch": 0.28768, + "grad_norm": 0.35460270204765015, + "learning_rate": 0.0001671642949037966, + "loss": 0.6796, + "step": 899 + }, + { + "epoch": 0.288, + "grad_norm": 0.3577821876725225, + "learning_rate": 0.0001670874680970025, + "loss": 0.6261, + "step": 900 + }, + { + "epoch": 0.28832, + "grad_norm": 0.3772616328756418, + "learning_rate": 0.0001670105692177667, + "loss": 0.6755, + "step": 901 + }, + { + "epoch": 0.28864, + "grad_norm": 0.380094153877933, + "learning_rate": 0.00016693359834870207, + "loss": 0.6484, + "step": 902 + }, + { + "epoch": 0.28896, + "grad_norm": 0.3665405454149139, + "learning_rate": 0.00016685655557249887, + "loss": 0.6946, + "step": 903 + }, + { + "epoch": 0.28928, + "grad_norm": 0.3861010972957807, + "learning_rate": 0.0001667794409719246, + "loss": 0.6929, + "step": 904 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3926124592299394, + "learning_rate": 0.00016670225462982386, + "loss": 0.6724, + "step": 905 + }, + { + "epoch": 0.28992, + "grad_norm": 0.3523672322317073, + "learning_rate": 0.0001666249966291184, + "loss": 0.6467, + "step": 906 + }, + { + "epoch": 0.29024, + "grad_norm": 0.3602395304708209, + "learning_rate": 0.00016654766705280694, + "loss": 0.6229, + "step": 907 + }, + { + "epoch": 0.29056, + "grad_norm": 0.3999067550893319, + "learning_rate": 0.00016647026598396505, + "loss": 0.6553, + "step": 908 + }, + { + "epoch": 0.29088, + "grad_norm": 0.3618976583977307, + "learning_rate": 0.0001663927935057451, + "loss": 0.6022, + "step": 909 + }, + { + "epoch": 0.2912, + "grad_norm": 0.3643539203957328, + "learning_rate": 0.0001663152497013763, + "loss": 0.6918, + "step": 910 + }, + { + "epoch": 0.29152, + "grad_norm": 0.38437017782753297, + "learning_rate": 0.00016623763465416425, + "loss": 0.6706, + "step": 911 + }, + { + "epoch": 0.29184, + "grad_norm": 0.39873148741296155, + "learning_rate": 0.0001661599484474914, + "loss": 0.7074, + "step": 912 + }, + { + "epoch": 0.29216, + "grad_norm": 0.3949016272609718, + "learning_rate": 0.0001660821911648163, + "loss": 0.7474, + "step": 913 + }, + { + "epoch": 0.29248, + "grad_norm": 0.3494489891991421, + "learning_rate": 0.00016600436288967418, + "loss": 0.6632, + "step": 914 + }, + { + "epoch": 0.2928, + "grad_norm": 0.3501319837881801, + "learning_rate": 0.0001659264637056763, + "loss": 0.6304, + "step": 915 + }, + { + "epoch": 0.29312, + "grad_norm": 0.3527166694363339, + "learning_rate": 0.00016584849369651026, + "loss": 0.6397, + "step": 916 + }, + { + "epoch": 0.29344, + "grad_norm": 0.381420613678227, + "learning_rate": 0.00016577045294593958, + "loss": 0.6531, + "step": 917 + }, + { + "epoch": 0.29376, + "grad_norm": 0.36213841323932616, + "learning_rate": 0.00016569234153780395, + "loss": 0.6513, + "step": 918 + }, + { + "epoch": 0.29408, + "grad_norm": 0.3623787905356813, + "learning_rate": 0.00016561415955601886, + "loss": 0.6275, + "step": 919 + }, + { + "epoch": 0.2944, + "grad_norm": 0.35633437654990074, + "learning_rate": 0.0001655359070845757, + "loss": 0.6678, + "step": 920 + }, + { + "epoch": 0.29472, + "grad_norm": 0.3746090132247638, + "learning_rate": 0.00016545758420754146, + "loss": 0.6892, + "step": 921 + }, + { + "epoch": 0.29504, + "grad_norm": 0.3425791549577302, + "learning_rate": 0.0001653791910090589, + "loss": 0.5937, + "step": 922 + }, + { + "epoch": 0.29536, + "grad_norm": 0.36255434070489573, + "learning_rate": 0.00016530072757334625, + "loss": 0.6745, + "step": 923 + }, + { + "epoch": 0.29568, + "grad_norm": 0.3776061396002599, + "learning_rate": 0.00016522219398469723, + "loss": 0.6909, + "step": 924 + }, + { + "epoch": 0.296, + "grad_norm": 0.38002185151870216, + "learning_rate": 0.00016514359032748088, + "loss": 0.6561, + "step": 925 + }, + { + "epoch": 0.29632, + "grad_norm": 0.37971148165006247, + "learning_rate": 0.0001650649166861416, + "loss": 0.7082, + "step": 926 + }, + { + "epoch": 0.29664, + "grad_norm": 0.36269364908054363, + "learning_rate": 0.00016498617314519886, + "loss": 0.6385, + "step": 927 + }, + { + "epoch": 0.29696, + "grad_norm": 0.3599052242365175, + "learning_rate": 0.00016490735978924733, + "loss": 0.6906, + "step": 928 + }, + { + "epoch": 0.29728, + "grad_norm": 0.351768734324277, + "learning_rate": 0.00016482847670295665, + "loss": 0.6703, + "step": 929 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3812060990620126, + "learning_rate": 0.00016474952397107134, + "loss": 0.6426, + "step": 930 + }, + { + "epoch": 0.29792, + "grad_norm": 0.35627487037143313, + "learning_rate": 0.00016467050167841074, + "loss": 0.6475, + "step": 931 + }, + { + "epoch": 0.29824, + "grad_norm": 0.3646915738130938, + "learning_rate": 0.00016459140990986894, + "loss": 0.7014, + "step": 932 + }, + { + "epoch": 0.29856, + "grad_norm": 0.3771235652915896, + "learning_rate": 0.0001645122487504147, + "loss": 0.7274, + "step": 933 + }, + { + "epoch": 0.29888, + "grad_norm": 0.353189475012315, + "learning_rate": 0.0001644330182850913, + "loss": 0.5989, + "step": 934 + }, + { + "epoch": 0.2992, + "grad_norm": 0.37873884947495684, + "learning_rate": 0.00016435371859901645, + "loss": 0.735, + "step": 935 + }, + { + "epoch": 0.29952, + "grad_norm": 0.3794924661892233, + "learning_rate": 0.00016427434977738225, + "loss": 0.7076, + "step": 936 + }, + { + "epoch": 0.29984, + "grad_norm": 0.35027737099186884, + "learning_rate": 0.00016419491190545509, + "loss": 0.6959, + "step": 937 + }, + { + "epoch": 0.30016, + "grad_norm": 0.33591022962403055, + "learning_rate": 0.00016411540506857547, + "loss": 0.6204, + "step": 938 + }, + { + "epoch": 0.30048, + "grad_norm": 0.3539505955772323, + "learning_rate": 0.0001640358293521581, + "loss": 0.656, + "step": 939 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3617125872942655, + "learning_rate": 0.0001639561848416915, + "loss": 0.6451, + "step": 940 + }, + { + "epoch": 0.30112, + "grad_norm": 0.37123614005159844, + "learning_rate": 0.00016387647162273837, + "loss": 0.7278, + "step": 941 + }, + { + "epoch": 0.30144, + "grad_norm": 0.3725088941668062, + "learning_rate": 0.00016379668978093491, + "loss": 0.6451, + "step": 942 + }, + { + "epoch": 0.30176, + "grad_norm": 0.34152340573341006, + "learning_rate": 0.00016371683940199133, + "loss": 0.6771, + "step": 943 + }, + { + "epoch": 0.30208, + "grad_norm": 0.3548490178109378, + "learning_rate": 0.00016363692057169124, + "loss": 0.6843, + "step": 944 + }, + { + "epoch": 0.3024, + "grad_norm": 0.35343157514639656, + "learning_rate": 0.00016355693337589196, + "loss": 0.688, + "step": 945 + }, + { + "epoch": 0.30272, + "grad_norm": 0.3640587759348646, + "learning_rate": 0.00016347687790052416, + "loss": 0.6717, + "step": 946 + }, + { + "epoch": 0.30304, + "grad_norm": 0.36144577101370634, + "learning_rate": 0.00016339675423159182, + "loss": 0.6268, + "step": 947 + }, + { + "epoch": 0.30336, + "grad_norm": 0.380090566499197, + "learning_rate": 0.0001633165624551723, + "loss": 0.6831, + "step": 948 + }, + { + "epoch": 0.30368, + "grad_norm": 0.3769643347765158, + "learning_rate": 0.0001632363026574161, + "loss": 0.6858, + "step": 949 + }, + { + "epoch": 0.304, + "grad_norm": 0.37556314716044387, + "learning_rate": 0.00016315597492454672, + "loss": 0.6933, + "step": 950 + }, + { + "epoch": 0.30432, + "grad_norm": 0.3680280472287063, + "learning_rate": 0.0001630755793428607, + "loss": 0.6713, + "step": 951 + }, + { + "epoch": 0.30464, + "grad_norm": 0.36623019970443227, + "learning_rate": 0.00016299511599872753, + "loss": 0.7097, + "step": 952 + }, + { + "epoch": 0.30496, + "grad_norm": 0.3744260530128633, + "learning_rate": 0.0001629145849785893, + "loss": 0.6656, + "step": 953 + }, + { + "epoch": 0.30528, + "grad_norm": 0.3609342240323625, + "learning_rate": 0.00016283398636896107, + "loss": 0.6421, + "step": 954 + }, + { + "epoch": 0.3056, + "grad_norm": 0.35399280203812694, + "learning_rate": 0.00016275332025643028, + "loss": 0.6587, + "step": 955 + }, + { + "epoch": 0.30592, + "grad_norm": 0.3698257987725609, + "learning_rate": 0.000162672586727657, + "loss": 0.6756, + "step": 956 + }, + { + "epoch": 0.30624, + "grad_norm": 0.36418119602728144, + "learning_rate": 0.00016259178586937377, + "loss": 0.6538, + "step": 957 + }, + { + "epoch": 0.30656, + "grad_norm": 0.35738670930122973, + "learning_rate": 0.00016251091776838536, + "loss": 0.6733, + "step": 958 + }, + { + "epoch": 0.30688, + "grad_norm": 0.3883558384165783, + "learning_rate": 0.00016242998251156883, + "loss": 0.6671, + "step": 959 + }, + { + "epoch": 0.3072, + "grad_norm": 0.36084244676274785, + "learning_rate": 0.00016234898018587337, + "loss": 0.6477, + "step": 960 + }, + { + "epoch": 0.30752, + "grad_norm": 0.35322104987763, + "learning_rate": 0.00016226791087832024, + "loss": 0.6534, + "step": 961 + }, + { + "epoch": 0.30784, + "grad_norm": 0.36973136564250303, + "learning_rate": 0.00016218677467600264, + "loss": 0.6583, + "step": 962 + }, + { + "epoch": 0.30816, + "grad_norm": 0.3824178268494372, + "learning_rate": 0.00016210557166608562, + "loss": 0.6576, + "step": 963 + }, + { + "epoch": 0.30848, + "grad_norm": 0.3411237488526102, + "learning_rate": 0.0001620243019358061, + "loss": 0.684, + "step": 964 + }, + { + "epoch": 0.3088, + "grad_norm": 0.37440464927829414, + "learning_rate": 0.00016194296557247255, + "loss": 0.652, + "step": 965 + }, + { + "epoch": 0.30912, + "grad_norm": 0.35787823750772685, + "learning_rate": 0.0001618615626634651, + "loss": 0.6943, + "step": 966 + }, + { + "epoch": 0.30944, + "grad_norm": 0.3579971211954716, + "learning_rate": 0.00016178009329623533, + "loss": 0.6607, + "step": 967 + }, + { + "epoch": 0.30976, + "grad_norm": 0.37491312284581807, + "learning_rate": 0.00016169855755830627, + "loss": 0.7079, + "step": 968 + }, + { + "epoch": 0.31008, + "grad_norm": 0.3758810732262012, + "learning_rate": 0.0001616169555372722, + "loss": 0.696, + "step": 969 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3685168908904295, + "learning_rate": 0.0001615352873207986, + "loss": 0.6437, + "step": 970 + }, + { + "epoch": 0.31072, + "grad_norm": 0.3472159295621595, + "learning_rate": 0.00016145355299662211, + "loss": 0.625, + "step": 971 + }, + { + "epoch": 0.31104, + "grad_norm": 0.3760137565048465, + "learning_rate": 0.0001613717526525504, + "loss": 0.663, + "step": 972 + }, + { + "epoch": 0.31136, + "grad_norm": 0.3705790528335316, + "learning_rate": 0.00016128988637646204, + "loss": 0.6884, + "step": 973 + }, + { + "epoch": 0.31168, + "grad_norm": 0.37541867430056264, + "learning_rate": 0.00016120795425630634, + "loss": 0.6854, + "step": 974 + }, + { + "epoch": 0.312, + "grad_norm": 0.38349964714380863, + "learning_rate": 0.00016112595638010353, + "loss": 0.6917, + "step": 975 + }, + { + "epoch": 0.31232, + "grad_norm": 0.3640848867841372, + "learning_rate": 0.00016104389283594435, + "loss": 0.633, + "step": 976 + }, + { + "epoch": 0.31264, + "grad_norm": 0.36504894306850255, + "learning_rate": 0.00016096176371199015, + "loss": 0.6975, + "step": 977 + }, + { + "epoch": 0.31296, + "grad_norm": 0.3743788002825349, + "learning_rate": 0.00016087956909647264, + "loss": 0.6332, + "step": 978 + }, + { + "epoch": 0.31328, + "grad_norm": 0.38754508345126265, + "learning_rate": 0.000160797309077694, + "loss": 0.6625, + "step": 979 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3982264654764306, + "learning_rate": 0.00016071498374402665, + "loss": 0.735, + "step": 980 + }, + { + "epoch": 0.31392, + "grad_norm": 0.364892912346316, + "learning_rate": 0.00016063259318391314, + "loss": 0.6332, + "step": 981 + }, + { + "epoch": 0.31424, + "grad_norm": 0.3603837381919519, + "learning_rate": 0.00016055013748586606, + "loss": 0.6539, + "step": 982 + }, + { + "epoch": 0.31456, + "grad_norm": 0.40015024801271154, + "learning_rate": 0.0001604676167384681, + "loss": 0.6703, + "step": 983 + }, + { + "epoch": 0.31488, + "grad_norm": 0.35001951598663866, + "learning_rate": 0.0001603850310303717, + "loss": 0.6098, + "step": 984 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3668523738327645, + "learning_rate": 0.00016030238045029916, + "loss": 0.645, + "step": 985 + }, + { + "epoch": 0.31552, + "grad_norm": 0.35403577782505474, + "learning_rate": 0.00016021966508704253, + "loss": 0.6603, + "step": 986 + }, + { + "epoch": 0.31584, + "grad_norm": 0.3789129050894694, + "learning_rate": 0.0001601368850294633, + "loss": 0.6628, + "step": 987 + }, + { + "epoch": 0.31616, + "grad_norm": 0.4647907425339079, + "learning_rate": 0.00016005404036649256, + "loss": 0.6876, + "step": 988 + }, + { + "epoch": 0.31648, + "grad_norm": 0.3594507397418406, + "learning_rate": 0.00015997113118713086, + "loss": 0.5998, + "step": 989 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3652042297314683, + "learning_rate": 0.00015988815758044792, + "loss": 0.6838, + "step": 990 + }, + { + "epoch": 0.31712, + "grad_norm": 0.3729665870482187, + "learning_rate": 0.00015980511963558278, + "loss": 0.6495, + "step": 991 + }, + { + "epoch": 0.31744, + "grad_norm": 0.36933924818636055, + "learning_rate": 0.00015972201744174352, + "loss": 0.6757, + "step": 992 + }, + { + "epoch": 0.31776, + "grad_norm": 0.38679551637565257, + "learning_rate": 0.00015963885108820743, + "loss": 0.6541, + "step": 993 + }, + { + "epoch": 0.31808, + "grad_norm": 0.3603785868451744, + "learning_rate": 0.00015955562066432042, + "loss": 0.6163, + "step": 994 + }, + { + "epoch": 0.3184, + "grad_norm": 0.3929824324946899, + "learning_rate": 0.0001594723262594975, + "loss": 0.6849, + "step": 995 + }, + { + "epoch": 0.31872, + "grad_norm": 0.3559659322460444, + "learning_rate": 0.0001593889679632223, + "loss": 0.6379, + "step": 996 + }, + { + "epoch": 0.31904, + "grad_norm": 0.37883766953321385, + "learning_rate": 0.00015930554586504706, + "loss": 0.6729, + "step": 997 + }, + { + "epoch": 0.31936, + "grad_norm": 0.3729989821806419, + "learning_rate": 0.00015922206005459266, + "loss": 0.6889, + "step": 998 + }, + { + "epoch": 0.31968, + "grad_norm": 0.35918826467125653, + "learning_rate": 0.00015913851062154835, + "loss": 0.5814, + "step": 999 + }, + { + "epoch": 0.32, + "grad_norm": 0.35906920939136566, + "learning_rate": 0.00015905489765567172, + "loss": 0.6608, + "step": 1000 + }, + { + "epoch": 0.32032, + "grad_norm": 0.383067845796615, + "learning_rate": 0.0001589712212467887, + "loss": 0.6538, + "step": 1001 + }, + { + "epoch": 0.32064, + "grad_norm": 0.39490816143678525, + "learning_rate": 0.00015888748148479328, + "loss": 0.7145, + "step": 1002 + }, + { + "epoch": 0.32096, + "grad_norm": 0.3649317674576256, + "learning_rate": 0.0001588036784596476, + "loss": 0.6571, + "step": 1003 + }, + { + "epoch": 0.32128, + "grad_norm": 0.3522141223196636, + "learning_rate": 0.00015871981226138173, + "loss": 0.6314, + "step": 1004 + }, + { + "epoch": 0.3216, + "grad_norm": 0.3737060606617921, + "learning_rate": 0.00015863588298009352, + "loss": 0.6448, + "step": 1005 + }, + { + "epoch": 0.32192, + "grad_norm": 0.4006255670960862, + "learning_rate": 0.00015855189070594866, + "loss": 0.6446, + "step": 1006 + }, + { + "epoch": 0.32224, + "grad_norm": 0.3788146630252767, + "learning_rate": 0.00015846783552918062, + "loss": 0.6669, + "step": 1007 + }, + { + "epoch": 0.32256, + "grad_norm": 0.3353138151804068, + "learning_rate": 0.00015838371754009028, + "loss": 0.6304, + "step": 1008 + }, + { + "epoch": 0.32288, + "grad_norm": 0.37891280362065266, + "learning_rate": 0.00015829953682904607, + "loss": 0.6662, + "step": 1009 + }, + { + "epoch": 0.3232, + "grad_norm": 0.36893994601189634, + "learning_rate": 0.0001582152934864838, + "loss": 0.6723, + "step": 1010 + }, + { + "epoch": 0.32352, + "grad_norm": 0.3718370779966313, + "learning_rate": 0.00015813098760290658, + "loss": 0.6795, + "step": 1011 + }, + { + "epoch": 0.32384, + "grad_norm": 0.37614222168808964, + "learning_rate": 0.00015804661926888466, + "loss": 0.689, + "step": 1012 + }, + { + "epoch": 0.32416, + "grad_norm": 0.35470906810610897, + "learning_rate": 0.00015796218857505546, + "loss": 0.6787, + "step": 1013 + }, + { + "epoch": 0.32448, + "grad_norm": 0.3868230064365295, + "learning_rate": 0.0001578776956121233, + "loss": 0.6919, + "step": 1014 + }, + { + "epoch": 0.3248, + "grad_norm": 0.3584080977191767, + "learning_rate": 0.00015779314047085946, + "loss": 0.649, + "step": 1015 + }, + { + "epoch": 0.32512, + "grad_norm": 0.3581072006980192, + "learning_rate": 0.00015770852324210202, + "loss": 0.6889, + "step": 1016 + }, + { + "epoch": 0.32544, + "grad_norm": 0.362762782023143, + "learning_rate": 0.00015762384401675567, + "loss": 0.6816, + "step": 1017 + }, + { + "epoch": 0.32576, + "grad_norm": 0.371193053002529, + "learning_rate": 0.00015753910288579184, + "loss": 0.6565, + "step": 1018 + }, + { + "epoch": 0.32608, + "grad_norm": 0.38115451431628883, + "learning_rate": 0.0001574542999402484, + "loss": 0.6811, + "step": 1019 + }, + { + "epoch": 0.3264, + "grad_norm": 0.36106395785399764, + "learning_rate": 0.00015736943527122963, + "loss": 0.661, + "step": 1020 + }, + { + "epoch": 0.32672, + "grad_norm": 0.35735851646054667, + "learning_rate": 0.00015728450896990606, + "loss": 0.6663, + "step": 1021 + }, + { + "epoch": 0.32704, + "grad_norm": 0.3623931389838278, + "learning_rate": 0.0001571995211275146, + "loss": 0.7253, + "step": 1022 + }, + { + "epoch": 0.32736, + "grad_norm": 0.35506019809511985, + "learning_rate": 0.00015711447183535806, + "loss": 0.618, + "step": 1023 + }, + { + "epoch": 0.32768, + "grad_norm": 0.3489857022479348, + "learning_rate": 0.0001570293611848054, + "loss": 0.6222, + "step": 1024 + }, + { + "epoch": 0.328, + "grad_norm": 0.37680837981183957, + "learning_rate": 0.00015694418926729146, + "loss": 0.6799, + "step": 1025 + }, + { + "epoch": 0.32832, + "grad_norm": 0.36911157265370625, + "learning_rate": 0.00015685895617431695, + "loss": 0.6583, + "step": 1026 + }, + { + "epoch": 0.32864, + "grad_norm": 0.36159265607345803, + "learning_rate": 0.0001567736619974482, + "loss": 0.661, + "step": 1027 + }, + { + "epoch": 0.32896, + "grad_norm": 0.36901441587991685, + "learning_rate": 0.00015668830682831724, + "loss": 0.6655, + "step": 1028 + }, + { + "epoch": 0.32928, + "grad_norm": 0.36093552394155415, + "learning_rate": 0.00015660289075862164, + "loss": 0.5978, + "step": 1029 + }, + { + "epoch": 0.3296, + "grad_norm": 0.37342315971667994, + "learning_rate": 0.00015651741388012432, + "loss": 0.6677, + "step": 1030 + }, + { + "epoch": 0.32992, + "grad_norm": 0.4098215899518884, + "learning_rate": 0.0001564318762846536, + "loss": 0.6449, + "step": 1031 + }, + { + "epoch": 0.33024, + "grad_norm": 0.4050113949889007, + "learning_rate": 0.00015634627806410296, + "loss": 0.6834, + "step": 1032 + }, + { + "epoch": 0.33056, + "grad_norm": 0.3539258347240515, + "learning_rate": 0.00015626061931043106, + "loss": 0.6924, + "step": 1033 + }, + { + "epoch": 0.33088, + "grad_norm": 0.3806068426402795, + "learning_rate": 0.0001561749001156616, + "loss": 0.6653, + "step": 1034 + }, + { + "epoch": 0.3312, + "grad_norm": 0.36582037767946063, + "learning_rate": 0.00015608912057188317, + "loss": 0.639, + "step": 1035 + }, + { + "epoch": 0.33152, + "grad_norm": 0.370538404130026, + "learning_rate": 0.0001560032807712492, + "loss": 0.6589, + "step": 1036 + }, + { + "epoch": 0.33184, + "grad_norm": 0.366378492392802, + "learning_rate": 0.0001559173808059779, + "loss": 0.6549, + "step": 1037 + }, + { + "epoch": 0.33216, + "grad_norm": 0.3522169174480128, + "learning_rate": 0.00015583142076835204, + "loss": 0.6711, + "step": 1038 + }, + { + "epoch": 0.33248, + "grad_norm": 0.34943426259617355, + "learning_rate": 0.000155745400750719, + "loss": 0.6653, + "step": 1039 + }, + { + "epoch": 0.3328, + "grad_norm": 0.36946892742706056, + "learning_rate": 0.00015565932084549058, + "loss": 0.6565, + "step": 1040 + }, + { + "epoch": 0.33312, + "grad_norm": 0.3793852309877599, + "learning_rate": 0.00015557318114514285, + "loss": 0.675, + "step": 1041 + }, + { + "epoch": 0.33344, + "grad_norm": 0.36173698458850406, + "learning_rate": 0.00015548698174221626, + "loss": 0.6613, + "step": 1042 + }, + { + "epoch": 0.33376, + "grad_norm": 0.33469803714633256, + "learning_rate": 0.00015540072272931518, + "loss": 0.6195, + "step": 1043 + }, + { + "epoch": 0.33408, + "grad_norm": 0.36164971660089984, + "learning_rate": 0.00015531440419910828, + "loss": 0.6679, + "step": 1044 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3639937211916124, + "learning_rate": 0.00015522802624432796, + "loss": 0.6121, + "step": 1045 + }, + { + "epoch": 0.33472, + "grad_norm": 0.37109129267506613, + "learning_rate": 0.00015514158895777054, + "loss": 0.6926, + "step": 1046 + }, + { + "epoch": 0.33504, + "grad_norm": 0.3714485316690472, + "learning_rate": 0.00015505509243229614, + "loss": 0.6987, + "step": 1047 + }, + { + "epoch": 0.33536, + "grad_norm": 0.39142606845217615, + "learning_rate": 0.0001549685367608284, + "loss": 0.7089, + "step": 1048 + }, + { + "epoch": 0.33568, + "grad_norm": 0.36259030690561067, + "learning_rate": 0.0001548819220363546, + "loss": 0.6253, + "step": 1049 + }, + { + "epoch": 0.336, + "grad_norm": 0.35952691110741264, + "learning_rate": 0.0001547952483519254, + "loss": 0.6555, + "step": 1050 + }, + { + "epoch": 0.33632, + "grad_norm": 0.3548597404926695, + "learning_rate": 0.0001547085158006548, + "loss": 0.6308, + "step": 1051 + }, + { + "epoch": 0.33664, + "grad_norm": 0.38402158450852814, + "learning_rate": 0.00015462172447572013, + "loss": 0.634, + "step": 1052 + }, + { + "epoch": 0.33696, + "grad_norm": 0.34441548175393877, + "learning_rate": 0.00015453487447036172, + "loss": 0.6589, + "step": 1053 + }, + { + "epoch": 0.33728, + "grad_norm": 0.344323551106791, + "learning_rate": 0.00015444796587788307, + "loss": 0.6249, + "step": 1054 + }, + { + "epoch": 0.3376, + "grad_norm": 0.38643166482499186, + "learning_rate": 0.00015436099879165055, + "loss": 0.7023, + "step": 1055 + }, + { + "epoch": 0.33792, + "grad_norm": 0.3563266249988576, + "learning_rate": 0.0001542739733050934, + "loss": 0.6786, + "step": 1056 + }, + { + "epoch": 0.33824, + "grad_norm": 0.35397941503812475, + "learning_rate": 0.00015418688951170356, + "loss": 0.6575, + "step": 1057 + }, + { + "epoch": 0.33856, + "grad_norm": 0.3790624743469411, + "learning_rate": 0.00015409974750503564, + "loss": 0.7061, + "step": 1058 + }, + { + "epoch": 0.33888, + "grad_norm": 0.3574400160617647, + "learning_rate": 0.00015401254737870682, + "loss": 0.6708, + "step": 1059 + }, + { + "epoch": 0.3392, + "grad_norm": 0.36844606935547664, + "learning_rate": 0.00015392528922639662, + "loss": 0.6051, + "step": 1060 + }, + { + "epoch": 0.33952, + "grad_norm": 0.37658332480738516, + "learning_rate": 0.00015383797314184704, + "loss": 0.6514, + "step": 1061 + }, + { + "epoch": 0.33984, + "grad_norm": 0.38675895372917346, + "learning_rate": 0.00015375059921886213, + "loss": 0.6716, + "step": 1062 + }, + { + "epoch": 0.34016, + "grad_norm": 0.3421617167538485, + "learning_rate": 0.00015366316755130829, + "loss": 0.6229, + "step": 1063 + }, + { + "epoch": 0.34048, + "grad_norm": 0.35342929523103755, + "learning_rate": 0.0001535756782331138, + "loss": 0.6561, + "step": 1064 + }, + { + "epoch": 0.3408, + "grad_norm": 0.36779895789196637, + "learning_rate": 0.00015348813135826893, + "loss": 0.675, + "step": 1065 + }, + { + "epoch": 0.34112, + "grad_norm": 0.361010410788992, + "learning_rate": 0.00015340052702082576, + "loss": 0.6069, + "step": 1066 + }, + { + "epoch": 0.34144, + "grad_norm": 0.34595375035287296, + "learning_rate": 0.00015331286531489817, + "loss": 0.6734, + "step": 1067 + }, + { + "epoch": 0.34176, + "grad_norm": 0.3483443428908466, + "learning_rate": 0.00015322514633466154, + "loss": 0.642, + "step": 1068 + }, + { + "epoch": 0.34208, + "grad_norm": 0.3955374802065069, + "learning_rate": 0.00015313737017435294, + "loss": 0.7165, + "step": 1069 + }, + { + "epoch": 0.3424, + "grad_norm": 0.37830914007613364, + "learning_rate": 0.00015304953692827074, + "loss": 0.6725, + "step": 1070 + }, + { + "epoch": 0.34272, + "grad_norm": 0.36273349137274824, + "learning_rate": 0.0001529616466907747, + "loss": 0.6709, + "step": 1071 + }, + { + "epoch": 0.34304, + "grad_norm": 0.34624603419985345, + "learning_rate": 0.0001528736995562858, + "loss": 0.6273, + "step": 1072 + }, + { + "epoch": 0.34336, + "grad_norm": 0.3648264327047193, + "learning_rate": 0.00015278569561928614, + "loss": 0.6467, + "step": 1073 + }, + { + "epoch": 0.34368, + "grad_norm": 0.37493698819062815, + "learning_rate": 0.00015269763497431882, + "loss": 0.6715, + "step": 1074 + }, + { + "epoch": 0.344, + "grad_norm": 0.3777423462146128, + "learning_rate": 0.0001526095177159879, + "loss": 0.6988, + "step": 1075 + }, + { + "epoch": 0.34432, + "grad_norm": 0.34828002107038014, + "learning_rate": 0.00015252134393895826, + "loss": 0.6032, + "step": 1076 + }, + { + "epoch": 0.34464, + "grad_norm": 0.3711510117528022, + "learning_rate": 0.0001524331137379554, + "loss": 0.6086, + "step": 1077 + }, + { + "epoch": 0.34496, + "grad_norm": 0.36452581357905783, + "learning_rate": 0.00015234482720776564, + "loss": 0.6448, + "step": 1078 + }, + { + "epoch": 0.34528, + "grad_norm": 0.3980780639622504, + "learning_rate": 0.0001522564844432356, + "loss": 0.6608, + "step": 1079 + }, + { + "epoch": 0.3456, + "grad_norm": 0.40193983367291936, + "learning_rate": 0.00015216808553927247, + "loss": 0.6543, + "step": 1080 + }, + { + "epoch": 0.34592, + "grad_norm": 0.3447463363113517, + "learning_rate": 0.00015207963059084357, + "loss": 0.6593, + "step": 1081 + }, + { + "epoch": 0.34624, + "grad_norm": 0.35587151155889213, + "learning_rate": 0.00015199111969297672, + "loss": 0.6565, + "step": 1082 + }, + { + "epoch": 0.34656, + "grad_norm": 0.3657730208797738, + "learning_rate": 0.00015190255294075951, + "loss": 0.642, + "step": 1083 + }, + { + "epoch": 0.34688, + "grad_norm": 0.3761834500421161, + "learning_rate": 0.00015181393042933982, + "loss": 0.7161, + "step": 1084 + }, + { + "epoch": 0.3472, + "grad_norm": 0.368373353833742, + "learning_rate": 0.00015172525225392526, + "loss": 0.6403, + "step": 1085 + }, + { + "epoch": 0.34752, + "grad_norm": 0.36157521739080134, + "learning_rate": 0.00015163651850978323, + "loss": 0.6209, + "step": 1086 + }, + { + "epoch": 0.34784, + "grad_norm": 0.37804306769636536, + "learning_rate": 0.00015154772929224097, + "loss": 0.6342, + "step": 1087 + }, + { + "epoch": 0.34816, + "grad_norm": 0.3659468089331613, + "learning_rate": 0.0001514588846966852, + "loss": 0.6017, + "step": 1088 + }, + { + "epoch": 0.34848, + "grad_norm": 0.4005300627479996, + "learning_rate": 0.00015136998481856217, + "loss": 0.6926, + "step": 1089 + }, + { + "epoch": 0.3488, + "grad_norm": 0.36799319928078683, + "learning_rate": 0.00015128102975337751, + "loss": 0.6702, + "step": 1090 + }, + { + "epoch": 0.34912, + "grad_norm": 0.4246392386445891, + "learning_rate": 0.00015119201959669617, + "loss": 0.6892, + "step": 1091 + }, + { + "epoch": 0.34944, + "grad_norm": 0.3623757114099262, + "learning_rate": 0.00015110295444414223, + "loss": 0.6689, + "step": 1092 + }, + { + "epoch": 0.34976, + "grad_norm": 0.3893692892556752, + "learning_rate": 0.00015101383439139885, + "loss": 0.6879, + "step": 1093 + }, + { + "epoch": 0.35008, + "grad_norm": 0.33343620870386564, + "learning_rate": 0.00015092465953420826, + "loss": 0.6139, + "step": 1094 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3431220788079396, + "learning_rate": 0.0001508354299683715, + "loss": 0.6167, + "step": 1095 + }, + { + "epoch": 0.35072, + "grad_norm": 0.3460788286937098, + "learning_rate": 0.00015074614578974838, + "loss": 0.6743, + "step": 1096 + }, + { + "epoch": 0.35104, + "grad_norm": 0.35435269334017727, + "learning_rate": 0.00015065680709425736, + "loss": 0.6402, + "step": 1097 + }, + { + "epoch": 0.35136, + "grad_norm": 0.3955384008580851, + "learning_rate": 0.00015056741397787552, + "loss": 0.7266, + "step": 1098 + }, + { + "epoch": 0.35168, + "grad_norm": 0.3720773510787015, + "learning_rate": 0.00015047796653663842, + "loss": 0.7068, + "step": 1099 + }, + { + "epoch": 0.352, + "grad_norm": 0.36846361069983113, + "learning_rate": 0.00015038846486663992, + "loss": 0.6619, + "step": 1100 + }, + { + "epoch": 0.35232, + "grad_norm": 0.3731832828827827, + "learning_rate": 0.00015029890906403216, + "loss": 0.655, + "step": 1101 + }, + { + "epoch": 0.35264, + "grad_norm": 0.36390205898133987, + "learning_rate": 0.00015020929922502542, + "loss": 0.6291, + "step": 1102 + }, + { + "epoch": 0.35296, + "grad_norm": 0.37820250573305875, + "learning_rate": 0.00015011963544588806, + "loss": 0.653, + "step": 1103 + }, + { + "epoch": 0.35328, + "grad_norm": 0.37319611633025596, + "learning_rate": 0.00015002991782294643, + "loss": 0.6602, + "step": 1104 + }, + { + "epoch": 0.3536, + "grad_norm": 0.38281627790075523, + "learning_rate": 0.00014994014645258462, + "loss": 0.7075, + "step": 1105 + }, + { + "epoch": 0.35392, + "grad_norm": 0.36365256193161577, + "learning_rate": 0.0001498503214312445, + "loss": 0.679, + "step": 1106 + }, + { + "epoch": 0.35424, + "grad_norm": 0.38319966869436156, + "learning_rate": 0.00014976044285542562, + "loss": 0.6504, + "step": 1107 + }, + { + "epoch": 0.35456, + "grad_norm": 0.3902505783573718, + "learning_rate": 0.00014967051082168505, + "loss": 0.658, + "step": 1108 + }, + { + "epoch": 0.35488, + "grad_norm": 0.38244811334158846, + "learning_rate": 0.00014958052542663727, + "loss": 0.6883, + "step": 1109 + }, + { + "epoch": 0.3552, + "grad_norm": 0.36380406935396126, + "learning_rate": 0.0001494904867669541, + "loss": 0.635, + "step": 1110 + }, + { + "epoch": 0.35552, + "grad_norm": 0.36506113091519193, + "learning_rate": 0.00014940039493936452, + "loss": 0.6451, + "step": 1111 + }, + { + "epoch": 0.35584, + "grad_norm": 0.36786582084111225, + "learning_rate": 0.00014931025004065476, + "loss": 0.6817, + "step": 1112 + }, + { + "epoch": 0.35616, + "grad_norm": 0.3352380674175264, + "learning_rate": 0.00014922005216766793, + "loss": 0.6073, + "step": 1113 + }, + { + "epoch": 0.35648, + "grad_norm": 0.3740820351794441, + "learning_rate": 0.00014912980141730412, + "loss": 0.667, + "step": 1114 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3451447785259785, + "learning_rate": 0.00014903949788652024, + "loss": 0.6027, + "step": 1115 + }, + { + "epoch": 0.35712, + "grad_norm": 0.33993376634128447, + "learning_rate": 0.00014894914167232987, + "loss": 0.6378, + "step": 1116 + }, + { + "epoch": 0.35744, + "grad_norm": 0.36844012858525965, + "learning_rate": 0.00014885873287180318, + "loss": 0.6775, + "step": 1117 + }, + { + "epoch": 0.35776, + "grad_norm": 0.36975490164036573, + "learning_rate": 0.00014876827158206686, + "loss": 0.689, + "step": 1118 + }, + { + "epoch": 0.35808, + "grad_norm": 0.36327834327863406, + "learning_rate": 0.00014867775790030398, + "loss": 0.6936, + "step": 1119 + }, + { + "epoch": 0.3584, + "grad_norm": 0.33425609426282454, + "learning_rate": 0.00014858719192375387, + "loss": 0.6049, + "step": 1120 + }, + { + "epoch": 0.35872, + "grad_norm": 0.3655351336389169, + "learning_rate": 0.0001484965737497121, + "loss": 0.6658, + "step": 1121 + }, + { + "epoch": 0.35904, + "grad_norm": 0.3442095695084306, + "learning_rate": 0.00014840590347553028, + "loss": 0.6263, + "step": 1122 + }, + { + "epoch": 0.35936, + "grad_norm": 0.3464205548065845, + "learning_rate": 0.00014831518119861597, + "loss": 0.6626, + "step": 1123 + }, + { + "epoch": 0.35968, + "grad_norm": 0.40890607498456205, + "learning_rate": 0.0001482244070164326, + "loss": 0.7037, + "step": 1124 + }, + { + "epoch": 0.36, + "grad_norm": 0.3576923789465385, + "learning_rate": 0.00014813358102649943, + "loss": 0.601, + "step": 1125 + }, + { + "epoch": 0.36032, + "grad_norm": 0.3623456455719011, + "learning_rate": 0.00014804270332639133, + "loss": 0.7096, + "step": 1126 + }, + { + "epoch": 0.36064, + "grad_norm": 0.3543078306536569, + "learning_rate": 0.00014795177401373866, + "loss": 0.6231, + "step": 1127 + }, + { + "epoch": 0.36096, + "grad_norm": 0.37705917299442426, + "learning_rate": 0.00014786079318622732, + "loss": 0.703, + "step": 1128 + }, + { + "epoch": 0.36128, + "grad_norm": 0.36222471159585806, + "learning_rate": 0.00014776976094159854, + "loss": 0.6418, + "step": 1129 + }, + { + "epoch": 0.3616, + "grad_norm": 0.3633471700280255, + "learning_rate": 0.00014767867737764876, + "loss": 0.6537, + "step": 1130 + }, + { + "epoch": 0.36192, + "grad_norm": 0.362633231449119, + "learning_rate": 0.00014758754259222955, + "loss": 0.6712, + "step": 1131 + }, + { + "epoch": 0.36224, + "grad_norm": 0.34320157126623524, + "learning_rate": 0.00014749635668324755, + "loss": 0.6682, + "step": 1132 + }, + { + "epoch": 0.36256, + "grad_norm": 0.3666364646050061, + "learning_rate": 0.00014740511974866425, + "loss": 0.7158, + "step": 1133 + }, + { + "epoch": 0.36288, + "grad_norm": 0.3670979884963596, + "learning_rate": 0.00014731383188649596, + "loss": 0.6555, + "step": 1134 + }, + { + "epoch": 0.3632, + "grad_norm": 0.35817734403737456, + "learning_rate": 0.00014722249319481384, + "loss": 0.6652, + "step": 1135 + }, + { + "epoch": 0.36352, + "grad_norm": 0.3839821900635269, + "learning_rate": 0.00014713110377174356, + "loss": 0.6639, + "step": 1136 + }, + { + "epoch": 0.36384, + "grad_norm": 0.3388437054909136, + "learning_rate": 0.00014703966371546517, + "loss": 0.6246, + "step": 1137 + }, + { + "epoch": 0.36416, + "grad_norm": 0.35378792334740317, + "learning_rate": 0.0001469481731242133, + "loss": 0.6469, + "step": 1138 + }, + { + "epoch": 0.36448, + "grad_norm": 0.3755234393310129, + "learning_rate": 0.00014685663209627688, + "loss": 0.6616, + "step": 1139 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3663473507265074, + "learning_rate": 0.0001467650407299988, + "loss": 0.6296, + "step": 1140 + }, + { + "epoch": 0.36512, + "grad_norm": 0.3489172688394357, + "learning_rate": 0.0001466733991237763, + "loss": 0.6181, + "step": 1141 + }, + { + "epoch": 0.36544, + "grad_norm": 0.34005200209869935, + "learning_rate": 0.00014658170737606038, + "loss": 0.5694, + "step": 1142 + }, + { + "epoch": 0.36576, + "grad_norm": 0.3480946148868714, + "learning_rate": 0.00014648996558535606, + "loss": 0.6509, + "step": 1143 + }, + { + "epoch": 0.36608, + "grad_norm": 0.38332042967451024, + "learning_rate": 0.00014639817385022206, + "loss": 0.6612, + "step": 1144 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3764488223683972, + "learning_rate": 0.00014630633226927068, + "loss": 0.6569, + "step": 1145 + }, + { + "epoch": 0.36672, + "grad_norm": 0.35554693281053207, + "learning_rate": 0.00014621444094116792, + "loss": 0.6779, + "step": 1146 + }, + { + "epoch": 0.36704, + "grad_norm": 0.35042745424800764, + "learning_rate": 0.0001461224999646331, + "loss": 0.6335, + "step": 1147 + }, + { + "epoch": 0.36736, + "grad_norm": 0.37137182264251023, + "learning_rate": 0.00014603050943843898, + "loss": 0.6843, + "step": 1148 + }, + { + "epoch": 0.36768, + "grad_norm": 0.3805604460905857, + "learning_rate": 0.00014593846946141142, + "loss": 0.6908, + "step": 1149 + }, + { + "epoch": 0.368, + "grad_norm": 0.35950530577255946, + "learning_rate": 0.00014584638013242953, + "loss": 0.6292, + "step": 1150 + }, + { + "epoch": 0.36832, + "grad_norm": 0.3480550930098258, + "learning_rate": 0.00014575424155042536, + "loss": 0.6137, + "step": 1151 + }, + { + "epoch": 0.36864, + "grad_norm": 0.351338579024904, + "learning_rate": 0.00014566205381438395, + "loss": 0.6323, + "step": 1152 + }, + { + "epoch": 0.36896, + "grad_norm": 0.38514115687406036, + "learning_rate": 0.0001455698170233431, + "loss": 0.6585, + "step": 1153 + }, + { + "epoch": 0.36928, + "grad_norm": 0.38456616315153086, + "learning_rate": 0.00014547753127639324, + "loss": 0.6892, + "step": 1154 + }, + { + "epoch": 0.3696, + "grad_norm": 0.36196125122106154, + "learning_rate": 0.00014538519667267754, + "loss": 0.6207, + "step": 1155 + }, + { + "epoch": 0.36992, + "grad_norm": 0.34800232564616895, + "learning_rate": 0.00014529281331139153, + "loss": 0.6546, + "step": 1156 + }, + { + "epoch": 0.37024, + "grad_norm": 0.36055734006233836, + "learning_rate": 0.0001452003812917832, + "loss": 0.6532, + "step": 1157 + }, + { + "epoch": 0.37056, + "grad_norm": 0.4048817129042739, + "learning_rate": 0.00014510790071315278, + "loss": 0.6248, + "step": 1158 + }, + { + "epoch": 0.37088, + "grad_norm": 0.3607998948655766, + "learning_rate": 0.00014501537167485267, + "loss": 0.6351, + "step": 1159 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3508315751082414, + "learning_rate": 0.0001449227942762873, + "loss": 0.6193, + "step": 1160 + }, + { + "epoch": 0.37152, + "grad_norm": 0.4014065812136743, + "learning_rate": 0.0001448301686169131, + "loss": 0.6222, + "step": 1161 + }, + { + "epoch": 0.37184, + "grad_norm": 0.3464640299726335, + "learning_rate": 0.0001447374947962384, + "loss": 0.6205, + "step": 1162 + }, + { + "epoch": 0.37216, + "grad_norm": 0.36150707547216376, + "learning_rate": 0.00014464477291382315, + "loss": 0.6773, + "step": 1163 + }, + { + "epoch": 0.37248, + "grad_norm": 0.37605695631058256, + "learning_rate": 0.00014455200306927893, + "loss": 0.6612, + "step": 1164 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3605930599009112, + "learning_rate": 0.00014445918536226904, + "loss": 0.6628, + "step": 1165 + }, + { + "epoch": 0.37312, + "grad_norm": 0.37657130051925736, + "learning_rate": 0.00014436631989250793, + "loss": 0.622, + "step": 1166 + }, + { + "epoch": 0.37344, + "grad_norm": 0.3360705081746471, + "learning_rate": 0.00014427340675976158, + "loss": 0.6472, + "step": 1167 + }, + { + "epoch": 0.37376, + "grad_norm": 0.3692092099637814, + "learning_rate": 0.000144180446063847, + "loss": 0.643, + "step": 1168 + }, + { + "epoch": 0.37408, + "grad_norm": 0.3530391333825638, + "learning_rate": 0.00014408743790463247, + "loss": 0.6797, + "step": 1169 + }, + { + "epoch": 0.3744, + "grad_norm": 0.36097002707572645, + "learning_rate": 0.00014399438238203716, + "loss": 0.6374, + "step": 1170 + }, + { + "epoch": 0.37472, + "grad_norm": 0.35146178271190726, + "learning_rate": 0.00014390127959603108, + "loss": 0.6303, + "step": 1171 + }, + { + "epoch": 0.37504, + "grad_norm": 0.37091439055456177, + "learning_rate": 0.00014380812964663513, + "loss": 0.7094, + "step": 1172 + }, + { + "epoch": 0.37536, + "grad_norm": 0.36032936147731837, + "learning_rate": 0.0001437149326339208, + "loss": 0.7257, + "step": 1173 + }, + { + "epoch": 0.37568, + "grad_norm": 0.36897899886159996, + "learning_rate": 0.00014362168865801017, + "loss": 0.696, + "step": 1174 + }, + { + "epoch": 0.376, + "grad_norm": 0.3660038800899447, + "learning_rate": 0.00014352839781907578, + "loss": 0.6875, + "step": 1175 + }, + { + "epoch": 0.37632, + "grad_norm": 0.3358799316426415, + "learning_rate": 0.00014343506021734044, + "loss": 0.6353, + "step": 1176 + }, + { + "epoch": 0.37664, + "grad_norm": 0.3453952039192468, + "learning_rate": 0.00014334167595307732, + "loss": 0.6392, + "step": 1177 + }, + { + "epoch": 0.37696, + "grad_norm": 0.3493594546931393, + "learning_rate": 0.0001432482451266096, + "loss": 0.7301, + "step": 1178 + }, + { + "epoch": 0.37728, + "grad_norm": 0.35460348035497186, + "learning_rate": 0.0001431547678383106, + "loss": 0.6647, + "step": 1179 + }, + { + "epoch": 0.3776, + "grad_norm": 0.35492765885097327, + "learning_rate": 0.00014306124418860347, + "loss": 0.6736, + "step": 1180 + }, + { + "epoch": 0.37792, + "grad_norm": 0.34242203428065143, + "learning_rate": 0.00014296767427796116, + "loss": 0.6222, + "step": 1181 + }, + { + "epoch": 0.37824, + "grad_norm": 0.3733630052080278, + "learning_rate": 0.00014287405820690636, + "loss": 0.659, + "step": 1182 + }, + { + "epoch": 0.37856, + "grad_norm": 0.36465866936407954, + "learning_rate": 0.00014278039607601136, + "loss": 0.6454, + "step": 1183 + }, + { + "epoch": 0.37888, + "grad_norm": 0.35384191111172697, + "learning_rate": 0.00014268668798589793, + "loss": 0.6069, + "step": 1184 + }, + { + "epoch": 0.3792, + "grad_norm": 0.3512339262141367, + "learning_rate": 0.00014259293403723716, + "loss": 0.6114, + "step": 1185 + }, + { + "epoch": 0.37952, + "grad_norm": 0.34354806060619186, + "learning_rate": 0.0001424991343307494, + "loss": 0.6582, + "step": 1186 + }, + { + "epoch": 0.37984, + "grad_norm": 0.391948641056348, + "learning_rate": 0.0001424052889672043, + "loss": 0.6364, + "step": 1187 + }, + { + "epoch": 0.38016, + "grad_norm": 0.3587244271259095, + "learning_rate": 0.00014231139804742036, + "loss": 0.6301, + "step": 1188 + }, + { + "epoch": 0.38048, + "grad_norm": 0.369912259095861, + "learning_rate": 0.00014221746167226518, + "loss": 0.6767, + "step": 1189 + }, + { + "epoch": 0.3808, + "grad_norm": 0.36033411888946815, + "learning_rate": 0.00014212347994265508, + "loss": 0.6407, + "step": 1190 + }, + { + "epoch": 0.38112, + "grad_norm": 0.35168942156212024, + "learning_rate": 0.0001420294529595552, + "loss": 0.6869, + "step": 1191 + }, + { + "epoch": 0.38144, + "grad_norm": 0.366334501959747, + "learning_rate": 0.00014193538082397927, + "loss": 0.6365, + "step": 1192 + }, + { + "epoch": 0.38176, + "grad_norm": 0.3551506950560667, + "learning_rate": 0.0001418412636369895, + "loss": 0.6427, + "step": 1193 + }, + { + "epoch": 0.38208, + "grad_norm": 0.349252047418438, + "learning_rate": 0.00014174710149969646, + "loss": 0.6261, + "step": 1194 + }, + { + "epoch": 0.3824, + "grad_norm": 0.344027119598442, + "learning_rate": 0.00014165289451325907, + "loss": 0.6468, + "step": 1195 + }, + { + "epoch": 0.38272, + "grad_norm": 0.3517115187781731, + "learning_rate": 0.0001415586427788845, + "loss": 0.6103, + "step": 1196 + }, + { + "epoch": 0.38304, + "grad_norm": 0.3538346711167031, + "learning_rate": 0.00014146434639782782, + "loss": 0.6111, + "step": 1197 + }, + { + "epoch": 0.38336, + "grad_norm": 0.3643372781369026, + "learning_rate": 0.00014137000547139223, + "loss": 0.7294, + "step": 1198 + }, + { + "epoch": 0.38368, + "grad_norm": 0.35356490707573734, + "learning_rate": 0.00014127562010092865, + "loss": 0.6645, + "step": 1199 + }, + { + "epoch": 0.384, + "grad_norm": 0.35479395393239455, + "learning_rate": 0.00014118119038783588, + "loss": 0.6414, + "step": 1200 + }, + { + "epoch": 0.38432, + "grad_norm": 0.3724724382878393, + "learning_rate": 0.00014108671643356025, + "loss": 0.6267, + "step": 1201 + }, + { + "epoch": 0.38464, + "grad_norm": 0.3498487938229309, + "learning_rate": 0.00014099219833959564, + "loss": 0.5792, + "step": 1202 + }, + { + "epoch": 0.38496, + "grad_norm": 0.4074467207222055, + "learning_rate": 0.00014089763620748339, + "loss": 0.6823, + "step": 1203 + }, + { + "epoch": 0.38528, + "grad_norm": 0.3571418637491229, + "learning_rate": 0.00014080303013881207, + "loss": 0.6407, + "step": 1204 + }, + { + "epoch": 0.3856, + "grad_norm": 0.36884655527109417, + "learning_rate": 0.00014070838023521763, + "loss": 0.6547, + "step": 1205 + }, + { + "epoch": 0.38592, + "grad_norm": 0.3825316556449097, + "learning_rate": 0.00014061368659838293, + "loss": 0.6766, + "step": 1206 + }, + { + "epoch": 0.38624, + "grad_norm": 0.37688498463873227, + "learning_rate": 0.00014051894933003782, + "loss": 0.6741, + "step": 1207 + }, + { + "epoch": 0.38656, + "grad_norm": 0.3457593953938326, + "learning_rate": 0.00014042416853195914, + "loss": 0.645, + "step": 1208 + }, + { + "epoch": 0.38688, + "grad_norm": 0.3545257775885476, + "learning_rate": 0.0001403293443059704, + "loss": 0.6533, + "step": 1209 + }, + { + "epoch": 0.3872, + "grad_norm": 0.37147392400843676, + "learning_rate": 0.0001402344767539418, + "loss": 0.6434, + "step": 1210 + }, + { + "epoch": 0.38752, + "grad_norm": 0.34665788800852526, + "learning_rate": 0.0001401395659777901, + "loss": 0.6197, + "step": 1211 + }, + { + "epoch": 0.38784, + "grad_norm": 0.36470091984128866, + "learning_rate": 0.00014004461207947847, + "loss": 0.6977, + "step": 1212 + }, + { + "epoch": 0.38816, + "grad_norm": 0.40017899300612314, + "learning_rate": 0.00013994961516101642, + "loss": 0.6801, + "step": 1213 + }, + { + "epoch": 0.38848, + "grad_norm": 0.3508062826536708, + "learning_rate": 0.00013985457532445962, + "loss": 0.6162, + "step": 1214 + }, + { + "epoch": 0.3888, + "grad_norm": 0.3565330428544234, + "learning_rate": 0.00013975949267190996, + "loss": 0.6848, + "step": 1215 + }, + { + "epoch": 0.38912, + "grad_norm": 0.36592503414841004, + "learning_rate": 0.00013966436730551525, + "loss": 0.6225, + "step": 1216 + }, + { + "epoch": 0.38944, + "grad_norm": 0.3631019284931274, + "learning_rate": 0.00013956919932746914, + "loss": 0.6931, + "step": 1217 + }, + { + "epoch": 0.38976, + "grad_norm": 0.499376685599847, + "learning_rate": 0.00013947398884001121, + "loss": 0.6957, + "step": 1218 + }, + { + "epoch": 0.39008, + "grad_norm": 0.3669103263170906, + "learning_rate": 0.0001393787359454266, + "loss": 0.6645, + "step": 1219 + }, + { + "epoch": 0.3904, + "grad_norm": 0.36108966074941806, + "learning_rate": 0.00013928344074604597, + "loss": 0.6159, + "step": 1220 + }, + { + "epoch": 0.39072, + "grad_norm": 0.3310372149611766, + "learning_rate": 0.00013918810334424554, + "loss": 0.605, + "step": 1221 + }, + { + "epoch": 0.39104, + "grad_norm": 0.36391317255929906, + "learning_rate": 0.00013909272384244679, + "loss": 0.6329, + "step": 1222 + }, + { + "epoch": 0.39136, + "grad_norm": 0.36071028090097496, + "learning_rate": 0.00013899730234311644, + "loss": 0.6457, + "step": 1223 + }, + { + "epoch": 0.39168, + "grad_norm": 0.3606414251260815, + "learning_rate": 0.00013890183894876642, + "loss": 0.6474, + "step": 1224 + }, + { + "epoch": 0.392, + "grad_norm": 0.3855067095452181, + "learning_rate": 0.00013880633376195348, + "loss": 0.5796, + "step": 1225 + }, + { + "epoch": 0.39232, + "grad_norm": 0.3698567294505442, + "learning_rate": 0.00013871078688527943, + "loss": 0.6322, + "step": 1226 + }, + { + "epoch": 0.39264, + "grad_norm": 0.3999914294986726, + "learning_rate": 0.00013861519842139078, + "loss": 0.6893, + "step": 1227 + }, + { + "epoch": 0.39296, + "grad_norm": 0.3774958160029902, + "learning_rate": 0.00013851956847297882, + "loss": 0.6137, + "step": 1228 + }, + { + "epoch": 0.39328, + "grad_norm": 0.3527654104745092, + "learning_rate": 0.00013842389714277927, + "loss": 0.6006, + "step": 1229 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3610566136205061, + "learning_rate": 0.0001383281845335724, + "loss": 0.6642, + "step": 1230 + }, + { + "epoch": 0.39392, + "grad_norm": 0.3663583619345716, + "learning_rate": 0.00013823243074818277, + "loss": 0.647, + "step": 1231 + }, + { + "epoch": 0.39424, + "grad_norm": 0.3504005929992112, + "learning_rate": 0.00013813663588947925, + "loss": 0.6456, + "step": 1232 + }, + { + "epoch": 0.39456, + "grad_norm": 0.3508202934329224, + "learning_rate": 0.00013804080006037478, + "loss": 0.6111, + "step": 1233 + }, + { + "epoch": 0.39488, + "grad_norm": 0.3447725408829163, + "learning_rate": 0.00013794492336382635, + "loss": 0.6204, + "step": 1234 + }, + { + "epoch": 0.3952, + "grad_norm": 0.36481364877718714, + "learning_rate": 0.00013784900590283473, + "loss": 0.6495, + "step": 1235 + }, + { + "epoch": 0.39552, + "grad_norm": 0.36043626294563597, + "learning_rate": 0.0001377530477804447, + "loss": 0.6193, + "step": 1236 + }, + { + "epoch": 0.39584, + "grad_norm": 0.47577918253085855, + "learning_rate": 0.0001376570490997446, + "loss": 0.6589, + "step": 1237 + }, + { + "epoch": 0.39616, + "grad_norm": 0.36035416220427785, + "learning_rate": 0.00013756100996386626, + "loss": 0.6718, + "step": 1238 + }, + { + "epoch": 0.39648, + "grad_norm": 0.3787175747447074, + "learning_rate": 0.00013746493047598512, + "loss": 0.616, + "step": 1239 + }, + { + "epoch": 0.3968, + "grad_norm": 0.35635790868036293, + "learning_rate": 0.00013736881073931993, + "loss": 0.6541, + "step": 1240 + }, + { + "epoch": 0.39712, + "grad_norm": 0.38499276580247926, + "learning_rate": 0.00013727265085713264, + "loss": 0.6556, + "step": 1241 + }, + { + "epoch": 0.39744, + "grad_norm": 0.3492776702835988, + "learning_rate": 0.00013717645093272833, + "loss": 0.6559, + "step": 1242 + }, + { + "epoch": 0.39776, + "grad_norm": 0.3321756495386628, + "learning_rate": 0.00013708021106945514, + "loss": 0.6064, + "step": 1243 + }, + { + "epoch": 0.39808, + "grad_norm": 0.3758795103904307, + "learning_rate": 0.00013698393137070403, + "loss": 0.6852, + "step": 1244 + }, + { + "epoch": 0.3984, + "grad_norm": 0.40288062578089195, + "learning_rate": 0.00013688761193990888, + "loss": 0.6303, + "step": 1245 + }, + { + "epoch": 0.39872, + "grad_norm": 0.3494164272649773, + "learning_rate": 0.00013679125288054621, + "loss": 0.6037, + "step": 1246 + }, + { + "epoch": 0.39904, + "grad_norm": 0.35293284207782044, + "learning_rate": 0.00013669485429613506, + "loss": 0.6114, + "step": 1247 + }, + { + "epoch": 0.39936, + "grad_norm": 0.3566926311089219, + "learning_rate": 0.00013659841629023696, + "loss": 0.6565, + "step": 1248 + }, + { + "epoch": 0.39968, + "grad_norm": 0.3573981006039202, + "learning_rate": 0.00013650193896645583, + "loss": 0.643, + "step": 1249 + }, + { + "epoch": 0.4, + "grad_norm": 0.3462914767129748, + "learning_rate": 0.00013640542242843778, + "loss": 0.5933, + "step": 1250 + }, + { + "epoch": 0.40032, + "grad_norm": 0.38179837226718827, + "learning_rate": 0.00013630886677987107, + "loss": 0.6943, + "step": 1251 + }, + { + "epoch": 0.40064, + "grad_norm": 0.36061348926406517, + "learning_rate": 0.00013621227212448598, + "loss": 0.636, + "step": 1252 + }, + { + "epoch": 0.40096, + "grad_norm": 0.3816641614097142, + "learning_rate": 0.00013611563856605463, + "loss": 0.6302, + "step": 1253 + }, + { + "epoch": 0.40128, + "grad_norm": 0.37524463793177804, + "learning_rate": 0.00013601896620839108, + "loss": 0.6351, + "step": 1254 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3819320271157617, + "learning_rate": 0.00013592225515535094, + "loss": 0.6754, + "step": 1255 + }, + { + "epoch": 0.40192, + "grad_norm": 0.3635548982949283, + "learning_rate": 0.00013582550551083142, + "loss": 0.6361, + "step": 1256 + }, + { + "epoch": 0.40224, + "grad_norm": 0.3580769880672606, + "learning_rate": 0.0001357287173787712, + "loss": 0.6461, + "step": 1257 + }, + { + "epoch": 0.40256, + "grad_norm": 0.372916927434414, + "learning_rate": 0.0001356318908631504, + "loss": 0.6322, + "step": 1258 + }, + { + "epoch": 0.40288, + "grad_norm": 0.3976583442627668, + "learning_rate": 0.00013553502606799018, + "loss": 0.6387, + "step": 1259 + }, + { + "epoch": 0.4032, + "grad_norm": 0.36388274254846326, + "learning_rate": 0.00013543812309735296, + "loss": 0.6673, + "step": 1260 + }, + { + "epoch": 0.40352, + "grad_norm": 0.3617105015278976, + "learning_rate": 0.00013534118205534216, + "loss": 0.6036, + "step": 1261 + }, + { + "epoch": 0.40384, + "grad_norm": 0.3564492535214902, + "learning_rate": 0.0001352442030461021, + "loss": 0.6728, + "step": 1262 + }, + { + "epoch": 0.40416, + "grad_norm": 0.3875792443666019, + "learning_rate": 0.00013514718617381778, + "loss": 0.6609, + "step": 1263 + }, + { + "epoch": 0.40448, + "grad_norm": 0.3850308563040858, + "learning_rate": 0.0001350501315427151, + "loss": 0.6836, + "step": 1264 + }, + { + "epoch": 0.4048, + "grad_norm": 0.33943646109194714, + "learning_rate": 0.0001349530392570603, + "loss": 0.6344, + "step": 1265 + }, + { + "epoch": 0.40512, + "grad_norm": 0.3465323506995736, + "learning_rate": 0.00013485590942116017, + "loss": 0.556, + "step": 1266 + }, + { + "epoch": 0.40544, + "grad_norm": 0.3605078381500674, + "learning_rate": 0.00013475874213936189, + "loss": 0.5765, + "step": 1267 + }, + { + "epoch": 0.40576, + "grad_norm": 0.3554480016174343, + "learning_rate": 0.00013466153751605275, + "loss": 0.6432, + "step": 1268 + }, + { + "epoch": 0.40608, + "grad_norm": 0.33018202183452494, + "learning_rate": 0.00013456429565566027, + "loss": 0.6288, + "step": 1269 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3690413857324501, + "learning_rate": 0.0001344670166626519, + "loss": 0.6723, + "step": 1270 + }, + { + "epoch": 0.40672, + "grad_norm": 0.4280733453069583, + "learning_rate": 0.000134369700641535, + "loss": 0.6293, + "step": 1271 + }, + { + "epoch": 0.40704, + "grad_norm": 0.34315319646574005, + "learning_rate": 0.00013427234769685674, + "loss": 0.6653, + "step": 1272 + }, + { + "epoch": 0.40736, + "grad_norm": 0.453466689810093, + "learning_rate": 0.0001341749579332039, + "loss": 0.659, + "step": 1273 + }, + { + "epoch": 0.40768, + "grad_norm": 0.35292056287522733, + "learning_rate": 0.00013407753145520287, + "loss": 0.6499, + "step": 1274 + }, + { + "epoch": 0.408, + "grad_norm": 0.38587237468717983, + "learning_rate": 0.00013398006836751945, + "loss": 0.6537, + "step": 1275 + }, + { + "epoch": 0.40832, + "grad_norm": 0.3529306070695278, + "learning_rate": 0.0001338825687748588, + "loss": 0.6817, + "step": 1276 + }, + { + "epoch": 0.40864, + "grad_norm": 0.36051844834341557, + "learning_rate": 0.00013378503278196522, + "loss": 0.6337, + "step": 1277 + }, + { + "epoch": 0.40896, + "grad_norm": 0.343321159940691, + "learning_rate": 0.00013368746049362225, + "loss": 0.6529, + "step": 1278 + }, + { + "epoch": 0.40928, + "grad_norm": 0.3351048408709637, + "learning_rate": 0.00013358985201465226, + "loss": 0.6232, + "step": 1279 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3435219396633898, + "learning_rate": 0.00013349220744991665, + "loss": 0.6401, + "step": 1280 + }, + { + "epoch": 0.40992, + "grad_norm": 0.4067975516341677, + "learning_rate": 0.0001333945269043155, + "loss": 0.7252, + "step": 1281 + }, + { + "epoch": 0.41024, + "grad_norm": 0.35214924212694954, + "learning_rate": 0.0001332968104827876, + "loss": 0.6165, + "step": 1282 + }, + { + "epoch": 0.41056, + "grad_norm": 0.34850375228792707, + "learning_rate": 0.00013319905829031016, + "loss": 0.6418, + "step": 1283 + }, + { + "epoch": 0.41088, + "grad_norm": 0.34318201572025187, + "learning_rate": 0.000133101270431899, + "loss": 0.7001, + "step": 1284 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3733743863546783, + "learning_rate": 0.0001330034470126081, + "loss": 0.6991, + "step": 1285 + }, + { + "epoch": 0.41152, + "grad_norm": 0.3770948893491864, + "learning_rate": 0.00013290558813752976, + "loss": 0.6659, + "step": 1286 + }, + { + "epoch": 0.41184, + "grad_norm": 0.35994327221867084, + "learning_rate": 0.00013280769391179427, + "loss": 0.6326, + "step": 1287 + }, + { + "epoch": 0.41216, + "grad_norm": 0.34861119605672714, + "learning_rate": 0.00013270976444056993, + "loss": 0.618, + "step": 1288 + }, + { + "epoch": 0.41248, + "grad_norm": 0.3568665716847094, + "learning_rate": 0.00013261179982906296, + "loss": 0.6713, + "step": 1289 + }, + { + "epoch": 0.4128, + "grad_norm": 0.36238510851647426, + "learning_rate": 0.0001325138001825173, + "loss": 0.6598, + "step": 1290 + }, + { + "epoch": 0.41312, + "grad_norm": 0.3366664810713671, + "learning_rate": 0.00013241576560621445, + "loss": 0.621, + "step": 1291 + }, + { + "epoch": 0.41344, + "grad_norm": 0.3669410714488144, + "learning_rate": 0.00013231769620547358, + "loss": 0.6661, + "step": 1292 + }, + { + "epoch": 0.41376, + "grad_norm": 0.34475722878964293, + "learning_rate": 0.00013221959208565114, + "loss": 0.6483, + "step": 1293 + }, + { + "epoch": 0.41408, + "grad_norm": 0.3460746296692432, + "learning_rate": 0.00013212145335214097, + "loss": 0.6528, + "step": 1294 + }, + { + "epoch": 0.4144, + "grad_norm": 0.3730770689420592, + "learning_rate": 0.00013202328011037404, + "loss": 0.6444, + "step": 1295 + }, + { + "epoch": 0.41472, + "grad_norm": 0.3568421824960019, + "learning_rate": 0.0001319250724658184, + "loss": 0.6464, + "step": 1296 + }, + { + "epoch": 0.41504, + "grad_norm": 0.3575319229223752, + "learning_rate": 0.0001318268305239791, + "loss": 0.6099, + "step": 1297 + }, + { + "epoch": 0.41536, + "grad_norm": 0.37343788399550126, + "learning_rate": 0.00013172855439039802, + "loss": 0.6049, + "step": 1298 + }, + { + "epoch": 0.41568, + "grad_norm": 0.3764856116892189, + "learning_rate": 0.0001316302441706537, + "loss": 0.6894, + "step": 1299 + }, + { + "epoch": 0.416, + "grad_norm": 0.3573642349959537, + "learning_rate": 0.00013153189997036142, + "loss": 0.661, + "step": 1300 + }, + { + "epoch": 0.41632, + "grad_norm": 0.36075971225401227, + "learning_rate": 0.00013143352189517283, + "loss": 0.6371, + "step": 1301 + }, + { + "epoch": 0.41664, + "grad_norm": 0.35425791072986146, + "learning_rate": 0.0001313351100507761, + "loss": 0.6725, + "step": 1302 + }, + { + "epoch": 0.41696, + "grad_norm": 0.34591639250099865, + "learning_rate": 0.00013123666454289566, + "loss": 0.7016, + "step": 1303 + }, + { + "epoch": 0.41728, + "grad_norm": 0.33614920683151617, + "learning_rate": 0.00013113818547729202, + "loss": 0.6235, + "step": 1304 + }, + { + "epoch": 0.4176, + "grad_norm": 0.3673677373910372, + "learning_rate": 0.00013103967295976179, + "loss": 0.6848, + "step": 1305 + }, + { + "epoch": 0.41792, + "grad_norm": 0.34566980754791926, + "learning_rate": 0.00013094112709613747, + "loss": 0.671, + "step": 1306 + }, + { + "epoch": 0.41824, + "grad_norm": 0.3303812462162933, + "learning_rate": 0.00013084254799228753, + "loss": 0.5806, + "step": 1307 + }, + { + "epoch": 0.41856, + "grad_norm": 0.34825215783206426, + "learning_rate": 0.000130743935754116, + "loss": 0.5705, + "step": 1308 + }, + { + "epoch": 0.41888, + "grad_norm": 0.35838984534257606, + "learning_rate": 0.00013064529048756256, + "loss": 0.6494, + "step": 1309 + }, + { + "epoch": 0.4192, + "grad_norm": 0.34119239755194375, + "learning_rate": 0.00013054661229860238, + "loss": 0.63, + "step": 1310 + }, + { + "epoch": 0.41952, + "grad_norm": 0.36455797899709114, + "learning_rate": 0.000130447901293246, + "loss": 0.6876, + "step": 1311 + }, + { + "epoch": 0.41984, + "grad_norm": 0.34262004353926406, + "learning_rate": 0.00013034915757753916, + "loss": 0.6085, + "step": 1312 + }, + { + "epoch": 0.42016, + "grad_norm": 0.3503190021222328, + "learning_rate": 0.00013025038125756284, + "loss": 0.606, + "step": 1313 + }, + { + "epoch": 0.42048, + "grad_norm": 0.3816539330707386, + "learning_rate": 0.0001301515724394329, + "loss": 0.6845, + "step": 1314 + }, + { + "epoch": 0.4208, + "grad_norm": 0.3594555903017935, + "learning_rate": 0.00013005273122930036, + "loss": 0.6854, + "step": 1315 + }, + { + "epoch": 0.42112, + "grad_norm": 0.34466991551002474, + "learning_rate": 0.00012995385773335074, + "loss": 0.6392, + "step": 1316 + }, + { + "epoch": 0.42144, + "grad_norm": 0.32219332502638137, + "learning_rate": 0.00012985495205780447, + "loss": 0.6103, + "step": 1317 + }, + { + "epoch": 0.42176, + "grad_norm": 0.3416425229543437, + "learning_rate": 0.00012975601430891643, + "loss": 0.6383, + "step": 1318 + }, + { + "epoch": 0.42208, + "grad_norm": 0.36489715660687144, + "learning_rate": 0.00012965704459297602, + "loss": 0.6809, + "step": 1319 + }, + { + "epoch": 0.4224, + "grad_norm": 0.346798826084252, + "learning_rate": 0.00012955804301630693, + "loss": 0.6624, + "step": 1320 + }, + { + "epoch": 0.42272, + "grad_norm": 0.36112835877244714, + "learning_rate": 0.00012945900968526716, + "loss": 0.6684, + "step": 1321 + }, + { + "epoch": 0.42304, + "grad_norm": 0.3652280722113376, + "learning_rate": 0.00012935994470624875, + "loss": 0.6076, + "step": 1322 + }, + { + "epoch": 0.42336, + "grad_norm": 0.3574492527504861, + "learning_rate": 0.0001292608481856777, + "loss": 0.6127, + "step": 1323 + }, + { + "epoch": 0.42368, + "grad_norm": 0.3504809839004475, + "learning_rate": 0.00012916172023001406, + "loss": 0.6577, + "step": 1324 + }, + { + "epoch": 0.424, + "grad_norm": 0.3844463042997439, + "learning_rate": 0.00012906256094575146, + "loss": 0.6172, + "step": 1325 + }, + { + "epoch": 0.42432, + "grad_norm": 0.3797990055410563, + "learning_rate": 0.0001289633704394173, + "loss": 0.6419, + "step": 1326 + }, + { + "epoch": 0.42464, + "grad_norm": 0.3868650002573214, + "learning_rate": 0.00012886414881757246, + "loss": 0.6701, + "step": 1327 + }, + { + "epoch": 0.42496, + "grad_norm": 0.3528851337849618, + "learning_rate": 0.00012876489618681135, + "loss": 0.6075, + "step": 1328 + }, + { + "epoch": 0.42528, + "grad_norm": 0.34703654994128025, + "learning_rate": 0.0001286656126537616, + "loss": 0.6267, + "step": 1329 + }, + { + "epoch": 0.4256, + "grad_norm": 0.34437613160416275, + "learning_rate": 0.00012856629832508408, + "loss": 0.6553, + "step": 1330 + }, + { + "epoch": 0.42592, + "grad_norm": 0.33037427360110766, + "learning_rate": 0.00012846695330747266, + "loss": 0.6217, + "step": 1331 + }, + { + "epoch": 0.42624, + "grad_norm": 0.3762704604148889, + "learning_rate": 0.00012836757770765434, + "loss": 0.6614, + "step": 1332 + }, + { + "epoch": 0.42656, + "grad_norm": 0.33931856387279397, + "learning_rate": 0.0001282681716323888, + "loss": 0.6262, + "step": 1333 + }, + { + "epoch": 0.42688, + "grad_norm": 0.3827684139348457, + "learning_rate": 0.00012816873518846863, + "loss": 0.6486, + "step": 1334 + }, + { + "epoch": 0.4272, + "grad_norm": 0.35601274758876245, + "learning_rate": 0.00012806926848271886, + "loss": 0.6637, + "step": 1335 + }, + { + "epoch": 0.42752, + "grad_norm": 0.3510290212188483, + "learning_rate": 0.00012796977162199717, + "loss": 0.5754, + "step": 1336 + }, + { + "epoch": 0.42784, + "grad_norm": 0.35443467930466266, + "learning_rate": 0.00012787024471319362, + "loss": 0.6089, + "step": 1337 + }, + { + "epoch": 0.42816, + "grad_norm": 0.3475015658292733, + "learning_rate": 0.0001277706878632305, + "loss": 0.6315, + "step": 1338 + }, + { + "epoch": 0.42848, + "grad_norm": 0.37814373328016304, + "learning_rate": 0.0001276711011790623, + "loss": 0.6634, + "step": 1339 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3701372610710627, + "learning_rate": 0.00012757148476767553, + "loss": 0.6833, + "step": 1340 + }, + { + "epoch": 0.42912, + "grad_norm": 0.3387632680188932, + "learning_rate": 0.00012747183873608865, + "loss": 0.6667, + "step": 1341 + }, + { + "epoch": 0.42944, + "grad_norm": 0.35593204722664024, + "learning_rate": 0.00012737216319135198, + "loss": 0.6439, + "step": 1342 + }, + { + "epoch": 0.42976, + "grad_norm": 0.3772028607822921, + "learning_rate": 0.00012727245824054753, + "loss": 0.6426, + "step": 1343 + }, + { + "epoch": 0.43008, + "grad_norm": 0.3845568482838058, + "learning_rate": 0.00012717272399078884, + "loss": 0.6448, + "step": 1344 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3447425244248575, + "learning_rate": 0.00012707296054922097, + "loss": 0.6513, + "step": 1345 + }, + { + "epoch": 0.43072, + "grad_norm": 0.33810265020031316, + "learning_rate": 0.00012697316802302036, + "loss": 0.6762, + "step": 1346 + }, + { + "epoch": 0.43104, + "grad_norm": 0.3475298850931401, + "learning_rate": 0.0001268733465193947, + "loss": 0.6061, + "step": 1347 + }, + { + "epoch": 0.43136, + "grad_norm": 0.3608790325277824, + "learning_rate": 0.0001267734961455828, + "loss": 0.6478, + "step": 1348 + }, + { + "epoch": 0.43168, + "grad_norm": 0.36508800193478946, + "learning_rate": 0.0001266736170088544, + "loss": 0.6109, + "step": 1349 + }, + { + "epoch": 0.432, + "grad_norm": 0.3383799099306968, + "learning_rate": 0.00012657370921651025, + "loss": 0.6334, + "step": 1350 + }, + { + "epoch": 0.43232, + "grad_norm": 0.35618257726364355, + "learning_rate": 0.00012647377287588186, + "loss": 0.6594, + "step": 1351 + }, + { + "epoch": 0.43264, + "grad_norm": 0.35177084347016874, + "learning_rate": 0.00012637380809433143, + "loss": 0.6484, + "step": 1352 + }, + { + "epoch": 0.43296, + "grad_norm": 0.3548705046761641, + "learning_rate": 0.00012627381497925163, + "loss": 0.6384, + "step": 1353 + }, + { + "epoch": 0.43328, + "grad_norm": 0.40688751781160165, + "learning_rate": 0.00012617379363806563, + "loss": 0.6618, + "step": 1354 + }, + { + "epoch": 0.4336, + "grad_norm": 0.3452045247527599, + "learning_rate": 0.00012607374417822695, + "loss": 0.6391, + "step": 1355 + }, + { + "epoch": 0.43392, + "grad_norm": 0.35552910751039707, + "learning_rate": 0.00012597366670721925, + "loss": 0.6074, + "step": 1356 + }, + { + "epoch": 0.43424, + "grad_norm": 0.3764260118454921, + "learning_rate": 0.00012587356133255632, + "loss": 0.6755, + "step": 1357 + }, + { + "epoch": 0.43456, + "grad_norm": 0.3533763806443953, + "learning_rate": 0.00012577342816178194, + "loss": 0.6562, + "step": 1358 + }, + { + "epoch": 0.43488, + "grad_norm": 0.3544995820110034, + "learning_rate": 0.0001256732673024697, + "loss": 0.6499, + "step": 1359 + }, + { + "epoch": 0.4352, + "grad_norm": 0.37073666265091326, + "learning_rate": 0.00012557307886222304, + "loss": 0.6247, + "step": 1360 + }, + { + "epoch": 0.43552, + "grad_norm": 0.37107082767575766, + "learning_rate": 0.00012547286294867487, + "loss": 0.5801, + "step": 1361 + }, + { + "epoch": 0.43584, + "grad_norm": 0.35227638320394655, + "learning_rate": 0.00012537261966948777, + "loss": 0.6513, + "step": 1362 + }, + { + "epoch": 0.43616, + "grad_norm": 0.3601658309861874, + "learning_rate": 0.00012527234913235362, + "loss": 0.6262, + "step": 1363 + }, + { + "epoch": 0.43648, + "grad_norm": 0.35008795436825585, + "learning_rate": 0.00012517205144499366, + "loss": 0.6378, + "step": 1364 + }, + { + "epoch": 0.4368, + "grad_norm": 0.346105470523686, + "learning_rate": 0.00012507172671515822, + "loss": 0.6192, + "step": 1365 + }, + { + "epoch": 0.43712, + "grad_norm": 0.3773640996777558, + "learning_rate": 0.00012497137505062674, + "loss": 0.6452, + "step": 1366 + }, + { + "epoch": 0.43744, + "grad_norm": 0.3468633481588172, + "learning_rate": 0.00012487099655920757, + "loss": 0.6582, + "step": 1367 + }, + { + "epoch": 0.43776, + "grad_norm": 0.38851105590023044, + "learning_rate": 0.00012477059134873784, + "loss": 0.6304, + "step": 1368 + }, + { + "epoch": 0.43808, + "grad_norm": 0.34867703247550724, + "learning_rate": 0.00012467015952708348, + "loss": 0.6342, + "step": 1369 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3563014878857632, + "learning_rate": 0.00012456970120213896, + "loss": 0.6053, + "step": 1370 + }, + { + "epoch": 0.43872, + "grad_norm": 0.3449361759382152, + "learning_rate": 0.00012446921648182716, + "loss": 0.6923, + "step": 1371 + }, + { + "epoch": 0.43904, + "grad_norm": 0.3641615446986733, + "learning_rate": 0.00012436870547409944, + "loss": 0.6648, + "step": 1372 + }, + { + "epoch": 0.43936, + "grad_norm": 0.3395863365971903, + "learning_rate": 0.0001242681682869353, + "loss": 0.5922, + "step": 1373 + }, + { + "epoch": 0.43968, + "grad_norm": 0.3539280047765845, + "learning_rate": 0.00012416760502834243, + "loss": 0.6219, + "step": 1374 + }, + { + "epoch": 0.44, + "grad_norm": 0.34918862471192, + "learning_rate": 0.0001240670158063565, + "loss": 0.7103, + "step": 1375 + }, + { + "epoch": 0.44032, + "grad_norm": 0.3595735599927838, + "learning_rate": 0.00012396640072904103, + "loss": 0.5902, + "step": 1376 + }, + { + "epoch": 0.44064, + "grad_norm": 0.35856148872984034, + "learning_rate": 0.00012386575990448742, + "loss": 0.6201, + "step": 1377 + }, + { + "epoch": 0.44096, + "grad_norm": 0.37700670637280187, + "learning_rate": 0.0001237650934408146, + "loss": 0.6418, + "step": 1378 + }, + { + "epoch": 0.44128, + "grad_norm": 0.36989058657011725, + "learning_rate": 0.00012366440144616917, + "loss": 0.6457, + "step": 1379 + }, + { + "epoch": 0.4416, + "grad_norm": 0.4419841347795627, + "learning_rate": 0.0001235636840287251, + "loss": 0.6498, + "step": 1380 + }, + { + "epoch": 0.44192, + "grad_norm": 0.34323897155294086, + "learning_rate": 0.00012346294129668366, + "loss": 0.6242, + "step": 1381 + }, + { + "epoch": 0.44224, + "grad_norm": 0.34643679938270927, + "learning_rate": 0.0001233621733582733, + "loss": 0.6126, + "step": 1382 + }, + { + "epoch": 0.44256, + "grad_norm": 0.3356235258047478, + "learning_rate": 0.00012326138032174965, + "loss": 0.6364, + "step": 1383 + }, + { + "epoch": 0.44288, + "grad_norm": 0.3575902692954344, + "learning_rate": 0.00012316056229539518, + "loss": 0.6369, + "step": 1384 + }, + { + "epoch": 0.4432, + "grad_norm": 0.3393711534808123, + "learning_rate": 0.00012305971938751924, + "loss": 0.6429, + "step": 1385 + }, + { + "epoch": 0.44352, + "grad_norm": 0.3684626498108965, + "learning_rate": 0.00012295885170645796, + "loss": 0.5705, + "step": 1386 + }, + { + "epoch": 0.44384, + "grad_norm": 0.3423815982347849, + "learning_rate": 0.00012285795936057406, + "loss": 0.6181, + "step": 1387 + }, + { + "epoch": 0.44416, + "grad_norm": 0.3656618249888552, + "learning_rate": 0.00012275704245825678, + "loss": 0.6802, + "step": 1388 + }, + { + "epoch": 0.44448, + "grad_norm": 0.3490014887116999, + "learning_rate": 0.0001226561011079216, + "loss": 0.6106, + "step": 1389 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3429221228928354, + "learning_rate": 0.00012255513541801049, + "loss": 0.6224, + "step": 1390 + }, + { + "epoch": 0.44512, + "grad_norm": 0.3491161411693488, + "learning_rate": 0.00012245414549699144, + "loss": 0.5897, + "step": 1391 + }, + { + "epoch": 0.44544, + "grad_norm": 0.3623171659441398, + "learning_rate": 0.00012235313145335844, + "loss": 0.6423, + "step": 1392 + }, + { + "epoch": 0.44576, + "grad_norm": 0.3420727006074005, + "learning_rate": 0.00012225209339563145, + "loss": 0.6036, + "step": 1393 + }, + { + "epoch": 0.44608, + "grad_norm": 0.3512269788663449, + "learning_rate": 0.00012215103143235623, + "loss": 0.6378, + "step": 1394 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3466087062962386, + "learning_rate": 0.00012204994567210426, + "loss": 0.583, + "step": 1395 + }, + { + "epoch": 0.44672, + "grad_norm": 0.38119716890420635, + "learning_rate": 0.00012194883622347246, + "loss": 0.6351, + "step": 1396 + }, + { + "epoch": 0.44704, + "grad_norm": 0.3625595031674755, + "learning_rate": 0.0001218477031950833, + "loss": 0.621, + "step": 1397 + }, + { + "epoch": 0.44736, + "grad_norm": 0.351603026050947, + "learning_rate": 0.00012174654669558454, + "loss": 0.6253, + "step": 1398 + }, + { + "epoch": 0.44768, + "grad_norm": 0.34827861710726954, + "learning_rate": 0.00012164536683364925, + "loss": 0.6171, + "step": 1399 + }, + { + "epoch": 0.448, + "grad_norm": 0.3602366774986206, + "learning_rate": 0.00012154416371797543, + "loss": 0.6534, + "step": 1400 + }, + { + "epoch": 0.44832, + "grad_norm": 0.35683792396738884, + "learning_rate": 0.0001214429374572862, + "loss": 0.6668, + "step": 1401 + }, + { + "epoch": 0.44864, + "grad_norm": 0.34068186682117463, + "learning_rate": 0.00012134168816032949, + "loss": 0.6373, + "step": 1402 + }, + { + "epoch": 0.44896, + "grad_norm": 0.343090577485209, + "learning_rate": 0.00012124041593587798, + "loss": 0.6159, + "step": 1403 + }, + { + "epoch": 0.44928, + "grad_norm": 0.34224599156117047, + "learning_rate": 0.00012113912089272898, + "loss": 0.6572, + "step": 1404 + }, + { + "epoch": 0.4496, + "grad_norm": 0.33193464365254555, + "learning_rate": 0.00012103780313970435, + "loss": 0.6303, + "step": 1405 + }, + { + "epoch": 0.44992, + "grad_norm": 0.369626764959218, + "learning_rate": 0.00012093646278565029, + "loss": 0.6286, + "step": 1406 + }, + { + "epoch": 0.45024, + "grad_norm": 0.35471716045787755, + "learning_rate": 0.00012083509993943732, + "loss": 0.646, + "step": 1407 + }, + { + "epoch": 0.45056, + "grad_norm": 0.3582100147411708, + "learning_rate": 0.00012073371470996009, + "loss": 0.6869, + "step": 1408 + }, + { + "epoch": 0.45088, + "grad_norm": 0.37186706964832433, + "learning_rate": 0.00012063230720613734, + "loss": 0.6737, + "step": 1409 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3191968885135222, + "learning_rate": 0.00012053087753691172, + "loss": 0.5869, + "step": 1410 + }, + { + "epoch": 0.45152, + "grad_norm": 0.3691993077316804, + "learning_rate": 0.00012042942581124967, + "loss": 0.6264, + "step": 1411 + }, + { + "epoch": 0.45184, + "grad_norm": 0.3695780460379041, + "learning_rate": 0.00012032795213814136, + "loss": 0.6712, + "step": 1412 + }, + { + "epoch": 0.45216, + "grad_norm": 0.3568156626276025, + "learning_rate": 0.00012022645662660054, + "loss": 0.6089, + "step": 1413 + }, + { + "epoch": 0.45248, + "grad_norm": 0.3655528196906202, + "learning_rate": 0.0001201249393856644, + "loss": 0.6389, + "step": 1414 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3386700688822135, + "learning_rate": 0.00012002340052439345, + "loss": 0.6517, + "step": 1415 + }, + { + "epoch": 0.45312, + "grad_norm": 0.3449331101654647, + "learning_rate": 0.00011992184015187145, + "loss": 0.6082, + "step": 1416 + }, + { + "epoch": 0.45344, + "grad_norm": 0.35254729205610263, + "learning_rate": 0.00011982025837720532, + "loss": 0.6636, + "step": 1417 + }, + { + "epoch": 0.45376, + "grad_norm": 0.35069952676512073, + "learning_rate": 0.00011971865530952491, + "loss": 0.5966, + "step": 1418 + }, + { + "epoch": 0.45408, + "grad_norm": 0.3515586909700109, + "learning_rate": 0.00011961703105798297, + "loss": 0.653, + "step": 1419 + }, + { + "epoch": 0.4544, + "grad_norm": 0.35351295595603843, + "learning_rate": 0.00011951538573175494, + "loss": 0.6445, + "step": 1420 + }, + { + "epoch": 0.45472, + "grad_norm": 0.37666401087814305, + "learning_rate": 0.00011941371944003905, + "loss": 0.6309, + "step": 1421 + }, + { + "epoch": 0.45504, + "grad_norm": 0.3469615941342529, + "learning_rate": 0.00011931203229205596, + "loss": 0.6541, + "step": 1422 + }, + { + "epoch": 0.45536, + "grad_norm": 0.35931076286043323, + "learning_rate": 0.00011921032439704867, + "loss": 0.646, + "step": 1423 + }, + { + "epoch": 0.45568, + "grad_norm": 0.3481688003934372, + "learning_rate": 0.00011910859586428258, + "loss": 0.649, + "step": 1424 + }, + { + "epoch": 0.456, + "grad_norm": 0.3953700777295994, + "learning_rate": 0.00011900684680304521, + "loss": 0.6735, + "step": 1425 + }, + { + "epoch": 0.45632, + "grad_norm": 0.35507200417141105, + "learning_rate": 0.00011890507732264616, + "loss": 0.5973, + "step": 1426 + }, + { + "epoch": 0.45664, + "grad_norm": 0.4579588714968065, + "learning_rate": 0.00011880328753241694, + "loss": 0.6464, + "step": 1427 + }, + { + "epoch": 0.45696, + "grad_norm": 0.3526760816498687, + "learning_rate": 0.00011870147754171093, + "loss": 0.6487, + "step": 1428 + }, + { + "epoch": 0.45728, + "grad_norm": 0.3613568385566994, + "learning_rate": 0.00011859964745990308, + "loss": 0.6025, + "step": 1429 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3604373311550373, + "learning_rate": 0.00011849779739639012, + "loss": 0.6658, + "step": 1430 + }, + { + "epoch": 0.45792, + "grad_norm": 0.36857102682555254, + "learning_rate": 0.00011839592746059008, + "loss": 0.6973, + "step": 1431 + }, + { + "epoch": 0.45824, + "grad_norm": 0.37162056867268184, + "learning_rate": 0.0001182940377619424, + "loss": 0.6365, + "step": 1432 + }, + { + "epoch": 0.45856, + "grad_norm": 0.33902705479978135, + "learning_rate": 0.00011819212840990778, + "loss": 0.6301, + "step": 1433 + }, + { + "epoch": 0.45888, + "grad_norm": 0.37413568959543353, + "learning_rate": 0.00011809019951396799, + "loss": 0.6239, + "step": 1434 + }, + { + "epoch": 0.4592, + "grad_norm": 0.35477794355157105, + "learning_rate": 0.00011798825118362582, + "loss": 0.6425, + "step": 1435 + }, + { + "epoch": 0.45952, + "grad_norm": 0.3522078087107167, + "learning_rate": 0.00011788628352840494, + "loss": 0.6418, + "step": 1436 + }, + { + "epoch": 0.45984, + "grad_norm": 0.35103797118150487, + "learning_rate": 0.00011778429665784978, + "loss": 0.6072, + "step": 1437 + }, + { + "epoch": 0.46016, + "grad_norm": 0.3353757349855207, + "learning_rate": 0.00011768229068152532, + "loss": 0.6002, + "step": 1438 + }, + { + "epoch": 0.46048, + "grad_norm": 0.38094559653695437, + "learning_rate": 0.00011758026570901726, + "loss": 0.6438, + "step": 1439 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3822728591127505, + "learning_rate": 0.00011747822184993153, + "loss": 0.6757, + "step": 1440 + }, + { + "epoch": 0.46112, + "grad_norm": 0.3507043929942193, + "learning_rate": 0.00011737615921389444, + "loss": 0.6217, + "step": 1441 + }, + { + "epoch": 0.46144, + "grad_norm": 0.3410123569919182, + "learning_rate": 0.00011727407791055244, + "loss": 0.6435, + "step": 1442 + }, + { + "epoch": 0.46176, + "grad_norm": 0.34433236456279837, + "learning_rate": 0.00011717197804957207, + "loss": 0.6401, + "step": 1443 + }, + { + "epoch": 0.46208, + "grad_norm": 0.3455912774447328, + "learning_rate": 0.00011706985974063978, + "loss": 0.5887, + "step": 1444 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3875625719562774, + "learning_rate": 0.00011696772309346182, + "loss": 0.6422, + "step": 1445 + }, + { + "epoch": 0.46272, + "grad_norm": 0.3654056864366787, + "learning_rate": 0.00011686556821776415, + "loss": 0.7212, + "step": 1446 + }, + { + "epoch": 0.46304, + "grad_norm": 0.35367123382665144, + "learning_rate": 0.00011676339522329232, + "loss": 0.6469, + "step": 1447 + }, + { + "epoch": 0.46336, + "grad_norm": 0.3338090449098278, + "learning_rate": 0.00011666120421981139, + "loss": 0.5892, + "step": 1448 + }, + { + "epoch": 0.46368, + "grad_norm": 0.3587477742861889, + "learning_rate": 0.0001165589953171057, + "loss": 0.6474, + "step": 1449 + }, + { + "epoch": 0.464, + "grad_norm": 0.3910356686835087, + "learning_rate": 0.00011645676862497885, + "loss": 0.662, + "step": 1450 + }, + { + "epoch": 0.46432, + "grad_norm": 0.3688443749662049, + "learning_rate": 0.00011635452425325355, + "loss": 0.6625, + "step": 1451 + }, + { + "epoch": 0.46464, + "grad_norm": 0.3621249395050262, + "learning_rate": 0.00011625226231177149, + "loss": 0.581, + "step": 1452 + }, + { + "epoch": 0.46496, + "grad_norm": 0.34116364664587356, + "learning_rate": 0.00011614998291039326, + "loss": 0.608, + "step": 1453 + }, + { + "epoch": 0.46528, + "grad_norm": 0.3321749101489626, + "learning_rate": 0.00011604768615899817, + "loss": 0.6658, + "step": 1454 + }, + { + "epoch": 0.4656, + "grad_norm": 0.34336578026686626, + "learning_rate": 0.0001159453721674842, + "loss": 0.6447, + "step": 1455 + }, + { + "epoch": 0.46592, + "grad_norm": 0.3390950292435595, + "learning_rate": 0.00011584304104576781, + "loss": 0.6386, + "step": 1456 + }, + { + "epoch": 0.46624, + "grad_norm": 0.33614897882321054, + "learning_rate": 0.00011574069290378398, + "loss": 0.5837, + "step": 1457 + }, + { + "epoch": 0.46656, + "grad_norm": 0.34076010531929685, + "learning_rate": 0.00011563832785148583, + "loss": 0.6049, + "step": 1458 + }, + { + "epoch": 0.46688, + "grad_norm": 0.3673040159836117, + "learning_rate": 0.00011553594599884471, + "loss": 0.6478, + "step": 1459 + }, + { + "epoch": 0.4672, + "grad_norm": 0.37603155481592304, + "learning_rate": 0.00011543354745585003, + "loss": 0.6341, + "step": 1460 + }, + { + "epoch": 0.46752, + "grad_norm": 0.32945147104457057, + "learning_rate": 0.00011533113233250911, + "loss": 0.5852, + "step": 1461 + }, + { + "epoch": 0.46784, + "grad_norm": 0.3516200094308783, + "learning_rate": 0.0001152287007388471, + "loss": 0.6567, + "step": 1462 + }, + { + "epoch": 0.46816, + "grad_norm": 0.36357213082683193, + "learning_rate": 0.00011512625278490683, + "loss": 0.6643, + "step": 1463 + }, + { + "epoch": 0.46848, + "grad_norm": 0.43220321793729743, + "learning_rate": 0.00011502378858074869, + "loss": 0.614, + "step": 1464 + }, + { + "epoch": 0.4688, + "grad_norm": 0.34989334390190063, + "learning_rate": 0.00011492130823645056, + "loss": 0.6317, + "step": 1465 + }, + { + "epoch": 0.46912, + "grad_norm": 0.34628728884582216, + "learning_rate": 0.00011481881186210765, + "loss": 0.6291, + "step": 1466 + }, + { + "epoch": 0.46944, + "grad_norm": 0.3294803821248172, + "learning_rate": 0.00011471629956783239, + "loss": 0.6026, + "step": 1467 + }, + { + "epoch": 0.46976, + "grad_norm": 0.35180452368672444, + "learning_rate": 0.0001146137714637543, + "loss": 0.6349, + "step": 1468 + }, + { + "epoch": 0.47008, + "grad_norm": 0.3855999411847994, + "learning_rate": 0.00011451122766001987, + "loss": 0.6258, + "step": 1469 + }, + { + "epoch": 0.4704, + "grad_norm": 0.343574916980757, + "learning_rate": 0.00011440866826679254, + "loss": 0.6301, + "step": 1470 + }, + { + "epoch": 0.47072, + "grad_norm": 0.3523512323973055, + "learning_rate": 0.0001143060933942524, + "loss": 0.6183, + "step": 1471 + }, + { + "epoch": 0.47104, + "grad_norm": 0.39126196332369406, + "learning_rate": 0.00011420350315259622, + "loss": 0.6002, + "step": 1472 + }, + { + "epoch": 0.47136, + "grad_norm": 0.38312920437104464, + "learning_rate": 0.00011410089765203724, + "loss": 0.6444, + "step": 1473 + }, + { + "epoch": 0.47168, + "grad_norm": 0.3406849317885865, + "learning_rate": 0.00011399827700280519, + "loss": 0.6166, + "step": 1474 + }, + { + "epoch": 0.472, + "grad_norm": 0.35156970174795626, + "learning_rate": 0.00011389564131514596, + "loss": 0.648, + "step": 1475 + }, + { + "epoch": 0.47232, + "grad_norm": 0.3622574065950502, + "learning_rate": 0.00011379299069932165, + "loss": 0.5963, + "step": 1476 + }, + { + "epoch": 0.47264, + "grad_norm": 0.3479842152568595, + "learning_rate": 0.00011369032526561039, + "loss": 0.5882, + "step": 1477 + }, + { + "epoch": 0.47296, + "grad_norm": 0.3329170605694178, + "learning_rate": 0.00011358764512430622, + "loss": 0.5978, + "step": 1478 + }, + { + "epoch": 0.47328, + "grad_norm": 0.3861496322294955, + "learning_rate": 0.000113484950385719, + "loss": 0.6236, + "step": 1479 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3529911339843644, + "learning_rate": 0.00011338224116017423, + "loss": 0.6618, + "step": 1480 + }, + { + "epoch": 0.47392, + "grad_norm": 0.3392940403089316, + "learning_rate": 0.00011327951755801307, + "loss": 0.6002, + "step": 1481 + }, + { + "epoch": 0.47424, + "grad_norm": 0.34836953650491914, + "learning_rate": 0.000113176779689592, + "loss": 0.6337, + "step": 1482 + }, + { + "epoch": 0.47456, + "grad_norm": 0.34293545389884234, + "learning_rate": 0.00011307402766528293, + "loss": 0.6376, + "step": 1483 + }, + { + "epoch": 0.47488, + "grad_norm": 0.34734856793025476, + "learning_rate": 0.0001129712615954729, + "loss": 0.5877, + "step": 1484 + }, + { + "epoch": 0.4752, + "grad_norm": 0.35392265174991977, + "learning_rate": 0.00011286848159056409, + "loss": 0.6429, + "step": 1485 + }, + { + "epoch": 0.47552, + "grad_norm": 0.3586802859631239, + "learning_rate": 0.0001127656877609736, + "loss": 0.6371, + "step": 1486 + }, + { + "epoch": 0.47584, + "grad_norm": 0.3891543667036396, + "learning_rate": 0.00011266288021713347, + "loss": 0.6663, + "step": 1487 + }, + { + "epoch": 0.47616, + "grad_norm": 0.34439856456847573, + "learning_rate": 0.00011256005906949041, + "loss": 0.6127, + "step": 1488 + }, + { + "epoch": 0.47648, + "grad_norm": 0.3634753787510308, + "learning_rate": 0.0001124572244285057, + "loss": 0.6373, + "step": 1489 + }, + { + "epoch": 0.4768, + "grad_norm": 0.3406557813425304, + "learning_rate": 0.00011235437640465522, + "loss": 0.6313, + "step": 1490 + }, + { + "epoch": 0.47712, + "grad_norm": 0.3512969725436879, + "learning_rate": 0.00011225151510842917, + "loss": 0.6322, + "step": 1491 + }, + { + "epoch": 0.47744, + "grad_norm": 0.35554341878234125, + "learning_rate": 0.00011214864065033202, + "loss": 0.6267, + "step": 1492 + }, + { + "epoch": 0.47776, + "grad_norm": 0.34908971553433915, + "learning_rate": 0.00011204575314088233, + "loss": 0.6171, + "step": 1493 + }, + { + "epoch": 0.47808, + "grad_norm": 0.4069273132712551, + "learning_rate": 0.00011194285269061277, + "loss": 0.6421, + "step": 1494 + }, + { + "epoch": 0.4784, + "grad_norm": 0.34053587415385045, + "learning_rate": 0.00011183993941006983, + "loss": 0.6132, + "step": 1495 + }, + { + "epoch": 0.47872, + "grad_norm": 0.34548132317073693, + "learning_rate": 0.00011173701340981386, + "loss": 0.6181, + "step": 1496 + }, + { + "epoch": 0.47904, + "grad_norm": 0.33489113586140046, + "learning_rate": 0.00011163407480041885, + "loss": 0.563, + "step": 1497 + }, + { + "epoch": 0.47936, + "grad_norm": 0.3530909412452826, + "learning_rate": 0.0001115311236924723, + "loss": 0.6428, + "step": 1498 + }, + { + "epoch": 0.47968, + "grad_norm": 0.34212937452029313, + "learning_rate": 0.00011142816019657514, + "loss": 0.6316, + "step": 1499 + }, + { + "epoch": 0.48, + "grad_norm": 0.3821487759207229, + "learning_rate": 0.00011132518442334161, + "loss": 0.626, + "step": 1500 + }, + { + "epoch": 0.48032, + "grad_norm": 0.3373358912775132, + "learning_rate": 0.00011122219648339924, + "loss": 0.6226, + "step": 1501 + }, + { + "epoch": 0.48064, + "grad_norm": 0.342414496716076, + "learning_rate": 0.00011111919648738851, + "loss": 0.6716, + "step": 1502 + }, + { + "epoch": 0.48096, + "grad_norm": 0.3999028830058103, + "learning_rate": 0.00011101618454596287, + "loss": 0.6415, + "step": 1503 + }, + { + "epoch": 0.48128, + "grad_norm": 0.3407819948407285, + "learning_rate": 0.00011091316076978866, + "loss": 0.6295, + "step": 1504 + }, + { + "epoch": 0.4816, + "grad_norm": 0.34683982320339674, + "learning_rate": 0.00011081012526954486, + "loss": 0.6432, + "step": 1505 + }, + { + "epoch": 0.48192, + "grad_norm": 0.3742143463070764, + "learning_rate": 0.0001107070781559231, + "loss": 0.6295, + "step": 1506 + }, + { + "epoch": 0.48224, + "grad_norm": 0.34628225365856147, + "learning_rate": 0.00011060401953962748, + "loss": 0.6281, + "step": 1507 + }, + { + "epoch": 0.48256, + "grad_norm": 0.3506858097180935, + "learning_rate": 0.00011050094953137444, + "loss": 0.6436, + "step": 1508 + }, + { + "epoch": 0.48288, + "grad_norm": 0.34512210750820227, + "learning_rate": 0.00011039786824189263, + "loss": 0.6678, + "step": 1509 + }, + { + "epoch": 0.4832, + "grad_norm": 0.34878405937210083, + "learning_rate": 0.00011029477578192291, + "loss": 0.6399, + "step": 1510 + }, + { + "epoch": 0.48352, + "grad_norm": 0.36579689331874526, + "learning_rate": 0.00011019167226221808, + "loss": 0.6664, + "step": 1511 + }, + { + "epoch": 0.48384, + "grad_norm": 0.36442956929914655, + "learning_rate": 0.00011008855779354281, + "loss": 0.6621, + "step": 1512 + }, + { + "epoch": 0.48416, + "grad_norm": 0.3490222252368435, + "learning_rate": 0.00010998543248667352, + "loss": 0.6165, + "step": 1513 + }, + { + "epoch": 0.48448, + "grad_norm": 0.3293433339691941, + "learning_rate": 0.00010988229645239836, + "loss": 0.6435, + "step": 1514 + }, + { + "epoch": 0.4848, + "grad_norm": 0.3593454046175193, + "learning_rate": 0.00010977914980151691, + "loss": 0.6223, + "step": 1515 + }, + { + "epoch": 0.48512, + "grad_norm": 0.3714878582059504, + "learning_rate": 0.00010967599264484024, + "loss": 0.6899, + "step": 1516 + }, + { + "epoch": 0.48544, + "grad_norm": 0.3390951918633382, + "learning_rate": 0.00010957282509319056, + "loss": 0.5989, + "step": 1517 + }, + { + "epoch": 0.48576, + "grad_norm": 0.3650563201215915, + "learning_rate": 0.00010946964725740145, + "loss": 0.6377, + "step": 1518 + }, + { + "epoch": 0.48608, + "grad_norm": 0.3384077845207565, + "learning_rate": 0.0001093664592483174, + "loss": 0.6301, + "step": 1519 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3365703675980268, + "learning_rate": 0.00010926326117679388, + "loss": 0.6149, + "step": 1520 + }, + { + "epoch": 0.48672, + "grad_norm": 0.3509719189816956, + "learning_rate": 0.00010916005315369713, + "loss": 0.6384, + "step": 1521 + }, + { + "epoch": 0.48704, + "grad_norm": 0.34555108446779503, + "learning_rate": 0.00010905683528990406, + "loss": 0.5871, + "step": 1522 + }, + { + "epoch": 0.48736, + "grad_norm": 0.33944086739768, + "learning_rate": 0.0001089536076963023, + "loss": 0.614, + "step": 1523 + }, + { + "epoch": 0.48768, + "grad_norm": 0.3470535535643965, + "learning_rate": 0.00010885037048378977, + "loss": 0.6223, + "step": 1524 + }, + { + "epoch": 0.488, + "grad_norm": 0.36763022084148006, + "learning_rate": 0.00010874712376327481, + "loss": 0.6508, + "step": 1525 + }, + { + "epoch": 0.48832, + "grad_norm": 0.35657647304209655, + "learning_rate": 0.00010864386764567588, + "loss": 0.6309, + "step": 1526 + }, + { + "epoch": 0.48864, + "grad_norm": 0.3591461764707076, + "learning_rate": 0.00010854060224192171, + "loss": 0.6358, + "step": 1527 + }, + { + "epoch": 0.48896, + "grad_norm": 0.48561248508741794, + "learning_rate": 0.0001084373276629508, + "loss": 0.6348, + "step": 1528 + }, + { + "epoch": 0.48928, + "grad_norm": 0.3937691425705274, + "learning_rate": 0.0001083340440197117, + "loss": 0.6763, + "step": 1529 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3694699208895203, + "learning_rate": 0.00010823075142316254, + "loss": 0.6427, + "step": 1530 + }, + { + "epoch": 0.48992, + "grad_norm": 0.3340047751988796, + "learning_rate": 0.00010812744998427113, + "loss": 0.687, + "step": 1531 + }, + { + "epoch": 0.49024, + "grad_norm": 0.3189005679326198, + "learning_rate": 0.00010802413981401483, + "loss": 0.5775, + "step": 1532 + }, + { + "epoch": 0.49056, + "grad_norm": 0.3325406920938598, + "learning_rate": 0.0001079208210233803, + "loss": 0.6136, + "step": 1533 + }, + { + "epoch": 0.49088, + "grad_norm": 0.3355965863247086, + "learning_rate": 0.00010781749372336352, + "loss": 0.6077, + "step": 1534 + }, + { + "epoch": 0.4912, + "grad_norm": 0.33799882207111426, + "learning_rate": 0.00010771415802496955, + "loss": 0.6723, + "step": 1535 + }, + { + "epoch": 0.49152, + "grad_norm": 0.3363724588202362, + "learning_rate": 0.00010761081403921254, + "loss": 0.6255, + "step": 1536 + }, + { + "epoch": 0.49184, + "grad_norm": 0.34712555670684897, + "learning_rate": 0.00010750746187711549, + "loss": 0.5984, + "step": 1537 + }, + { + "epoch": 0.49216, + "grad_norm": 0.35267736141035483, + "learning_rate": 0.00010740410164971019, + "loss": 0.654, + "step": 1538 + }, + { + "epoch": 0.49248, + "grad_norm": 0.342106239419953, + "learning_rate": 0.00010730073346803713, + "loss": 0.5809, + "step": 1539 + }, + { + "epoch": 0.4928, + "grad_norm": 0.35593989300415674, + "learning_rate": 0.00010719735744314534, + "loss": 0.6922, + "step": 1540 + }, + { + "epoch": 0.49312, + "grad_norm": 0.337651959706354, + "learning_rate": 0.00010709397368609227, + "loss": 0.636, + "step": 1541 + }, + { + "epoch": 0.49344, + "grad_norm": 0.33531385524069957, + "learning_rate": 0.00010699058230794361, + "loss": 0.6228, + "step": 1542 + }, + { + "epoch": 0.49376, + "grad_norm": 0.3859082536656671, + "learning_rate": 0.00010688718341977336, + "loss": 0.6683, + "step": 1543 + }, + { + "epoch": 0.49408, + "grad_norm": 0.3485561180895661, + "learning_rate": 0.00010678377713266345, + "loss": 0.5995, + "step": 1544 + }, + { + "epoch": 0.4944, + "grad_norm": 0.34372235738037266, + "learning_rate": 0.0001066803635577039, + "loss": 0.5981, + "step": 1545 + }, + { + "epoch": 0.49472, + "grad_norm": 0.33920912386132973, + "learning_rate": 0.00010657694280599249, + "loss": 0.6103, + "step": 1546 + }, + { + "epoch": 0.49504, + "grad_norm": 0.3414769670888569, + "learning_rate": 0.00010647351498863464, + "loss": 0.6016, + "step": 1547 + }, + { + "epoch": 0.49536, + "grad_norm": 0.3428845304428137, + "learning_rate": 0.00010637008021674351, + "loss": 0.6115, + "step": 1548 + }, + { + "epoch": 0.49568, + "grad_norm": 0.3444076882469626, + "learning_rate": 0.00010626663860143962, + "loss": 0.6308, + "step": 1549 + }, + { + "epoch": 0.496, + "grad_norm": 0.35318799940328427, + "learning_rate": 0.00010616319025385089, + "loss": 0.6047, + "step": 1550 + }, + { + "epoch": 0.49632, + "grad_norm": 0.3552816630964503, + "learning_rate": 0.00010605973528511241, + "loss": 0.6381, + "step": 1551 + }, + { + "epoch": 0.49664, + "grad_norm": 0.3686555284800937, + "learning_rate": 0.0001059562738063665, + "loss": 0.6443, + "step": 1552 + }, + { + "epoch": 0.49696, + "grad_norm": 0.37288971430522566, + "learning_rate": 0.00010585280592876233, + "loss": 0.6661, + "step": 1553 + }, + { + "epoch": 0.49728, + "grad_norm": 0.34161361243175975, + "learning_rate": 0.00010574933176345607, + "loss": 0.6363, + "step": 1554 + }, + { + "epoch": 0.4976, + "grad_norm": 0.35552535199216256, + "learning_rate": 0.0001056458514216106, + "loss": 0.6575, + "step": 1555 + }, + { + "epoch": 0.49792, + "grad_norm": 0.3475335229702711, + "learning_rate": 0.0001055423650143954, + "loss": 0.6406, + "step": 1556 + }, + { + "epoch": 0.49824, + "grad_norm": 0.3574275794821226, + "learning_rate": 0.00010543887265298651, + "loss": 0.6583, + "step": 1557 + }, + { + "epoch": 0.49856, + "grad_norm": 0.3405013725914652, + "learning_rate": 0.00010533537444856636, + "loss": 0.5741, + "step": 1558 + }, + { + "epoch": 0.49888, + "grad_norm": 0.32599557351795333, + "learning_rate": 0.00010523187051232361, + "loss": 0.5653, + "step": 1559 + }, + { + "epoch": 0.4992, + "grad_norm": 0.35905268027963827, + "learning_rate": 0.00010512836095545318, + "loss": 0.6315, + "step": 1560 + }, + { + "epoch": 0.49952, + "grad_norm": 0.34767210200709264, + "learning_rate": 0.00010502484588915591, + "loss": 0.6434, + "step": 1561 + }, + { + "epoch": 0.49984, + "grad_norm": 0.34094109716070703, + "learning_rate": 0.00010492132542463866, + "loss": 0.6337, + "step": 1562 + }, + { + "epoch": 0.50016, + "grad_norm": 0.35053766408601883, + "learning_rate": 0.000104817799673114, + "loss": 0.6426, + "step": 1563 + }, + { + "epoch": 0.50048, + "grad_norm": 0.35446862941735086, + "learning_rate": 0.00010471426874580023, + "loss": 0.6024, + "step": 1564 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3514784473335216, + "learning_rate": 0.00010461073275392124, + "loss": 0.6033, + "step": 1565 + }, + { + "epoch": 0.50112, + "grad_norm": 0.3518396437748681, + "learning_rate": 0.00010450719180870625, + "loss": 0.6208, + "step": 1566 + }, + { + "epoch": 0.50144, + "grad_norm": 0.33915986199309217, + "learning_rate": 0.00010440364602138997, + "loss": 0.6363, + "step": 1567 + }, + { + "epoch": 0.50176, + "grad_norm": 0.32840816260361627, + "learning_rate": 0.00010430009550321216, + "loss": 0.6207, + "step": 1568 + }, + { + "epoch": 0.50208, + "grad_norm": 0.34851701116767964, + "learning_rate": 0.00010419654036541773, + "loss": 0.6527, + "step": 1569 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3526953113108774, + "learning_rate": 0.0001040929807192565, + "loss": 0.6297, + "step": 1570 + }, + { + "epoch": 0.50272, + "grad_norm": 0.3406735973567741, + "learning_rate": 0.00010398941667598328, + "loss": 0.6308, + "step": 1571 + }, + { + "epoch": 0.50304, + "grad_norm": 0.34907120340842374, + "learning_rate": 0.00010388584834685744, + "loss": 0.6219, + "step": 1572 + }, + { + "epoch": 0.50336, + "grad_norm": 0.34831451268479496, + "learning_rate": 0.000103782275843143, + "loss": 0.6516, + "step": 1573 + }, + { + "epoch": 0.50368, + "grad_norm": 0.33467870310641157, + "learning_rate": 0.00010367869927610849, + "loss": 0.618, + "step": 1574 + }, + { + "epoch": 0.504, + "grad_norm": 0.34355951534504026, + "learning_rate": 0.0001035751187570268, + "loss": 0.6188, + "step": 1575 + }, + { + "epoch": 0.50432, + "grad_norm": 0.3456240834891537, + "learning_rate": 0.0001034715343971751, + "loss": 0.6338, + "step": 1576 + }, + { + "epoch": 0.50464, + "grad_norm": 0.33625577000451945, + "learning_rate": 0.00010336794630783457, + "loss": 0.648, + "step": 1577 + }, + { + "epoch": 0.50496, + "grad_norm": 0.37623334603388203, + "learning_rate": 0.00010326435460029052, + "loss": 0.6795, + "step": 1578 + }, + { + "epoch": 0.50528, + "grad_norm": 0.34629085630422224, + "learning_rate": 0.00010316075938583206, + "loss": 0.6097, + "step": 1579 + }, + { + "epoch": 0.5056, + "grad_norm": 0.34475625719241837, + "learning_rate": 0.00010305716077575215, + "loss": 0.6303, + "step": 1580 + }, + { + "epoch": 0.50592, + "grad_norm": 0.3480966711092534, + "learning_rate": 0.00010295355888134737, + "loss": 0.6121, + "step": 1581 + }, + { + "epoch": 0.50624, + "grad_norm": 0.32857632761879896, + "learning_rate": 0.00010284995381391776, + "loss": 0.579, + "step": 1582 + }, + { + "epoch": 0.50656, + "grad_norm": 0.3671680513839694, + "learning_rate": 0.00010274634568476687, + "loss": 0.6944, + "step": 1583 + }, + { + "epoch": 0.50688, + "grad_norm": 0.33882969595327056, + "learning_rate": 0.00010264273460520144, + "loss": 0.6094, + "step": 1584 + }, + { + "epoch": 0.5072, + "grad_norm": 0.34246469015874414, + "learning_rate": 0.00010253912068653146, + "loss": 0.6132, + "step": 1585 + }, + { + "epoch": 0.50752, + "grad_norm": 0.3430856300357972, + "learning_rate": 0.00010243550404006998, + "loss": 0.6277, + "step": 1586 + }, + { + "epoch": 0.50784, + "grad_norm": 0.3637497325407064, + "learning_rate": 0.00010233188477713289, + "loss": 0.5708, + "step": 1587 + }, + { + "epoch": 0.50816, + "grad_norm": 0.36489311092522864, + "learning_rate": 0.00010222826300903896, + "loss": 0.6436, + "step": 1588 + }, + { + "epoch": 0.50848, + "grad_norm": 0.347896629202345, + "learning_rate": 0.00010212463884710963, + "loss": 0.6708, + "step": 1589 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3987583220077781, + "learning_rate": 0.00010202101240266893, + "loss": 0.6293, + "step": 1590 + }, + { + "epoch": 0.50912, + "grad_norm": 0.34133588390452735, + "learning_rate": 0.00010191738378704332, + "loss": 0.6026, + "step": 1591 + }, + { + "epoch": 0.50944, + "grad_norm": 0.34308336912178355, + "learning_rate": 0.00010181375311156157, + "loss": 0.6212, + "step": 1592 + }, + { + "epoch": 0.50976, + "grad_norm": 0.3546976501552943, + "learning_rate": 0.00010171012048755472, + "loss": 0.6723, + "step": 1593 + }, + { + "epoch": 0.51008, + "grad_norm": 0.3452054980769207, + "learning_rate": 0.0001016064860263559, + "loss": 0.6097, + "step": 1594 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3634600412889473, + "learning_rate": 0.00010150284983930016, + "loss": 0.6639, + "step": 1595 + }, + { + "epoch": 0.51072, + "grad_norm": 0.3790210411846465, + "learning_rate": 0.00010139921203772446, + "loss": 0.6021, + "step": 1596 + }, + { + "epoch": 0.51104, + "grad_norm": 0.36305368367853424, + "learning_rate": 0.00010129557273296741, + "loss": 0.6586, + "step": 1597 + }, + { + "epoch": 0.51136, + "grad_norm": 0.34875358539597456, + "learning_rate": 0.00010119193203636939, + "loss": 0.6376, + "step": 1598 + }, + { + "epoch": 0.51168, + "grad_norm": 0.35160684463697656, + "learning_rate": 0.0001010882900592721, + "loss": 0.6172, + "step": 1599 + }, + { + "epoch": 0.512, + "grad_norm": 0.3510462536991429, + "learning_rate": 0.00010098464691301873, + "loss": 0.6245, + "step": 1600 + }, + { + "epoch": 0.51232, + "grad_norm": 0.3517323450812867, + "learning_rate": 0.00010088100270895364, + "loss": 0.6318, + "step": 1601 + }, + { + "epoch": 0.51264, + "grad_norm": 0.34741836930236275, + "learning_rate": 0.00010077735755842249, + "loss": 0.6483, + "step": 1602 + }, + { + "epoch": 0.51296, + "grad_norm": 0.3445827873634932, + "learning_rate": 0.00010067371157277172, + "loss": 0.6363, + "step": 1603 + }, + { + "epoch": 0.51328, + "grad_norm": 0.340521729146734, + "learning_rate": 0.00010057006486334886, + "loss": 0.6328, + "step": 1604 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3442250946520639, + "learning_rate": 0.00010046641754150214, + "loss": 0.5945, + "step": 1605 + }, + { + "epoch": 0.51392, + "grad_norm": 0.34561763070868484, + "learning_rate": 0.00010036276971858043, + "loss": 0.6182, + "step": 1606 + }, + { + "epoch": 0.51424, + "grad_norm": 0.6792569101809796, + "learning_rate": 0.0001002591215059332, + "loss": 0.6232, + "step": 1607 + }, + { + "epoch": 0.51456, + "grad_norm": 0.32707872563052687, + "learning_rate": 0.00010015547301491029, + "loss": 0.6305, + "step": 1608 + }, + { + "epoch": 0.51488, + "grad_norm": 0.3418782185323137, + "learning_rate": 0.00010005182435686185, + "loss": 0.6282, + "step": 1609 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3277883216323344, + "learning_rate": 9.994817564313819e-05, + "loss": 0.5916, + "step": 1610 + }, + { + "epoch": 0.51552, + "grad_norm": 0.3758405996209342, + "learning_rate": 9.984452698508976e-05, + "loss": 0.5882, + "step": 1611 + }, + { + "epoch": 0.51584, + "grad_norm": 0.35309424359566505, + "learning_rate": 9.974087849406683e-05, + "loss": 0.6298, + "step": 1612 + }, + { + "epoch": 0.51616, + "grad_norm": 0.35958186876854126, + "learning_rate": 9.963723028141958e-05, + "loss": 0.6351, + "step": 1613 + }, + { + "epoch": 0.51648, + "grad_norm": 0.35877317848063567, + "learning_rate": 9.953358245849791e-05, + "loss": 0.593, + "step": 1614 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3470855461805342, + "learning_rate": 9.942993513665115e-05, + "loss": 0.6275, + "step": 1615 + }, + { + "epoch": 0.51712, + "grad_norm": 0.35562411903011554, + "learning_rate": 9.932628842722833e-05, + "loss": 0.6568, + "step": 1616 + }, + { + "epoch": 0.51744, + "grad_norm": 0.3374542824126831, + "learning_rate": 9.922264244157755e-05, + "loss": 0.6127, + "step": 1617 + }, + { + "epoch": 0.51776, + "grad_norm": 0.4641883257339624, + "learning_rate": 9.911899729104636e-05, + "loss": 0.6304, + "step": 1618 + }, + { + "epoch": 0.51808, + "grad_norm": 0.36001288775377394, + "learning_rate": 9.901535308698134e-05, + "loss": 0.6493, + "step": 1619 + }, + { + "epoch": 0.5184, + "grad_norm": 0.34512170873528053, + "learning_rate": 9.891170994072793e-05, + "loss": 0.6653, + "step": 1620 + }, + { + "epoch": 0.51872, + "grad_norm": 0.34124183509899075, + "learning_rate": 9.880806796363062e-05, + "loss": 0.6237, + "step": 1621 + }, + { + "epoch": 0.51904, + "grad_norm": 0.35446878909065016, + "learning_rate": 9.870442726703261e-05, + "loss": 0.6246, + "step": 1622 + }, + { + "epoch": 0.51936, + "grad_norm": 0.3255396906990857, + "learning_rate": 9.860078796227556e-05, + "loss": 0.5546, + "step": 1623 + }, + { + "epoch": 0.51968, + "grad_norm": 0.3243820177044248, + "learning_rate": 9.849715016069986e-05, + "loss": 0.639, + "step": 1624 + }, + { + "epoch": 0.52, + "grad_norm": 0.3497987341346313, + "learning_rate": 9.839351397364411e-05, + "loss": 0.5908, + "step": 1625 + }, + { + "epoch": 0.52032, + "grad_norm": 0.3611728666287869, + "learning_rate": 9.828987951244528e-05, + "loss": 0.6432, + "step": 1626 + }, + { + "epoch": 0.52064, + "grad_norm": 0.3514356489710369, + "learning_rate": 9.818624688843846e-05, + "loss": 0.6276, + "step": 1627 + }, + { + "epoch": 0.52096, + "grad_norm": 0.34513254592759046, + "learning_rate": 9.808261621295672e-05, + "loss": 0.6102, + "step": 1628 + }, + { + "epoch": 0.52128, + "grad_norm": 0.33850586628037443, + "learning_rate": 9.79789875973311e-05, + "loss": 0.6, + "step": 1629 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3518783954718028, + "learning_rate": 9.787536115289038e-05, + "loss": 0.6132, + "step": 1630 + }, + { + "epoch": 0.52192, + "grad_norm": 0.3375530884733763, + "learning_rate": 9.777173699096107e-05, + "loss": 0.5779, + "step": 1631 + }, + { + "epoch": 0.52224, + "grad_norm": 0.3405406523913444, + "learning_rate": 9.766811522286712e-05, + "loss": 0.6513, + "step": 1632 + }, + { + "epoch": 0.52256, + "grad_norm": 0.3618874965518611, + "learning_rate": 9.756449595993004e-05, + "loss": 0.6352, + "step": 1633 + }, + { + "epoch": 0.52288, + "grad_norm": 0.36369634483609947, + "learning_rate": 9.746087931346852e-05, + "loss": 0.6321, + "step": 1634 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3338864944394306, + "learning_rate": 9.73572653947986e-05, + "loss": 0.5672, + "step": 1635 + }, + { + "epoch": 0.52352, + "grad_norm": 0.34625940166135283, + "learning_rate": 9.725365431523315e-05, + "loss": 0.6156, + "step": 1636 + }, + { + "epoch": 0.52384, + "grad_norm": 0.3340723236290614, + "learning_rate": 9.715004618608228e-05, + "loss": 0.6129, + "step": 1637 + }, + { + "epoch": 0.52416, + "grad_norm": 0.3402868286590741, + "learning_rate": 9.704644111865265e-05, + "loss": 0.6148, + "step": 1638 + }, + { + "epoch": 0.52448, + "grad_norm": 0.3459932283985027, + "learning_rate": 9.694283922424784e-05, + "loss": 0.6356, + "step": 1639 + }, + { + "epoch": 0.5248, + "grad_norm": 0.33908552214619375, + "learning_rate": 9.683924061416797e-05, + "loss": 0.6038, + "step": 1640 + }, + { + "epoch": 0.52512, + "grad_norm": 0.3572073149458451, + "learning_rate": 9.673564539970951e-05, + "loss": 0.6583, + "step": 1641 + }, + { + "epoch": 0.52544, + "grad_norm": 0.3643586303290633, + "learning_rate": 9.663205369216548e-05, + "loss": 0.6052, + "step": 1642 + }, + { + "epoch": 0.52576, + "grad_norm": 0.3388210109305512, + "learning_rate": 9.652846560282494e-05, + "loss": 0.5858, + "step": 1643 + }, + { + "epoch": 0.52608, + "grad_norm": 0.3550710451875338, + "learning_rate": 9.64248812429732e-05, + "loss": 0.6887, + "step": 1644 + }, + { + "epoch": 0.5264, + "grad_norm": 0.379058341960421, + "learning_rate": 9.632130072389152e-05, + "loss": 0.6832, + "step": 1645 + }, + { + "epoch": 0.52672, + "grad_norm": 0.33756503615840416, + "learning_rate": 9.621772415685703e-05, + "loss": 0.6563, + "step": 1646 + }, + { + "epoch": 0.52704, + "grad_norm": 0.3514272125488278, + "learning_rate": 9.61141516531426e-05, + "loss": 0.6819, + "step": 1647 + }, + { + "epoch": 0.52736, + "grad_norm": 0.3621922446497168, + "learning_rate": 9.601058332401673e-05, + "loss": 0.6365, + "step": 1648 + }, + { + "epoch": 0.52768, + "grad_norm": 0.3521641110158746, + "learning_rate": 9.590701928074348e-05, + "loss": 0.6375, + "step": 1649 + }, + { + "epoch": 0.528, + "grad_norm": 0.34077196030814916, + "learning_rate": 9.580345963458233e-05, + "loss": 0.6038, + "step": 1650 + }, + { + "epoch": 0.52832, + "grad_norm": 0.3435477626400518, + "learning_rate": 9.569990449678787e-05, + "loss": 0.6136, + "step": 1651 + }, + { + "epoch": 0.52864, + "grad_norm": 0.3483526386704811, + "learning_rate": 9.559635397861004e-05, + "loss": 0.6346, + "step": 1652 + }, + { + "epoch": 0.52896, + "grad_norm": 0.3296478462200726, + "learning_rate": 9.549280819129377e-05, + "loss": 0.644, + "step": 1653 + }, + { + "epoch": 0.52928, + "grad_norm": 0.35505382681576264, + "learning_rate": 9.53892672460788e-05, + "loss": 0.6001, + "step": 1654 + }, + { + "epoch": 0.5296, + "grad_norm": 0.37060064973685763, + "learning_rate": 9.52857312541998e-05, + "loss": 0.6296, + "step": 1655 + }, + { + "epoch": 0.52992, + "grad_norm": 0.35545535495174535, + "learning_rate": 9.518220032688603e-05, + "loss": 0.6013, + "step": 1656 + }, + { + "epoch": 0.53024, + "grad_norm": 0.36337776428354174, + "learning_rate": 9.507867457536138e-05, + "loss": 0.6843, + "step": 1657 + }, + { + "epoch": 0.53056, + "grad_norm": 0.367860694950824, + "learning_rate": 9.49751541108441e-05, + "loss": 0.6435, + "step": 1658 + }, + { + "epoch": 0.53088, + "grad_norm": 0.39661411613344955, + "learning_rate": 9.487163904454685e-05, + "loss": 0.6509, + "step": 1659 + }, + { + "epoch": 0.5312, + "grad_norm": 0.34600133675729755, + "learning_rate": 9.47681294876764e-05, + "loss": 0.6312, + "step": 1660 + }, + { + "epoch": 0.53152, + "grad_norm": 0.3456184877652772, + "learning_rate": 9.466462555143368e-05, + "loss": 0.63, + "step": 1661 + }, + { + "epoch": 0.53184, + "grad_norm": 0.3420147724316903, + "learning_rate": 9.456112734701349e-05, + "loss": 0.6298, + "step": 1662 + }, + { + "epoch": 0.53216, + "grad_norm": 0.33553414127144215, + "learning_rate": 9.445763498560463e-05, + "loss": 0.5536, + "step": 1663 + }, + { + "epoch": 0.53248, + "grad_norm": 0.33471678110632297, + "learning_rate": 9.435414857838942e-05, + "loss": 0.6177, + "step": 1664 + }, + { + "epoch": 0.5328, + "grad_norm": 0.34285722592754014, + "learning_rate": 9.425066823654393e-05, + "loss": 0.6187, + "step": 1665 + }, + { + "epoch": 0.53312, + "grad_norm": 0.336502798863346, + "learning_rate": 9.41471940712377e-05, + "loss": 0.631, + "step": 1666 + }, + { + "epoch": 0.53344, + "grad_norm": 0.3672435941706159, + "learning_rate": 9.404372619363353e-05, + "loss": 0.6211, + "step": 1667 + }, + { + "epoch": 0.53376, + "grad_norm": 0.3290893712675334, + "learning_rate": 9.394026471488762e-05, + "loss": 0.5746, + "step": 1668 + }, + { + "epoch": 0.53408, + "grad_norm": 0.3558363473580683, + "learning_rate": 9.383680974614915e-05, + "loss": 0.6478, + "step": 1669 + }, + { + "epoch": 0.5344, + "grad_norm": 0.33461305565651456, + "learning_rate": 9.373336139856039e-05, + "loss": 0.6204, + "step": 1670 + }, + { + "epoch": 0.53472, + "grad_norm": 0.3572442378059015, + "learning_rate": 9.36299197832565e-05, + "loss": 0.6767, + "step": 1671 + }, + { + "epoch": 0.53504, + "grad_norm": 0.34831788962831867, + "learning_rate": 9.352648501136538e-05, + "loss": 0.6407, + "step": 1672 + }, + { + "epoch": 0.53536, + "grad_norm": 0.3466679800660956, + "learning_rate": 9.342305719400755e-05, + "loss": 0.5952, + "step": 1673 + }, + { + "epoch": 0.53568, + "grad_norm": 0.36002830709514616, + "learning_rate": 9.331963644229611e-05, + "loss": 0.6099, + "step": 1674 + }, + { + "epoch": 0.536, + "grad_norm": 0.3383518386169015, + "learning_rate": 9.321622286733655e-05, + "loss": 0.5839, + "step": 1675 + }, + { + "epoch": 0.53632, + "grad_norm": 0.3436071802430533, + "learning_rate": 9.31128165802267e-05, + "loss": 0.5881, + "step": 1676 + }, + { + "epoch": 0.53664, + "grad_norm": 0.3385875097113795, + "learning_rate": 9.30094176920564e-05, + "loss": 0.6477, + "step": 1677 + }, + { + "epoch": 0.53696, + "grad_norm": 0.34485333697226567, + "learning_rate": 9.290602631390774e-05, + "loss": 0.615, + "step": 1678 + }, + { + "epoch": 0.53728, + "grad_norm": 0.34265052061935997, + "learning_rate": 9.280264255685467e-05, + "loss": 0.642, + "step": 1679 + }, + { + "epoch": 0.5376, + "grad_norm": 0.340332040773954, + "learning_rate": 9.269926653196286e-05, + "loss": 0.6265, + "step": 1680 + }, + { + "epoch": 0.53792, + "grad_norm": 0.35983120342480945, + "learning_rate": 9.259589835028985e-05, + "loss": 0.5691, + "step": 1681 + }, + { + "epoch": 0.53824, + "grad_norm": 0.3613563884457701, + "learning_rate": 9.249253812288454e-05, + "loss": 0.6326, + "step": 1682 + }, + { + "epoch": 0.53856, + "grad_norm": 0.3347482561034258, + "learning_rate": 9.238918596078746e-05, + "loss": 0.6058, + "step": 1683 + }, + { + "epoch": 0.53888, + "grad_norm": 0.33377818378359564, + "learning_rate": 9.228584197503047e-05, + "loss": 0.6075, + "step": 1684 + }, + { + "epoch": 0.5392, + "grad_norm": 0.34869157014223023, + "learning_rate": 9.21825062766365e-05, + "loss": 0.6221, + "step": 1685 + }, + { + "epoch": 0.53952, + "grad_norm": 0.3550502704944002, + "learning_rate": 9.207917897661971e-05, + "loss": 0.5982, + "step": 1686 + }, + { + "epoch": 0.53984, + "grad_norm": 0.3642034198511041, + "learning_rate": 9.197586018598518e-05, + "loss": 0.6285, + "step": 1687 + }, + { + "epoch": 0.54016, + "grad_norm": 0.330179119689698, + "learning_rate": 9.187255001572886e-05, + "loss": 0.5946, + "step": 1688 + }, + { + "epoch": 0.54048, + "grad_norm": 0.3545109241322245, + "learning_rate": 9.17692485768375e-05, + "loss": 0.6497, + "step": 1689 + }, + { + "epoch": 0.5408, + "grad_norm": 0.33016124052617013, + "learning_rate": 9.166595598028832e-05, + "loss": 0.6192, + "step": 1690 + }, + { + "epoch": 0.54112, + "grad_norm": 0.3260067961830586, + "learning_rate": 9.156267233704922e-05, + "loss": 0.5986, + "step": 1691 + }, + { + "epoch": 0.54144, + "grad_norm": 0.3478611469874791, + "learning_rate": 9.145939775807833e-05, + "loss": 0.615, + "step": 1692 + }, + { + "epoch": 0.54176, + "grad_norm": 0.34285832246410863, + "learning_rate": 9.135613235432413e-05, + "loss": 0.6247, + "step": 1693 + }, + { + "epoch": 0.54208, + "grad_norm": 0.3657478913658266, + "learning_rate": 9.125287623672525e-05, + "loss": 0.6072, + "step": 1694 + }, + { + "epoch": 0.5424, + "grad_norm": 0.3476063432549975, + "learning_rate": 9.114962951621024e-05, + "loss": 0.6605, + "step": 1695 + }, + { + "epoch": 0.54272, + "grad_norm": 0.34453039917363415, + "learning_rate": 9.104639230369769e-05, + "loss": 0.6025, + "step": 1696 + }, + { + "epoch": 0.54304, + "grad_norm": 0.3425646347782955, + "learning_rate": 9.094316471009596e-05, + "loss": 0.6119, + "step": 1697 + }, + { + "epoch": 0.54336, + "grad_norm": 0.32810171938466276, + "learning_rate": 9.083994684630289e-05, + "loss": 0.6159, + "step": 1698 + }, + { + "epoch": 0.54368, + "grad_norm": 0.33519412706874185, + "learning_rate": 9.073673882320615e-05, + "loss": 0.5634, + "step": 1699 + }, + { + "epoch": 0.544, + "grad_norm": 0.36123679999363023, + "learning_rate": 9.063354075168262e-05, + "loss": 0.6588, + "step": 1700 + }, + { + "epoch": 0.54432, + "grad_norm": 0.3389970013734657, + "learning_rate": 9.053035274259855e-05, + "loss": 0.5957, + "step": 1701 + }, + { + "epoch": 0.54464, + "grad_norm": 0.3674292130209685, + "learning_rate": 9.042717490680946e-05, + "loss": 0.6384, + "step": 1702 + }, + { + "epoch": 0.54496, + "grad_norm": 0.36273330396477954, + "learning_rate": 9.03240073551598e-05, + "loss": 0.6003, + "step": 1703 + }, + { + "epoch": 0.54528, + "grad_norm": 0.3384260519568992, + "learning_rate": 9.022085019848314e-05, + "loss": 0.6247, + "step": 1704 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3455683340536113, + "learning_rate": 9.011770354760168e-05, + "loss": 0.6395, + "step": 1705 + }, + { + "epoch": 0.54592, + "grad_norm": 0.34367493334249694, + "learning_rate": 9.001456751332649e-05, + "loss": 0.6621, + "step": 1706 + }, + { + "epoch": 0.54624, + "grad_norm": 0.35644784070146923, + "learning_rate": 8.991144220645724e-05, + "loss": 0.6655, + "step": 1707 + }, + { + "epoch": 0.54656, + "grad_norm": 0.3534638713268805, + "learning_rate": 8.980832773778193e-05, + "loss": 0.6318, + "step": 1708 + }, + { + "epoch": 0.54688, + "grad_norm": 0.3409241916958398, + "learning_rate": 8.970522421807707e-05, + "loss": 0.609, + "step": 1709 + }, + { + "epoch": 0.5472, + "grad_norm": 0.35038036509249415, + "learning_rate": 8.960213175810738e-05, + "loss": 0.6631, + "step": 1710 + }, + { + "epoch": 0.54752, + "grad_norm": 0.35825908020786396, + "learning_rate": 8.94990504686256e-05, + "loss": 0.661, + "step": 1711 + }, + { + "epoch": 0.54784, + "grad_norm": 0.34664905650836964, + "learning_rate": 8.939598046037257e-05, + "loss": 0.6769, + "step": 1712 + }, + { + "epoch": 0.54816, + "grad_norm": 0.35853567678098225, + "learning_rate": 8.929292184407692e-05, + "loss": 0.6495, + "step": 1713 + }, + { + "epoch": 0.54848, + "grad_norm": 0.3174661452196107, + "learning_rate": 8.918987473045517e-05, + "loss": 0.6152, + "step": 1714 + }, + { + "epoch": 0.5488, + "grad_norm": 0.3495590900347859, + "learning_rate": 8.908683923021137e-05, + "loss": 0.637, + "step": 1715 + }, + { + "epoch": 0.54912, + "grad_norm": 0.33932804328146104, + "learning_rate": 8.898381545403714e-05, + "loss": 0.5833, + "step": 1716 + }, + { + "epoch": 0.54944, + "grad_norm": 0.38013174749029327, + "learning_rate": 8.888080351261154e-05, + "loss": 0.623, + "step": 1717 + }, + { + "epoch": 0.54976, + "grad_norm": 0.3633381849551441, + "learning_rate": 8.877780351660078e-05, + "loss": 0.6319, + "step": 1718 + }, + { + "epoch": 0.55008, + "grad_norm": 0.34351967998456356, + "learning_rate": 8.867481557665837e-05, + "loss": 0.5893, + "step": 1719 + }, + { + "epoch": 0.5504, + "grad_norm": 0.35912950579023756, + "learning_rate": 8.857183980342491e-05, + "loss": 0.5895, + "step": 1720 + }, + { + "epoch": 0.55072, + "grad_norm": 0.3599373228414958, + "learning_rate": 8.846887630752774e-05, + "loss": 0.5929, + "step": 1721 + }, + { + "epoch": 0.55104, + "grad_norm": 0.36908549605691704, + "learning_rate": 8.836592519958118e-05, + "loss": 0.6235, + "step": 1722 + }, + { + "epoch": 0.55136, + "grad_norm": 0.3565050343212028, + "learning_rate": 8.826298659018615e-05, + "loss": 0.6202, + "step": 1723 + }, + { + "epoch": 0.55168, + "grad_norm": 0.33184316237640576, + "learning_rate": 8.816006058993018e-05, + "loss": 0.6089, + "step": 1724 + }, + { + "epoch": 0.552, + "grad_norm": 0.3480609683269693, + "learning_rate": 8.805714730938728e-05, + "loss": 0.597, + "step": 1725 + }, + { + "epoch": 0.55232, + "grad_norm": 0.32641504693056705, + "learning_rate": 8.795424685911769e-05, + "loss": 0.6014, + "step": 1726 + }, + { + "epoch": 0.55264, + "grad_norm": 0.3370306759354407, + "learning_rate": 8.785135934966802e-05, + "loss": 0.5981, + "step": 1727 + }, + { + "epoch": 0.55296, + "grad_norm": 0.3524600351484558, + "learning_rate": 8.774848489157085e-05, + "loss": 0.6521, + "step": 1728 + }, + { + "epoch": 0.55328, + "grad_norm": 0.34147473912059895, + "learning_rate": 8.76456235953448e-05, + "loss": 0.5806, + "step": 1729 + }, + { + "epoch": 0.5536, + "grad_norm": 0.34229875747408445, + "learning_rate": 8.754277557149431e-05, + "loss": 0.6229, + "step": 1730 + }, + { + "epoch": 0.55392, + "grad_norm": 0.3556211593332454, + "learning_rate": 8.743994093050963e-05, + "loss": 0.6357, + "step": 1731 + }, + { + "epoch": 0.55424, + "grad_norm": 0.3459976872304538, + "learning_rate": 8.733711978286652e-05, + "loss": 0.606, + "step": 1732 + }, + { + "epoch": 0.55456, + "grad_norm": 0.32759608492010256, + "learning_rate": 8.723431223902642e-05, + "loss": 0.5674, + "step": 1733 + }, + { + "epoch": 0.55488, + "grad_norm": 0.3591738982655165, + "learning_rate": 8.713151840943593e-05, + "loss": 0.6368, + "step": 1734 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3548210898890839, + "learning_rate": 8.702873840452715e-05, + "loss": 0.6256, + "step": 1735 + }, + { + "epoch": 0.55552, + "grad_norm": 0.3420524668970695, + "learning_rate": 8.69259723347171e-05, + "loss": 0.5913, + "step": 1736 + }, + { + "epoch": 0.55584, + "grad_norm": 0.3677436150713398, + "learning_rate": 8.6823220310408e-05, + "loss": 0.6019, + "step": 1737 + }, + { + "epoch": 0.55616, + "grad_norm": 0.758768611701704, + "learning_rate": 8.672048244198696e-05, + "loss": 0.6071, + "step": 1738 + }, + { + "epoch": 0.55648, + "grad_norm": 0.4664334827903925, + "learning_rate": 8.661775883982578e-05, + "loss": 0.5909, + "step": 1739 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3512120492042207, + "learning_rate": 8.651504961428103e-05, + "loss": 0.5943, + "step": 1740 + }, + { + "epoch": 0.55712, + "grad_norm": 0.3494784468671295, + "learning_rate": 8.641235487569381e-05, + "loss": 0.6019, + "step": 1741 + }, + { + "epoch": 0.55744, + "grad_norm": 0.40159765671756575, + "learning_rate": 8.630967473438965e-05, + "loss": 0.6165, + "step": 1742 + }, + { + "epoch": 0.55776, + "grad_norm": 0.3516738509161463, + "learning_rate": 8.620700930067837e-05, + "loss": 0.6166, + "step": 1743 + }, + { + "epoch": 0.55808, + "grad_norm": 0.3453203645406318, + "learning_rate": 8.610435868485406e-05, + "loss": 0.6038, + "step": 1744 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3478563315262649, + "learning_rate": 8.60017229971948e-05, + "loss": 0.6395, + "step": 1745 + }, + { + "epoch": 0.55872, + "grad_norm": 0.34843144616470656, + "learning_rate": 8.589910234796277e-05, + "loss": 0.6315, + "step": 1746 + }, + { + "epoch": 0.55904, + "grad_norm": 0.3614937658969523, + "learning_rate": 8.57964968474038e-05, + "loss": 0.6416, + "step": 1747 + }, + { + "epoch": 0.55936, + "grad_norm": 0.3514392345357859, + "learning_rate": 8.569390660574764e-05, + "loss": 0.5702, + "step": 1748 + }, + { + "epoch": 0.55968, + "grad_norm": 0.3775582118729312, + "learning_rate": 8.559133173320749e-05, + "loss": 0.6333, + "step": 1749 + }, + { + "epoch": 0.56, + "grad_norm": 0.34841874680400164, + "learning_rate": 8.548877233998014e-05, + "loss": 0.6002, + "step": 1750 + }, + { + "epoch": 0.56032, + "grad_norm": 0.36064135754041476, + "learning_rate": 8.538622853624575e-05, + "loss": 0.6102, + "step": 1751 + }, + { + "epoch": 0.56064, + "grad_norm": 0.3498893351562969, + "learning_rate": 8.528370043216763e-05, + "loss": 0.6583, + "step": 1752 + }, + { + "epoch": 0.56096, + "grad_norm": 0.33541127767302875, + "learning_rate": 8.518118813789237e-05, + "loss": 0.6345, + "step": 1753 + }, + { + "epoch": 0.56128, + "grad_norm": 0.3444274881783159, + "learning_rate": 8.507869176354945e-05, + "loss": 0.5759, + "step": 1754 + }, + { + "epoch": 0.5616, + "grad_norm": 0.359571877500463, + "learning_rate": 8.497621141925134e-05, + "loss": 0.6181, + "step": 1755 + }, + { + "epoch": 0.56192, + "grad_norm": 0.3364808308963259, + "learning_rate": 8.48737472150932e-05, + "loss": 0.6301, + "step": 1756 + }, + { + "epoch": 0.56224, + "grad_norm": 0.35686018816248494, + "learning_rate": 8.477129926115292e-05, + "loss": 0.5969, + "step": 1757 + }, + { + "epoch": 0.56256, + "grad_norm": 0.3768640326394994, + "learning_rate": 8.46688676674909e-05, + "loss": 0.6699, + "step": 1758 + }, + { + "epoch": 0.56288, + "grad_norm": 0.3335654500656375, + "learning_rate": 8.456645254414998e-05, + "loss": 0.5696, + "step": 1759 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3311188794008257, + "learning_rate": 8.44640540011553e-05, + "loss": 0.6219, + "step": 1760 + }, + { + "epoch": 0.56352, + "grad_norm": 0.3687414880614699, + "learning_rate": 8.43616721485142e-05, + "loss": 0.6129, + "step": 1761 + }, + { + "epoch": 0.56384, + "grad_norm": 0.3605932703963821, + "learning_rate": 8.425930709621603e-05, + "loss": 0.6274, + "step": 1762 + }, + { + "epoch": 0.56416, + "grad_norm": 0.36229678089782624, + "learning_rate": 8.415695895423217e-05, + "loss": 0.6263, + "step": 1763 + }, + { + "epoch": 0.56448, + "grad_norm": 0.36506050857138017, + "learning_rate": 8.405462783251584e-05, + "loss": 0.6862, + "step": 1764 + }, + { + "epoch": 0.5648, + "grad_norm": 0.34292050339086494, + "learning_rate": 8.395231384100186e-05, + "loss": 0.6257, + "step": 1765 + }, + { + "epoch": 0.56512, + "grad_norm": 0.3646566164278792, + "learning_rate": 8.38500170896068e-05, + "loss": 0.634, + "step": 1766 + }, + { + "epoch": 0.56544, + "grad_norm": 0.3189982683044711, + "learning_rate": 8.374773768822852e-05, + "loss": 0.6274, + "step": 1767 + }, + { + "epoch": 0.56576, + "grad_norm": 0.3414819172629874, + "learning_rate": 8.364547574674646e-05, + "loss": 0.6319, + "step": 1768 + }, + { + "epoch": 0.56608, + "grad_norm": 0.35254524861186665, + "learning_rate": 8.354323137502116e-05, + "loss": 0.5951, + "step": 1769 + }, + { + "epoch": 0.5664, + "grad_norm": 0.33019863111545944, + "learning_rate": 8.344100468289432e-05, + "loss": 0.5548, + "step": 1770 + }, + { + "epoch": 0.56672, + "grad_norm": 0.36060299596327494, + "learning_rate": 8.33387957801886e-05, + "loss": 0.6434, + "step": 1771 + }, + { + "epoch": 0.56704, + "grad_norm": 0.32756797011689986, + "learning_rate": 8.32366047767077e-05, + "loss": 0.5958, + "step": 1772 + }, + { + "epoch": 0.56736, + "grad_norm": 0.33393303877597424, + "learning_rate": 8.313443178223588e-05, + "loss": 0.5528, + "step": 1773 + }, + { + "epoch": 0.56768, + "grad_norm": 0.3182382517355006, + "learning_rate": 8.303227690653823e-05, + "loss": 0.5875, + "step": 1774 + }, + { + "epoch": 0.568, + "grad_norm": 0.34022211187839557, + "learning_rate": 8.293014025936025e-05, + "loss": 0.5936, + "step": 1775 + }, + { + "epoch": 0.56832, + "grad_norm": 0.35043308249476285, + "learning_rate": 8.282802195042791e-05, + "loss": 0.6393, + "step": 1776 + }, + { + "epoch": 0.56864, + "grad_norm": 0.33363217945429957, + "learning_rate": 8.272592208944757e-05, + "loss": 0.5875, + "step": 1777 + }, + { + "epoch": 0.56896, + "grad_norm": 0.3669529817187083, + "learning_rate": 8.262384078610557e-05, + "loss": 0.6068, + "step": 1778 + }, + { + "epoch": 0.56928, + "grad_norm": 0.33683705702974326, + "learning_rate": 8.25217781500685e-05, + "loss": 0.5844, + "step": 1779 + }, + { + "epoch": 0.5696, + "grad_norm": 0.33157252838040185, + "learning_rate": 8.241973429098278e-05, + "loss": 0.6147, + "step": 1780 + }, + { + "epoch": 0.56992, + "grad_norm": 0.3601380705008049, + "learning_rate": 8.231770931847468e-05, + "loss": 0.6203, + "step": 1781 + }, + { + "epoch": 0.57024, + "grad_norm": 0.3416796462478041, + "learning_rate": 8.221570334215028e-05, + "loss": 0.5912, + "step": 1782 + }, + { + "epoch": 0.57056, + "grad_norm": 0.3566078336017613, + "learning_rate": 8.211371647159508e-05, + "loss": 0.6134, + "step": 1783 + }, + { + "epoch": 0.57088, + "grad_norm": 0.34778276936001096, + "learning_rate": 8.201174881637418e-05, + "loss": 0.5982, + "step": 1784 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3450000549538375, + "learning_rate": 8.190980048603202e-05, + "loss": 0.6235, + "step": 1785 + }, + { + "epoch": 0.57152, + "grad_norm": 0.31995352297599994, + "learning_rate": 8.180787159009224e-05, + "loss": 0.5897, + "step": 1786 + }, + { + "epoch": 0.57184, + "grad_norm": 0.3476253266014655, + "learning_rate": 8.170596223805764e-05, + "loss": 0.6175, + "step": 1787 + }, + { + "epoch": 0.57216, + "grad_norm": 0.35229632600308786, + "learning_rate": 8.160407253940996e-05, + "loss": 0.602, + "step": 1788 + }, + { + "epoch": 0.57248, + "grad_norm": 0.344557938524413, + "learning_rate": 8.15022026036099e-05, + "loss": 0.5972, + "step": 1789 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3446825091651972, + "learning_rate": 8.140035254009694e-05, + "loss": 0.5881, + "step": 1790 + }, + { + "epoch": 0.57312, + "grad_norm": 0.35506574119094275, + "learning_rate": 8.129852245828911e-05, + "loss": 0.6082, + "step": 1791 + }, + { + "epoch": 0.57344, + "grad_norm": 0.33758397072840335, + "learning_rate": 8.119671246758309e-05, + "loss": 0.5517, + "step": 1792 + }, + { + "epoch": 0.57376, + "grad_norm": 0.3592248986561124, + "learning_rate": 8.109492267735385e-05, + "loss": 0.6164, + "step": 1793 + }, + { + "epoch": 0.57408, + "grad_norm": 0.4102568831719911, + "learning_rate": 8.09931531969548e-05, + "loss": 0.652, + "step": 1794 + }, + { + "epoch": 0.5744, + "grad_norm": 0.346546650174362, + "learning_rate": 8.089140413571747e-05, + "loss": 0.6199, + "step": 1795 + }, + { + "epoch": 0.57472, + "grad_norm": 0.3417509959458073, + "learning_rate": 8.078967560295135e-05, + "loss": 0.6402, + "step": 1796 + }, + { + "epoch": 0.57504, + "grad_norm": 0.3593332033690645, + "learning_rate": 8.068796770794409e-05, + "loss": 0.6201, + "step": 1797 + }, + { + "epoch": 0.57536, + "grad_norm": 0.3344397831757427, + "learning_rate": 8.058628055996097e-05, + "loss": 0.5854, + "step": 1798 + }, + { + "epoch": 0.57568, + "grad_norm": 0.33669693105473986, + "learning_rate": 8.048461426824504e-05, + "loss": 0.6071, + "step": 1799 + }, + { + "epoch": 0.576, + "grad_norm": 0.3322098668524984, + "learning_rate": 8.038296894201709e-05, + "loss": 0.6007, + "step": 1800 + }, + { + "epoch": 0.57632, + "grad_norm": 0.36398466596258644, + "learning_rate": 8.028134469047511e-05, + "loss": 0.6501, + "step": 1801 + }, + { + "epoch": 0.57664, + "grad_norm": 0.3817692776522116, + "learning_rate": 8.017974162279468e-05, + "loss": 0.622, + "step": 1802 + }, + { + "epoch": 0.57696, + "grad_norm": 0.3318927429557493, + "learning_rate": 8.007815984812858e-05, + "loss": 0.613, + "step": 1803 + }, + { + "epoch": 0.57728, + "grad_norm": 0.3355035577313009, + "learning_rate": 7.997659947560657e-05, + "loss": 0.5825, + "step": 1804 + }, + { + "epoch": 0.5776, + "grad_norm": 0.35134403475325027, + "learning_rate": 7.987506061433566e-05, + "loss": 0.6403, + "step": 1805 + }, + { + "epoch": 0.57792, + "grad_norm": 0.34429640725696375, + "learning_rate": 7.977354337339947e-05, + "loss": 0.6203, + "step": 1806 + }, + { + "epoch": 0.57824, + "grad_norm": 0.32812620362219835, + "learning_rate": 7.967204786185862e-05, + "loss": 0.5783, + "step": 1807 + }, + { + "epoch": 0.57856, + "grad_norm": 0.3632573766266116, + "learning_rate": 7.957057418875035e-05, + "loss": 0.6556, + "step": 1808 + }, + { + "epoch": 0.57888, + "grad_norm": 0.3581287231356658, + "learning_rate": 7.94691224630883e-05, + "loss": 0.574, + "step": 1809 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3513606175773462, + "learning_rate": 7.93676927938627e-05, + "loss": 0.5968, + "step": 1810 + }, + { + "epoch": 0.57952, + "grad_norm": 0.3537518775102126, + "learning_rate": 7.926628529003993e-05, + "loss": 0.6246, + "step": 1811 + }, + { + "epoch": 0.57984, + "grad_norm": 0.32630686990256164, + "learning_rate": 7.916490006056272e-05, + "loss": 0.5991, + "step": 1812 + }, + { + "epoch": 0.58016, + "grad_norm": 0.3368490146509299, + "learning_rate": 7.906353721434976e-05, + "loss": 0.5983, + "step": 1813 + }, + { + "epoch": 0.58048, + "grad_norm": 0.3301949361054577, + "learning_rate": 7.896219686029568e-05, + "loss": 0.6067, + "step": 1814 + }, + { + "epoch": 0.5808, + "grad_norm": 0.3290942403417462, + "learning_rate": 7.886087910727102e-05, + "loss": 0.5935, + "step": 1815 + }, + { + "epoch": 0.58112, + "grad_norm": 0.3768347447878006, + "learning_rate": 7.875958406412205e-05, + "loss": 0.5981, + "step": 1816 + }, + { + "epoch": 0.58144, + "grad_norm": 0.35085799962451825, + "learning_rate": 7.865831183967052e-05, + "loss": 0.57, + "step": 1817 + }, + { + "epoch": 0.58176, + "grad_norm": 0.3362152555262188, + "learning_rate": 7.855706254271383e-05, + "loss": 0.5961, + "step": 1818 + }, + { + "epoch": 0.58208, + "grad_norm": 0.3382900287554013, + "learning_rate": 7.845583628202458e-05, + "loss": 0.562, + "step": 1819 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3640850708981999, + "learning_rate": 7.835463316635076e-05, + "loss": 0.6246, + "step": 1820 + }, + { + "epoch": 0.58272, + "grad_norm": 0.34738804178115684, + "learning_rate": 7.825345330441547e-05, + "loss": 0.5975, + "step": 1821 + }, + { + "epoch": 0.58304, + "grad_norm": 0.3494235111301023, + "learning_rate": 7.815229680491672e-05, + "loss": 0.6198, + "step": 1822 + }, + { + "epoch": 0.58336, + "grad_norm": 0.34502638106532346, + "learning_rate": 7.805116377652759e-05, + "loss": 0.5754, + "step": 1823 + }, + { + "epoch": 0.58368, + "grad_norm": 0.32515833449673626, + "learning_rate": 7.795005432789578e-05, + "loss": 0.6183, + "step": 1824 + }, + { + "epoch": 0.584, + "grad_norm": 0.3721671465972435, + "learning_rate": 7.784896856764378e-05, + "loss": 0.6612, + "step": 1825 + }, + { + "epoch": 0.58432, + "grad_norm": 0.35012917233931085, + "learning_rate": 7.774790660436858e-05, + "loss": 0.6351, + "step": 1826 + }, + { + "epoch": 0.58464, + "grad_norm": 0.35949715438534474, + "learning_rate": 7.76468685466416e-05, + "loss": 0.5921, + "step": 1827 + }, + { + "epoch": 0.58496, + "grad_norm": 0.3522800868429467, + "learning_rate": 7.754585450300857e-05, + "loss": 0.5872, + "step": 1828 + }, + { + "epoch": 0.58528, + "grad_norm": 0.3535606901082648, + "learning_rate": 7.744486458198952e-05, + "loss": 0.6495, + "step": 1829 + }, + { + "epoch": 0.5856, + "grad_norm": 0.35126459942610294, + "learning_rate": 7.73438988920784e-05, + "loss": 0.5455, + "step": 1830 + }, + { + "epoch": 0.58592, + "grad_norm": 0.3295892469369936, + "learning_rate": 7.724295754174329e-05, + "loss": 0.5707, + "step": 1831 + }, + { + "epoch": 0.58624, + "grad_norm": 0.355488538883084, + "learning_rate": 7.714204063942596e-05, + "loss": 0.6039, + "step": 1832 + }, + { + "epoch": 0.58656, + "grad_norm": 0.35156084918248576, + "learning_rate": 7.704114829354205e-05, + "loss": 0.6032, + "step": 1833 + }, + { + "epoch": 0.58688, + "grad_norm": 0.332690882577147, + "learning_rate": 7.69402806124808e-05, + "loss": 0.6067, + "step": 1834 + }, + { + "epoch": 0.5872, + "grad_norm": 0.3662284033432509, + "learning_rate": 7.683943770460486e-05, + "loss": 0.5819, + "step": 1835 + }, + { + "epoch": 0.58752, + "grad_norm": 0.3729064154870074, + "learning_rate": 7.67386196782504e-05, + "loss": 0.6241, + "step": 1836 + }, + { + "epoch": 0.58784, + "grad_norm": 0.35438731610718377, + "learning_rate": 7.66378266417267e-05, + "loss": 0.6178, + "step": 1837 + }, + { + "epoch": 0.58816, + "grad_norm": 0.3445148338757892, + "learning_rate": 7.653705870331637e-05, + "loss": 0.6472, + "step": 1838 + }, + { + "epoch": 0.58848, + "grad_norm": 0.338572057256935, + "learning_rate": 7.643631597127491e-05, + "loss": 0.5935, + "step": 1839 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3743576130506236, + "learning_rate": 7.633559855383083e-05, + "loss": 0.6354, + "step": 1840 + }, + { + "epoch": 0.58912, + "grad_norm": 0.3518943998436618, + "learning_rate": 7.623490655918542e-05, + "loss": 0.5951, + "step": 1841 + }, + { + "epoch": 0.58944, + "grad_norm": 0.3478277490872392, + "learning_rate": 7.613424009551262e-05, + "loss": 0.6344, + "step": 1842 + }, + { + "epoch": 0.58976, + "grad_norm": 0.3804717195769369, + "learning_rate": 7.603359927095898e-05, + "loss": 0.6052, + "step": 1843 + }, + { + "epoch": 0.59008, + "grad_norm": 0.36456673440541854, + "learning_rate": 7.593298419364354e-05, + "loss": 0.5552, + "step": 1844 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3436990292346027, + "learning_rate": 7.583239497165758e-05, + "loss": 0.5816, + "step": 1845 + }, + { + "epoch": 0.59072, + "grad_norm": 0.40387712776280615, + "learning_rate": 7.57318317130647e-05, + "loss": 0.6242, + "step": 1846 + }, + { + "epoch": 0.59104, + "grad_norm": 0.5860322861133728, + "learning_rate": 7.563129452590058e-05, + "loss": 0.6576, + "step": 1847 + }, + { + "epoch": 0.59136, + "grad_norm": 0.33843346953001474, + "learning_rate": 7.553078351817284e-05, + "loss": 0.604, + "step": 1848 + }, + { + "epoch": 0.59168, + "grad_norm": 0.3459407252738132, + "learning_rate": 7.54302987978611e-05, + "loss": 0.6372, + "step": 1849 + }, + { + "epoch": 0.592, + "grad_norm": 0.3160897855957368, + "learning_rate": 7.532984047291653e-05, + "loss": 0.5559, + "step": 1850 + }, + { + "epoch": 0.59232, + "grad_norm": 0.39557856056846075, + "learning_rate": 7.522940865126218e-05, + "loss": 0.6034, + "step": 1851 + }, + { + "epoch": 0.59264, + "grad_norm": 0.3740328579661679, + "learning_rate": 7.512900344079248e-05, + "loss": 0.649, + "step": 1852 + }, + { + "epoch": 0.59296, + "grad_norm": 0.38069317942855857, + "learning_rate": 7.502862494937328e-05, + "loss": 0.6618, + "step": 1853 + }, + { + "epoch": 0.59328, + "grad_norm": 0.36904855598923475, + "learning_rate": 7.49282732848418e-05, + "loss": 0.6338, + "step": 1854 + }, + { + "epoch": 0.5936, + "grad_norm": 0.32679068632894726, + "learning_rate": 7.482794855500637e-05, + "loss": 0.6064, + "step": 1855 + }, + { + "epoch": 0.59392, + "grad_norm": 0.33673646719340805, + "learning_rate": 7.472765086764636e-05, + "loss": 0.5717, + "step": 1856 + }, + { + "epoch": 0.59424, + "grad_norm": 0.36930881435386403, + "learning_rate": 7.462738033051226e-05, + "loss": 0.5982, + "step": 1857 + }, + { + "epoch": 0.59456, + "grad_norm": 0.34455375193479976, + "learning_rate": 7.452713705132515e-05, + "loss": 0.5935, + "step": 1858 + }, + { + "epoch": 0.59488, + "grad_norm": 0.3366345570825285, + "learning_rate": 7.442692113777698e-05, + "loss": 0.5467, + "step": 1859 + }, + { + "epoch": 0.5952, + "grad_norm": 0.34071383754876755, + "learning_rate": 7.432673269753033e-05, + "loss": 0.6109, + "step": 1860 + }, + { + "epoch": 0.59552, + "grad_norm": 0.3355050561880696, + "learning_rate": 7.422657183821807e-05, + "loss": 0.6128, + "step": 1861 + }, + { + "epoch": 0.59584, + "grad_norm": 0.3441311191900993, + "learning_rate": 7.41264386674437e-05, + "loss": 0.5988, + "step": 1862 + }, + { + "epoch": 0.59616, + "grad_norm": 0.35050460869341926, + "learning_rate": 7.402633329278077e-05, + "loss": 0.562, + "step": 1863 + }, + { + "epoch": 0.59648, + "grad_norm": 0.35492535444768214, + "learning_rate": 7.392625582177305e-05, + "loss": 0.6161, + "step": 1864 + }, + { + "epoch": 0.5968, + "grad_norm": 0.39570849017754073, + "learning_rate": 7.382620636193438e-05, + "loss": 0.6329, + "step": 1865 + }, + { + "epoch": 0.59712, + "grad_norm": 0.3567276679686721, + "learning_rate": 7.372618502074839e-05, + "loss": 0.6025, + "step": 1866 + }, + { + "epoch": 0.59744, + "grad_norm": 0.35229351240295026, + "learning_rate": 7.362619190566859e-05, + "loss": 0.6299, + "step": 1867 + }, + { + "epoch": 0.59776, + "grad_norm": 0.3968985455395164, + "learning_rate": 7.352622712411815e-05, + "loss": 0.5881, + "step": 1868 + }, + { + "epoch": 0.59808, + "grad_norm": 0.3728693452838266, + "learning_rate": 7.342629078348975e-05, + "loss": 0.6099, + "step": 1869 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3265477445107486, + "learning_rate": 7.332638299114564e-05, + "loss": 0.5906, + "step": 1870 + }, + { + "epoch": 0.59872, + "grad_norm": 0.3980449396194788, + "learning_rate": 7.322650385441723e-05, + "loss": 0.5897, + "step": 1871 + }, + { + "epoch": 0.59904, + "grad_norm": 0.441773930417171, + "learning_rate": 7.312665348060533e-05, + "loss": 0.6003, + "step": 1872 + }, + { + "epoch": 0.59936, + "grad_norm": 0.3623260207228797, + "learning_rate": 7.302683197697965e-05, + "loss": 0.6494, + "step": 1873 + }, + { + "epoch": 0.59968, + "grad_norm": 0.3448128798098835, + "learning_rate": 7.292703945077903e-05, + "loss": 0.551, + "step": 1874 + }, + { + "epoch": 0.6, + "grad_norm": 0.3519105284427964, + "learning_rate": 7.28272760092112e-05, + "loss": 0.582, + "step": 1875 + }, + { + "epoch": 0.60032, + "grad_norm": 0.33984203968889287, + "learning_rate": 7.27275417594525e-05, + "loss": 0.597, + "step": 1876 + }, + { + "epoch": 0.60064, + "grad_norm": 0.3308544795799944, + "learning_rate": 7.2627836808648e-05, + "loss": 0.5887, + "step": 1877 + }, + { + "epoch": 0.60096, + "grad_norm": 0.33063541285678194, + "learning_rate": 7.252816126391137e-05, + "loss": 0.5823, + "step": 1878 + }, + { + "epoch": 0.60128, + "grad_norm": 0.36841842374340583, + "learning_rate": 7.242851523232448e-05, + "loss": 0.6681, + "step": 1879 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3333904385291622, + "learning_rate": 7.232889882093774e-05, + "loss": 0.5593, + "step": 1880 + }, + { + "epoch": 0.60192, + "grad_norm": 0.3394299704793546, + "learning_rate": 7.222931213676953e-05, + "loss": 0.6011, + "step": 1881 + }, + { + "epoch": 0.60224, + "grad_norm": 0.3421787595036792, + "learning_rate": 7.212975528680639e-05, + "loss": 0.5722, + "step": 1882 + }, + { + "epoch": 0.60256, + "grad_norm": 0.3474842414269521, + "learning_rate": 7.203022837800286e-05, + "loss": 0.6073, + "step": 1883 + }, + { + "epoch": 0.60288, + "grad_norm": 0.3393776810551021, + "learning_rate": 7.193073151728117e-05, + "loss": 0.6158, + "step": 1884 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3512342639452111, + "learning_rate": 7.183126481153144e-05, + "loss": 0.6121, + "step": 1885 + }, + { + "epoch": 0.60352, + "grad_norm": 0.3270891255036287, + "learning_rate": 7.173182836761121e-05, + "loss": 0.5311, + "step": 1886 + }, + { + "epoch": 0.60384, + "grad_norm": 0.3703712729881265, + "learning_rate": 7.163242229234569e-05, + "loss": 0.5523, + "step": 1887 + }, + { + "epoch": 0.60416, + "grad_norm": 0.3558917837259812, + "learning_rate": 7.153304669252736e-05, + "loss": 0.6299, + "step": 1888 + }, + { + "epoch": 0.60448, + "grad_norm": 0.43647224378839683, + "learning_rate": 7.143370167491596e-05, + "loss": 0.6375, + "step": 1889 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3509631190257302, + "learning_rate": 7.13343873462384e-05, + "loss": 0.6277, + "step": 1890 + }, + { + "epoch": 0.60512, + "grad_norm": 0.3818035649517864, + "learning_rate": 7.123510381318867e-05, + "loss": 0.594, + "step": 1891 + }, + { + "epoch": 0.60544, + "grad_norm": 0.33709759866295746, + "learning_rate": 7.113585118242754e-05, + "loss": 0.6027, + "step": 1892 + }, + { + "epoch": 0.60576, + "grad_norm": 0.33019830804874145, + "learning_rate": 7.103662956058277e-05, + "loss": 0.5849, + "step": 1893 + }, + { + "epoch": 0.60608, + "grad_norm": 0.33536534872635776, + "learning_rate": 7.09374390542486e-05, + "loss": 0.6014, + "step": 1894 + }, + { + "epoch": 0.6064, + "grad_norm": 0.34081409825453657, + "learning_rate": 7.083827976998599e-05, + "loss": 0.6308, + "step": 1895 + }, + { + "epoch": 0.60672, + "grad_norm": 0.3457388818466461, + "learning_rate": 7.073915181432233e-05, + "loss": 0.5743, + "step": 1896 + }, + { + "epoch": 0.60704, + "grad_norm": 0.36262196211565395, + "learning_rate": 7.064005529375128e-05, + "loss": 0.5803, + "step": 1897 + }, + { + "epoch": 0.60736, + "grad_norm": 0.3409727779590013, + "learning_rate": 7.054099031473287e-05, + "loss": 0.6189, + "step": 1898 + }, + { + "epoch": 0.60768, + "grad_norm": 0.337990287180337, + "learning_rate": 7.044195698369307e-05, + "loss": 0.5831, + "step": 1899 + }, + { + "epoch": 0.608, + "grad_norm": 0.3707543164317421, + "learning_rate": 7.034295540702397e-05, + "loss": 0.6591, + "step": 1900 + }, + { + "epoch": 0.60832, + "grad_norm": 0.35627947450473185, + "learning_rate": 7.024398569108359e-05, + "loss": 0.6008, + "step": 1901 + }, + { + "epoch": 0.60864, + "grad_norm": 0.3582176582922628, + "learning_rate": 7.014504794219554e-05, + "loss": 0.5976, + "step": 1902 + }, + { + "epoch": 0.60896, + "grad_norm": 0.32730720026573235, + "learning_rate": 7.004614226664925e-05, + "loss": 0.6055, + "step": 1903 + }, + { + "epoch": 0.60928, + "grad_norm": 0.37083839654458967, + "learning_rate": 6.994726877069968e-05, + "loss": 0.561, + "step": 1904 + }, + { + "epoch": 0.6096, + "grad_norm": 0.35244138574927625, + "learning_rate": 6.984842756056708e-05, + "loss": 0.6148, + "step": 1905 + }, + { + "epoch": 0.60992, + "grad_norm": 0.33856137853151946, + "learning_rate": 6.974961874243722e-05, + "loss": 0.6232, + "step": 1906 + }, + { + "epoch": 0.61024, + "grad_norm": 0.3460405354598853, + "learning_rate": 6.965084242246088e-05, + "loss": 0.58, + "step": 1907 + }, + { + "epoch": 0.61056, + "grad_norm": 0.34610336494131433, + "learning_rate": 6.955209870675403e-05, + "loss": 0.6447, + "step": 1908 + }, + { + "epoch": 0.61088, + "grad_norm": 0.3563410678571309, + "learning_rate": 6.945338770139764e-05, + "loss": 0.6213, + "step": 1909 + }, + { + "epoch": 0.6112, + "grad_norm": 0.4954793242972794, + "learning_rate": 6.935470951243745e-05, + "loss": 0.6023, + "step": 1910 + }, + { + "epoch": 0.61152, + "grad_norm": 0.40267918022635496, + "learning_rate": 6.925606424588405e-05, + "loss": 0.6531, + "step": 1911 + }, + { + "epoch": 0.61184, + "grad_norm": 0.32319198390328585, + "learning_rate": 6.915745200771248e-05, + "loss": 0.5721, + "step": 1912 + }, + { + "epoch": 0.61216, + "grad_norm": 0.3885673830240352, + "learning_rate": 6.905887290386253e-05, + "loss": 0.6086, + "step": 1913 + }, + { + "epoch": 0.61248, + "grad_norm": 0.31374233624224723, + "learning_rate": 6.896032704023826e-05, + "loss": 0.556, + "step": 1914 + }, + { + "epoch": 0.6128, + "grad_norm": 0.35042864174812055, + "learning_rate": 6.8861814522708e-05, + "loss": 0.624, + "step": 1915 + }, + { + "epoch": 0.61312, + "grad_norm": 0.3238677912260108, + "learning_rate": 6.876333545710436e-05, + "loss": 0.5661, + "step": 1916 + }, + { + "epoch": 0.61344, + "grad_norm": 0.360876791391741, + "learning_rate": 6.866488994922388e-05, + "loss": 0.6589, + "step": 1917 + }, + { + "epoch": 0.61376, + "grad_norm": 0.3364148105094434, + "learning_rate": 6.856647810482715e-05, + "loss": 0.6098, + "step": 1918 + }, + { + "epoch": 0.61408, + "grad_norm": 0.33953150057904763, + "learning_rate": 6.846810002963862e-05, + "loss": 0.5835, + "step": 1919 + }, + { + "epoch": 0.6144, + "grad_norm": 0.33444006812866056, + "learning_rate": 6.83697558293463e-05, + "loss": 0.6008, + "step": 1920 + }, + { + "epoch": 0.61472, + "grad_norm": 0.3450236893827937, + "learning_rate": 6.8271445609602e-05, + "loss": 0.5528, + "step": 1921 + }, + { + "epoch": 0.61504, + "grad_norm": 0.3541289842123425, + "learning_rate": 6.81731694760209e-05, + "loss": 0.575, + "step": 1922 + }, + { + "epoch": 0.61536, + "grad_norm": 0.33859576529313284, + "learning_rate": 6.807492753418161e-05, + "loss": 0.617, + "step": 1923 + }, + { + "epoch": 0.61568, + "grad_norm": 0.31719951883031655, + "learning_rate": 6.7976719889626e-05, + "loss": 0.5352, + "step": 1924 + }, + { + "epoch": 0.616, + "grad_norm": 0.3309750549522487, + "learning_rate": 6.787854664785906e-05, + "loss": 0.5947, + "step": 1925 + }, + { + "epoch": 0.61632, + "grad_norm": 0.35567505854354714, + "learning_rate": 6.778040791434887e-05, + "loss": 0.5997, + "step": 1926 + }, + { + "epoch": 0.61664, + "grad_norm": 0.3596150308996054, + "learning_rate": 6.768230379452647e-05, + "loss": 0.6021, + "step": 1927 + }, + { + "epoch": 0.61696, + "grad_norm": 0.378101334854469, + "learning_rate": 6.758423439378556e-05, + "loss": 0.6678, + "step": 1928 + }, + { + "epoch": 0.61728, + "grad_norm": 0.3581054545682819, + "learning_rate": 6.748619981748276e-05, + "loss": 0.6109, + "step": 1929 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3556626346138884, + "learning_rate": 6.738820017093706e-05, + "loss": 0.5685, + "step": 1930 + }, + { + "epoch": 0.61792, + "grad_norm": 0.35201691923831, + "learning_rate": 6.729023555943008e-05, + "loss": 0.6298, + "step": 1931 + }, + { + "epoch": 0.61824, + "grad_norm": 0.35059339899423375, + "learning_rate": 6.71923060882058e-05, + "loss": 0.5487, + "step": 1932 + }, + { + "epoch": 0.61856, + "grad_norm": 0.35013531370272016, + "learning_rate": 6.709441186247027e-05, + "loss": 0.6421, + "step": 1933 + }, + { + "epoch": 0.61888, + "grad_norm": 0.33988654085275827, + "learning_rate": 6.699655298739191e-05, + "loss": 0.5705, + "step": 1934 + }, + { + "epoch": 0.6192, + "grad_norm": 0.37528410622289504, + "learning_rate": 6.689872956810102e-05, + "loss": 0.6252, + "step": 1935 + }, + { + "epoch": 0.61952, + "grad_norm": 0.3640375305941058, + "learning_rate": 6.680094170968984e-05, + "loss": 0.5609, + "step": 1936 + }, + { + "epoch": 0.61984, + "grad_norm": 0.34663352050728613, + "learning_rate": 6.670318951721244e-05, + "loss": 0.6163, + "step": 1937 + }, + { + "epoch": 0.62016, + "grad_norm": 0.3362295699160863, + "learning_rate": 6.660547309568453e-05, + "loss": 0.6443, + "step": 1938 + }, + { + "epoch": 0.62048, + "grad_norm": 0.47610771558551523, + "learning_rate": 6.650779255008335e-05, + "loss": 0.6535, + "step": 1939 + }, + { + "epoch": 0.6208, + "grad_norm": 0.33397382452848845, + "learning_rate": 6.641014798534777e-05, + "loss": 0.633, + "step": 1940 + }, + { + "epoch": 0.62112, + "grad_norm": 0.3480349224542998, + "learning_rate": 6.631253950637779e-05, + "loss": 0.5857, + "step": 1941 + }, + { + "epoch": 0.62144, + "grad_norm": 0.330826962424729, + "learning_rate": 6.621496721803482e-05, + "loss": 0.587, + "step": 1942 + }, + { + "epoch": 0.62176, + "grad_norm": 0.3331030000813696, + "learning_rate": 6.611743122514125e-05, + "loss": 0.576, + "step": 1943 + }, + { + "epoch": 0.62208, + "grad_norm": 0.333396476832253, + "learning_rate": 6.601993163248056e-05, + "loss": 0.6313, + "step": 1944 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3498400431441658, + "learning_rate": 6.592246854479716e-05, + "loss": 0.6292, + "step": 1945 + }, + { + "epoch": 0.62272, + "grad_norm": 0.38921573964876904, + "learning_rate": 6.582504206679612e-05, + "loss": 0.6416, + "step": 1946 + }, + { + "epoch": 0.62304, + "grad_norm": 0.34675016742971393, + "learning_rate": 6.57276523031433e-05, + "loss": 0.6093, + "step": 1947 + }, + { + "epoch": 0.62336, + "grad_norm": 0.34304511632347195, + "learning_rate": 6.563029935846501e-05, + "loss": 0.6031, + "step": 1948 + }, + { + "epoch": 0.62368, + "grad_norm": 0.35691656699875085, + "learning_rate": 6.553298333734812e-05, + "loss": 0.648, + "step": 1949 + }, + { + "epoch": 0.624, + "grad_norm": 0.33030219390031523, + "learning_rate": 6.543570434433974e-05, + "loss": 0.5982, + "step": 1950 + }, + { + "epoch": 0.62432, + "grad_norm": 0.35822092901925984, + "learning_rate": 6.533846248394726e-05, + "loss": 0.5655, + "step": 1951 + }, + { + "epoch": 0.62464, + "grad_norm": 0.33147031143132805, + "learning_rate": 6.524125786063812e-05, + "loss": 0.6276, + "step": 1952 + }, + { + "epoch": 0.62496, + "grad_norm": 0.3505576625422144, + "learning_rate": 6.514409057883985e-05, + "loss": 0.5837, + "step": 1953 + }, + { + "epoch": 0.62528, + "grad_norm": 0.33463326459661547, + "learning_rate": 6.504696074293973e-05, + "loss": 0.6308, + "step": 1954 + }, + { + "epoch": 0.6256, + "grad_norm": 0.3479494892905318, + "learning_rate": 6.494986845728495e-05, + "loss": 0.6095, + "step": 1955 + }, + { + "epoch": 0.62592, + "grad_norm": 0.3350495826604109, + "learning_rate": 6.485281382618222e-05, + "loss": 0.5712, + "step": 1956 + }, + { + "epoch": 0.62624, + "grad_norm": 0.3455116328868799, + "learning_rate": 6.475579695389793e-05, + "loss": 0.6226, + "step": 1957 + }, + { + "epoch": 0.62656, + "grad_norm": 0.34347161806365567, + "learning_rate": 6.465881794465786e-05, + "loss": 0.6437, + "step": 1958 + }, + { + "epoch": 0.62688, + "grad_norm": 0.333188456743107, + "learning_rate": 6.456187690264705e-05, + "loss": 0.5938, + "step": 1959 + }, + { + "epoch": 0.6272, + "grad_norm": 0.34214147323845223, + "learning_rate": 6.446497393200985e-05, + "loss": 0.6036, + "step": 1960 + }, + { + "epoch": 0.62752, + "grad_norm": 0.37549074299350527, + "learning_rate": 6.436810913684963e-05, + "loss": 0.6248, + "step": 1961 + }, + { + "epoch": 0.62784, + "grad_norm": 0.3768302593197824, + "learning_rate": 6.427128262122877e-05, + "loss": 0.5912, + "step": 1962 + }, + { + "epoch": 0.62816, + "grad_norm": 0.357635947655499, + "learning_rate": 6.41744944891686e-05, + "loss": 0.5933, + "step": 1963 + }, + { + "epoch": 0.62848, + "grad_norm": 0.3902771304039085, + "learning_rate": 6.40777448446491e-05, + "loss": 0.5732, + "step": 1964 + }, + { + "epoch": 0.6288, + "grad_norm": 0.32872369588529426, + "learning_rate": 6.398103379160894e-05, + "loss": 0.604, + "step": 1965 + }, + { + "epoch": 0.62912, + "grad_norm": 0.344510616958961, + "learning_rate": 6.38843614339454e-05, + "loss": 0.6127, + "step": 1966 + }, + { + "epoch": 0.62944, + "grad_norm": 0.33791665167147994, + "learning_rate": 6.378772787551406e-05, + "loss": 0.6079, + "step": 1967 + }, + { + "epoch": 0.62976, + "grad_norm": 0.3644273641741268, + "learning_rate": 6.369113322012898e-05, + "loss": 0.6121, + "step": 1968 + }, + { + "epoch": 0.63008, + "grad_norm": 0.3471073608458236, + "learning_rate": 6.359457757156225e-05, + "loss": 0.6552, + "step": 1969 + }, + { + "epoch": 0.6304, + "grad_norm": 0.33374082385825, + "learning_rate": 6.349806103354417e-05, + "loss": 0.5831, + "step": 1970 + }, + { + "epoch": 0.63072, + "grad_norm": 0.334394370938585, + "learning_rate": 6.340158370976306e-05, + "loss": 0.6162, + "step": 1971 + }, + { + "epoch": 0.63104, + "grad_norm": 0.3800632214695881, + "learning_rate": 6.330514570386495e-05, + "loss": 0.5994, + "step": 1972 + }, + { + "epoch": 0.63136, + "grad_norm": 0.3599124750253699, + "learning_rate": 6.320874711945382e-05, + "loss": 0.643, + "step": 1973 + }, + { + "epoch": 0.63168, + "grad_norm": 0.3223482280689595, + "learning_rate": 6.311238806009112e-05, + "loss": 0.5583, + "step": 1974 + }, + { + "epoch": 0.632, + "grad_norm": 0.3569872180637835, + "learning_rate": 6.301606862929599e-05, + "loss": 0.6399, + "step": 1975 + }, + { + "epoch": 0.63232, + "grad_norm": 0.4037901641847413, + "learning_rate": 6.291978893054493e-05, + "loss": 0.5964, + "step": 1976 + }, + { + "epoch": 0.63264, + "grad_norm": 0.3777917756077199, + "learning_rate": 6.28235490672717e-05, + "loss": 0.6511, + "step": 1977 + }, + { + "epoch": 0.63296, + "grad_norm": 0.3351399149280321, + "learning_rate": 6.272734914286738e-05, + "loss": 0.6158, + "step": 1978 + }, + { + "epoch": 0.63328, + "grad_norm": 0.42345018526463113, + "learning_rate": 6.26311892606801e-05, + "loss": 0.6283, + "step": 1979 + }, + { + "epoch": 0.6336, + "grad_norm": 0.35112881188089906, + "learning_rate": 6.253506952401486e-05, + "loss": 0.5964, + "step": 1980 + }, + { + "epoch": 0.63392, + "grad_norm": 0.3509198575726522, + "learning_rate": 6.243899003613378e-05, + "loss": 0.6295, + "step": 1981 + }, + { + "epoch": 0.63424, + "grad_norm": 0.34702519733189763, + "learning_rate": 6.234295090025543e-05, + "loss": 0.6483, + "step": 1982 + }, + { + "epoch": 0.63456, + "grad_norm": 0.3365075547843539, + "learning_rate": 6.224695221955528e-05, + "loss": 0.5985, + "step": 1983 + }, + { + "epoch": 0.63488, + "grad_norm": 0.3427907029581174, + "learning_rate": 6.215099409716527e-05, + "loss": 0.5935, + "step": 1984 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3898604857758913, + "learning_rate": 6.205507663617369e-05, + "loss": 0.594, + "step": 1985 + }, + { + "epoch": 0.63552, + "grad_norm": 0.36899032854667785, + "learning_rate": 6.195919993962526e-05, + "loss": 0.59, + "step": 1986 + }, + { + "epoch": 0.63584, + "grad_norm": 0.3647593849049401, + "learning_rate": 6.186336411052076e-05, + "loss": 0.6184, + "step": 1987 + }, + { + "epoch": 0.63616, + "grad_norm": 0.36788914135648504, + "learning_rate": 6.176756925181724e-05, + "loss": 0.5693, + "step": 1988 + }, + { + "epoch": 0.63648, + "grad_norm": 0.35888830905427066, + "learning_rate": 6.167181546642765e-05, + "loss": 0.5862, + "step": 1989 + }, + { + "epoch": 0.6368, + "grad_norm": 0.3916968509376385, + "learning_rate": 6.157610285722075e-05, + "loss": 0.6235, + "step": 1990 + }, + { + "epoch": 0.63712, + "grad_norm": 0.36725234260748274, + "learning_rate": 6.148043152702123e-05, + "loss": 0.5974, + "step": 1991 + }, + { + "epoch": 0.63744, + "grad_norm": 0.3528027520142461, + "learning_rate": 6.138480157860921e-05, + "loss": 0.5824, + "step": 1992 + }, + { + "epoch": 0.63776, + "grad_norm": 0.33292930868441833, + "learning_rate": 6.12892131147206e-05, + "loss": 0.5512, + "step": 1993 + }, + { + "epoch": 0.63808, + "grad_norm": 0.34146409838349157, + "learning_rate": 6.119366623804657e-05, + "loss": 0.5762, + "step": 1994 + }, + { + "epoch": 0.6384, + "grad_norm": 0.37487891872563855, + "learning_rate": 6.109816105123362e-05, + "loss": 0.6288, + "step": 1995 + }, + { + "epoch": 0.63872, + "grad_norm": 0.34305373560782226, + "learning_rate": 6.1002697656883534e-05, + "loss": 0.6155, + "step": 1996 + }, + { + "epoch": 0.63904, + "grad_norm": 0.35759278621831475, + "learning_rate": 6.090727615755323e-05, + "loss": 0.6356, + "step": 1997 + }, + { + "epoch": 0.63936, + "grad_norm": 0.34842299986583225, + "learning_rate": 6.0811896655754465e-05, + "loss": 0.5795, + "step": 1998 + }, + { + "epoch": 0.63968, + "grad_norm": 0.3542660813580204, + "learning_rate": 6.0716559253954066e-05, + "loss": 0.5973, + "step": 1999 + }, + { + "epoch": 0.64, + "grad_norm": 0.37031274294214567, + "learning_rate": 6.0621264054573435e-05, + "loss": 0.6216, + "step": 2000 + }, + { + "epoch": 0.64032, + "grad_norm": 0.354375054376305, + "learning_rate": 6.052601115998878e-05, + "loss": 0.612, + "step": 2001 + }, + { + "epoch": 0.64064, + "grad_norm": 0.34356793991137885, + "learning_rate": 6.0430800672530876e-05, + "loss": 0.5865, + "step": 2002 + }, + { + "epoch": 0.64096, + "grad_norm": 0.34524776507829036, + "learning_rate": 6.0335632694484786e-05, + "loss": 0.566, + "step": 2003 + }, + { + "epoch": 0.64128, + "grad_norm": 0.3490977700367447, + "learning_rate": 6.024050732809008e-05, + "loss": 0.5903, + "step": 2004 + }, + { + "epoch": 0.6416, + "grad_norm": 0.323204570629607, + "learning_rate": 6.0145424675540394e-05, + "loss": 0.5869, + "step": 2005 + }, + { + "epoch": 0.64192, + "grad_norm": 0.3839359250539005, + "learning_rate": 6.005038483898362e-05, + "loss": 0.6245, + "step": 2006 + }, + { + "epoch": 0.64224, + "grad_norm": 0.36390599085911796, + "learning_rate": 5.9955387920521556e-05, + "loss": 0.6403, + "step": 2007 + }, + { + "epoch": 0.64256, + "grad_norm": 0.3506411929622666, + "learning_rate": 5.986043402220991e-05, + "loss": 0.6148, + "step": 2008 + }, + { + "epoch": 0.64288, + "grad_norm": 0.36104334103629493, + "learning_rate": 5.97655232460582e-05, + "loss": 0.5755, + "step": 2009 + }, + { + "epoch": 0.6432, + "grad_norm": 0.36005742059183676, + "learning_rate": 5.967065569402963e-05, + "loss": 0.6218, + "step": 2010 + }, + { + "epoch": 0.64352, + "grad_norm": 0.3401721909877025, + "learning_rate": 5.957583146804089e-05, + "loss": 0.5984, + "step": 2011 + }, + { + "epoch": 0.64384, + "grad_norm": 0.3494695725479328, + "learning_rate": 5.948105066996221e-05, + "loss": 0.6064, + "step": 2012 + }, + { + "epoch": 0.64416, + "grad_norm": 0.3554958485339228, + "learning_rate": 5.938631340161711e-05, + "loss": 0.6191, + "step": 2013 + }, + { + "epoch": 0.64448, + "grad_norm": 0.37812082673335734, + "learning_rate": 5.929161976478237e-05, + "loss": 0.6146, + "step": 2014 + }, + { + "epoch": 0.6448, + "grad_norm": 0.3504930724082363, + "learning_rate": 5.919696986118792e-05, + "loss": 0.6244, + "step": 2015 + }, + { + "epoch": 0.64512, + "grad_norm": 0.34501863572185304, + "learning_rate": 5.910236379251664e-05, + "loss": 0.6019, + "step": 2016 + }, + { + "epoch": 0.64544, + "grad_norm": 0.347046718981164, + "learning_rate": 5.9007801660404406e-05, + "loss": 0.6037, + "step": 2017 + }, + { + "epoch": 0.64576, + "grad_norm": 0.334516084814039, + "learning_rate": 5.891328356643979e-05, + "loss": 0.5849, + "step": 2018 + }, + { + "epoch": 0.64608, + "grad_norm": 0.3486600052010709, + "learning_rate": 5.881880961216415e-05, + "loss": 0.5822, + "step": 2019 + }, + { + "epoch": 0.6464, + "grad_norm": 0.37517557288961817, + "learning_rate": 5.872437989907136e-05, + "loss": 0.6097, + "step": 2020 + }, + { + "epoch": 0.64672, + "grad_norm": 0.35633671254185595, + "learning_rate": 5.86299945286078e-05, + "loss": 0.6159, + "step": 2021 + }, + { + "epoch": 0.64704, + "grad_norm": 0.33265969768336545, + "learning_rate": 5.85356536021722e-05, + "loss": 0.5683, + "step": 2022 + }, + { + "epoch": 0.64736, + "grad_norm": 0.3455329694067352, + "learning_rate": 5.844135722111555e-05, + "loss": 0.5739, + "step": 2023 + }, + { + "epoch": 0.64768, + "grad_norm": 0.35329932653586066, + "learning_rate": 5.8347105486740906e-05, + "loss": 0.5872, + "step": 2024 + }, + { + "epoch": 0.648, + "grad_norm": 0.36760192003971637, + "learning_rate": 5.8252898500303575e-05, + "loss": 0.6332, + "step": 2025 + }, + { + "epoch": 0.64832, + "grad_norm": 0.3529618057725791, + "learning_rate": 5.8158736363010526e-05, + "loss": 0.5883, + "step": 2026 + }, + { + "epoch": 0.64864, + "grad_norm": 0.3567232711947976, + "learning_rate": 5.806461917602074e-05, + "loss": 0.5695, + "step": 2027 + }, + { + "epoch": 0.64896, + "grad_norm": 0.32973668181108357, + "learning_rate": 5.7970547040444826e-05, + "loss": 0.6436, + "step": 2028 + }, + { + "epoch": 0.64928, + "grad_norm": 0.36989591043910014, + "learning_rate": 5.787652005734494e-05, + "loss": 0.6, + "step": 2029 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3512108851293808, + "learning_rate": 5.7782538327734884e-05, + "loss": 0.5723, + "step": 2030 + }, + { + "epoch": 0.64992, + "grad_norm": 0.36034410354167296, + "learning_rate": 5.768860195257968e-05, + "loss": 0.6217, + "step": 2031 + }, + { + "epoch": 0.65024, + "grad_norm": 0.3634043706264434, + "learning_rate": 5.7594711032795736e-05, + "loss": 0.6413, + "step": 2032 + }, + { + "epoch": 0.65056, + "grad_norm": 0.34696679449036355, + "learning_rate": 5.7500865669250626e-05, + "loss": 0.6035, + "step": 2033 + }, + { + "epoch": 0.65088, + "grad_norm": 0.36921144484327034, + "learning_rate": 5.7407065962762875e-05, + "loss": 0.638, + "step": 2034 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3604245985171455, + "learning_rate": 5.731331201410211e-05, + "loss": 0.6358, + "step": 2035 + }, + { + "epoch": 0.65152, + "grad_norm": 0.33336876492518613, + "learning_rate": 5.721960392398864e-05, + "loss": 0.5855, + "step": 2036 + }, + { + "epoch": 0.65184, + "grad_norm": 0.35765372253095246, + "learning_rate": 5.712594179309363e-05, + "loss": 0.5929, + "step": 2037 + }, + { + "epoch": 0.65216, + "grad_norm": 0.3225003554617414, + "learning_rate": 5.703232572203887e-05, + "loss": 0.6205, + "step": 2038 + }, + { + "epoch": 0.65248, + "grad_norm": 0.3435914377123781, + "learning_rate": 5.693875581139656e-05, + "loss": 0.5663, + "step": 2039 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3528038112643403, + "learning_rate": 5.68452321616894e-05, + "loss": 0.695, + "step": 2040 + }, + { + "epoch": 0.65312, + "grad_norm": 0.34626246994151005, + "learning_rate": 5.675175487339042e-05, + "loss": 0.6674, + "step": 2041 + }, + { + "epoch": 0.65344, + "grad_norm": 0.35168272875291506, + "learning_rate": 5.66583240469227e-05, + "loss": 0.6012, + "step": 2042 + }, + { + "epoch": 0.65376, + "grad_norm": 0.3549995660404429, + "learning_rate": 5.65649397826596e-05, + "loss": 0.6469, + "step": 2043 + }, + { + "epoch": 0.65408, + "grad_norm": 0.34065114536047747, + "learning_rate": 5.647160218092424e-05, + "loss": 0.5899, + "step": 2044 + }, + { + "epoch": 0.6544, + "grad_norm": 0.343485285841103, + "learning_rate": 5.637831134198982e-05, + "loss": 0.6092, + "step": 2045 + }, + { + "epoch": 0.65472, + "grad_norm": 0.3442168880757106, + "learning_rate": 5.6285067366079214e-05, + "loss": 0.6046, + "step": 2046 + }, + { + "epoch": 0.65504, + "grad_norm": 0.314384420811312, + "learning_rate": 5.6191870353364864e-05, + "loss": 0.5792, + "step": 2047 + }, + { + "epoch": 0.65536, + "grad_norm": 0.3546328917788084, + "learning_rate": 5.609872040396894e-05, + "loss": 0.5736, + "step": 2048 + }, + { + "epoch": 0.65568, + "grad_norm": 0.3309075071145526, + "learning_rate": 5.600561761796286e-05, + "loss": 0.5955, + "step": 2049 + }, + { + "epoch": 0.656, + "grad_norm": 0.3464703362931646, + "learning_rate": 5.5912562095367514e-05, + "loss": 0.6404, + "step": 2050 + }, + { + "epoch": 0.65632, + "grad_norm": 0.37007410033329685, + "learning_rate": 5.5819553936153e-05, + "loss": 0.5883, + "step": 2051 + }, + { + "epoch": 0.65664, + "grad_norm": 0.35701392801641474, + "learning_rate": 5.5726593240238436e-05, + "loss": 0.5559, + "step": 2052 + }, + { + "epoch": 0.65696, + "grad_norm": 0.34402146381815724, + "learning_rate": 5.563368010749208e-05, + "loss": 0.5914, + "step": 2053 + }, + { + "epoch": 0.65728, + "grad_norm": 0.332056145194669, + "learning_rate": 5.554081463773098e-05, + "loss": 0.5983, + "step": 2054 + }, + { + "epoch": 0.6576, + "grad_norm": 0.34100466230285387, + "learning_rate": 5.544799693072107e-05, + "loss": 0.5451, + "step": 2055 + }, + { + "epoch": 0.65792, + "grad_norm": 0.3560960301887002, + "learning_rate": 5.535522708617686e-05, + "loss": 0.6306, + "step": 2056 + }, + { + "epoch": 0.65824, + "grad_norm": 0.3403216703658889, + "learning_rate": 5.5262505203761624e-05, + "loss": 0.6284, + "step": 2057 + }, + { + "epoch": 0.65856, + "grad_norm": 0.3366642146627937, + "learning_rate": 5.516983138308689e-05, + "loss": 0.5765, + "step": 2058 + }, + { + "epoch": 0.65888, + "grad_norm": 0.3502813843702922, + "learning_rate": 5.5077205723712745e-05, + "loss": 0.614, + "step": 2059 + }, + { + "epoch": 0.6592, + "grad_norm": 0.36374933081863037, + "learning_rate": 5.498462832514737e-05, + "loss": 0.6305, + "step": 2060 + }, + { + "epoch": 0.65952, + "grad_norm": 0.36312266218067474, + "learning_rate": 5.4892099286847274e-05, + "loss": 0.5706, + "step": 2061 + }, + { + "epoch": 0.65984, + "grad_norm": 0.3431109901545333, + "learning_rate": 5.479961870821683e-05, + "loss": 0.5773, + "step": 2062 + }, + { + "epoch": 0.66016, + "grad_norm": 0.346553441248706, + "learning_rate": 5.470718668860848e-05, + "loss": 0.5697, + "step": 2063 + }, + { + "epoch": 0.66048, + "grad_norm": 0.3350030065506168, + "learning_rate": 5.461480332732249e-05, + "loss": 0.6016, + "step": 2064 + }, + { + "epoch": 0.6608, + "grad_norm": 0.34105759765938687, + "learning_rate": 5.4522468723606766e-05, + "loss": 0.5919, + "step": 2065 + }, + { + "epoch": 0.66112, + "grad_norm": 0.34790499613377257, + "learning_rate": 5.4430182976656944e-05, + "loss": 0.6302, + "step": 2066 + }, + { + "epoch": 0.66144, + "grad_norm": 0.3197844662148609, + "learning_rate": 5.433794618561605e-05, + "loss": 0.5738, + "step": 2067 + }, + { + "epoch": 0.66176, + "grad_norm": 0.3824322801478392, + "learning_rate": 5.424575844957462e-05, + "loss": 0.5626, + "step": 2068 + }, + { + "epoch": 0.66208, + "grad_norm": 0.35418669909269734, + "learning_rate": 5.41536198675705e-05, + "loss": 0.5816, + "step": 2069 + }, + { + "epoch": 0.6624, + "grad_norm": 0.35688396732221106, + "learning_rate": 5.40615305385886e-05, + "loss": 0.6281, + "step": 2070 + }, + { + "epoch": 0.66272, + "grad_norm": 0.3121624609259152, + "learning_rate": 5.396949056156104e-05, + "loss": 0.5537, + "step": 2071 + }, + { + "epoch": 0.66304, + "grad_norm": 0.3397661727705769, + "learning_rate": 5.387750003536691e-05, + "loss": 0.5835, + "step": 2072 + }, + { + "epoch": 0.66336, + "grad_norm": 0.3582237880936574, + "learning_rate": 5.378555905883209e-05, + "loss": 0.6073, + "step": 2073 + }, + { + "epoch": 0.66368, + "grad_norm": 0.35398878554064056, + "learning_rate": 5.369366773072935e-05, + "loss": 0.5629, + "step": 2074 + }, + { + "epoch": 0.664, + "grad_norm": 0.41663713321345, + "learning_rate": 5.3601826149777966e-05, + "loss": 0.6176, + "step": 2075 + }, + { + "epoch": 0.66432, + "grad_norm": 0.45132563293139605, + "learning_rate": 5.3510034414643926e-05, + "loss": 0.6003, + "step": 2076 + }, + { + "epoch": 0.66464, + "grad_norm": 0.3487035577840251, + "learning_rate": 5.341829262393962e-05, + "loss": 0.614, + "step": 2077 + }, + { + "epoch": 0.66496, + "grad_norm": 0.346383983874882, + "learning_rate": 5.33266008762237e-05, + "loss": 0.5896, + "step": 2078 + }, + { + "epoch": 0.66528, + "grad_norm": 0.3498860534467345, + "learning_rate": 5.323495927000121e-05, + "loss": 0.5743, + "step": 2079 + }, + { + "epoch": 0.6656, + "grad_norm": 0.32260245557893713, + "learning_rate": 5.314336790372314e-05, + "loss": 0.6027, + "step": 2080 + }, + { + "epoch": 0.66592, + "grad_norm": 0.3525826953611603, + "learning_rate": 5.305182687578669e-05, + "loss": 0.6021, + "step": 2081 + }, + { + "epoch": 0.66624, + "grad_norm": 0.34509038475476195, + "learning_rate": 5.296033628453484e-05, + "loss": 0.5911, + "step": 2082 + }, + { + "epoch": 0.66656, + "grad_norm": 0.3451670125169388, + "learning_rate": 5.28688962282565e-05, + "loss": 0.6002, + "step": 2083 + }, + { + "epoch": 0.66688, + "grad_norm": 0.3438255109041832, + "learning_rate": 5.277750680518616e-05, + "loss": 0.6187, + "step": 2084 + }, + { + "epoch": 0.6672, + "grad_norm": 0.4060436564166929, + "learning_rate": 5.2686168113504065e-05, + "loss": 0.6208, + "step": 2085 + }, + { + "epoch": 0.66752, + "grad_norm": 0.3526091690642306, + "learning_rate": 5.259488025133581e-05, + "loss": 0.6036, + "step": 2086 + }, + { + "epoch": 0.66784, + "grad_norm": 0.3399029864410244, + "learning_rate": 5.2503643316752525e-05, + "loss": 0.5794, + "step": 2087 + }, + { + "epoch": 0.66816, + "grad_norm": 0.3793844190152782, + "learning_rate": 5.241245740777048e-05, + "loss": 0.5522, + "step": 2088 + }, + { + "epoch": 0.66848, + "grad_norm": 0.335563960813538, + "learning_rate": 5.2321322622351254e-05, + "loss": 0.5831, + "step": 2089 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3644425321825224, + "learning_rate": 5.22302390584015e-05, + "loss": 0.6042, + "step": 2090 + }, + { + "epoch": 0.66912, + "grad_norm": 0.3451372037548831, + "learning_rate": 5.213920681377269e-05, + "loss": 0.6461, + "step": 2091 + }, + { + "epoch": 0.66944, + "grad_norm": 0.35458415976060564, + "learning_rate": 5.20482259862614e-05, + "loss": 0.5987, + "step": 2092 + }, + { + "epoch": 0.66976, + "grad_norm": 0.3343760722253775, + "learning_rate": 5.195729667360871e-05, + "loss": 0.5951, + "step": 2093 + }, + { + "epoch": 0.67008, + "grad_norm": 0.32043943548255815, + "learning_rate": 5.1866418973500575e-05, + "loss": 0.5356, + "step": 2094 + }, + { + "epoch": 0.6704, + "grad_norm": 0.35326541827919694, + "learning_rate": 5.1775592983567426e-05, + "loss": 0.6788, + "step": 2095 + }, + { + "epoch": 0.67072, + "grad_norm": 0.34535253136486044, + "learning_rate": 5.168481880138405e-05, + "loss": 0.5929, + "step": 2096 + }, + { + "epoch": 0.67104, + "grad_norm": 0.3491851474045318, + "learning_rate": 5.159409652446976e-05, + "loss": 0.6167, + "step": 2097 + }, + { + "epoch": 0.67136, + "grad_norm": 0.35441336377883786, + "learning_rate": 5.15034262502879e-05, + "loss": 0.5981, + "step": 2098 + }, + { + "epoch": 0.67168, + "grad_norm": 0.338926333841282, + "learning_rate": 5.1412808076246123e-05, + "loss": 0.5557, + "step": 2099 + }, + { + "epoch": 0.672, + "grad_norm": 0.3196784137658592, + "learning_rate": 5.132224209969605e-05, + "loss": 0.5487, + "step": 2100 + }, + { + "epoch": 0.67232, + "grad_norm": 0.34804145701447764, + "learning_rate": 5.123172841793315e-05, + "loss": 0.5714, + "step": 2101 + }, + { + "epoch": 0.67264, + "grad_norm": 0.37430255623358516, + "learning_rate": 5.1141267128196804e-05, + "loss": 0.6019, + "step": 2102 + }, + { + "epoch": 0.67296, + "grad_norm": 0.33396059911421894, + "learning_rate": 5.1050858327670136e-05, + "loss": 0.5526, + "step": 2103 + }, + { + "epoch": 0.67328, + "grad_norm": 0.34771881718642605, + "learning_rate": 5.096050211347975e-05, + "loss": 0.577, + "step": 2104 + }, + { + "epoch": 0.6736, + "grad_norm": 0.3362442533689697, + "learning_rate": 5.087019858269588e-05, + "loss": 0.617, + "step": 2105 + }, + { + "epoch": 0.67392, + "grad_norm": 0.3726473249121704, + "learning_rate": 5.0779947832332074e-05, + "loss": 0.6207, + "step": 2106 + }, + { + "epoch": 0.67424, + "grad_norm": 0.33868489525988194, + "learning_rate": 5.068974995934523e-05, + "loss": 0.5762, + "step": 2107 + }, + { + "epoch": 0.67456, + "grad_norm": 0.34236111482465825, + "learning_rate": 5.059960506063548e-05, + "loss": 0.6233, + "step": 2108 + }, + { + "epoch": 0.67488, + "grad_norm": 0.36793989757759804, + "learning_rate": 5.05095132330459e-05, + "loss": 0.6449, + "step": 2109 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3468855130341154, + "learning_rate": 5.041947457336274e-05, + "loss": 0.5863, + "step": 2110 + }, + { + "epoch": 0.67552, + "grad_norm": 0.3463551120740465, + "learning_rate": 5.0329489178314974e-05, + "loss": 0.5534, + "step": 2111 + }, + { + "epoch": 0.67584, + "grad_norm": 0.41554523742853994, + "learning_rate": 5.023955714457439e-05, + "loss": 0.64, + "step": 2112 + }, + { + "epoch": 0.67616, + "grad_norm": 0.3500498558145719, + "learning_rate": 5.0149678568755545e-05, + "loss": 0.598, + "step": 2113 + }, + { + "epoch": 0.67648, + "grad_norm": 0.3515305974392414, + "learning_rate": 5.005985354741543e-05, + "loss": 0.5571, + "step": 2114 + }, + { + "epoch": 0.6768, + "grad_norm": 0.34353499770713497, + "learning_rate": 4.99700821770536e-05, + "loss": 0.568, + "step": 2115 + }, + { + "epoch": 0.67712, + "grad_norm": 0.3287209481117013, + "learning_rate": 4.988036455411197e-05, + "loss": 0.5798, + "step": 2116 + }, + { + "epoch": 0.67744, + "grad_norm": 0.32849266582937475, + "learning_rate": 4.9790700774974605e-05, + "loss": 0.5828, + "step": 2117 + }, + { + "epoch": 0.67776, + "grad_norm": 0.3595703565476983, + "learning_rate": 4.97010909359679e-05, + "loss": 0.6021, + "step": 2118 + }, + { + "epoch": 0.67808, + "grad_norm": 0.6601814705838766, + "learning_rate": 4.961153513336011e-05, + "loss": 0.6094, + "step": 2119 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3317350540717859, + "learning_rate": 4.952203346336158e-05, + "loss": 0.542, + "step": 2120 + }, + { + "epoch": 0.67872, + "grad_norm": 0.3544282957512177, + "learning_rate": 4.9432586022124494e-05, + "loss": 0.6089, + "step": 2121 + }, + { + "epoch": 0.67904, + "grad_norm": 0.3654747649938245, + "learning_rate": 4.934319290574266e-05, + "loss": 0.5781, + "step": 2122 + }, + { + "epoch": 0.67936, + "grad_norm": 0.3118886344431738, + "learning_rate": 4.925385421025167e-05, + "loss": 0.5373, + "step": 2123 + }, + { + "epoch": 0.67968, + "grad_norm": 0.3451120407212237, + "learning_rate": 4.916457003162852e-05, + "loss": 0.6043, + "step": 2124 + }, + { + "epoch": 0.68, + "grad_norm": 0.35425978467866664, + "learning_rate": 4.907534046579173e-05, + "loss": 0.6149, + "step": 2125 + }, + { + "epoch": 0.68032, + "grad_norm": 0.3482058751281036, + "learning_rate": 4.898616560860116e-05, + "loss": 0.5905, + "step": 2126 + }, + { + "epoch": 0.68064, + "grad_norm": 0.3737144520124246, + "learning_rate": 4.88970455558578e-05, + "loss": 0.6353, + "step": 2127 + }, + { + "epoch": 0.68096, + "grad_norm": 0.4134031552056952, + "learning_rate": 4.880798040330383e-05, + "loss": 0.5448, + "step": 2128 + }, + { + "epoch": 0.68128, + "grad_norm": 0.378894264940317, + "learning_rate": 4.8718970246622496e-05, + "loss": 0.5831, + "step": 2129 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3403382839635182, + "learning_rate": 4.8630015181437826e-05, + "loss": 0.592, + "step": 2130 + }, + { + "epoch": 0.68192, + "grad_norm": 0.32596992803050007, + "learning_rate": 4.8541115303314824e-05, + "loss": 0.6038, + "step": 2131 + }, + { + "epoch": 0.68224, + "grad_norm": 0.3523430624349347, + "learning_rate": 4.845227070775903e-05, + "loss": 0.5373, + "step": 2132 + }, + { + "epoch": 0.68256, + "grad_norm": 0.36324300667545956, + "learning_rate": 4.8363481490216754e-05, + "loss": 0.6118, + "step": 2133 + }, + { + "epoch": 0.68288, + "grad_norm": 0.35990878032248785, + "learning_rate": 4.827474774607478e-05, + "loss": 0.5571, + "step": 2134 + }, + { + "epoch": 0.6832, + "grad_norm": 0.35246368836583347, + "learning_rate": 4.8186069570660175e-05, + "loss": 0.6019, + "step": 2135 + }, + { + "epoch": 0.68352, + "grad_norm": 0.33820762145884403, + "learning_rate": 4.809744705924049e-05, + "loss": 0.5542, + "step": 2136 + }, + { + "epoch": 0.68384, + "grad_norm": 0.36396619324937507, + "learning_rate": 4.8008880307023286e-05, + "loss": 0.6071, + "step": 2137 + }, + { + "epoch": 0.68416, + "grad_norm": 0.34297769819882357, + "learning_rate": 4.792036940915642e-05, + "loss": 0.6004, + "step": 2138 + }, + { + "epoch": 0.68448, + "grad_norm": 0.3446707277874218, + "learning_rate": 4.783191446072757e-05, + "loss": 0.5795, + "step": 2139 + }, + { + "epoch": 0.6848, + "grad_norm": 0.34247701064963976, + "learning_rate": 4.774351555676443e-05, + "loss": 0.5997, + "step": 2140 + }, + { + "epoch": 0.68512, + "grad_norm": 0.34031480373250517, + "learning_rate": 4.7655172792234416e-05, + "loss": 0.5971, + "step": 2141 + }, + { + "epoch": 0.68544, + "grad_norm": 0.3418328397157584, + "learning_rate": 4.756688626204462e-05, + "loss": 0.5388, + "step": 2142 + }, + { + "epoch": 0.68576, + "grad_norm": 0.33830150878911225, + "learning_rate": 4.7478656061041785e-05, + "loss": 0.6027, + "step": 2143 + }, + { + "epoch": 0.68608, + "grad_norm": 0.3290347537066264, + "learning_rate": 4.7390482284012137e-05, + "loss": 0.5667, + "step": 2144 + }, + { + "epoch": 0.6864, + "grad_norm": 0.322758433693767, + "learning_rate": 4.7302365025681206e-05, + "loss": 0.5579, + "step": 2145 + }, + { + "epoch": 0.68672, + "grad_norm": 0.32916432132305706, + "learning_rate": 4.7214304380713883e-05, + "loss": 0.5601, + "step": 2146 + }, + { + "epoch": 0.68704, + "grad_norm": 0.35069694685103214, + "learning_rate": 4.7126300443714235e-05, + "loss": 0.6728, + "step": 2147 + }, + { + "epoch": 0.68736, + "grad_norm": 0.3372776585334965, + "learning_rate": 4.703835330922531e-05, + "loss": 0.5553, + "step": 2148 + }, + { + "epoch": 0.68768, + "grad_norm": 0.35081736607009983, + "learning_rate": 4.69504630717293e-05, + "loss": 0.5835, + "step": 2149 + }, + { + "epoch": 0.688, + "grad_norm": 0.3421663522711319, + "learning_rate": 4.686262982564709e-05, + "loss": 0.5625, + "step": 2150 + }, + { + "epoch": 0.68832, + "grad_norm": 0.330158782268883, + "learning_rate": 4.677485366533846e-05, + "loss": 0.5669, + "step": 2151 + }, + { + "epoch": 0.68864, + "grad_norm": 0.3322203301560736, + "learning_rate": 4.6687134685101866e-05, + "loss": 0.6449, + "step": 2152 + }, + { + "epoch": 0.68896, + "grad_norm": 0.3379636582895683, + "learning_rate": 4.6599472979174244e-05, + "loss": 0.5857, + "step": 2153 + }, + { + "epoch": 0.68928, + "grad_norm": 0.3717812856756649, + "learning_rate": 4.6511868641731104e-05, + "loss": 0.568, + "step": 2154 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3515730424662826, + "learning_rate": 4.6424321766886215e-05, + "loss": 0.591, + "step": 2155 + }, + { + "epoch": 0.68992, + "grad_norm": 0.37438832198048366, + "learning_rate": 4.633683244869172e-05, + "loss": 0.6282, + "step": 2156 + }, + { + "epoch": 0.69024, + "grad_norm": 0.33930012347487964, + "learning_rate": 4.624940078113789e-05, + "loss": 0.567, + "step": 2157 + }, + { + "epoch": 0.69056, + "grad_norm": 0.35189737408355254, + "learning_rate": 4.616202685815299e-05, + "loss": 0.583, + "step": 2158 + }, + { + "epoch": 0.69088, + "grad_norm": 0.3600147955581338, + "learning_rate": 4.607471077360337e-05, + "loss": 0.6119, + "step": 2159 + }, + { + "epoch": 0.6912, + "grad_norm": 0.34892846844248593, + "learning_rate": 4.59874526212932e-05, + "loss": 0.5768, + "step": 2160 + }, + { + "epoch": 0.69152, + "grad_norm": 0.33590850093963653, + "learning_rate": 4.590025249496436e-05, + "loss": 0.6, + "step": 2161 + }, + { + "epoch": 0.69184, + "grad_norm": 0.3273754220673678, + "learning_rate": 4.581311048829646e-05, + "loss": 0.5733, + "step": 2162 + }, + { + "epoch": 0.69216, + "grad_norm": 0.36343403171179073, + "learning_rate": 4.572602669490661e-05, + "loss": 0.5946, + "step": 2163 + }, + { + "epoch": 0.69248, + "grad_norm": 0.3528486629323474, + "learning_rate": 4.563900120834946e-05, + "loss": 0.6293, + "step": 2164 + }, + { + "epoch": 0.6928, + "grad_norm": 0.33337786487290694, + "learning_rate": 4.5552034122116936e-05, + "loss": 0.5556, + "step": 2165 + }, + { + "epoch": 0.69312, + "grad_norm": 0.34427510895914637, + "learning_rate": 4.5465125529638305e-05, + "loss": 0.6029, + "step": 2166 + }, + { + "epoch": 0.69344, + "grad_norm": 0.353709840783666, + "learning_rate": 4.53782755242799e-05, + "loss": 0.5748, + "step": 2167 + }, + { + "epoch": 0.69376, + "grad_norm": 0.35153681066549636, + "learning_rate": 4.5291484199345234e-05, + "loss": 0.5909, + "step": 2168 + }, + { + "epoch": 0.69408, + "grad_norm": 0.35000499888657527, + "learning_rate": 4.5204751648074636e-05, + "loss": 0.5761, + "step": 2169 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3566588805630135, + "learning_rate": 4.5118077963645445e-05, + "loss": 0.5875, + "step": 2170 + }, + { + "epoch": 0.69472, + "grad_norm": 0.32566392052613685, + "learning_rate": 4.503146323917162e-05, + "loss": 0.5598, + "step": 2171 + }, + { + "epoch": 0.69504, + "grad_norm": 0.34817018348189954, + "learning_rate": 4.49449075677039e-05, + "loss": 0.606, + "step": 2172 + }, + { + "epoch": 0.69536, + "grad_norm": 0.3493169142378676, + "learning_rate": 4.4858411042229465e-05, + "loss": 0.5809, + "step": 2173 + }, + { + "epoch": 0.69568, + "grad_norm": 0.34416584714624077, + "learning_rate": 4.477197375567206e-05, + "loss": 0.5982, + "step": 2174 + }, + { + "epoch": 0.696, + "grad_norm": 0.3236725756890863, + "learning_rate": 4.468559580089175e-05, + "loss": 0.5704, + "step": 2175 + }, + { + "epoch": 0.69632, + "grad_norm": 0.3343878917653586, + "learning_rate": 4.4599277270684824e-05, + "loss": 0.5765, + "step": 2176 + }, + { + "epoch": 0.69664, + "grad_norm": 0.3372671578789749, + "learning_rate": 4.451301825778376e-05, + "loss": 0.5752, + "step": 2177 + }, + { + "epoch": 0.69696, + "grad_norm": 0.3148645999782816, + "learning_rate": 4.4426818854857155e-05, + "loss": 0.5564, + "step": 2178 + }, + { + "epoch": 0.69728, + "grad_norm": 0.3396740773552133, + "learning_rate": 4.4340679154509424e-05, + "loss": 0.5569, + "step": 2179 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3415364337182879, + "learning_rate": 4.4254599249281016e-05, + "loss": 0.5836, + "step": 2180 + }, + { + "epoch": 0.69792, + "grad_norm": 0.3946165939020095, + "learning_rate": 4.416857923164798e-05, + "loss": 0.6023, + "step": 2181 + }, + { + "epoch": 0.69824, + "grad_norm": 0.37093013746780495, + "learning_rate": 4.4082619194022115e-05, + "loss": 0.648, + "step": 2182 + }, + { + "epoch": 0.69856, + "grad_norm": 0.3450062761631485, + "learning_rate": 4.3996719228750826e-05, + "loss": 0.6124, + "step": 2183 + }, + { + "epoch": 0.69888, + "grad_norm": 0.34441174318580886, + "learning_rate": 4.391087942811685e-05, + "loss": 0.5923, + "step": 2184 + }, + { + "epoch": 0.6992, + "grad_norm": 0.36483531970324995, + "learning_rate": 4.382509988433844e-05, + "loss": 0.5991, + "step": 2185 + }, + { + "epoch": 0.69952, + "grad_norm": 0.3427550850478959, + "learning_rate": 4.3739380689568955e-05, + "loss": 0.5472, + "step": 2186 + }, + { + "epoch": 0.69984, + "grad_norm": 0.35052673086789415, + "learning_rate": 4.365372193589704e-05, + "loss": 0.587, + "step": 2187 + }, + { + "epoch": 0.70016, + "grad_norm": 0.3411611014102783, + "learning_rate": 4.356812371534643e-05, + "loss": 0.5848, + "step": 2188 + }, + { + "epoch": 0.70048, + "grad_norm": 0.32664012386565394, + "learning_rate": 4.348258611987568e-05, + "loss": 0.5444, + "step": 2189 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3227412475218645, + "learning_rate": 4.339710924137835e-05, + "loss": 0.5493, + "step": 2190 + }, + { + "epoch": 0.70112, + "grad_norm": 0.34809704278662396, + "learning_rate": 4.3311693171682765e-05, + "loss": 0.578, + "step": 2191 + }, + { + "epoch": 0.70144, + "grad_norm": 0.3614951388063878, + "learning_rate": 4.3226338002551806e-05, + "loss": 0.6186, + "step": 2192 + }, + { + "epoch": 0.70176, + "grad_norm": 0.33865981322451133, + "learning_rate": 4.314104382568308e-05, + "loss": 0.5779, + "step": 2193 + }, + { + "epoch": 0.70208, + "grad_norm": 0.34010188618196313, + "learning_rate": 4.305581073270858e-05, + "loss": 0.5643, + "step": 2194 + }, + { + "epoch": 0.7024, + "grad_norm": 0.3449882766085503, + "learning_rate": 4.297063881519463e-05, + "loss": 0.5997, + "step": 2195 + }, + { + "epoch": 0.70272, + "grad_norm": 0.3880839796316396, + "learning_rate": 4.2885528164642e-05, + "loss": 0.5456, + "step": 2196 + }, + { + "epoch": 0.70304, + "grad_norm": 0.3452914945529851, + "learning_rate": 4.280047887248544e-05, + "loss": 0.5778, + "step": 2197 + }, + { + "epoch": 0.70336, + "grad_norm": 0.34027732818941414, + "learning_rate": 4.271549103009396e-05, + "loss": 0.5757, + "step": 2198 + }, + { + "epoch": 0.70368, + "grad_norm": 0.3716579564549702, + "learning_rate": 4.2630564728770396e-05, + "loss": 0.5908, + "step": 2199 + }, + { + "epoch": 0.704, + "grad_norm": 0.3368628859692047, + "learning_rate": 4.25457000597516e-05, + "loss": 0.5946, + "step": 2200 + }, + { + "epoch": 0.70432, + "grad_norm": 0.34163950275425115, + "learning_rate": 4.2460897114208173e-05, + "loss": 0.6224, + "step": 2201 + }, + { + "epoch": 0.70464, + "grad_norm": 0.3504970826067996, + "learning_rate": 4.237615598324435e-05, + "loss": 0.5534, + "step": 2202 + }, + { + "epoch": 0.70496, + "grad_norm": 0.3572226016436674, + "learning_rate": 4.229147675789801e-05, + "loss": 0.58, + "step": 2203 + }, + { + "epoch": 0.70528, + "grad_norm": 0.34691137822027407, + "learning_rate": 4.220685952914057e-05, + "loss": 0.5924, + "step": 2204 + }, + { + "epoch": 0.7056, + "grad_norm": 0.33739703850569674, + "learning_rate": 4.212230438787671e-05, + "loss": 0.5333, + "step": 2205 + }, + { + "epoch": 0.70592, + "grad_norm": 0.3409260034699299, + "learning_rate": 4.2037811424944574e-05, + "loss": 0.5875, + "step": 2206 + }, + { + "epoch": 0.70624, + "grad_norm": 0.3427695122891294, + "learning_rate": 4.1953380731115346e-05, + "loss": 0.5939, + "step": 2207 + }, + { + "epoch": 0.70656, + "grad_norm": 0.33617391846693423, + "learning_rate": 4.1869012397093424e-05, + "loss": 0.5588, + "step": 2208 + }, + { + "epoch": 0.70688, + "grad_norm": 0.3390340944840326, + "learning_rate": 4.1784706513516214e-05, + "loss": 0.5594, + "step": 2209 + }, + { + "epoch": 0.7072, + "grad_norm": 0.32951850359568313, + "learning_rate": 4.170046317095393e-05, + "loss": 0.6086, + "step": 2210 + }, + { + "epoch": 0.70752, + "grad_norm": 0.34260843039120775, + "learning_rate": 4.161628245990975e-05, + "loss": 0.5784, + "step": 2211 + }, + { + "epoch": 0.70784, + "grad_norm": 0.325536739055374, + "learning_rate": 4.153216447081939e-05, + "loss": 0.5634, + "step": 2212 + }, + { + "epoch": 0.70816, + "grad_norm": 0.3386819031421406, + "learning_rate": 4.144810929405132e-05, + "loss": 0.5809, + "step": 2213 + }, + { + "epoch": 0.70848, + "grad_norm": 0.34231274950439816, + "learning_rate": 4.136411701990652e-05, + "loss": 0.5819, + "step": 2214 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3394653712190638, + "learning_rate": 4.12801877386183e-05, + "loss": 0.5837, + "step": 2215 + }, + { + "epoch": 0.70912, + "grad_norm": 0.35804269805922173, + "learning_rate": 4.119632154035241e-05, + "loss": 0.5952, + "step": 2216 + }, + { + "epoch": 0.70944, + "grad_norm": 0.3586488862051871, + "learning_rate": 4.111251851520671e-05, + "loss": 0.5905, + "step": 2217 + }, + { + "epoch": 0.70976, + "grad_norm": 0.3349094131065414, + "learning_rate": 4.102877875321129e-05, + "loss": 0.5509, + "step": 2218 + }, + { + "epoch": 0.71008, + "grad_norm": 0.36753271991452285, + "learning_rate": 4.09451023443283e-05, + "loss": 0.6154, + "step": 2219 + }, + { + "epoch": 0.7104, + "grad_norm": 0.37342710333992696, + "learning_rate": 4.086148937845167e-05, + "loss": 0.5742, + "step": 2220 + }, + { + "epoch": 0.71072, + "grad_norm": 0.36672738966642876, + "learning_rate": 4.0777939945407375e-05, + "loss": 0.5519, + "step": 2221 + }, + { + "epoch": 0.71104, + "grad_norm": 0.3599664623076852, + "learning_rate": 4.069445413495295e-05, + "loss": 0.6215, + "step": 2222 + }, + { + "epoch": 0.71136, + "grad_norm": 0.3749111463513934, + "learning_rate": 4.061103203677774e-05, + "loss": 0.6112, + "step": 2223 + }, + { + "epoch": 0.71168, + "grad_norm": 0.34354172664774496, + "learning_rate": 4.052767374050255e-05, + "loss": 0.5861, + "step": 2224 + }, + { + "epoch": 0.712, + "grad_norm": 0.3340136786370096, + "learning_rate": 4.04443793356796e-05, + "loss": 0.5552, + "step": 2225 + }, + { + "epoch": 0.71232, + "grad_norm": 0.3189664004815407, + "learning_rate": 4.03611489117926e-05, + "loss": 0.6129, + "step": 2226 + }, + { + "epoch": 0.71264, + "grad_norm": 0.3337073767832239, + "learning_rate": 4.027798255825648e-05, + "loss": 0.5344, + "step": 2227 + }, + { + "epoch": 0.71296, + "grad_norm": 0.332111140498855, + "learning_rate": 4.019488036441725e-05, + "loss": 0.5527, + "step": 2228 + }, + { + "epoch": 0.71328, + "grad_norm": 0.34633997300019187, + "learning_rate": 4.011184241955213e-05, + "loss": 0.5889, + "step": 2229 + }, + { + "epoch": 0.7136, + "grad_norm": 0.35046950969274326, + "learning_rate": 4.002886881286917e-05, + "loss": 0.5922, + "step": 2230 + }, + { + "epoch": 0.71392, + "grad_norm": 0.3418008484457355, + "learning_rate": 3.9945959633507435e-05, + "loss": 0.5939, + "step": 2231 + }, + { + "epoch": 0.71424, + "grad_norm": 0.3332284797781076, + "learning_rate": 3.986311497053673e-05, + "loss": 0.584, + "step": 2232 + }, + { + "epoch": 0.71456, + "grad_norm": 0.370652608795854, + "learning_rate": 3.97803349129575e-05, + "loss": 0.5694, + "step": 2233 + }, + { + "epoch": 0.71488, + "grad_norm": 0.353716723198577, + "learning_rate": 3.969761954970082e-05, + "loss": 0.6438, + "step": 2234 + }, + { + "epoch": 0.7152, + "grad_norm": 0.349414260786171, + "learning_rate": 3.961496896962832e-05, + "loss": 0.5817, + "step": 2235 + }, + { + "epoch": 0.71552, + "grad_norm": 0.327908196522291, + "learning_rate": 3.953238326153193e-05, + "loss": 0.573, + "step": 2236 + }, + { + "epoch": 0.71584, + "grad_norm": 0.34876399927957835, + "learning_rate": 3.9449862514133975e-05, + "loss": 0.6092, + "step": 2237 + }, + { + "epoch": 0.71616, + "grad_norm": 0.33970579566706016, + "learning_rate": 3.936740681608689e-05, + "loss": 0.5903, + "step": 2238 + }, + { + "epoch": 0.71648, + "grad_norm": 0.3467371703056036, + "learning_rate": 3.928501625597335e-05, + "loss": 0.5812, + "step": 2239 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3618289787290079, + "learning_rate": 3.920269092230601e-05, + "loss": 0.5753, + "step": 2240 + }, + { + "epoch": 0.71712, + "grad_norm": 0.33453430651132277, + "learning_rate": 3.912043090352737e-05, + "loss": 0.5742, + "step": 2241 + }, + { + "epoch": 0.71744, + "grad_norm": 0.33237639957635434, + "learning_rate": 3.90382362880099e-05, + "loss": 0.5806, + "step": 2242 + }, + { + "epoch": 0.71776, + "grad_norm": 0.36920254349985016, + "learning_rate": 3.8956107164055656e-05, + "loss": 0.6318, + "step": 2243 + }, + { + "epoch": 0.71808, + "grad_norm": 0.33312389210356563, + "learning_rate": 3.887404361989646e-05, + "loss": 0.6117, + "step": 2244 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3624401730891918, + "learning_rate": 3.8792045743693674e-05, + "loss": 0.6194, + "step": 2245 + }, + { + "epoch": 0.71872, + "grad_norm": 0.3812328820490707, + "learning_rate": 3.871011362353798e-05, + "loss": 0.6183, + "step": 2246 + }, + { + "epoch": 0.71904, + "grad_norm": 0.34115584079319067, + "learning_rate": 3.862824734744961e-05, + "loss": 0.5807, + "step": 2247 + }, + { + "epoch": 0.71936, + "grad_norm": 0.3277684077637942, + "learning_rate": 3.854644700337788e-05, + "loss": 0.5814, + "step": 2248 + }, + { + "epoch": 0.71968, + "grad_norm": 0.3576097084128524, + "learning_rate": 3.846471267920143e-05, + "loss": 0.5993, + "step": 2249 + }, + { + "epoch": 0.72, + "grad_norm": 0.3411302003205912, + "learning_rate": 3.8383044462727826e-05, + "loss": 0.6164, + "step": 2250 + }, + { + "epoch": 0.72032, + "grad_norm": 0.37426285149209554, + "learning_rate": 3.830144244169377e-05, + "loss": 0.6164, + "step": 2251 + }, + { + "epoch": 0.72064, + "grad_norm": 0.37352400249170115, + "learning_rate": 3.821990670376468e-05, + "loss": 0.598, + "step": 2252 + }, + { + "epoch": 0.72096, + "grad_norm": 0.33679111597156436, + "learning_rate": 3.813843733653494e-05, + "loss": 0.6082, + "step": 2253 + }, + { + "epoch": 0.72128, + "grad_norm": 0.3572988792391773, + "learning_rate": 3.805703442752747e-05, + "loss": 0.5666, + "step": 2254 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3321327187613777, + "learning_rate": 3.797569806419394e-05, + "loss": 0.6219, + "step": 2255 + }, + { + "epoch": 0.72192, + "grad_norm": 0.34449603882458674, + "learning_rate": 3.78944283339144e-05, + "loss": 0.6428, + "step": 2256 + }, + { + "epoch": 0.72224, + "grad_norm": 0.36480563636854263, + "learning_rate": 3.7813225323997394e-05, + "loss": 0.6036, + "step": 2257 + }, + { + "epoch": 0.72256, + "grad_norm": 0.41505054186153895, + "learning_rate": 3.77320891216798e-05, + "loss": 0.5695, + "step": 2258 + }, + { + "epoch": 0.72288, + "grad_norm": 0.3376316800430768, + "learning_rate": 3.7651019814126654e-05, + "loss": 0.6033, + "step": 2259 + }, + { + "epoch": 0.7232, + "grad_norm": 0.35426071634403483, + "learning_rate": 3.757001748843121e-05, + "loss": 0.5696, + "step": 2260 + }, + { + "epoch": 0.72352, + "grad_norm": 0.35349342910908493, + "learning_rate": 3.748908223161466e-05, + "loss": 0.6225, + "step": 2261 + }, + { + "epoch": 0.72384, + "grad_norm": 0.3660884093482465, + "learning_rate": 3.7408214130626226e-05, + "loss": 0.609, + "step": 2262 + }, + { + "epoch": 0.72416, + "grad_norm": 0.36732128749850107, + "learning_rate": 3.732741327234301e-05, + "loss": 0.6025, + "step": 2263 + }, + { + "epoch": 0.72448, + "grad_norm": 0.32087709387809354, + "learning_rate": 3.7246679743569736e-05, + "loss": 0.5603, + "step": 2264 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3918461582081944, + "learning_rate": 3.716601363103894e-05, + "loss": 0.5982, + "step": 2265 + }, + { + "epoch": 0.72512, + "grad_norm": 0.3382516923879686, + "learning_rate": 3.7085415021410706e-05, + "loss": 0.5844, + "step": 2266 + }, + { + "epoch": 0.72544, + "grad_norm": 0.3586032250212653, + "learning_rate": 3.7004884001272486e-05, + "loss": 0.5933, + "step": 2267 + }, + { + "epoch": 0.72576, + "grad_norm": 0.3578244878400089, + "learning_rate": 3.6924420657139304e-05, + "loss": 0.6197, + "step": 2268 + }, + { + "epoch": 0.72608, + "grad_norm": 0.3274635023599598, + "learning_rate": 3.684402507545329e-05, + "loss": 0.586, + "step": 2269 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3620469443544567, + "learning_rate": 3.6763697342583905e-05, + "loss": 0.5895, + "step": 2270 + }, + { + "epoch": 0.72672, + "grad_norm": 0.34616989961039274, + "learning_rate": 3.66834375448277e-05, + "loss": 0.5577, + "step": 2271 + }, + { + "epoch": 0.72704, + "grad_norm": 0.3198872973298801, + "learning_rate": 3.660324576840819e-05, + "loss": 0.5394, + "step": 2272 + }, + { + "epoch": 0.72736, + "grad_norm": 0.32009968478428624, + "learning_rate": 3.652312209947589e-05, + "loss": 0.5802, + "step": 2273 + }, + { + "epoch": 0.72768, + "grad_norm": 0.36490577158852383, + "learning_rate": 3.644306662410805e-05, + "loss": 0.6055, + "step": 2274 + }, + { + "epoch": 0.728, + "grad_norm": 0.36374517482268487, + "learning_rate": 3.6363079428308776e-05, + "loss": 0.6059, + "step": 2275 + }, + { + "epoch": 0.72832, + "grad_norm": 0.34302612410676386, + "learning_rate": 3.628316059800868e-05, + "loss": 0.5782, + "step": 2276 + }, + { + "epoch": 0.72864, + "grad_norm": 0.3668817865393265, + "learning_rate": 3.62033102190651e-05, + "loss": 0.5935, + "step": 2277 + }, + { + "epoch": 0.72896, + "grad_norm": 0.362158383809741, + "learning_rate": 3.612352837726166e-05, + "loss": 0.5743, + "step": 2278 + }, + { + "epoch": 0.72928, + "grad_norm": 0.33252307212965837, + "learning_rate": 3.6043815158308516e-05, + "loss": 0.5968, + "step": 2279 + }, + { + "epoch": 0.7296, + "grad_norm": 0.34244771372003796, + "learning_rate": 3.5964170647841943e-05, + "loss": 0.5985, + "step": 2280 + }, + { + "epoch": 0.72992, + "grad_norm": 0.34611776184090504, + "learning_rate": 3.588459493142456e-05, + "loss": 0.5963, + "step": 2281 + }, + { + "epoch": 0.73024, + "grad_norm": 0.3562953662835525, + "learning_rate": 3.580508809454494e-05, + "loss": 0.5682, + "step": 2282 + }, + { + "epoch": 0.73056, + "grad_norm": 0.3277345038138114, + "learning_rate": 3.572565022261775e-05, + "loss": 0.5756, + "step": 2283 + }, + { + "epoch": 0.73088, + "grad_norm": 0.32874532005793394, + "learning_rate": 3.5646281400983574e-05, + "loss": 0.5859, + "step": 2284 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3401882822854291, + "learning_rate": 3.556698171490871e-05, + "loss": 0.5899, + "step": 2285 + }, + { + "epoch": 0.73152, + "grad_norm": 0.405455404993615, + "learning_rate": 3.548775124958532e-05, + "loss": 0.5457, + "step": 2286 + }, + { + "epoch": 0.73184, + "grad_norm": 0.391038851640563, + "learning_rate": 3.540859009013108e-05, + "loss": 0.6236, + "step": 2287 + }, + { + "epoch": 0.73216, + "grad_norm": 0.3603780442231415, + "learning_rate": 3.532949832158928e-05, + "loss": 0.5599, + "step": 2288 + }, + { + "epoch": 0.73248, + "grad_norm": 0.35753046410297185, + "learning_rate": 3.5250476028928715e-05, + "loss": 0.5793, + "step": 2289 + }, + { + "epoch": 0.7328, + "grad_norm": 0.325083022215054, + "learning_rate": 3.517152329704337e-05, + "loss": 0.5893, + "step": 2290 + }, + { + "epoch": 0.73312, + "grad_norm": 0.33934670102174425, + "learning_rate": 3.509264021075269e-05, + "loss": 0.5592, + "step": 2291 + }, + { + "epoch": 0.73344, + "grad_norm": 0.3629510582047156, + "learning_rate": 3.501382685480116e-05, + "loss": 0.5867, + "step": 2292 + }, + { + "epoch": 0.73376, + "grad_norm": 0.33460678087081486, + "learning_rate": 3.493508331385842e-05, + "loss": 0.5617, + "step": 2293 + }, + { + "epoch": 0.73408, + "grad_norm": 0.36053140140845014, + "learning_rate": 3.485640967251914e-05, + "loss": 0.5853, + "step": 2294 + }, + { + "epoch": 0.7344, + "grad_norm": 0.38112213895245, + "learning_rate": 3.4777806015302796e-05, + "loss": 0.6554, + "step": 2295 + }, + { + "epoch": 0.73472, + "grad_norm": 0.3155893188801853, + "learning_rate": 3.469927242665375e-05, + "loss": 0.5347, + "step": 2296 + }, + { + "epoch": 0.73504, + "grad_norm": 0.33097273265734717, + "learning_rate": 3.462080899094111e-05, + "loss": 0.6048, + "step": 2297 + }, + { + "epoch": 0.73536, + "grad_norm": 0.32406528689576003, + "learning_rate": 3.454241579245854e-05, + "loss": 0.5445, + "step": 2298 + }, + { + "epoch": 0.73568, + "grad_norm": 0.3352555994003734, + "learning_rate": 3.446409291542433e-05, + "loss": 0.5804, + "step": 2299 + }, + { + "epoch": 0.736, + "grad_norm": 0.33904697657405386, + "learning_rate": 3.438584044398113e-05, + "loss": 0.5923, + "step": 2300 + }, + { + "epoch": 0.73632, + "grad_norm": 0.3274710727691608, + "learning_rate": 3.430765846219603e-05, + "loss": 0.5771, + "step": 2301 + }, + { + "epoch": 0.73664, + "grad_norm": 0.3312216807151814, + "learning_rate": 3.422954705406043e-05, + "loss": 0.5868, + "step": 2302 + }, + { + "epoch": 0.73696, + "grad_norm": 0.3505180134891135, + "learning_rate": 3.415150630348977e-05, + "loss": 0.5796, + "step": 2303 + }, + { + "epoch": 0.73728, + "grad_norm": 0.3376211050147904, + "learning_rate": 3.4073536294323705e-05, + "loss": 0.5807, + "step": 2304 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3384012217882616, + "learning_rate": 3.399563711032583e-05, + "loss": 0.5528, + "step": 2305 + }, + { + "epoch": 0.73792, + "grad_norm": 0.32379561738608786, + "learning_rate": 3.3917808835183706e-05, + "loss": 0.6018, + "step": 2306 + }, + { + "epoch": 0.73824, + "grad_norm": 0.33816519346897606, + "learning_rate": 3.384005155250867e-05, + "loss": 0.6139, + "step": 2307 + }, + { + "epoch": 0.73856, + "grad_norm": 0.3361173981310898, + "learning_rate": 3.376236534583576e-05, + "loss": 0.5965, + "step": 2308 + }, + { + "epoch": 0.73888, + "grad_norm": 0.376368344839782, + "learning_rate": 3.368475029862373e-05, + "loss": 0.6307, + "step": 2309 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3400849938851062, + "learning_rate": 3.3607206494254914e-05, + "loss": 0.6181, + "step": 2310 + }, + { + "epoch": 0.73952, + "grad_norm": 0.3465621007928963, + "learning_rate": 3.352973401603499e-05, + "loss": 0.5998, + "step": 2311 + }, + { + "epoch": 0.73984, + "grad_norm": 0.3606098793187089, + "learning_rate": 3.34523329471931e-05, + "loss": 0.5891, + "step": 2312 + }, + { + "epoch": 0.74016, + "grad_norm": 0.33224624071446074, + "learning_rate": 3.337500337088162e-05, + "loss": 0.5275, + "step": 2313 + }, + { + "epoch": 0.74048, + "grad_norm": 0.3252397944789401, + "learning_rate": 3.329774537017616e-05, + "loss": 0.5804, + "step": 2314 + }, + { + "epoch": 0.7408, + "grad_norm": 0.35389135430673174, + "learning_rate": 3.322055902807545e-05, + "loss": 0.5961, + "step": 2315 + }, + { + "epoch": 0.74112, + "grad_norm": 0.3349695026013135, + "learning_rate": 3.314344442750116e-05, + "loss": 0.5687, + "step": 2316 + }, + { + "epoch": 0.74144, + "grad_norm": 0.34821436421442353, + "learning_rate": 3.306640165129799e-05, + "loss": 0.639, + "step": 2317 + }, + { + "epoch": 0.74176, + "grad_norm": 0.3422837278270685, + "learning_rate": 3.298943078223334e-05, + "loss": 0.5472, + "step": 2318 + }, + { + "epoch": 0.74208, + "grad_norm": 0.37652125465124586, + "learning_rate": 3.2912531902997524e-05, + "loss": 0.6127, + "step": 2319 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3417975229253376, + "learning_rate": 3.283570509620344e-05, + "loss": 0.5816, + "step": 2320 + }, + { + "epoch": 0.74272, + "grad_norm": 0.336177798926716, + "learning_rate": 3.275895044438649e-05, + "loss": 0.5837, + "step": 2321 + }, + { + "epoch": 0.74304, + "grad_norm": 0.3697089660276437, + "learning_rate": 3.26822680300047e-05, + "loss": 0.5831, + "step": 2322 + }, + { + "epoch": 0.74336, + "grad_norm": 0.3402634791319304, + "learning_rate": 3.260565793543835e-05, + "loss": 0.5591, + "step": 2323 + }, + { + "epoch": 0.74368, + "grad_norm": 0.3717472540096618, + "learning_rate": 3.252912024299012e-05, + "loss": 0.5955, + "step": 2324 + }, + { + "epoch": 0.744, + "grad_norm": 0.3514658273470325, + "learning_rate": 3.24526550348849e-05, + "loss": 0.6062, + "step": 2325 + }, + { + "epoch": 0.74432, + "grad_norm": 0.33652106281200506, + "learning_rate": 3.237626239326965e-05, + "loss": 0.5467, + "step": 2326 + }, + { + "epoch": 0.74464, + "grad_norm": 0.32989357684968657, + "learning_rate": 3.2299942400213446e-05, + "loss": 0.6034, + "step": 2327 + }, + { + "epoch": 0.74496, + "grad_norm": 0.32928431954703236, + "learning_rate": 3.222369513770729e-05, + "loss": 0.5549, + "step": 2328 + }, + { + "epoch": 0.74528, + "grad_norm": 0.3451671890137395, + "learning_rate": 3.214752068766399e-05, + "loss": 0.5612, + "step": 2329 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3415621954459219, + "learning_rate": 3.207141913191826e-05, + "loss": 0.6022, + "step": 2330 + }, + { + "epoch": 0.74592, + "grad_norm": 0.3989017659278773, + "learning_rate": 3.1995390552226336e-05, + "loss": 0.6029, + "step": 2331 + }, + { + "epoch": 0.74624, + "grad_norm": 0.381837055963759, + "learning_rate": 3.191943503026622e-05, + "loss": 0.6244, + "step": 2332 + }, + { + "epoch": 0.74656, + "grad_norm": 0.33126585934733654, + "learning_rate": 3.184355264763731e-05, + "loss": 0.6068, + "step": 2333 + }, + { + "epoch": 0.74688, + "grad_norm": 0.3536341899877415, + "learning_rate": 3.176774348586051e-05, + "loss": 0.616, + "step": 2334 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3301539930652797, + "learning_rate": 3.1692007626377985e-05, + "loss": 0.5831, + "step": 2335 + }, + { + "epoch": 0.74752, + "grad_norm": 0.3369603625025938, + "learning_rate": 3.161634515055323e-05, + "loss": 0.5537, + "step": 2336 + }, + { + "epoch": 0.74784, + "grad_norm": 0.34788287232024545, + "learning_rate": 3.154075613967082e-05, + "loss": 0.5715, + "step": 2337 + }, + { + "epoch": 0.74816, + "grad_norm": 0.34199407523362596, + "learning_rate": 3.1465240674936514e-05, + "loss": 0.5558, + "step": 2338 + }, + { + "epoch": 0.74848, + "grad_norm": 0.3450282930035882, + "learning_rate": 3.138979883747692e-05, + "loss": 0.6083, + "step": 2339 + }, + { + "epoch": 0.7488, + "grad_norm": 0.34494890222555324, + "learning_rate": 3.131443070833968e-05, + "loss": 0.5348, + "step": 2340 + }, + { + "epoch": 0.74912, + "grad_norm": 0.33275235016148924, + "learning_rate": 3.1239136368493216e-05, + "loss": 0.542, + "step": 2341 + }, + { + "epoch": 0.74944, + "grad_norm": 0.33857138631779304, + "learning_rate": 3.116391589882659e-05, + "loss": 0.5713, + "step": 2342 + }, + { + "epoch": 0.74976, + "grad_norm": 0.33607981110956336, + "learning_rate": 3.108876938014964e-05, + "loss": 0.6235, + "step": 2343 + }, + { + "epoch": 0.75008, + "grad_norm": 0.3468769088146216, + "learning_rate": 3.101369689319263e-05, + "loss": 0.5832, + "step": 2344 + }, + { + "epoch": 0.7504, + "grad_norm": 0.3316342090040698, + "learning_rate": 3.093869851860638e-05, + "loss": 0.5911, + "step": 2345 + }, + { + "epoch": 0.75072, + "grad_norm": 0.3581437347975455, + "learning_rate": 3.08637743369621e-05, + "loss": 0.5667, + "step": 2346 + }, + { + "epoch": 0.75104, + "grad_norm": 0.3465195910789064, + "learning_rate": 3.078892442875119e-05, + "loss": 0.5399, + "step": 2347 + }, + { + "epoch": 0.75136, + "grad_norm": 0.3556462190377705, + "learning_rate": 3.071414887438537e-05, + "loss": 0.5732, + "step": 2348 + }, + { + "epoch": 0.75168, + "grad_norm": 0.34595846905129507, + "learning_rate": 3.063944775419641e-05, + "loss": 0.5484, + "step": 2349 + }, + { + "epoch": 0.752, + "grad_norm": 0.3446135367032066, + "learning_rate": 3.056482114843614e-05, + "loss": 0.5553, + "step": 2350 + }, + { + "epoch": 0.75232, + "grad_norm": 0.3607712135522439, + "learning_rate": 3.0490269137276394e-05, + "loss": 0.626, + "step": 2351 + }, + { + "epoch": 0.75264, + "grad_norm": 0.3402565405559765, + "learning_rate": 3.0415791800808723e-05, + "loss": 0.6187, + "step": 2352 + }, + { + "epoch": 0.75296, + "grad_norm": 0.3295917465282104, + "learning_rate": 3.0341389219044615e-05, + "loss": 0.5493, + "step": 2353 + }, + { + "epoch": 0.75328, + "grad_norm": 0.3519275515043085, + "learning_rate": 3.026706147191517e-05, + "loss": 0.5767, + "step": 2354 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3408333723271029, + "learning_rate": 3.0192808639271065e-05, + "loss": 0.6295, + "step": 2355 + }, + { + "epoch": 0.75392, + "grad_norm": 0.3470666538922749, + "learning_rate": 3.0118630800882596e-05, + "loss": 0.5814, + "step": 2356 + }, + { + "epoch": 0.75424, + "grad_norm": 0.31666937870659334, + "learning_rate": 3.0044528036439357e-05, + "loss": 0.5683, + "step": 2357 + }, + { + "epoch": 0.75456, + "grad_norm": 0.33637773717051944, + "learning_rate": 2.9970500425550417e-05, + "loss": 0.5963, + "step": 2358 + }, + { + "epoch": 0.75488, + "grad_norm": 0.33786096292751916, + "learning_rate": 2.989654804774401e-05, + "loss": 0.5816, + "step": 2359 + }, + { + "epoch": 0.7552, + "grad_norm": 0.32677482985465583, + "learning_rate": 2.9822670982467637e-05, + "loss": 0.5942, + "step": 2360 + }, + { + "epoch": 0.75552, + "grad_norm": 0.3338809730277252, + "learning_rate": 2.9748869309087778e-05, + "loss": 0.5483, + "step": 2361 + }, + { + "epoch": 0.75584, + "grad_norm": 0.33174851524524673, + "learning_rate": 2.9675143106890056e-05, + "loss": 0.5796, + "step": 2362 + }, + { + "epoch": 0.75616, + "grad_norm": 0.3324989595799846, + "learning_rate": 2.9601492455078872e-05, + "loss": 0.6124, + "step": 2363 + }, + { + "epoch": 0.75648, + "grad_norm": 0.3362730718367246, + "learning_rate": 2.9527917432777595e-05, + "loss": 0.603, + "step": 2364 + }, + { + "epoch": 0.7568, + "grad_norm": 0.3617593581459504, + "learning_rate": 2.9454418119028227e-05, + "loss": 0.603, + "step": 2365 + }, + { + "epoch": 0.75712, + "grad_norm": 0.3471047988231789, + "learning_rate": 2.9380994592791545e-05, + "loss": 0.5695, + "step": 2366 + }, + { + "epoch": 0.75744, + "grad_norm": 0.3400950061793918, + "learning_rate": 2.9307646932946797e-05, + "loss": 0.5897, + "step": 2367 + }, + { + "epoch": 0.75776, + "grad_norm": 0.348662092469313, + "learning_rate": 2.923437521829181e-05, + "loss": 0.5527, + "step": 2368 + }, + { + "epoch": 0.75808, + "grad_norm": 0.3236855574460617, + "learning_rate": 2.9161179527542827e-05, + "loss": 0.5643, + "step": 2369 + }, + { + "epoch": 0.7584, + "grad_norm": 0.34981066030108027, + "learning_rate": 2.9088059939334332e-05, + "loss": 0.6093, + "step": 2370 + }, + { + "epoch": 0.75872, + "grad_norm": 0.3289082649092464, + "learning_rate": 2.9015016532219132e-05, + "loss": 0.5921, + "step": 2371 + }, + { + "epoch": 0.75904, + "grad_norm": 0.32879029315275066, + "learning_rate": 2.89420493846682e-05, + "loss": 0.5972, + "step": 2372 + }, + { + "epoch": 0.75936, + "grad_norm": 0.32476863522355104, + "learning_rate": 2.8869158575070488e-05, + "loss": 0.5829, + "step": 2373 + }, + { + "epoch": 0.75968, + "grad_norm": 0.33679510274302665, + "learning_rate": 2.8796344181733058e-05, + "loss": 0.5977, + "step": 2374 + }, + { + "epoch": 0.76, + "grad_norm": 0.36180129427645297, + "learning_rate": 2.8723606282880765e-05, + "loss": 0.6201, + "step": 2375 + }, + { + "epoch": 0.76032, + "grad_norm": 0.3343661758410183, + "learning_rate": 2.865094495665638e-05, + "loss": 0.5824, + "step": 2376 + }, + { + "epoch": 0.76064, + "grad_norm": 0.352082421340757, + "learning_rate": 2.8578360281120377e-05, + "loss": 0.5986, + "step": 2377 + }, + { + "epoch": 0.76096, + "grad_norm": 0.35789151503896116, + "learning_rate": 2.8505852334250825e-05, + "loss": 0.6062, + "step": 2378 + }, + { + "epoch": 0.76128, + "grad_norm": 0.3685475388743482, + "learning_rate": 2.84334211939435e-05, + "loss": 0.6083, + "step": 2379 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3340106042654049, + "learning_rate": 2.836106693801148e-05, + "loss": 0.5777, + "step": 2380 + }, + { + "epoch": 0.76192, + "grad_norm": 0.38552664950055376, + "learning_rate": 2.828878964418542e-05, + "loss": 0.5968, + "step": 2381 + }, + { + "epoch": 0.76224, + "grad_norm": 0.3445694135476144, + "learning_rate": 2.8216589390113214e-05, + "loss": 0.5659, + "step": 2382 + }, + { + "epoch": 0.76256, + "grad_norm": 0.3295777802569321, + "learning_rate": 2.814446625335997e-05, + "loss": 0.5686, + "step": 2383 + }, + { + "epoch": 0.76288, + "grad_norm": 0.3446027220236407, + "learning_rate": 2.8072420311407977e-05, + "loss": 0.6051, + "step": 2384 + }, + { + "epoch": 0.7632, + "grad_norm": 0.33973972123166163, + "learning_rate": 2.8000451641656633e-05, + "loss": 0.5734, + "step": 2385 + }, + { + "epoch": 0.76352, + "grad_norm": 0.36079199185827376, + "learning_rate": 2.7928560321422237e-05, + "loss": 0.5711, + "step": 2386 + }, + { + "epoch": 0.76384, + "grad_norm": 0.34253463173699983, + "learning_rate": 2.7856746427938073e-05, + "loss": 0.5762, + "step": 2387 + }, + { + "epoch": 0.76416, + "grad_norm": 0.35533724153056895, + "learning_rate": 2.7785010038354197e-05, + "loss": 0.5945, + "step": 2388 + }, + { + "epoch": 0.76448, + "grad_norm": 0.34504298045201975, + "learning_rate": 2.7713351229737372e-05, + "loss": 0.5778, + "step": 2389 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3487361865298093, + "learning_rate": 2.7641770079071127e-05, + "loss": 0.5675, + "step": 2390 + }, + { + "epoch": 0.76512, + "grad_norm": 0.3452140897641801, + "learning_rate": 2.7570266663255417e-05, + "loss": 0.5891, + "step": 2391 + }, + { + "epoch": 0.76544, + "grad_norm": 0.32948686627018636, + "learning_rate": 2.7498841059106827e-05, + "loss": 0.5769, + "step": 2392 + }, + { + "epoch": 0.76576, + "grad_norm": 0.3430492211172593, + "learning_rate": 2.7427493343358224e-05, + "loss": 0.5989, + "step": 2393 + }, + { + "epoch": 0.76608, + "grad_norm": 0.33875464109493547, + "learning_rate": 2.735622359265889e-05, + "loss": 0.5895, + "step": 2394 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3439041615207522, + "learning_rate": 2.728503188357434e-05, + "loss": 0.5534, + "step": 2395 + }, + { + "epoch": 0.76672, + "grad_norm": 0.34886016012964727, + "learning_rate": 2.7213918292586173e-05, + "loss": 0.5861, + "step": 2396 + }, + { + "epoch": 0.76704, + "grad_norm": 0.32669939047169744, + "learning_rate": 2.714288289609217e-05, + "loss": 0.6137, + "step": 2397 + }, + { + "epoch": 0.76736, + "grad_norm": 0.36101603744040395, + "learning_rate": 2.7071925770405992e-05, + "loss": 0.5998, + "step": 2398 + }, + { + "epoch": 0.76768, + "grad_norm": 0.3328698049958207, + "learning_rate": 2.700104699175732e-05, + "loss": 0.5831, + "step": 2399 + }, + { + "epoch": 0.768, + "grad_norm": 0.3510927935259629, + "learning_rate": 2.6930246636291635e-05, + "loss": 0.5856, + "step": 2400 + }, + { + "epoch": 0.76832, + "grad_norm": 0.3312101945503044, + "learning_rate": 2.68595247800701e-05, + "loss": 0.5997, + "step": 2401 + }, + { + "epoch": 0.76864, + "grad_norm": 0.3430926524554503, + "learning_rate": 2.678888149906964e-05, + "loss": 0.6051, + "step": 2402 + }, + { + "epoch": 0.76896, + "grad_norm": 0.3358767848755449, + "learning_rate": 2.6718316869182735e-05, + "loss": 0.5756, + "step": 2403 + }, + { + "epoch": 0.76928, + "grad_norm": 0.3391095082693979, + "learning_rate": 2.6647830966217325e-05, + "loss": 0.5882, + "step": 2404 + }, + { + "epoch": 0.7696, + "grad_norm": 0.341842787369802, + "learning_rate": 2.6577423865896856e-05, + "loss": 0.557, + "step": 2405 + }, + { + "epoch": 0.76992, + "grad_norm": 0.3239114474441497, + "learning_rate": 2.650709564386e-05, + "loss": 0.577, + "step": 2406 + }, + { + "epoch": 0.77024, + "grad_norm": 0.34586815567672, + "learning_rate": 2.6436846375660816e-05, + "loss": 0.5922, + "step": 2407 + }, + { + "epoch": 0.77056, + "grad_norm": 0.334961876383252, + "learning_rate": 2.6366676136768486e-05, + "loss": 0.6024, + "step": 2408 + }, + { + "epoch": 0.77088, + "grad_norm": 0.3791633433224865, + "learning_rate": 2.6296585002567252e-05, + "loss": 0.5835, + "step": 2409 + }, + { + "epoch": 0.7712, + "grad_norm": 0.349845300823539, + "learning_rate": 2.622657304835646e-05, + "loss": 0.5667, + "step": 2410 + }, + { + "epoch": 0.77152, + "grad_norm": 0.3551498303535744, + "learning_rate": 2.615664034935028e-05, + "loss": 0.5732, + "step": 2411 + }, + { + "epoch": 0.77184, + "grad_norm": 0.31879527794421286, + "learning_rate": 2.6086786980677837e-05, + "loss": 0.5355, + "step": 2412 + }, + { + "epoch": 0.77216, + "grad_norm": 0.3262907843763252, + "learning_rate": 2.601701301738303e-05, + "loss": 0.543, + "step": 2413 + }, + { + "epoch": 0.77248, + "grad_norm": 0.35623855490669876, + "learning_rate": 2.5947318534424346e-05, + "loss": 0.573, + "step": 2414 + }, + { + "epoch": 0.7728, + "grad_norm": 0.3767124025771466, + "learning_rate": 2.587770360667503e-05, + "loss": 0.5501, + "step": 2415 + }, + { + "epoch": 0.77312, + "grad_norm": 0.32759350047708313, + "learning_rate": 2.580816830892272e-05, + "loss": 0.5474, + "step": 2416 + }, + { + "epoch": 0.77344, + "grad_norm": 0.35442865582215455, + "learning_rate": 2.573871271586963e-05, + "loss": 0.5801, + "step": 2417 + }, + { + "epoch": 0.77376, + "grad_norm": 0.36348390933380514, + "learning_rate": 2.5669336902132234e-05, + "loss": 0.5636, + "step": 2418 + }, + { + "epoch": 0.77408, + "grad_norm": 0.3337592582572294, + "learning_rate": 2.560004094224143e-05, + "loss": 0.568, + "step": 2419 + }, + { + "epoch": 0.7744, + "grad_norm": 0.33280285788443853, + "learning_rate": 2.5530824910642183e-05, + "loss": 0.5749, + "step": 2420 + }, + { + "epoch": 0.77472, + "grad_norm": 0.3448406590938855, + "learning_rate": 2.5461688881693723e-05, + "loss": 0.5969, + "step": 2421 + }, + { + "epoch": 0.77504, + "grad_norm": 0.3515999885584849, + "learning_rate": 2.5392632929669213e-05, + "loss": 0.5886, + "step": 2422 + }, + { + "epoch": 0.77536, + "grad_norm": 0.33735108637060596, + "learning_rate": 2.5323657128755895e-05, + "loss": 0.5806, + "step": 2423 + }, + { + "epoch": 0.77568, + "grad_norm": 0.35121809997582326, + "learning_rate": 2.525476155305483e-05, + "loss": 0.6119, + "step": 2424 + }, + { + "epoch": 0.776, + "grad_norm": 0.33504541137736965, + "learning_rate": 2.5185946276580918e-05, + "loss": 0.5855, + "step": 2425 + }, + { + "epoch": 0.77632, + "grad_norm": 0.342514163567369, + "learning_rate": 2.511721137326284e-05, + "loss": 0.5453, + "step": 2426 + }, + { + "epoch": 0.77664, + "grad_norm": 0.3469120002174987, + "learning_rate": 2.5048556916942824e-05, + "loss": 0.55, + "step": 2427 + }, + { + "epoch": 0.77696, + "grad_norm": 0.34586812850714027, + "learning_rate": 2.497998298137676e-05, + "loss": 0.5992, + "step": 2428 + }, + { + "epoch": 0.77728, + "grad_norm": 0.33724972199239284, + "learning_rate": 2.4911489640234055e-05, + "loss": 0.6001, + "step": 2429 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3512325189234438, + "learning_rate": 2.484307696709741e-05, + "loss": 0.5819, + "step": 2430 + }, + { + "epoch": 0.77792, + "grad_norm": 0.349654684211299, + "learning_rate": 2.4774745035463008e-05, + "loss": 0.6303, + "step": 2431 + }, + { + "epoch": 0.77824, + "grad_norm": 0.33453337928277854, + "learning_rate": 2.470649391874017e-05, + "loss": 0.5359, + "step": 2432 + }, + { + "epoch": 0.77856, + "grad_norm": 0.3416789705540942, + "learning_rate": 2.4638323690251486e-05, + "loss": 0.5783, + "step": 2433 + }, + { + "epoch": 0.77888, + "grad_norm": 0.34156817967059255, + "learning_rate": 2.457023442323262e-05, + "loss": 0.5715, + "step": 2434 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3368542957143385, + "learning_rate": 2.4502226190832222e-05, + "loss": 0.5813, + "step": 2435 + }, + { + "epoch": 0.77952, + "grad_norm": 0.327260707366308, + "learning_rate": 2.4434299066111953e-05, + "loss": 0.5449, + "step": 2436 + }, + { + "epoch": 0.77984, + "grad_norm": 0.3524152885666758, + "learning_rate": 2.4366453122046263e-05, + "loss": 0.6044, + "step": 2437 + }, + { + "epoch": 0.78016, + "grad_norm": 0.35477651008206396, + "learning_rate": 2.429868843152243e-05, + "loss": 0.5971, + "step": 2438 + }, + { + "epoch": 0.78048, + "grad_norm": 0.4093698158410259, + "learning_rate": 2.4231005067340508e-05, + "loss": 0.6219, + "step": 2439 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3528251215152049, + "learning_rate": 2.4163403102213012e-05, + "loss": 0.5619, + "step": 2440 + }, + { + "epoch": 0.78112, + "grad_norm": 0.35393343997845517, + "learning_rate": 2.4095882608765196e-05, + "loss": 0.5801, + "step": 2441 + }, + { + "epoch": 0.78144, + "grad_norm": 0.3434472851711151, + "learning_rate": 2.4028443659534638e-05, + "loss": 0.5451, + "step": 2442 + }, + { + "epoch": 0.78176, + "grad_norm": 0.4105730561393544, + "learning_rate": 2.396108632697145e-05, + "loss": 0.5907, + "step": 2443 + }, + { + "epoch": 0.78208, + "grad_norm": 0.3629371386000478, + "learning_rate": 2.38938106834379e-05, + "loss": 0.5737, + "step": 2444 + }, + { + "epoch": 0.7824, + "grad_norm": 0.33696633482943883, + "learning_rate": 2.382661680120868e-05, + "loss": 0.584, + "step": 2445 + }, + { + "epoch": 0.78272, + "grad_norm": 0.3449131704792935, + "learning_rate": 2.3759504752470463e-05, + "loss": 0.573, + "step": 2446 + }, + { + "epoch": 0.78304, + "grad_norm": 0.3727591769601757, + "learning_rate": 2.369247460932219e-05, + "loss": 0.6211, + "step": 2447 + }, + { + "epoch": 0.78336, + "grad_norm": 0.32671458489505556, + "learning_rate": 2.3625526443774636e-05, + "loss": 0.5821, + "step": 2448 + }, + { + "epoch": 0.78368, + "grad_norm": 0.354124806828755, + "learning_rate": 2.3558660327750647e-05, + "loss": 0.5517, + "step": 2449 + }, + { + "epoch": 0.784, + "grad_norm": 0.34426795256547404, + "learning_rate": 2.349187633308483e-05, + "loss": 0.5745, + "step": 2450 + }, + { + "epoch": 0.78432, + "grad_norm": 0.3494513559440474, + "learning_rate": 2.3425174531523596e-05, + "loss": 0.6134, + "step": 2451 + }, + { + "epoch": 0.78464, + "grad_norm": 0.35765403671344015, + "learning_rate": 2.3358554994725123e-05, + "loss": 0.6041, + "step": 2452 + }, + { + "epoch": 0.78496, + "grad_norm": 0.35954927876655834, + "learning_rate": 2.329201779425909e-05, + "loss": 0.5512, + "step": 2453 + }, + { + "epoch": 0.78528, + "grad_norm": 0.37114122005555067, + "learning_rate": 2.322556300160682e-05, + "loss": 0.6187, + "step": 2454 + }, + { + "epoch": 0.7856, + "grad_norm": 0.351944774645019, + "learning_rate": 2.3159190688161038e-05, + "loss": 0.5522, + "step": 2455 + }, + { + "epoch": 0.78592, + "grad_norm": 0.39093401850285614, + "learning_rate": 2.3092900925225903e-05, + "loss": 0.6108, + "step": 2456 + }, + { + "epoch": 0.78624, + "grad_norm": 0.3630710819278059, + "learning_rate": 2.3026693784016896e-05, + "loss": 0.5769, + "step": 2457 + }, + { + "epoch": 0.78656, + "grad_norm": 0.3253875198388068, + "learning_rate": 2.2960569335660685e-05, + "loss": 0.5478, + "step": 2458 + }, + { + "epoch": 0.78688, + "grad_norm": 0.3906710897143039, + "learning_rate": 2.2894527651195152e-05, + "loss": 0.603, + "step": 2459 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3607411091633684, + "learning_rate": 2.2828568801569283e-05, + "loss": 0.5763, + "step": 2460 + }, + { + "epoch": 0.78752, + "grad_norm": 0.35173782387112157, + "learning_rate": 2.2762692857642963e-05, + "loss": 0.5981, + "step": 2461 + }, + { + "epoch": 0.78784, + "grad_norm": 0.339858955189516, + "learning_rate": 2.2696899890187162e-05, + "loss": 0.6017, + "step": 2462 + }, + { + "epoch": 0.78816, + "grad_norm": 0.34544866050885736, + "learning_rate": 2.263118996988357e-05, + "loss": 0.5363, + "step": 2463 + }, + { + "epoch": 0.78848, + "grad_norm": 0.34761095872013703, + "learning_rate": 2.2565563167324743e-05, + "loss": 0.5989, + "step": 2464 + }, + { + "epoch": 0.7888, + "grad_norm": 0.31530810134808945, + "learning_rate": 2.2500019553013964e-05, + "loss": 0.5473, + "step": 2465 + }, + { + "epoch": 0.78912, + "grad_norm": 0.33415143701761924, + "learning_rate": 2.2434559197365034e-05, + "loss": 0.5469, + "step": 2466 + }, + { + "epoch": 0.78944, + "grad_norm": 0.35692277498717956, + "learning_rate": 2.236918217070244e-05, + "loss": 0.5848, + "step": 2467 + }, + { + "epoch": 0.78976, + "grad_norm": 0.347516591358437, + "learning_rate": 2.2303888543261032e-05, + "loss": 0.5526, + "step": 2468 + }, + { + "epoch": 0.79008, + "grad_norm": 0.34620092675831654, + "learning_rate": 2.223867838518615e-05, + "loss": 0.5462, + "step": 2469 + }, + { + "epoch": 0.7904, + "grad_norm": 0.33557138741524, + "learning_rate": 2.217355176653345e-05, + "loss": 0.5257, + "step": 2470 + }, + { + "epoch": 0.79072, + "grad_norm": 0.3697668097801008, + "learning_rate": 2.21085087572688e-05, + "loss": 0.5585, + "step": 2471 + }, + { + "epoch": 0.79104, + "grad_norm": 0.3715956488757461, + "learning_rate": 2.204354942726824e-05, + "loss": 0.5508, + "step": 2472 + }, + { + "epoch": 0.79136, + "grad_norm": 0.3243701754045067, + "learning_rate": 2.1978673846318e-05, + "loss": 0.5759, + "step": 2473 + }, + { + "epoch": 0.79168, + "grad_norm": 0.36072046537223734, + "learning_rate": 2.191388208411421e-05, + "loss": 0.5758, + "step": 2474 + }, + { + "epoch": 0.792, + "grad_norm": 0.33967224222953046, + "learning_rate": 2.184917421026309e-05, + "loss": 0.5881, + "step": 2475 + }, + { + "epoch": 0.79232, + "grad_norm": 0.3465350920768046, + "learning_rate": 2.1784550294280616e-05, + "loss": 0.615, + "step": 2476 + }, + { + "epoch": 0.79264, + "grad_norm": 0.37908477222437176, + "learning_rate": 2.172001040559264e-05, + "loss": 0.6177, + "step": 2477 + }, + { + "epoch": 0.79296, + "grad_norm": 0.3278710982429467, + "learning_rate": 2.1655554613534767e-05, + "loss": 0.5232, + "step": 2478 + }, + { + "epoch": 0.79328, + "grad_norm": 0.3384039255396264, + "learning_rate": 2.1591182987352142e-05, + "loss": 0.5578, + "step": 2479 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3403251723616923, + "learning_rate": 2.1526895596199626e-05, + "loss": 0.6037, + "step": 2480 + }, + { + "epoch": 0.79392, + "grad_norm": 0.3710591109731778, + "learning_rate": 2.1462692509141467e-05, + "loss": 0.6148, + "step": 2481 + }, + { + "epoch": 0.79424, + "grad_norm": 0.3470562337667696, + "learning_rate": 2.1398573795151432e-05, + "loss": 0.5675, + "step": 2482 + }, + { + "epoch": 0.79456, + "grad_norm": 0.3473910022037806, + "learning_rate": 2.133453952311264e-05, + "loss": 0.5619, + "step": 2483 + }, + { + "epoch": 0.79488, + "grad_norm": 0.3438608091876821, + "learning_rate": 2.1270589761817407e-05, + "loss": 0.6002, + "step": 2484 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3446990832392834, + "learning_rate": 2.1206724579967373e-05, + "loss": 0.5499, + "step": 2485 + }, + { + "epoch": 0.79552, + "grad_norm": 0.3545567847508976, + "learning_rate": 2.1142944046173207e-05, + "loss": 0.6121, + "step": 2486 + }, + { + "epoch": 0.79584, + "grad_norm": 0.3289303546556512, + "learning_rate": 2.1079248228954718e-05, + "loss": 0.5824, + "step": 2487 + }, + { + "epoch": 0.79616, + "grad_norm": 0.33370158019647944, + "learning_rate": 2.1015637196740712e-05, + "loss": 0.5954, + "step": 2488 + }, + { + "epoch": 0.79648, + "grad_norm": 0.33882023411268536, + "learning_rate": 2.0952111017868813e-05, + "loss": 0.5876, + "step": 2489 + }, + { + "epoch": 0.7968, + "grad_norm": 0.32884890388514093, + "learning_rate": 2.088866976058559e-05, + "loss": 0.554, + "step": 2490 + }, + { + "epoch": 0.79712, + "grad_norm": 0.33133134284141186, + "learning_rate": 2.082531349304636e-05, + "loss": 0.545, + "step": 2491 + }, + { + "epoch": 0.79744, + "grad_norm": 0.33783537894596144, + "learning_rate": 2.0762042283315052e-05, + "loss": 0.5587, + "step": 2492 + }, + { + "epoch": 0.79776, + "grad_norm": 0.3557898528807314, + "learning_rate": 2.0698856199364348e-05, + "loss": 0.5588, + "step": 2493 + }, + { + "epoch": 0.79808, + "grad_norm": 0.3500657192864167, + "learning_rate": 2.0635755309075343e-05, + "loss": 0.5911, + "step": 2494 + }, + { + "epoch": 0.7984, + "grad_norm": 0.3647386915647849, + "learning_rate": 2.0572739680237717e-05, + "loss": 0.599, + "step": 2495 + }, + { + "epoch": 0.79872, + "grad_norm": 0.3591643550741855, + "learning_rate": 2.0509809380549537e-05, + "loss": 0.5871, + "step": 2496 + }, + { + "epoch": 0.79904, + "grad_norm": 0.3517398474758034, + "learning_rate": 2.0446964477617116e-05, + "loss": 0.5943, + "step": 2497 + }, + { + "epoch": 0.79936, + "grad_norm": 0.3490242405490287, + "learning_rate": 2.0384205038955127e-05, + "loss": 0.5749, + "step": 2498 + }, + { + "epoch": 0.79968, + "grad_norm": 0.33831548102579173, + "learning_rate": 2.032153113198636e-05, + "loss": 0.5579, + "step": 2499 + }, + { + "epoch": 0.8, + "grad_norm": 0.3634872835470982, + "learning_rate": 2.025894282404177e-05, + "loss": 0.612, + "step": 2500 + }, + { + "epoch": 0.80032, + "grad_norm": 0.3425232023996621, + "learning_rate": 2.019644018236029e-05, + "loss": 0.5536, + "step": 2501 + }, + { + "epoch": 0.80064, + "grad_norm": 0.3480662902670338, + "learning_rate": 2.0134023274088898e-05, + "loss": 0.5832, + "step": 2502 + }, + { + "epoch": 0.80096, + "grad_norm": 0.3329767821258931, + "learning_rate": 2.0071692166282384e-05, + "loss": 0.6013, + "step": 2503 + }, + { + "epoch": 0.80128, + "grad_norm": 0.33173035226101194, + "learning_rate": 2.0009446925903462e-05, + "loss": 0.5412, + "step": 2504 + }, + { + "epoch": 0.8016, + "grad_norm": 0.35892749701366916, + "learning_rate": 1.9947287619822474e-05, + "loss": 0.6004, + "step": 2505 + }, + { + "epoch": 0.80192, + "grad_norm": 0.32380929116089685, + "learning_rate": 1.9885214314817568e-05, + "loss": 0.5298, + "step": 2506 + }, + { + "epoch": 0.80224, + "grad_norm": 0.3648924820132169, + "learning_rate": 1.9823227077574392e-05, + "loss": 0.5264, + "step": 2507 + }, + { + "epoch": 0.80256, + "grad_norm": 0.3476892297982287, + "learning_rate": 1.9761325974686208e-05, + "loss": 0.5717, + "step": 2508 + }, + { + "epoch": 0.80288, + "grad_norm": 0.49034137620021845, + "learning_rate": 1.9699511072653733e-05, + "loss": 0.5594, + "step": 2509 + }, + { + "epoch": 0.8032, + "grad_norm": 0.33899973081415063, + "learning_rate": 1.9637782437885023e-05, + "loss": 0.5813, + "step": 2510 + }, + { + "epoch": 0.80352, + "grad_norm": 0.34594804939489354, + "learning_rate": 1.9576140136695542e-05, + "loss": 0.5848, + "step": 2511 + }, + { + "epoch": 0.80384, + "grad_norm": 0.35349589268410203, + "learning_rate": 1.951458423530791e-05, + "loss": 0.5972, + "step": 2512 + }, + { + "epoch": 0.80416, + "grad_norm": 0.34116101498955914, + "learning_rate": 1.945311479985199e-05, + "loss": 0.5527, + "step": 2513 + }, + { + "epoch": 0.80448, + "grad_norm": 0.3452356052708206, + "learning_rate": 1.9391731896364784e-05, + "loss": 0.5324, + "step": 2514 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3203982896423424, + "learning_rate": 1.933043559079022e-05, + "loss": 0.5573, + "step": 2515 + }, + { + "epoch": 0.80512, + "grad_norm": 0.3402648463973037, + "learning_rate": 1.926922594897932e-05, + "loss": 0.5038, + "step": 2516 + }, + { + "epoch": 0.80544, + "grad_norm": 0.34359612396387357, + "learning_rate": 1.9208103036689894e-05, + "loss": 0.5888, + "step": 2517 + }, + { + "epoch": 0.80576, + "grad_norm": 0.3442759319141374, + "learning_rate": 1.9147066919586644e-05, + "loss": 0.5831, + "step": 2518 + }, + { + "epoch": 0.80608, + "grad_norm": 0.3272406428240339, + "learning_rate": 1.9086117663241055e-05, + "loss": 0.5817, + "step": 2519 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3742487291696982, + "learning_rate": 1.9025255333131176e-05, + "loss": 0.6346, + "step": 2520 + }, + { + "epoch": 0.80672, + "grad_norm": 0.40930377904831494, + "learning_rate": 1.8964479994641805e-05, + "loss": 0.5793, + "step": 2521 + }, + { + "epoch": 0.80704, + "grad_norm": 0.3565577889922621, + "learning_rate": 1.8903791713064233e-05, + "loss": 0.5882, + "step": 2522 + }, + { + "epoch": 0.80736, + "grad_norm": 0.3530479024802253, + "learning_rate": 1.884319055359617e-05, + "loss": 0.5817, + "step": 2523 + }, + { + "epoch": 0.80768, + "grad_norm": 0.3724923440038362, + "learning_rate": 1.878267658134184e-05, + "loss": 0.6316, + "step": 2524 + }, + { + "epoch": 0.808, + "grad_norm": 0.34200673881188254, + "learning_rate": 1.872224986131168e-05, + "loss": 0.5546, + "step": 2525 + }, + { + "epoch": 0.80832, + "grad_norm": 0.34782820641086815, + "learning_rate": 1.8661910458422514e-05, + "loss": 0.5609, + "step": 2526 + }, + { + "epoch": 0.80864, + "grad_norm": 0.3815838807460841, + "learning_rate": 1.860165843749725e-05, + "loss": 0.6105, + "step": 2527 + }, + { + "epoch": 0.80896, + "grad_norm": 0.32259838169392785, + "learning_rate": 1.8541493863265e-05, + "loss": 0.5511, + "step": 2528 + }, + { + "epoch": 0.80928, + "grad_norm": 0.3717571350885919, + "learning_rate": 1.8481416800360872e-05, + "loss": 0.5916, + "step": 2529 + }, + { + "epoch": 0.8096, + "grad_norm": 0.33874743755926234, + "learning_rate": 1.8421427313326046e-05, + "loss": 0.5645, + "step": 2530 + }, + { + "epoch": 0.80992, + "grad_norm": 0.32945992612330516, + "learning_rate": 1.8361525466607488e-05, + "loss": 0.5256, + "step": 2531 + }, + { + "epoch": 0.81024, + "grad_norm": 0.3408395461618973, + "learning_rate": 1.8301711324558158e-05, + "loss": 0.5444, + "step": 2532 + }, + { + "epoch": 0.81056, + "grad_norm": 0.3360345310477082, + "learning_rate": 1.8241984951436665e-05, + "loss": 0.5082, + "step": 2533 + }, + { + "epoch": 0.81088, + "grad_norm": 0.3619912782367946, + "learning_rate": 1.8182346411407412e-05, + "loss": 0.5975, + "step": 2534 + }, + { + "epoch": 0.8112, + "grad_norm": 0.36009948064504005, + "learning_rate": 1.8122795768540435e-05, + "loss": 0.566, + "step": 2535 + }, + { + "epoch": 0.81152, + "grad_norm": 0.3418557525512484, + "learning_rate": 1.8063333086811272e-05, + "loss": 0.5837, + "step": 2536 + }, + { + "epoch": 0.81184, + "grad_norm": 0.34353632916754195, + "learning_rate": 1.8003958430101085e-05, + "loss": 0.6053, + "step": 2537 + }, + { + "epoch": 0.81216, + "grad_norm": 0.3466534323420069, + "learning_rate": 1.7944671862196316e-05, + "loss": 0.569, + "step": 2538 + }, + { + "epoch": 0.81248, + "grad_norm": 0.33448332937251646, + "learning_rate": 1.7885473446788913e-05, + "loss": 0.5097, + "step": 2539 + }, + { + "epoch": 0.8128, + "grad_norm": 0.338864461741268, + "learning_rate": 1.7826363247476062e-05, + "loss": 0.5711, + "step": 2540 + }, + { + "epoch": 0.81312, + "grad_norm": 0.3961894658231154, + "learning_rate": 1.7767341327760155e-05, + "loss": 0.6439, + "step": 2541 + }, + { + "epoch": 0.81344, + "grad_norm": 0.3389672150998602, + "learning_rate": 1.7708407751048804e-05, + "loss": 0.5311, + "step": 2542 + }, + { + "epoch": 0.81376, + "grad_norm": 0.3304649549560958, + "learning_rate": 1.7649562580654632e-05, + "loss": 0.5521, + "step": 2543 + }, + { + "epoch": 0.81408, + "grad_norm": 0.31581192258434576, + "learning_rate": 1.7590805879795356e-05, + "loss": 0.554, + "step": 2544 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3598882048796307, + "learning_rate": 1.7532137711593665e-05, + "loss": 0.5951, + "step": 2545 + }, + { + "epoch": 0.81472, + "grad_norm": 0.36921175003301293, + "learning_rate": 1.747355813907704e-05, + "loss": 0.6196, + "step": 2546 + }, + { + "epoch": 0.81504, + "grad_norm": 0.3247460409488971, + "learning_rate": 1.7415067225177893e-05, + "loss": 0.5536, + "step": 2547 + }, + { + "epoch": 0.81536, + "grad_norm": 0.34154135482010906, + "learning_rate": 1.73566650327333e-05, + "loss": 0.5606, + "step": 2548 + }, + { + "epoch": 0.81568, + "grad_norm": 0.3584651750407313, + "learning_rate": 1.7298351624485065e-05, + "loss": 0.5707, + "step": 2549 + }, + { + "epoch": 0.816, + "grad_norm": 0.3517416392835679, + "learning_rate": 1.724012706307966e-05, + "loss": 0.5829, + "step": 2550 + }, + { + "epoch": 0.81632, + "grad_norm": 0.32107824931031526, + "learning_rate": 1.7181991411067987e-05, + "loss": 0.5364, + "step": 2551 + }, + { + "epoch": 0.81664, + "grad_norm": 0.319947206131299, + "learning_rate": 1.712394473090555e-05, + "loss": 0.5448, + "step": 2552 + }, + { + "epoch": 0.81696, + "grad_norm": 0.3446025975101544, + "learning_rate": 1.7065987084952217e-05, + "loss": 0.5848, + "step": 2553 + }, + { + "epoch": 0.81728, + "grad_norm": 0.357778438430661, + "learning_rate": 1.7008118535472196e-05, + "loss": 0.5725, + "step": 2554 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3549516723717266, + "learning_rate": 1.6950339144633975e-05, + "loss": 0.589, + "step": 2555 + }, + { + "epoch": 0.81792, + "grad_norm": 0.3535820407730016, + "learning_rate": 1.6892648974510328e-05, + "loss": 0.5997, + "step": 2556 + }, + { + "epoch": 0.81824, + "grad_norm": 0.3443859652005636, + "learning_rate": 1.6835048087078075e-05, + "loss": 0.5959, + "step": 2557 + }, + { + "epoch": 0.81856, + "grad_norm": 0.35135030865264794, + "learning_rate": 1.677753654421821e-05, + "loss": 0.5563, + "step": 2558 + }, + { + "epoch": 0.81888, + "grad_norm": 0.3517772511502157, + "learning_rate": 1.6720114407715658e-05, + "loss": 0.583, + "step": 2559 + }, + { + "epoch": 0.8192, + "grad_norm": 0.37629848204855904, + "learning_rate": 1.6662781739259403e-05, + "loss": 0.5675, + "step": 2560 + }, + { + "epoch": 0.81952, + "grad_norm": 0.3329615188076578, + "learning_rate": 1.6605538600442194e-05, + "loss": 0.5936, + "step": 2561 + }, + { + "epoch": 0.81984, + "grad_norm": 0.34608915327448414, + "learning_rate": 1.6548385052760674e-05, + "loss": 0.5779, + "step": 2562 + }, + { + "epoch": 0.82016, + "grad_norm": 0.38482917890870877, + "learning_rate": 1.6491321157615257e-05, + "loss": 0.539, + "step": 2563 + }, + { + "epoch": 0.82048, + "grad_norm": 0.354495920622266, + "learning_rate": 1.6434346976309943e-05, + "loss": 0.5032, + "step": 2564 + }, + { + "epoch": 0.8208, + "grad_norm": 0.3342699345517741, + "learning_rate": 1.6377462570052438e-05, + "loss": 0.5739, + "step": 2565 + }, + { + "epoch": 0.82112, + "grad_norm": 0.3476272880919875, + "learning_rate": 1.632066799995401e-05, + "loss": 0.5887, + "step": 2566 + }, + { + "epoch": 0.82144, + "grad_norm": 0.47916521297807313, + "learning_rate": 1.626396332702933e-05, + "loss": 0.5443, + "step": 2567 + }, + { + "epoch": 0.82176, + "grad_norm": 0.3178166462315333, + "learning_rate": 1.620734861219658e-05, + "loss": 0.5439, + "step": 2568 + }, + { + "epoch": 0.82208, + "grad_norm": 0.34288874484912857, + "learning_rate": 1.6150823916277248e-05, + "loss": 0.5582, + "step": 2569 + }, + { + "epoch": 0.8224, + "grad_norm": 0.41067229081441986, + "learning_rate": 1.6094389299996125e-05, + "loss": 0.6094, + "step": 2570 + }, + { + "epoch": 0.82272, + "grad_norm": 0.3618375438025809, + "learning_rate": 1.603804482398127e-05, + "loss": 0.6114, + "step": 2571 + }, + { + "epoch": 0.82304, + "grad_norm": 0.3790824543863109, + "learning_rate": 1.598179054876382e-05, + "loss": 0.6148, + "step": 2572 + }, + { + "epoch": 0.82336, + "grad_norm": 0.3485899110997312, + "learning_rate": 1.5925626534778103e-05, + "loss": 0.557, + "step": 2573 + }, + { + "epoch": 0.82368, + "grad_norm": 0.3514882907126632, + "learning_rate": 1.5869552842361378e-05, + "loss": 0.5372, + "step": 2574 + }, + { + "epoch": 0.824, + "grad_norm": 0.3364951753060846, + "learning_rate": 1.5813569531753968e-05, + "loss": 0.605, + "step": 2575 + }, + { + "epoch": 0.82432, + "grad_norm": 0.3684388543836776, + "learning_rate": 1.5757676663099076e-05, + "loss": 0.6105, + "step": 2576 + }, + { + "epoch": 0.82464, + "grad_norm": 0.3321301381734369, + "learning_rate": 1.5701874296442665e-05, + "loss": 0.5641, + "step": 2577 + }, + { + "epoch": 0.82496, + "grad_norm": 0.33324934501204545, + "learning_rate": 1.564616249173355e-05, + "loss": 0.5505, + "step": 2578 + }, + { + "epoch": 0.82528, + "grad_norm": 0.3543134291092468, + "learning_rate": 1.559054130882327e-05, + "loss": 0.529, + "step": 2579 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3581079442657288, + "learning_rate": 1.553501080746592e-05, + "loss": 0.5957, + "step": 2580 + }, + { + "epoch": 0.82592, + "grad_norm": 0.34821991077229714, + "learning_rate": 1.5479571047318287e-05, + "loss": 0.5619, + "step": 2581 + }, + { + "epoch": 0.82624, + "grad_norm": 0.3446841489137521, + "learning_rate": 1.5424222087939544e-05, + "loss": 0.5579, + "step": 2582 + }, + { + "epoch": 0.82656, + "grad_norm": 0.3512213595332296, + "learning_rate": 1.5368963988791453e-05, + "loss": 0.548, + "step": 2583 + }, + { + "epoch": 0.82688, + "grad_norm": 0.35407948504044434, + "learning_rate": 1.5313796809238057e-05, + "loss": 0.5291, + "step": 2584 + }, + { + "epoch": 0.8272, + "grad_norm": 0.35336483465835566, + "learning_rate": 1.5258720608545762e-05, + "loss": 0.6354, + "step": 2585 + }, + { + "epoch": 0.82752, + "grad_norm": 0.35100577830346985, + "learning_rate": 1.5203735445883282e-05, + "loss": 0.5833, + "step": 2586 + }, + { + "epoch": 0.82784, + "grad_norm": 0.37158239057050696, + "learning_rate": 1.514884138032142e-05, + "loss": 0.5611, + "step": 2587 + }, + { + "epoch": 0.82816, + "grad_norm": 0.3591429055911957, + "learning_rate": 1.5094038470833217e-05, + "loss": 0.6189, + "step": 2588 + }, + { + "epoch": 0.82848, + "grad_norm": 0.3624361997159548, + "learning_rate": 1.5039326776293771e-05, + "loss": 0.6094, + "step": 2589 + }, + { + "epoch": 0.8288, + "grad_norm": 0.348654141636165, + "learning_rate": 1.4984706355480094e-05, + "loss": 0.6149, + "step": 2590 + }, + { + "epoch": 0.82912, + "grad_norm": 0.33611335623002886, + "learning_rate": 1.4930177267071277e-05, + "loss": 0.5616, + "step": 2591 + }, + { + "epoch": 0.82944, + "grad_norm": 0.546330809890968, + "learning_rate": 1.4875739569648172e-05, + "loss": 0.5911, + "step": 2592 + }, + { + "epoch": 0.82976, + "grad_norm": 0.3929949240286303, + "learning_rate": 1.4821393321693523e-05, + "loss": 0.5749, + "step": 2593 + }, + { + "epoch": 0.83008, + "grad_norm": 0.36194757328710503, + "learning_rate": 1.4767138581591822e-05, + "loss": 0.5784, + "step": 2594 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3404384657653635, + "learning_rate": 1.4712975407629203e-05, + "loss": 0.5713, + "step": 2595 + }, + { + "epoch": 0.83072, + "grad_norm": 0.33359288488877453, + "learning_rate": 1.4658903857993489e-05, + "loss": 0.5369, + "step": 2596 + }, + { + "epoch": 0.83104, + "grad_norm": 0.33777074819030184, + "learning_rate": 1.4604923990774067e-05, + "loss": 0.5617, + "step": 2597 + }, + { + "epoch": 0.83136, + "grad_norm": 0.4123918560404225, + "learning_rate": 1.455103586396177e-05, + "loss": 0.6266, + "step": 2598 + }, + { + "epoch": 0.83168, + "grad_norm": 0.3408086516335801, + "learning_rate": 1.449723953544896e-05, + "loss": 0.5591, + "step": 2599 + }, + { + "epoch": 0.832, + "grad_norm": 0.33205955167379664, + "learning_rate": 1.4443535063029279e-05, + "loss": 0.5408, + "step": 2600 + }, + { + "epoch": 0.83232, + "grad_norm": 0.35731959101580363, + "learning_rate": 1.4389922504397769e-05, + "loss": 0.5763, + "step": 2601 + }, + { + "epoch": 0.83264, + "grad_norm": 0.3351865605996006, + "learning_rate": 1.433640191715072e-05, + "loss": 0.5408, + "step": 2602 + }, + { + "epoch": 0.83296, + "grad_norm": 0.34730184802784503, + "learning_rate": 1.4282973358785557e-05, + "loss": 0.586, + "step": 2603 + }, + { + "epoch": 0.83328, + "grad_norm": 0.3669686168935996, + "learning_rate": 1.4229636886700925e-05, + "loss": 0.5721, + "step": 2604 + }, + { + "epoch": 0.8336, + "grad_norm": 0.33472540609377066, + "learning_rate": 1.417639255819645e-05, + "loss": 0.587, + "step": 2605 + }, + { + "epoch": 0.83392, + "grad_norm": 0.37710979799483263, + "learning_rate": 1.4123240430472828e-05, + "loss": 0.5949, + "step": 2606 + }, + { + "epoch": 0.83424, + "grad_norm": 0.3253109261838384, + "learning_rate": 1.4070180560631707e-05, + "loss": 0.5777, + "step": 2607 + }, + { + "epoch": 0.83456, + "grad_norm": 0.34239123450888104, + "learning_rate": 1.4017213005675567e-05, + "loss": 0.5544, + "step": 2608 + }, + { + "epoch": 0.83488, + "grad_norm": 0.33994658984642845, + "learning_rate": 1.396433782250779e-05, + "loss": 0.6011, + "step": 2609 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3441195074068338, + "learning_rate": 1.3911555067932425e-05, + "loss": 0.5852, + "step": 2610 + }, + { + "epoch": 0.83552, + "grad_norm": 0.3555807272291283, + "learning_rate": 1.3858864798654347e-05, + "loss": 0.5839, + "step": 2611 + }, + { + "epoch": 0.83584, + "grad_norm": 0.36325441830243516, + "learning_rate": 1.3806267071278934e-05, + "loss": 0.5695, + "step": 2612 + }, + { + "epoch": 0.83616, + "grad_norm": 0.3953561428405037, + "learning_rate": 1.3753761942312294e-05, + "loss": 0.6256, + "step": 2613 + }, + { + "epoch": 0.83648, + "grad_norm": 0.3652391929111533, + "learning_rate": 1.3701349468160906e-05, + "loss": 0.6038, + "step": 2614 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3470206100630685, + "learning_rate": 1.3649029705131844e-05, + "loss": 0.5945, + "step": 2615 + }, + { + "epoch": 0.83712, + "grad_norm": 0.341386609067006, + "learning_rate": 1.3596802709432466e-05, + "loss": 0.5598, + "step": 2616 + }, + { + "epoch": 0.83744, + "grad_norm": 0.31995790343333264, + "learning_rate": 1.354466853717059e-05, + "loss": 0.5911, + "step": 2617 + }, + { + "epoch": 0.83776, + "grad_norm": 0.33143148180840015, + "learning_rate": 1.3492627244354195e-05, + "loss": 0.5413, + "step": 2618 + }, + { + "epoch": 0.83808, + "grad_norm": 0.34003158625988406, + "learning_rate": 1.3440678886891544e-05, + "loss": 0.5552, + "step": 2619 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3340954964269716, + "learning_rate": 1.3388823520591077e-05, + "loss": 0.5752, + "step": 2620 + }, + { + "epoch": 0.83872, + "grad_norm": 0.3412906441356953, + "learning_rate": 1.333706120116126e-05, + "loss": 0.5518, + "step": 2621 + }, + { + "epoch": 0.83904, + "grad_norm": 0.518691759525488, + "learning_rate": 1.3285391984210694e-05, + "loss": 0.5325, + "step": 2622 + }, + { + "epoch": 0.83936, + "grad_norm": 0.34948417036987467, + "learning_rate": 1.3233815925247839e-05, + "loss": 0.5793, + "step": 2623 + }, + { + "epoch": 0.83968, + "grad_norm": 0.33387704714373895, + "learning_rate": 1.3182333079681197e-05, + "loss": 0.5559, + "step": 2624 + }, + { + "epoch": 0.84, + "grad_norm": 0.3478150704785004, + "learning_rate": 1.3130943502819082e-05, + "loss": 0.587, + "step": 2625 + }, + { + "epoch": 0.84032, + "grad_norm": 0.34808433684798074, + "learning_rate": 1.3079647249869554e-05, + "loss": 0.5573, + "step": 2626 + }, + { + "epoch": 0.84064, + "grad_norm": 0.3555303762744616, + "learning_rate": 1.3028444375940496e-05, + "loss": 0.5422, + "step": 2627 + }, + { + "epoch": 0.84096, + "grad_norm": 0.36693657979142713, + "learning_rate": 1.2977334936039454e-05, + "loss": 0.5699, + "step": 2628 + }, + { + "epoch": 0.84128, + "grad_norm": 0.3888044978014789, + "learning_rate": 1.292631898507356e-05, + "loss": 0.598, + "step": 2629 + }, + { + "epoch": 0.8416, + "grad_norm": 0.35891004519502656, + "learning_rate": 1.2875396577849552e-05, + "loss": 0.5889, + "step": 2630 + }, + { + "epoch": 0.84192, + "grad_norm": 0.36315079256525856, + "learning_rate": 1.2824567769073636e-05, + "loss": 0.5662, + "step": 2631 + }, + { + "epoch": 0.84224, + "grad_norm": 0.3586950693191108, + "learning_rate": 1.27738326133515e-05, + "loss": 0.5605, + "step": 2632 + }, + { + "epoch": 0.84256, + "grad_norm": 0.3704429464120191, + "learning_rate": 1.2723191165188219e-05, + "loss": 0.555, + "step": 2633 + }, + { + "epoch": 0.84288, + "grad_norm": 0.35981620667429703, + "learning_rate": 1.2672643478988144e-05, + "loss": 0.5897, + "step": 2634 + }, + { + "epoch": 0.8432, + "grad_norm": 0.37174072493699917, + "learning_rate": 1.2622189609054979e-05, + "loss": 0.5559, + "step": 2635 + }, + { + "epoch": 0.84352, + "grad_norm": 0.3366993256397614, + "learning_rate": 1.2571829609591568e-05, + "loss": 0.5548, + "step": 2636 + }, + { + "epoch": 0.84384, + "grad_norm": 0.34727323293654794, + "learning_rate": 1.2521563534699964e-05, + "loss": 0.5758, + "step": 2637 + }, + { + "epoch": 0.84416, + "grad_norm": 0.3275944921982183, + "learning_rate": 1.2471391438381264e-05, + "loss": 0.5549, + "step": 2638 + }, + { + "epoch": 0.84448, + "grad_norm": 0.3639262908404435, + "learning_rate": 1.242131337453567e-05, + "loss": 0.5556, + "step": 2639 + }, + { + "epoch": 0.8448, + "grad_norm": 0.35231147612482977, + "learning_rate": 1.2371329396962273e-05, + "loss": 0.5532, + "step": 2640 + }, + { + "epoch": 0.84512, + "grad_norm": 0.3830053274716528, + "learning_rate": 1.2321439559359193e-05, + "loss": 0.6275, + "step": 2641 + }, + { + "epoch": 0.84544, + "grad_norm": 0.31845931261951443, + "learning_rate": 1.2271643915323317e-05, + "loss": 0.5472, + "step": 2642 + }, + { + "epoch": 0.84576, + "grad_norm": 0.3427011213998191, + "learning_rate": 1.2221942518350415e-05, + "loss": 0.5658, + "step": 2643 + }, + { + "epoch": 0.84608, + "grad_norm": 0.3513626484067751, + "learning_rate": 1.2172335421834957e-05, + "loss": 0.5861, + "step": 2644 + }, + { + "epoch": 0.8464, + "grad_norm": 0.35135266995205605, + "learning_rate": 1.2122822679070122e-05, + "loss": 0.6017, + "step": 2645 + }, + { + "epoch": 0.84672, + "grad_norm": 0.33436155632830905, + "learning_rate": 1.2073404343247752e-05, + "loss": 0.5248, + "step": 2646 + }, + { + "epoch": 0.84704, + "grad_norm": 0.3374426104451427, + "learning_rate": 1.202408046745821e-05, + "loss": 0.5784, + "step": 2647 + }, + { + "epoch": 0.84736, + "grad_norm": 0.3453557098555006, + "learning_rate": 1.1974851104690444e-05, + "loss": 0.5397, + "step": 2648 + }, + { + "epoch": 0.84768, + "grad_norm": 0.3625372888969604, + "learning_rate": 1.192571630783179e-05, + "loss": 0.5768, + "step": 2649 + }, + { + "epoch": 0.848, + "grad_norm": 0.3459052337748261, + "learning_rate": 1.1876676129668075e-05, + "loss": 0.5807, + "step": 2650 + }, + { + "epoch": 0.84832, + "grad_norm": 0.40475236982731055, + "learning_rate": 1.1827730622883425e-05, + "loss": 0.5746, + "step": 2651 + }, + { + "epoch": 0.84864, + "grad_norm": 0.34425907542118983, + "learning_rate": 1.1778879840060253e-05, + "loss": 0.5915, + "step": 2652 + }, + { + "epoch": 0.84896, + "grad_norm": 0.5933119668400415, + "learning_rate": 1.173012383367923e-05, + "loss": 0.576, + "step": 2653 + }, + { + "epoch": 0.84928, + "grad_norm": 0.3331184263117235, + "learning_rate": 1.1681462656119257e-05, + "loss": 0.5637, + "step": 2654 + }, + { + "epoch": 0.8496, + "grad_norm": 0.37719069707307695, + "learning_rate": 1.163289635965723e-05, + "loss": 0.558, + "step": 2655 + }, + { + "epoch": 0.84992, + "grad_norm": 0.37043432070547594, + "learning_rate": 1.1584424996468268e-05, + "loss": 0.6001, + "step": 2656 + }, + { + "epoch": 0.85024, + "grad_norm": 0.36164734024045697, + "learning_rate": 1.1536048618625362e-05, + "loss": 0.6163, + "step": 2657 + }, + { + "epoch": 0.85056, + "grad_norm": 0.34034283702173906, + "learning_rate": 1.148776727809956e-05, + "loss": 0.552, + "step": 2658 + }, + { + "epoch": 0.85088, + "grad_norm": 0.38567877856658006, + "learning_rate": 1.1439581026759783e-05, + "loss": 0.6218, + "step": 2659 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3591955610438624, + "learning_rate": 1.1391489916372766e-05, + "loss": 0.579, + "step": 2660 + }, + { + "epoch": 0.85152, + "grad_norm": 0.33245191012461534, + "learning_rate": 1.1343493998603083e-05, + "loss": 0.5753, + "step": 2661 + }, + { + "epoch": 0.85184, + "grad_norm": 0.47976499849916715, + "learning_rate": 1.1295593325012988e-05, + "loss": 0.5957, + "step": 2662 + }, + { + "epoch": 0.85216, + "grad_norm": 0.35215599710990786, + "learning_rate": 1.124778794706245e-05, + "loss": 0.5688, + "step": 2663 + }, + { + "epoch": 0.85248, + "grad_norm": 0.40121587600172964, + "learning_rate": 1.1200077916109075e-05, + "loss": 0.5883, + "step": 2664 + }, + { + "epoch": 0.8528, + "grad_norm": 0.37006485265448724, + "learning_rate": 1.1152463283407987e-05, + "loss": 0.5781, + "step": 2665 + }, + { + "epoch": 0.85312, + "grad_norm": 0.3712484528133468, + "learning_rate": 1.1104944100111891e-05, + "loss": 0.5832, + "step": 2666 + }, + { + "epoch": 0.85344, + "grad_norm": 0.3470463640747448, + "learning_rate": 1.1057520417270873e-05, + "loss": 0.5591, + "step": 2667 + }, + { + "epoch": 0.85376, + "grad_norm": 0.35187055682418883, + "learning_rate": 1.1010192285832466e-05, + "loss": 0.5796, + "step": 2668 + }, + { + "epoch": 0.85408, + "grad_norm": 0.363763686915762, + "learning_rate": 1.0962959756641566e-05, + "loss": 0.6015, + "step": 2669 + }, + { + "epoch": 0.8544, + "grad_norm": 0.35589410689397755, + "learning_rate": 1.0915822880440308e-05, + "loss": 0.6153, + "step": 2670 + }, + { + "epoch": 0.85472, + "grad_norm": 0.3686280230439948, + "learning_rate": 1.0868781707868126e-05, + "loss": 0.5925, + "step": 2671 + }, + { + "epoch": 0.85504, + "grad_norm": 0.333496968547741, + "learning_rate": 1.0821836289461628e-05, + "loss": 0.5258, + "step": 2672 + }, + { + "epoch": 0.85536, + "grad_norm": 0.3534845869134167, + "learning_rate": 1.0774986675654509e-05, + "loss": 0.6064, + "step": 2673 + }, + { + "epoch": 0.85568, + "grad_norm": 0.3285824331187314, + "learning_rate": 1.0728232916777604e-05, + "loss": 0.5045, + "step": 2674 + }, + { + "epoch": 0.856, + "grad_norm": 0.4422402636765624, + "learning_rate": 1.068157506305869e-05, + "loss": 0.5988, + "step": 2675 + }, + { + "epoch": 0.85632, + "grad_norm": 0.3277821779984777, + "learning_rate": 1.0635013164622598e-05, + "loss": 0.5255, + "step": 2676 + }, + { + "epoch": 0.85664, + "grad_norm": 0.3527888405250758, + "learning_rate": 1.0588547271491033e-05, + "loss": 0.564, + "step": 2677 + }, + { + "epoch": 0.85696, + "grad_norm": 0.31995112530676284, + "learning_rate": 1.0542177433582545e-05, + "loss": 0.5438, + "step": 2678 + }, + { + "epoch": 0.85728, + "grad_norm": 0.3721048764363539, + "learning_rate": 1.049590370071254e-05, + "loss": 0.6162, + "step": 2679 + }, + { + "epoch": 0.8576, + "grad_norm": 0.36379566636130883, + "learning_rate": 1.0449726122593107e-05, + "loss": 0.5804, + "step": 2680 + }, + { + "epoch": 0.85792, + "grad_norm": 0.3694160420463327, + "learning_rate": 1.0403644748833097e-05, + "loss": 0.5664, + "step": 2681 + }, + { + "epoch": 0.85824, + "grad_norm": 0.3438838456015583, + "learning_rate": 1.035765962893801e-05, + "loss": 0.5638, + "step": 2682 + }, + { + "epoch": 0.85856, + "grad_norm": 0.3407604631279433, + "learning_rate": 1.0311770812309873e-05, + "loss": 0.597, + "step": 2683 + }, + { + "epoch": 0.85888, + "grad_norm": 0.34980604356141215, + "learning_rate": 1.0265978348247318e-05, + "loss": 0.5294, + "step": 2684 + }, + { + "epoch": 0.8592, + "grad_norm": 0.33074675296681794, + "learning_rate": 1.022028228594547e-05, + "loss": 0.5592, + "step": 2685 + }, + { + "epoch": 0.85952, + "grad_norm": 0.35646341842882845, + "learning_rate": 1.0174682674495827e-05, + "loss": 0.5884, + "step": 2686 + }, + { + "epoch": 0.85984, + "grad_norm": 0.3327559283842682, + "learning_rate": 1.0129179562886327e-05, + "loss": 0.591, + "step": 2687 + }, + { + "epoch": 0.86016, + "grad_norm": 0.3562247424285518, + "learning_rate": 1.0083773000001207e-05, + "loss": 0.5409, + "step": 2688 + }, + { + "epoch": 0.86048, + "grad_norm": 0.34271633882254676, + "learning_rate": 1.0038463034620982e-05, + "loss": 0.5602, + "step": 2689 + }, + { + "epoch": 0.8608, + "grad_norm": 0.45829064243611684, + "learning_rate": 9.993249715422437e-06, + "loss": 0.5338, + "step": 2690 + }, + { + "epoch": 0.86112, + "grad_norm": 0.3571020177254091, + "learning_rate": 9.94813309097844e-06, + "loss": 0.5799, + "step": 2691 + }, + { + "epoch": 0.86144, + "grad_norm": 0.7690030167190615, + "learning_rate": 9.903113209758096e-06, + "loss": 0.5733, + "step": 2692 + }, + { + "epoch": 0.86176, + "grad_norm": 0.36397672579443835, + "learning_rate": 9.858190120126454e-06, + "loss": 0.5551, + "step": 2693 + }, + { + "epoch": 0.86208, + "grad_norm": 0.33307799914059033, + "learning_rate": 9.813363870344683e-06, + "loss": 0.59, + "step": 2694 + }, + { + "epoch": 0.8624, + "grad_norm": 0.35105460157092433, + "learning_rate": 9.76863450856984e-06, + "loss": 0.5773, + "step": 2695 + }, + { + "epoch": 0.86272, + "grad_norm": 0.344957999458551, + "learning_rate": 9.724002082854977e-06, + "loss": 0.5622, + "step": 2696 + }, + { + "epoch": 0.86304, + "grad_norm": 0.36117553927246615, + "learning_rate": 9.679466641148916e-06, + "loss": 0.6403, + "step": 2697 + }, + { + "epoch": 0.86336, + "grad_norm": 0.34524868159182515, + "learning_rate": 9.635028231296327e-06, + "loss": 0.5476, + "step": 2698 + }, + { + "epoch": 0.86368, + "grad_norm": 0.362278650943625, + "learning_rate": 9.590686901037648e-06, + "loss": 0.5699, + "step": 2699 + }, + { + "epoch": 0.864, + "grad_norm": 0.34422212501217686, + "learning_rate": 9.546442698009061e-06, + "loss": 0.5619, + "step": 2700 + }, + { + "epoch": 0.86432, + "grad_norm": 0.42291120569078267, + "learning_rate": 9.502295669742289e-06, + "loss": 0.551, + "step": 2701 + }, + { + "epoch": 0.86464, + "grad_norm": 0.35497718610586765, + "learning_rate": 9.458245863664783e-06, + "loss": 0.5773, + "step": 2702 + }, + { + "epoch": 0.86496, + "grad_norm": 0.3589322565167151, + "learning_rate": 9.414293327099489e-06, + "loss": 0.6141, + "step": 2703 + }, + { + "epoch": 0.86528, + "grad_norm": 0.3579489197133505, + "learning_rate": 9.370438107264846e-06, + "loss": 0.5811, + "step": 2704 + }, + { + "epoch": 0.8656, + "grad_norm": 0.3361526389029414, + "learning_rate": 9.326680251274777e-06, + "loss": 0.5775, + "step": 2705 + }, + { + "epoch": 0.86592, + "grad_norm": 0.3620336094950284, + "learning_rate": 9.283019806138582e-06, + "loss": 0.6233, + "step": 2706 + }, + { + "epoch": 0.86624, + "grad_norm": 0.35552574324499514, + "learning_rate": 9.239456818760905e-06, + "loss": 0.604, + "step": 2707 + }, + { + "epoch": 0.86656, + "grad_norm": 0.3626207169312369, + "learning_rate": 9.195991335941756e-06, + "loss": 0.5535, + "step": 2708 + }, + { + "epoch": 0.86688, + "grad_norm": 0.36165536414025484, + "learning_rate": 9.152623404376293e-06, + "loss": 0.5794, + "step": 2709 + }, + { + "epoch": 0.8672, + "grad_norm": 0.34087441439371674, + "learning_rate": 9.10935307065497e-06, + "loss": 0.4998, + "step": 2710 + }, + { + "epoch": 0.86752, + "grad_norm": 0.32784235444910514, + "learning_rate": 9.0661803812633e-06, + "loss": 0.4953, + "step": 2711 + }, + { + "epoch": 0.86784, + "grad_norm": 0.33921486197812983, + "learning_rate": 9.023105382581975e-06, + "loss": 0.5409, + "step": 2712 + }, + { + "epoch": 0.86816, + "grad_norm": 0.9129041946953053, + "learning_rate": 8.980128120886722e-06, + "loss": 0.5119, + "step": 2713 + }, + { + "epoch": 0.86848, + "grad_norm": 0.38430006608115663, + "learning_rate": 8.937248642348218e-06, + "loss": 0.5736, + "step": 2714 + }, + { + "epoch": 0.8688, + "grad_norm": 0.35065551886106366, + "learning_rate": 8.894466993032147e-06, + "loss": 0.5856, + "step": 2715 + }, + { + "epoch": 0.86912, + "grad_norm": 0.33416199414926295, + "learning_rate": 8.85178321889908e-06, + "loss": 0.5528, + "step": 2716 + }, + { + "epoch": 0.86944, + "grad_norm": 0.37447740057728146, + "learning_rate": 8.809197365804401e-06, + "loss": 0.6337, + "step": 2717 + }, + { + "epoch": 0.86976, + "grad_norm": 0.336091885343065, + "learning_rate": 8.76670947949838e-06, + "loss": 0.565, + "step": 2718 + }, + { + "epoch": 0.87008, + "grad_norm": 0.34521337704582594, + "learning_rate": 8.72431960562594e-06, + "loss": 0.5484, + "step": 2719 + }, + { + "epoch": 0.8704, + "grad_norm": 0.35216516000853687, + "learning_rate": 8.68202778972681e-06, + "loss": 0.5447, + "step": 2720 + }, + { + "epoch": 0.87072, + "grad_norm": 0.32365542660433744, + "learning_rate": 8.639834077235266e-06, + "loss": 0.5384, + "step": 2721 + }, + { + "epoch": 0.87104, + "grad_norm": 0.3659657423997534, + "learning_rate": 8.597738513480302e-06, + "loss": 0.5619, + "step": 2722 + }, + { + "epoch": 0.87136, + "grad_norm": 0.3617534147684829, + "learning_rate": 8.555741143685381e-06, + "loss": 0.558, + "step": 2723 + }, + { + "epoch": 0.87168, + "grad_norm": 0.3370336801159234, + "learning_rate": 8.513842012968543e-06, + "loss": 0.5563, + "step": 2724 + }, + { + "epoch": 0.872, + "grad_norm": 0.32865741894191847, + "learning_rate": 8.472041166342216e-06, + "loss": 0.5532, + "step": 2725 + }, + { + "epoch": 0.87232, + "grad_norm": 0.3187491837381094, + "learning_rate": 8.430338648713332e-06, + "loss": 0.5469, + "step": 2726 + }, + { + "epoch": 0.87264, + "grad_norm": 0.4030900100656536, + "learning_rate": 8.388734504883088e-06, + "loss": 0.6269, + "step": 2727 + }, + { + "epoch": 0.87296, + "grad_norm": 0.3622155743104728, + "learning_rate": 8.347228779547078e-06, + "loss": 0.5491, + "step": 2728 + }, + { + "epoch": 0.87328, + "grad_norm": 0.34111289607719547, + "learning_rate": 8.305821517295154e-06, + "loss": 0.5929, + "step": 2729 + }, + { + "epoch": 0.8736, + "grad_norm": 0.35619153641275386, + "learning_rate": 8.264512762611321e-06, + "loss": 0.5512, + "step": 2730 + }, + { + "epoch": 0.87392, + "grad_norm": 0.44335790091425353, + "learning_rate": 8.223302559873857e-06, + "loss": 0.5873, + "step": 2731 + }, + { + "epoch": 0.87424, + "grad_norm": 0.3340282323353771, + "learning_rate": 8.182190953355063e-06, + "loss": 0.5574, + "step": 2732 + }, + { + "epoch": 0.87456, + "grad_norm": 0.34343220247981865, + "learning_rate": 8.141177987221394e-06, + "loss": 0.5417, + "step": 2733 + }, + { + "epoch": 0.87488, + "grad_norm": 0.313879474653681, + "learning_rate": 8.100263705533317e-06, + "loss": 0.5172, + "step": 2734 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3594818762354667, + "learning_rate": 8.059448152245242e-06, + "loss": 0.5766, + "step": 2735 + }, + { + "epoch": 0.87552, + "grad_norm": 0.32364856195429936, + "learning_rate": 8.01873137120559e-06, + "loss": 0.5567, + "step": 2736 + }, + { + "epoch": 0.87584, + "grad_norm": 0.3749724363509122, + "learning_rate": 7.978113406156584e-06, + "loss": 0.5476, + "step": 2737 + }, + { + "epoch": 0.87616, + "grad_norm": 0.3472650981403551, + "learning_rate": 7.93759430073434e-06, + "loss": 0.5652, + "step": 2738 + }, + { + "epoch": 0.87648, + "grad_norm": 0.3533542922260968, + "learning_rate": 7.897174098468797e-06, + "loss": 0.5877, + "step": 2739 + }, + { + "epoch": 0.8768, + "grad_norm": 0.34917540582951834, + "learning_rate": 7.856852842783547e-06, + "loss": 0.5656, + "step": 2740 + }, + { + "epoch": 0.87712, + "grad_norm": 0.3623230991737994, + "learning_rate": 7.816630576995987e-06, + "loss": 0.5675, + "step": 2741 + }, + { + "epoch": 0.87744, + "grad_norm": 0.3508426262275108, + "learning_rate": 7.776507344317097e-06, + "loss": 0.5662, + "step": 2742 + }, + { + "epoch": 0.87776, + "grad_norm": 0.3533153531625396, + "learning_rate": 7.736483187851484e-06, + "loss": 0.5869, + "step": 2743 + }, + { + "epoch": 0.87808, + "grad_norm": 0.3706637973494316, + "learning_rate": 7.696558150597356e-06, + "loss": 0.5505, + "step": 2744 + }, + { + "epoch": 0.8784, + "grad_norm": 0.40496628588296135, + "learning_rate": 7.656732275446366e-06, + "loss": 0.6336, + "step": 2745 + }, + { + "epoch": 0.87872, + "grad_norm": 0.32218960924854395, + "learning_rate": 7.61700560518368e-06, + "loss": 0.5454, + "step": 2746 + }, + { + "epoch": 0.87904, + "grad_norm": 0.3303135408331546, + "learning_rate": 7.577378182487926e-06, + "loss": 0.5644, + "step": 2747 + }, + { + "epoch": 0.87936, + "grad_norm": 0.353959313934574, + "learning_rate": 7.537850049931006e-06, + "loss": 0.5964, + "step": 2748 + }, + { + "epoch": 0.87968, + "grad_norm": 0.3494529531061012, + "learning_rate": 7.498421249978249e-06, + "loss": 0.5526, + "step": 2749 + }, + { + "epoch": 0.88, + "grad_norm": 0.3695466281187068, + "learning_rate": 7.459091824988229e-06, + "loss": 0.5667, + "step": 2750 + }, + { + "epoch": 0.88032, + "grad_norm": 0.3394420751070245, + "learning_rate": 7.419861817212758e-06, + "loss": 0.5498, + "step": 2751 + }, + { + "epoch": 0.88064, + "grad_norm": 0.3751188943499607, + "learning_rate": 7.380731268796859e-06, + "loss": 0.6279, + "step": 2752 + }, + { + "epoch": 0.88096, + "grad_norm": 0.3474335882234515, + "learning_rate": 7.341700221778691e-06, + "loss": 0.613, + "step": 2753 + }, + { + "epoch": 0.88128, + "grad_norm": 0.3828786412677813, + "learning_rate": 7.3027687180895475e-06, + "loss": 0.6296, + "step": 2754 + }, + { + "epoch": 0.8816, + "grad_norm": 0.32437881156472254, + "learning_rate": 7.263936799553728e-06, + "loss": 0.5602, + "step": 2755 + }, + { + "epoch": 0.88192, + "grad_norm": 0.3295470948671417, + "learning_rate": 7.2252045078885945e-06, + "loss": 0.5965, + "step": 2756 + }, + { + "epoch": 0.88224, + "grad_norm": 0.3635410050690401, + "learning_rate": 7.186571884704474e-06, + "loss": 0.6021, + "step": 2757 + }, + { + "epoch": 0.88256, + "grad_norm": 0.33213646315506595, + "learning_rate": 7.1480389715046e-06, + "loss": 0.5428, + "step": 2758 + }, + { + "epoch": 0.88288, + "grad_norm": 0.3653316523278171, + "learning_rate": 7.109605809685094e-06, + "loss": 0.5797, + "step": 2759 + }, + { + "epoch": 0.8832, + "grad_norm": 0.33986015532916414, + "learning_rate": 7.071272440534937e-06, + "loss": 0.5734, + "step": 2760 + }, + { + "epoch": 0.88352, + "grad_norm": 0.3552870208145124, + "learning_rate": 7.033038905235845e-06, + "loss": 0.575, + "step": 2761 + }, + { + "epoch": 0.88384, + "grad_norm": 0.3576248551259227, + "learning_rate": 6.994905244862349e-06, + "loss": 0.587, + "step": 2762 + }, + { + "epoch": 0.88416, + "grad_norm": 0.3636714688443094, + "learning_rate": 6.956871500381634e-06, + "loss": 0.577, + "step": 2763 + }, + { + "epoch": 0.88448, + "grad_norm": 0.3482394817957405, + "learning_rate": 6.918937712653584e-06, + "loss": 0.5612, + "step": 2764 + }, + { + "epoch": 0.8848, + "grad_norm": 0.34634732600544005, + "learning_rate": 6.881103922430665e-06, + "loss": 0.561, + "step": 2765 + }, + { + "epoch": 0.88512, + "grad_norm": 0.375902593868274, + "learning_rate": 6.843370170357932e-06, + "loss": 0.555, + "step": 2766 + }, + { + "epoch": 0.88544, + "grad_norm": 0.3376862125987125, + "learning_rate": 6.80573649697297e-06, + "loss": 0.5674, + "step": 2767 + }, + { + "epoch": 0.88576, + "grad_norm": 0.861545912519331, + "learning_rate": 6.7682029427058365e-06, + "loss": 0.5946, + "step": 2768 + }, + { + "epoch": 0.88608, + "grad_norm": 0.34865465601876977, + "learning_rate": 6.7307695478790345e-06, + "loss": 0.546, + "step": 2769 + }, + { + "epoch": 0.8864, + "grad_norm": 0.34870578512428874, + "learning_rate": 6.693436352707494e-06, + "loss": 0.5892, + "step": 2770 + }, + { + "epoch": 0.88672, + "grad_norm": 0.35889278838134153, + "learning_rate": 6.656203397298433e-06, + "loss": 0.5831, + "step": 2771 + }, + { + "epoch": 0.88704, + "grad_norm": 0.33848557579498934, + "learning_rate": 6.619070721651477e-06, + "loss": 0.5757, + "step": 2772 + }, + { + "epoch": 0.88736, + "grad_norm": 0.4172341171161688, + "learning_rate": 6.5820383656584165e-06, + "loss": 0.5692, + "step": 2773 + }, + { + "epoch": 0.88768, + "grad_norm": 0.3879464582293012, + "learning_rate": 6.545106369103349e-06, + "loss": 0.5605, + "step": 2774 + }, + { + "epoch": 0.888, + "grad_norm": 0.350582038472011, + "learning_rate": 6.5082747716625255e-06, + "loss": 0.558, + "step": 2775 + }, + { + "epoch": 0.88832, + "grad_norm": 0.3329044895418247, + "learning_rate": 6.471543612904319e-06, + "loss": 0.576, + "step": 2776 + }, + { + "epoch": 0.88864, + "grad_norm": 0.3491995049014739, + "learning_rate": 6.434912932289228e-06, + "loss": 0.5561, + "step": 2777 + }, + { + "epoch": 0.88896, + "grad_norm": 0.3300639753565361, + "learning_rate": 6.398382769169786e-06, + "loss": 0.5523, + "step": 2778 + }, + { + "epoch": 0.88928, + "grad_norm": 0.3490842110958772, + "learning_rate": 6.3619531627905904e-06, + "loss": 0.5391, + "step": 2779 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3439085395427416, + "learning_rate": 6.325624152288123e-06, + "loss": 0.5892, + "step": 2780 + }, + { + "epoch": 0.88992, + "grad_norm": 0.34509556617704645, + "learning_rate": 6.289395776690854e-06, + "loss": 0.6108, + "step": 2781 + }, + { + "epoch": 0.89024, + "grad_norm": 0.33386783087969407, + "learning_rate": 6.253268074919138e-06, + "loss": 0.5524, + "step": 2782 + }, + { + "epoch": 0.89056, + "grad_norm": 0.37653527935550135, + "learning_rate": 6.217241085785186e-06, + "loss": 0.5884, + "step": 2783 + }, + { + "epoch": 0.89088, + "grad_norm": 0.32243983124289327, + "learning_rate": 6.181314847992959e-06, + "loss": 0.5435, + "step": 2784 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3421011995759712, + "learning_rate": 6.145489400138238e-06, + "loss": 0.5316, + "step": 2785 + }, + { + "epoch": 0.89152, + "grad_norm": 0.37015341445689787, + "learning_rate": 6.109764780708482e-06, + "loss": 0.5693, + "step": 2786 + }, + { + "epoch": 0.89184, + "grad_norm": 0.34810680782887027, + "learning_rate": 6.074141028082858e-06, + "loss": 0.5555, + "step": 2787 + }, + { + "epoch": 0.89216, + "grad_norm": 0.36949507551060756, + "learning_rate": 6.038618180532174e-06, + "loss": 0.571, + "step": 2788 + }, + { + "epoch": 0.89248, + "grad_norm": 0.3991748582992206, + "learning_rate": 6.003196276218814e-06, + "loss": 0.5827, + "step": 2789 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3215523687754182, + "learning_rate": 5.9678753531967165e-06, + "loss": 0.5765, + "step": 2790 + }, + { + "epoch": 0.89312, + "grad_norm": 0.32743649458301116, + "learning_rate": 5.932655449411384e-06, + "loss": 0.5618, + "step": 2791 + }, + { + "epoch": 0.89344, + "grad_norm": 0.35078959112555913, + "learning_rate": 5.8975366026997046e-06, + "loss": 0.6208, + "step": 2792 + }, + { + "epoch": 0.89376, + "grad_norm": 0.34974224167142387, + "learning_rate": 5.862518850790099e-06, + "loss": 0.5734, + "step": 2793 + }, + { + "epoch": 0.89408, + "grad_norm": 0.33248483049491073, + "learning_rate": 5.8276022313022875e-06, + "loss": 0.5415, + "step": 2794 + }, + { + "epoch": 0.8944, + "grad_norm": 0.33611933596137067, + "learning_rate": 5.792786781747428e-06, + "loss": 0.5702, + "step": 2795 + }, + { + "epoch": 0.89472, + "grad_norm": 0.3501405426469906, + "learning_rate": 5.7580725395279366e-06, + "loss": 0.5837, + "step": 2796 + }, + { + "epoch": 0.89504, + "grad_norm": 0.3528040892370007, + "learning_rate": 5.723459541937515e-06, + "loss": 0.6133, + "step": 2797 + }, + { + "epoch": 0.89536, + "grad_norm": 0.354212207145996, + "learning_rate": 5.688947826161117e-06, + "loss": 0.6047, + "step": 2798 + }, + { + "epoch": 0.89568, + "grad_norm": 0.3704975451646289, + "learning_rate": 5.654537429274842e-06, + "loss": 0.6079, + "step": 2799 + }, + { + "epoch": 0.896, + "grad_norm": 0.3448700631938704, + "learning_rate": 5.620228388245996e-06, + "loss": 0.607, + "step": 2800 + }, + { + "epoch": 0.89632, + "grad_norm": 0.33290262588162295, + "learning_rate": 5.586020739932973e-06, + "loss": 0.5606, + "step": 2801 + }, + { + "epoch": 0.89664, + "grad_norm": 0.3398627304422116, + "learning_rate": 5.5519145210852105e-06, + "loss": 0.5032, + "step": 2802 + }, + { + "epoch": 0.89696, + "grad_norm": 0.33170687758390277, + "learning_rate": 5.517909768343254e-06, + "loss": 0.5335, + "step": 2803 + }, + { + "epoch": 0.89728, + "grad_norm": 0.37169788403853293, + "learning_rate": 5.4840065182385716e-06, + "loss": 0.5569, + "step": 2804 + }, + { + "epoch": 0.8976, + "grad_norm": 0.34250040944622306, + "learning_rate": 5.450204807193626e-06, + "loss": 0.5897, + "step": 2805 + }, + { + "epoch": 0.89792, + "grad_norm": 0.3676708902237842, + "learning_rate": 5.416504671521772e-06, + "loss": 0.6352, + "step": 2806 + }, + { + "epoch": 0.89824, + "grad_norm": 0.36384276177088926, + "learning_rate": 5.382906147427269e-06, + "loss": 0.6064, + "step": 2807 + }, + { + "epoch": 0.89856, + "grad_norm": 0.34099146203541597, + "learning_rate": 5.349409271005168e-06, + "loss": 0.5474, + "step": 2808 + }, + { + "epoch": 0.89888, + "grad_norm": 0.3345924397023783, + "learning_rate": 5.316014078241393e-06, + "loss": 0.583, + "step": 2809 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3335290728397917, + "learning_rate": 5.2827206050125585e-06, + "loss": 0.5806, + "step": 2810 + }, + { + "epoch": 0.89952, + "grad_norm": 0.3602870780474008, + "learning_rate": 5.249528887086052e-06, + "loss": 0.5963, + "step": 2811 + }, + { + "epoch": 0.89984, + "grad_norm": 0.33506594255551925, + "learning_rate": 5.216438960119885e-06, + "loss": 0.5606, + "step": 2812 + }, + { + "epoch": 0.90016, + "grad_norm": 0.34868184719748685, + "learning_rate": 5.183450859662764e-06, + "loss": 0.5765, + "step": 2813 + }, + { + "epoch": 0.90048, + "grad_norm": 0.3550045997969656, + "learning_rate": 5.150564621154019e-06, + "loss": 0.5898, + "step": 2814 + }, + { + "epoch": 0.9008, + "grad_norm": 0.33614302670948576, + "learning_rate": 5.11778027992349e-06, + "loss": 0.5363, + "step": 2815 + }, + { + "epoch": 0.90112, + "grad_norm": 0.3614194175997036, + "learning_rate": 5.085097871191591e-06, + "loss": 0.5268, + "step": 2816 + }, + { + "epoch": 0.90144, + "grad_norm": 0.3814564448995357, + "learning_rate": 5.052517430069204e-06, + "loss": 0.6076, + "step": 2817 + }, + { + "epoch": 0.90176, + "grad_norm": 0.36852320553559564, + "learning_rate": 5.020038991557674e-06, + "loss": 0.5706, + "step": 2818 + }, + { + "epoch": 0.90208, + "grad_norm": 0.34752850994780604, + "learning_rate": 4.9876625905488025e-06, + "loss": 0.5648, + "step": 2819 + }, + { + "epoch": 0.9024, + "grad_norm": 0.36553003238350545, + "learning_rate": 4.955388261824712e-06, + "loss": 0.5857, + "step": 2820 + }, + { + "epoch": 0.90272, + "grad_norm": 0.34547469445158907, + "learning_rate": 4.923216040057887e-06, + "loss": 0.5966, + "step": 2821 + }, + { + "epoch": 0.90304, + "grad_norm": 0.3577917172679757, + "learning_rate": 4.89114595981115e-06, + "loss": 0.5645, + "step": 2822 + }, + { + "epoch": 0.90336, + "grad_norm": 0.3451850821887828, + "learning_rate": 4.859178055537539e-06, + "loss": 0.5389, + "step": 2823 + }, + { + "epoch": 0.90368, + "grad_norm": 0.34628777076126466, + "learning_rate": 4.8273123615803825e-06, + "loss": 0.5482, + "step": 2824 + }, + { + "epoch": 0.904, + "grad_norm": 0.34300289902896913, + "learning_rate": 4.7955489121731535e-06, + "loss": 0.5727, + "step": 2825 + }, + { + "epoch": 0.90432, + "grad_norm": 0.3678335467865954, + "learning_rate": 4.763887741439499e-06, + "loss": 0.6289, + "step": 2826 + }, + { + "epoch": 0.90464, + "grad_norm": 0.3377063031338014, + "learning_rate": 4.732328883393211e-06, + "loss": 0.582, + "step": 2827 + }, + { + "epoch": 0.90496, + "grad_norm": 0.3442615691288297, + "learning_rate": 4.700872371938125e-06, + "loss": 0.5621, + "step": 2828 + }, + { + "epoch": 0.90528, + "grad_norm": 0.3340253158301437, + "learning_rate": 4.669518240868176e-06, + "loss": 0.5284, + "step": 2829 + }, + { + "epoch": 0.9056, + "grad_norm": 0.33567183285964836, + "learning_rate": 4.6382665238672405e-06, + "loss": 0.5576, + "step": 2830 + }, + { + "epoch": 0.90592, + "grad_norm": 0.34379715853428794, + "learning_rate": 4.607117254509241e-06, + "loss": 0.581, + "step": 2831 + }, + { + "epoch": 0.90624, + "grad_norm": 0.32591788634552477, + "learning_rate": 4.57607046625802e-06, + "loss": 0.5321, + "step": 2832 + }, + { + "epoch": 0.90656, + "grad_norm": 0.38041021998211644, + "learning_rate": 4.545126192467308e-06, + "loss": 0.6333, + "step": 2833 + }, + { + "epoch": 0.90688, + "grad_norm": 0.33676411330147826, + "learning_rate": 4.514284466380692e-06, + "loss": 0.5579, + "step": 2834 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3343253948899122, + "learning_rate": 4.483545321131632e-06, + "loss": 0.5703, + "step": 2835 + }, + { + "epoch": 0.90752, + "grad_norm": 0.3656624643540776, + "learning_rate": 4.452908789743337e-06, + "loss": 0.5768, + "step": 2836 + }, + { + "epoch": 0.90784, + "grad_norm": 0.32530824513401746, + "learning_rate": 4.422374905128846e-06, + "loss": 0.5652, + "step": 2837 + }, + { + "epoch": 0.90816, + "grad_norm": 0.36160556475942635, + "learning_rate": 4.391943700090839e-06, + "loss": 0.5405, + "step": 2838 + }, + { + "epoch": 0.90848, + "grad_norm": 0.3410437113763867, + "learning_rate": 4.361615207321756e-06, + "loss": 0.5786, + "step": 2839 + }, + { + "epoch": 0.9088, + "grad_norm": 0.367931721500388, + "learning_rate": 4.331389459403668e-06, + "loss": 0.5827, + "step": 2840 + }, + { + "epoch": 0.90912, + "grad_norm": 0.3716330792114727, + "learning_rate": 4.3012664888082424e-06, + "loss": 0.6251, + "step": 2841 + }, + { + "epoch": 0.90944, + "grad_norm": 0.3323411659132477, + "learning_rate": 4.271246327896783e-06, + "loss": 0.5965, + "step": 2842 + }, + { + "epoch": 0.90976, + "grad_norm": 0.3369799588012157, + "learning_rate": 4.241329008920081e-06, + "loss": 0.5185, + "step": 2843 + }, + { + "epoch": 0.91008, + "grad_norm": 0.3376043256633684, + "learning_rate": 4.211514564018515e-06, + "loss": 0.5655, + "step": 2844 + }, + { + "epoch": 0.9104, + "grad_norm": 0.33634762663920625, + "learning_rate": 4.181803025221898e-06, + "loss": 0.5191, + "step": 2845 + }, + { + "epoch": 0.91072, + "grad_norm": 0.3458550837125521, + "learning_rate": 4.152194424449485e-06, + "loss": 0.5759, + "step": 2846 + }, + { + "epoch": 0.91104, + "grad_norm": 0.3768007158563421, + "learning_rate": 4.122688793509988e-06, + "loss": 0.5652, + "step": 2847 + }, + { + "epoch": 0.91136, + "grad_norm": 0.3585454547843405, + "learning_rate": 4.0932861641014394e-06, + "loss": 0.5752, + "step": 2848 + }, + { + "epoch": 0.91168, + "grad_norm": 0.3341302029503999, + "learning_rate": 4.063986567811273e-06, + "loss": 0.5889, + "step": 2849 + }, + { + "epoch": 0.912, + "grad_norm": 0.34103806009772053, + "learning_rate": 4.034790036116209e-06, + "loss": 0.5406, + "step": 2850 + }, + { + "epoch": 0.91232, + "grad_norm": 0.34257989757405, + "learning_rate": 4.005696600382236e-06, + "loss": 0.5369, + "step": 2851 + }, + { + "epoch": 0.91264, + "grad_norm": 0.3246156810822316, + "learning_rate": 3.976706291864596e-06, + "loss": 0.5641, + "step": 2852 + }, + { + "epoch": 0.91296, + "grad_norm": 0.34452475901077434, + "learning_rate": 3.947819141707742e-06, + "loss": 0.552, + "step": 2853 + }, + { + "epoch": 0.91328, + "grad_norm": 0.32860189656766997, + "learning_rate": 3.919035180945297e-06, + "loss": 0.563, + "step": 2854 + }, + { + "epoch": 0.9136, + "grad_norm": 0.35123397263055917, + "learning_rate": 3.890354440500032e-06, + "loss": 0.582, + "step": 2855 + }, + { + "epoch": 0.91392, + "grad_norm": 0.3599111154000626, + "learning_rate": 3.8617769511838264e-06, + "loss": 0.5754, + "step": 2856 + }, + { + "epoch": 0.91424, + "grad_norm": 0.3723849372923179, + "learning_rate": 3.833302743697631e-06, + "loss": 0.5771, + "step": 2857 + }, + { + "epoch": 0.91456, + "grad_norm": 0.34801736815451534, + "learning_rate": 3.8049318486314657e-06, + "loss": 0.5659, + "step": 2858 + }, + { + "epoch": 0.91488, + "grad_norm": 0.499823699582421, + "learning_rate": 3.776664296464316e-06, + "loss": 0.5526, + "step": 2859 + }, + { + "epoch": 0.9152, + "grad_norm": 0.33305508076608026, + "learning_rate": 3.748500117564191e-06, + "loss": 0.5425, + "step": 2860 + }, + { + "epoch": 0.91552, + "grad_norm": 0.359265150726778, + "learning_rate": 3.7204393421880203e-06, + "loss": 0.5856, + "step": 2861 + }, + { + "epoch": 0.91584, + "grad_norm": 0.3511973531693837, + "learning_rate": 3.692482000481645e-06, + "loss": 0.6103, + "step": 2862 + }, + { + "epoch": 0.91616, + "grad_norm": 0.3485342444756007, + "learning_rate": 3.6646281224798075e-06, + "loss": 0.6073, + "step": 2863 + }, + { + "epoch": 0.91648, + "grad_norm": 0.3331230979882821, + "learning_rate": 3.6368777381060483e-06, + "loss": 0.5628, + "step": 2864 + }, + { + "epoch": 0.9168, + "grad_norm": 0.360453748430889, + "learning_rate": 3.609230877172798e-06, + "loss": 0.519, + "step": 2865 + }, + { + "epoch": 0.91712, + "grad_norm": 0.35306952896812904, + "learning_rate": 3.5816875693812314e-06, + "loss": 0.5667, + "step": 2866 + }, + { + "epoch": 0.91744, + "grad_norm": 0.3479611248165948, + "learning_rate": 3.554247844321257e-06, + "loss": 0.6038, + "step": 2867 + }, + { + "epoch": 0.91776, + "grad_norm": 0.32958588483131146, + "learning_rate": 3.5269117314715495e-06, + "loss": 0.5415, + "step": 2868 + }, + { + "epoch": 0.91808, + "grad_norm": 0.3472610183107072, + "learning_rate": 3.4996792601994287e-06, + "loss": 0.5858, + "step": 2869 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3343199575343456, + "learning_rate": 3.4725504597608816e-06, + "loss": 0.5404, + "step": 2870 + }, + { + "epoch": 0.91872, + "grad_norm": 0.33591339801874, + "learning_rate": 3.445525359300561e-06, + "loss": 0.587, + "step": 2871 + }, + { + "epoch": 0.91904, + "grad_norm": 0.3519781818579308, + "learning_rate": 3.4186039878516653e-06, + "loss": 0.559, + "step": 2872 + }, + { + "epoch": 0.91936, + "grad_norm": 0.3655017032831667, + "learning_rate": 3.3917863743359813e-06, + "loss": 0.5908, + "step": 2873 + }, + { + "epoch": 0.91968, + "grad_norm": 0.33429192412287345, + "learning_rate": 3.365072547563797e-06, + "loss": 0.5728, + "step": 2874 + }, + { + "epoch": 0.92, + "grad_norm": 0.34623141842115757, + "learning_rate": 3.338462536233955e-06, + "loss": 0.5545, + "step": 2875 + }, + { + "epoch": 0.92032, + "grad_norm": 0.3384627725450504, + "learning_rate": 3.311956368933733e-06, + "loss": 0.5382, + "step": 2876 + }, + { + "epoch": 0.92064, + "grad_norm": 0.5896368414027378, + "learning_rate": 3.2855540741388414e-06, + "loss": 0.5648, + "step": 2877 + }, + { + "epoch": 0.92096, + "grad_norm": 0.3486875469918097, + "learning_rate": 3.2592556802134244e-06, + "loss": 0.5481, + "step": 2878 + }, + { + "epoch": 0.92128, + "grad_norm": 0.34310645471152623, + "learning_rate": 3.2330612154099936e-06, + "loss": 0.568, + "step": 2879 + }, + { + "epoch": 0.9216, + "grad_norm": 0.35614170986681676, + "learning_rate": 3.2069707078694057e-06, + "loss": 0.5888, + "step": 2880 + }, + { + "epoch": 0.92192, + "grad_norm": 0.3167398265351191, + "learning_rate": 3.180984185620839e-06, + "loss": 0.5221, + "step": 2881 + }, + { + "epoch": 0.92224, + "grad_norm": 0.3700634777584648, + "learning_rate": 3.155101676581762e-06, + "loss": 0.5251, + "step": 2882 + }, + { + "epoch": 0.92256, + "grad_norm": 0.38675264625911526, + "learning_rate": 3.1293232085578883e-06, + "loss": 0.6143, + "step": 2883 + }, + { + "epoch": 0.92288, + "grad_norm": 0.3400966490316834, + "learning_rate": 3.103648809243187e-06, + "loss": 0.5509, + "step": 2884 + }, + { + "epoch": 0.9232, + "grad_norm": 0.36867499153736294, + "learning_rate": 3.078078506219784e-06, + "loss": 0.5717, + "step": 2885 + }, + { + "epoch": 0.92352, + "grad_norm": 0.3229408702567602, + "learning_rate": 3.0526123269580377e-06, + "loss": 0.5635, + "step": 2886 + }, + { + "epoch": 0.92384, + "grad_norm": 0.39804029753224696, + "learning_rate": 3.027250298816364e-06, + "loss": 0.5803, + "step": 2887 + }, + { + "epoch": 0.92416, + "grad_norm": 0.36141868305260705, + "learning_rate": 3.0019924490413685e-06, + "loss": 0.5929, + "step": 2888 + }, + { + "epoch": 0.92448, + "grad_norm": 0.340395146432603, + "learning_rate": 2.976838804767668e-06, + "loss": 0.5675, + "step": 2889 + }, + { + "epoch": 0.9248, + "grad_norm": 0.33833354131370824, + "learning_rate": 2.9517893930179785e-06, + "loss": 0.5409, + "step": 2890 + }, + { + "epoch": 0.92512, + "grad_norm": 0.34764889418401584, + "learning_rate": 2.9268442407030196e-06, + "loss": 0.5576, + "step": 2891 + }, + { + "epoch": 0.92544, + "grad_norm": 0.34512214030953275, + "learning_rate": 2.9020033746215313e-06, + "loss": 0.6083, + "step": 2892 + }, + { + "epoch": 0.92576, + "grad_norm": 0.34386328514826, + "learning_rate": 2.877266821460145e-06, + "loss": 0.5805, + "step": 2893 + }, + { + "epoch": 0.92608, + "grad_norm": 0.3675681102152316, + "learning_rate": 2.852634607793525e-06, + "loss": 0.5813, + "step": 2894 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3387088572847811, + "learning_rate": 2.8281067600841705e-06, + "loss": 0.5569, + "step": 2895 + }, + { + "epoch": 0.92672, + "grad_norm": 0.3384830103818643, + "learning_rate": 2.8036833046824917e-06, + "loss": 0.533, + "step": 2896 + }, + { + "epoch": 0.92704, + "grad_norm": 0.34017090324653076, + "learning_rate": 2.7793642678267563e-06, + "loss": 0.5812, + "step": 2897 + }, + { + "epoch": 0.92736, + "grad_norm": 0.3375396695080064, + "learning_rate": 2.7551496756430094e-06, + "loss": 0.5948, + "step": 2898 + }, + { + "epoch": 0.92768, + "grad_norm": 0.3714118755082262, + "learning_rate": 2.731039554145165e-06, + "loss": 0.5902, + "step": 2899 + }, + { + "epoch": 0.928, + "grad_norm": 0.36010523570409236, + "learning_rate": 2.7070339292348147e-06, + "loss": 0.5735, + "step": 2900 + }, + { + "epoch": 0.92832, + "grad_norm": 0.3445063157061339, + "learning_rate": 2.6831328267013624e-06, + "loss": 0.5686, + "step": 2901 + }, + { + "epoch": 0.92864, + "grad_norm": 0.33452317572382473, + "learning_rate": 2.659336272221913e-06, + "loss": 0.5453, + "step": 2902 + }, + { + "epoch": 0.92896, + "grad_norm": 0.34418474725527026, + "learning_rate": 2.6356442913612054e-06, + "loss": 0.6022, + "step": 2903 + }, + { + "epoch": 0.92928, + "grad_norm": 0.3417053895305936, + "learning_rate": 2.6120569095716806e-06, + "loss": 0.5401, + "step": 2904 + }, + { + "epoch": 0.9296, + "grad_norm": 0.4323828368821946, + "learning_rate": 2.5885741521933902e-06, + "loss": 0.5795, + "step": 2905 + }, + { + "epoch": 0.92992, + "grad_norm": 0.34626103032463873, + "learning_rate": 2.565196044453988e-06, + "loss": 0.5449, + "step": 2906 + }, + { + "epoch": 0.93024, + "grad_norm": 0.3482572315889939, + "learning_rate": 2.541922611468728e-06, + "loss": 0.5427, + "step": 2907 + }, + { + "epoch": 0.93056, + "grad_norm": 0.34881581725772226, + "learning_rate": 2.518753878240365e-06, + "loss": 0.5575, + "step": 2908 + }, + { + "epoch": 0.93088, + "grad_norm": 0.3634452371924472, + "learning_rate": 2.4956898696592124e-06, + "loss": 0.5936, + "step": 2909 + }, + { + "epoch": 0.9312, + "grad_norm": 0.32644715446893524, + "learning_rate": 2.47273061050306e-06, + "loss": 0.5412, + "step": 2910 + }, + { + "epoch": 0.93152, + "grad_norm": 0.3472435572486151, + "learning_rate": 2.449876125437156e-06, + "loss": 0.5803, + "step": 2911 + }, + { + "epoch": 0.93184, + "grad_norm": 0.33880253692206175, + "learning_rate": 2.4271264390142267e-06, + "loss": 0.6003, + "step": 2912 + }, + { + "epoch": 0.93216, + "grad_norm": 0.371720063380355, + "learning_rate": 2.4044815756743553e-06, + "loss": 0.6396, + "step": 2913 + }, + { + "epoch": 0.93248, + "grad_norm": 0.32725071098545716, + "learning_rate": 2.3819415597450825e-06, + "loss": 0.5768, + "step": 2914 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3479775228639434, + "learning_rate": 2.3595064154412374e-06, + "loss": 0.583, + "step": 2915 + }, + { + "epoch": 0.93312, + "grad_norm": 0.37140340929061805, + "learning_rate": 2.3371761668650404e-06, + "loss": 0.5962, + "step": 2916 + }, + { + "epoch": 0.93344, + "grad_norm": 0.338309123875935, + "learning_rate": 2.3149508380060025e-06, + "loss": 0.5742, + "step": 2917 + }, + { + "epoch": 0.93376, + "grad_norm": 0.4000009841628731, + "learning_rate": 2.2928304527409127e-06, + "loss": 0.5643, + "step": 2918 + }, + { + "epoch": 0.93408, + "grad_norm": 0.34963884051711597, + "learning_rate": 2.2708150348338176e-06, + "loss": 0.5929, + "step": 2919 + }, + { + "epoch": 0.9344, + "grad_norm": 0.34593758884301407, + "learning_rate": 2.2489046079360198e-06, + "loss": 0.5518, + "step": 2920 + }, + { + "epoch": 0.93472, + "grad_norm": 0.36935830768837036, + "learning_rate": 2.227099195586002e-06, + "loss": 0.6002, + "step": 2921 + }, + { + "epoch": 0.93504, + "grad_norm": 0.3593240870270949, + "learning_rate": 2.205398821209459e-06, + "loss": 0.5714, + "step": 2922 + }, + { + "epoch": 0.93536, + "grad_norm": 0.355966251151385, + "learning_rate": 2.1838035081191866e-06, + "loss": 0.6101, + "step": 2923 + }, + { + "epoch": 0.93568, + "grad_norm": 0.343728318953142, + "learning_rate": 2.1623132795151824e-06, + "loss": 0.5109, + "step": 2924 + }, + { + "epoch": 0.936, + "grad_norm": 0.34648754539327065, + "learning_rate": 2.140928158484523e-06, + "loss": 0.5327, + "step": 2925 + }, + { + "epoch": 0.93632, + "grad_norm": 0.3749897065975177, + "learning_rate": 2.1196481680013314e-06, + "loss": 0.5875, + "step": 2926 + }, + { + "epoch": 0.93664, + "grad_norm": 0.3841768188244641, + "learning_rate": 2.0984733309268424e-06, + "loss": 0.5799, + "step": 2927 + }, + { + "epoch": 0.93696, + "grad_norm": 0.352868203688293, + "learning_rate": 2.0774036700093036e-06, + "loss": 0.6127, + "step": 2928 + }, + { + "epoch": 0.93728, + "grad_norm": 0.36784058805691916, + "learning_rate": 2.0564392078839644e-06, + "loss": 0.6259, + "step": 2929 + }, + { + "epoch": 0.9376, + "grad_norm": 0.35661481974517606, + "learning_rate": 2.0355799670730645e-06, + "loss": 0.5455, + "step": 2930 + }, + { + "epoch": 0.93792, + "grad_norm": 0.34117875897031086, + "learning_rate": 2.0148259699857895e-06, + "loss": 0.5877, + "step": 2931 + }, + { + "epoch": 0.93824, + "grad_norm": 0.33847074964350893, + "learning_rate": 1.9941772389182935e-06, + "loss": 0.5019, + "step": 2932 + }, + { + "epoch": 0.93856, + "grad_norm": 0.3325235779600918, + "learning_rate": 1.97363379605362e-06, + "loss": 0.5952, + "step": 2933 + }, + { + "epoch": 0.93888, + "grad_norm": 0.3437097787712463, + "learning_rate": 1.9531956634617044e-06, + "loss": 0.5719, + "step": 2934 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3338308091140957, + "learning_rate": 1.9328628630993386e-06, + "loss": 0.5753, + "step": 2935 + }, + { + "epoch": 0.93952, + "grad_norm": 0.3357846986464912, + "learning_rate": 1.91263541681016e-06, + "loss": 0.5668, + "step": 2936 + }, + { + "epoch": 0.93984, + "grad_norm": 0.36589773617524324, + "learning_rate": 1.8925133463246425e-06, + "loss": 0.5296, + "step": 2937 + }, + { + "epoch": 0.94016, + "grad_norm": 0.34741287859319475, + "learning_rate": 1.872496673260038e-06, + "loss": 0.5665, + "step": 2938 + }, + { + "epoch": 0.94048, + "grad_norm": 0.36330777305164874, + "learning_rate": 1.8525854191203562e-06, + "loss": 0.6009, + "step": 2939 + }, + { + "epoch": 0.9408, + "grad_norm": 0.34104369081316266, + "learning_rate": 1.8327796052963752e-06, + "loss": 0.5838, + "step": 2940 + }, + { + "epoch": 0.94112, + "grad_norm": 0.3558130290045266, + "learning_rate": 1.813079253065597e-06, + "loss": 0.5918, + "step": 2941 + }, + { + "epoch": 0.94144, + "grad_norm": 0.34951431053086685, + "learning_rate": 1.7934843835922144e-06, + "loss": 0.5879, + "step": 2942 + }, + { + "epoch": 0.94176, + "grad_norm": 0.33169653160506357, + "learning_rate": 1.7739950179271103e-06, + "loss": 0.5474, + "step": 2943 + }, + { + "epoch": 0.94208, + "grad_norm": 0.3340890778759615, + "learning_rate": 1.7546111770078144e-06, + "loss": 0.5509, + "step": 2944 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3294918454159986, + "learning_rate": 1.7353328816584913e-06, + "loss": 0.5574, + "step": 2945 + }, + { + "epoch": 0.94272, + "grad_norm": 0.3378355069960122, + "learning_rate": 1.7161601525899407e-06, + "loss": 0.5605, + "step": 2946 + }, + { + "epoch": 0.94304, + "grad_norm": 0.347360388900877, + "learning_rate": 1.6970930103994974e-06, + "loss": 0.6079, + "step": 2947 + }, + { + "epoch": 0.94336, + "grad_norm": 0.5379559421601664, + "learning_rate": 1.6781314755711319e-06, + "loss": 0.5738, + "step": 2948 + }, + { + "epoch": 0.94368, + "grad_norm": 0.35090601722345, + "learning_rate": 1.6592755684753047e-06, + "loss": 0.5876, + "step": 2949 + }, + { + "epoch": 0.944, + "grad_norm": 0.35234845654112756, + "learning_rate": 1.6405253093690343e-06, + "loss": 0.5872, + "step": 2950 + }, + { + "epoch": 0.94432, + "grad_norm": 0.3680415823701676, + "learning_rate": 1.6218807183958295e-06, + "loss": 0.5551, + "step": 2951 + }, + { + "epoch": 0.94464, + "grad_norm": 0.3314338753479147, + "learning_rate": 1.6033418155856794e-06, + "loss": 0.5415, + "step": 2952 + }, + { + "epoch": 0.94496, + "grad_norm": 0.38456158513215843, + "learning_rate": 1.584908620855019e-06, + "loss": 0.5877, + "step": 2953 + }, + { + "epoch": 0.94528, + "grad_norm": 0.35445987277445967, + "learning_rate": 1.5665811540067409e-06, + "loss": 0.5378, + "step": 2954 + }, + { + "epoch": 0.9456, + "grad_norm": 0.34834016754937597, + "learning_rate": 1.54835943473014e-06, + "loss": 0.566, + "step": 2955 + }, + { + "epoch": 0.94592, + "grad_norm": 0.35124995819587745, + "learning_rate": 1.5302434826009349e-06, + "loss": 0.5474, + "step": 2956 + }, + { + "epoch": 0.94624, + "grad_norm": 0.34220984783938785, + "learning_rate": 1.5122333170811576e-06, + "loss": 0.5474, + "step": 2957 + }, + { + "epoch": 0.94656, + "grad_norm": 0.34712881756806974, + "learning_rate": 1.4943289575192421e-06, + "loss": 0.5703, + "step": 2958 + }, + { + "epoch": 0.94688, + "grad_norm": 0.3502499119432055, + "learning_rate": 1.4765304231499578e-06, + "loss": 0.5774, + "step": 2959 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3513076058690715, + "learning_rate": 1.4588377330943536e-06, + "loss": 0.5874, + "step": 2960 + }, + { + "epoch": 0.94752, + "grad_norm": 0.3537578390048591, + "learning_rate": 1.4412509063598034e-06, + "loss": 0.6439, + "step": 2961 + }, + { + "epoch": 0.94784, + "grad_norm": 0.3665668483613586, + "learning_rate": 1.4237699618399048e-06, + "loss": 0.5933, + "step": 2962 + }, + { + "epoch": 0.94816, + "grad_norm": 0.35312769657275656, + "learning_rate": 1.4063949183145463e-06, + "loss": 0.5918, + "step": 2963 + }, + { + "epoch": 0.94848, + "grad_norm": 0.3336561524014225, + "learning_rate": 1.3891257944498416e-06, + "loss": 0.5959, + "step": 2964 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3608145588685783, + "learning_rate": 1.3719626087981052e-06, + "loss": 0.5368, + "step": 2965 + }, + { + "epoch": 0.94912, + "grad_norm": 0.3730366746726397, + "learning_rate": 1.354905379797844e-06, + "loss": 0.5382, + "step": 2966 + }, + { + "epoch": 0.94944, + "grad_norm": 0.32338501285008003, + "learning_rate": 1.3379541257737217e-06, + "loss": 0.5561, + "step": 2967 + }, + { + "epoch": 0.94976, + "grad_norm": 0.3526047077386501, + "learning_rate": 1.321108864936571e-06, + "loss": 0.5963, + "step": 2968 + }, + { + "epoch": 0.95008, + "grad_norm": 0.3960926123725681, + "learning_rate": 1.3043696153833717e-06, + "loss": 0.6461, + "step": 2969 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3421126169413257, + "learning_rate": 1.287736395097161e-06, + "loss": 0.5622, + "step": 2970 + }, + { + "epoch": 0.95072, + "grad_norm": 0.3463639607176593, + "learning_rate": 1.2712092219471227e-06, + "loss": 0.5815, + "step": 2971 + }, + { + "epoch": 0.95104, + "grad_norm": 0.36283792504186074, + "learning_rate": 1.2547881136884654e-06, + "loss": 0.5378, + "step": 2972 + }, + { + "epoch": 0.95136, + "grad_norm": 0.3367440416990651, + "learning_rate": 1.2384730879625106e-06, + "loss": 0.5747, + "step": 2973 + }, + { + "epoch": 0.95168, + "grad_norm": 0.3477123428091511, + "learning_rate": 1.2222641622965604e-06, + "loss": 0.5391, + "step": 2974 + }, + { + "epoch": 0.952, + "grad_norm": 0.34504475093450154, + "learning_rate": 1.206161354103963e-06, + "loss": 0.581, + "step": 2975 + }, + { + "epoch": 0.95232, + "grad_norm": 0.3653522597405551, + "learning_rate": 1.1901646806840471e-06, + "loss": 0.5977, + "step": 2976 + }, + { + "epoch": 0.95264, + "grad_norm": 0.3618678438617388, + "learning_rate": 1.174274159222133e-06, + "loss": 0.6008, + "step": 2977 + }, + { + "epoch": 0.95296, + "grad_norm": 0.3596330442437025, + "learning_rate": 1.1584898067894867e-06, + "loss": 0.6252, + "step": 2978 + }, + { + "epoch": 0.95328, + "grad_norm": 0.3320449626840452, + "learning_rate": 1.1428116403433554e-06, + "loss": 0.5754, + "step": 2979 + }, + { + "epoch": 0.9536, + "grad_norm": 0.33215490910901957, + "learning_rate": 1.1272396767268433e-06, + "loss": 0.5494, + "step": 2980 + }, + { + "epoch": 0.95392, + "grad_norm": 0.34818280951106906, + "learning_rate": 1.1117739326690247e-06, + "loss": 0.5876, + "step": 2981 + }, + { + "epoch": 0.95424, + "grad_norm": 0.347376607901708, + "learning_rate": 1.0964144247848197e-06, + "loss": 0.5342, + "step": 2982 + }, + { + "epoch": 0.95456, + "grad_norm": 0.36345764361124655, + "learning_rate": 1.0811611695750513e-06, + "loss": 0.5983, + "step": 2983 + }, + { + "epoch": 0.95488, + "grad_norm": 0.35966788905303054, + "learning_rate": 1.0660141834263449e-06, + "loss": 0.5971, + "step": 2984 + }, + { + "epoch": 0.9552, + "grad_norm": 0.32790846520908623, + "learning_rate": 1.0509734826112394e-06, + "loss": 0.5089, + "step": 2985 + }, + { + "epoch": 0.95552, + "grad_norm": 0.36823188901468407, + "learning_rate": 1.0360390832879985e-06, + "loss": 0.5776, + "step": 2986 + }, + { + "epoch": 0.95584, + "grad_norm": 0.33943531308732744, + "learning_rate": 1.0212110015007547e-06, + "loss": 0.5786, + "step": 2987 + }, + { + "epoch": 0.95616, + "grad_norm": 0.34875097707546227, + "learning_rate": 1.006489253179388e-06, + "loss": 0.5677, + "step": 2988 + }, + { + "epoch": 0.95648, + "grad_norm": 0.3455504822690956, + "learning_rate": 9.918738541395578e-07, + "loss": 0.5526, + "step": 2989 + }, + { + "epoch": 0.9568, + "grad_norm": 0.4590995078853539, + "learning_rate": 9.773648200826823e-07, + "loss": 0.5361, + "step": 2990 + }, + { + "epoch": 0.95712, + "grad_norm": 0.3287794338622283, + "learning_rate": 9.62962166595882e-07, + "loss": 0.6143, + "step": 2991 + }, + { + "epoch": 0.95744, + "grad_norm": 0.33925374881083975, + "learning_rate": 9.486659091520244e-07, + "loss": 0.5597, + "step": 2992 + }, + { + "epoch": 0.95776, + "grad_norm": 0.43241540955994434, + "learning_rate": 9.344760631096239e-07, + "loss": 0.5561, + "step": 2993 + }, + { + "epoch": 0.95808, + "grad_norm": 0.3891603979596362, + "learning_rate": 9.203926437129528e-07, + "loss": 0.5409, + "step": 2994 + }, + { + "epoch": 0.9584, + "grad_norm": 0.38420498082309823, + "learning_rate": 9.064156660918865e-07, + "loss": 0.6308, + "step": 2995 + }, + { + "epoch": 0.95872, + "grad_norm": 0.36953905631438, + "learning_rate": 8.925451452619693e-07, + "loss": 0.6094, + "step": 2996 + }, + { + "epoch": 0.95904, + "grad_norm": 0.3620247581257889, + "learning_rate": 8.787810961243814e-07, + "loss": 0.6002, + "step": 2997 + }, + { + "epoch": 0.95936, + "grad_norm": 0.33706859882714446, + "learning_rate": 8.651235334659169e-07, + "loss": 0.5974, + "step": 2998 + }, + { + "epoch": 0.95968, + "grad_norm": 0.34774307956372846, + "learning_rate": 8.515724719589835e-07, + "loss": 0.5589, + "step": 2999 + }, + { + "epoch": 0.96, + "grad_norm": 0.334074929758646, + "learning_rate": 8.381279261615471e-07, + "loss": 0.5466, + "step": 3000 + }, + { + "epoch": 0.96032, + "grad_norm": 0.3610137435702167, + "learning_rate": 8.247899105171652e-07, + "loss": 0.5521, + "step": 3001 + }, + { + "epoch": 0.96064, + "grad_norm": 0.36058569173337224, + "learning_rate": 8.115584393549425e-07, + "loss": 0.5818, + "step": 3002 + }, + { + "epoch": 0.96096, + "grad_norm": 0.35159953169862573, + "learning_rate": 7.984335268895193e-07, + "loss": 0.5708, + "step": 3003 + }, + { + "epoch": 0.96128, + "grad_norm": 0.3423249349751101, + "learning_rate": 7.854151872210614e-07, + "loss": 0.6078, + "step": 3004 + }, + { + "epoch": 0.9616, + "grad_norm": 0.3383356696682219, + "learning_rate": 7.725034343352478e-07, + "loss": 0.6123, + "step": 3005 + }, + { + "epoch": 0.96192, + "grad_norm": 0.3323887843584124, + "learning_rate": 7.596982821032494e-07, + "loss": 0.5518, + "step": 3006 + }, + { + "epoch": 0.96224, + "grad_norm": 0.34685414527044406, + "learning_rate": 7.469997442816957e-07, + "loss": 0.6, + "step": 3007 + }, + { + "epoch": 0.96256, + "grad_norm": 0.3376624771473842, + "learning_rate": 7.344078345127292e-07, + "loss": 0.5722, + "step": 3008 + }, + { + "epoch": 0.96288, + "grad_norm": 0.3439905129336635, + "learning_rate": 7.219225663238738e-07, + "loss": 0.5529, + "step": 3009 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3399654458516724, + "learning_rate": 7.095439531281556e-07, + "loss": 0.5803, + "step": 3010 + }, + { + "epoch": 0.96352, + "grad_norm": 0.33972260289665, + "learning_rate": 6.972720082239481e-07, + "loss": 0.5828, + "step": 3011 + }, + { + "epoch": 0.96384, + "grad_norm": 0.32789095864566037, + "learning_rate": 6.851067447951054e-07, + "loss": 0.5288, + "step": 3012 + }, + { + "epoch": 0.96416, + "grad_norm": 0.32753837536877084, + "learning_rate": 6.730481759108287e-07, + "loss": 0.552, + "step": 3013 + }, + { + "epoch": 0.96448, + "grad_norm": 0.3247709604757158, + "learning_rate": 6.610963145256999e-07, + "loss": 0.5195, + "step": 3014 + }, + { + "epoch": 0.9648, + "grad_norm": 0.42050237723938133, + "learning_rate": 6.492511734796702e-07, + "loss": 0.5685, + "step": 3015 + }, + { + "epoch": 0.96512, + "grad_norm": 0.32661375396326997, + "learning_rate": 6.375127654980495e-07, + "loss": 0.5581, + "step": 3016 + }, + { + "epoch": 0.96544, + "grad_norm": 0.38819378922556824, + "learning_rate": 6.258811031914613e-07, + "loss": 0.6018, + "step": 3017 + }, + { + "epoch": 0.96576, + "grad_norm": 0.323920209103656, + "learning_rate": 6.143561990558877e-07, + "loss": 0.5423, + "step": 3018 + }, + { + "epoch": 0.96608, + "grad_norm": 0.3532360252455809, + "learning_rate": 6.029380654725691e-07, + "loss": 0.5779, + "step": 3019 + }, + { + "epoch": 0.9664, + "grad_norm": 0.33802407087018976, + "learning_rate": 5.916267147080934e-07, + "loss": 0.5869, + "step": 3020 + }, + { + "epoch": 0.96672, + "grad_norm": 0.3338179461751494, + "learning_rate": 5.804221589142955e-07, + "loss": 0.5722, + "step": 3021 + }, + { + "epoch": 0.96704, + "grad_norm": 0.3325262151259706, + "learning_rate": 5.693244101282913e-07, + "loss": 0.5459, + "step": 3022 + }, + { + "epoch": 0.96736, + "grad_norm": 0.36110785149431884, + "learning_rate": 5.583334802724661e-07, + "loss": 0.589, + "step": 3023 + }, + { + "epoch": 0.96768, + "grad_norm": 0.34241363205791653, + "learning_rate": 5.474493811544301e-07, + "loss": 0.538, + "step": 3024 + }, + { + "epoch": 0.968, + "grad_norm": 0.3476167965285925, + "learning_rate": 5.366721244670303e-07, + "loss": 0.5898, + "step": 3025 + }, + { + "epoch": 0.96832, + "grad_norm": 0.34763030822207996, + "learning_rate": 5.260017217883273e-07, + "loss": 0.5807, + "step": 3026 + }, + { + "epoch": 0.96864, + "grad_norm": 0.35025748580491395, + "learning_rate": 5.15438184581607e-07, + "loss": 0.5733, + "step": 3027 + }, + { + "epoch": 0.96896, + "grad_norm": 0.32257374496859864, + "learning_rate": 5.04981524195347e-07, + "loss": 0.5356, + "step": 3028 + }, + { + "epoch": 0.96928, + "grad_norm": 0.34391409073006424, + "learning_rate": 4.946317518631616e-07, + "loss": 0.5361, + "step": 3029 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3412407552958236, + "learning_rate": 4.843888787039009e-07, + "loss": 0.5774, + "step": 3030 + }, + { + "epoch": 0.96992, + "grad_norm": 0.35401081087623015, + "learning_rate": 4.742529157215181e-07, + "loss": 0.5344, + "step": 3031 + }, + { + "epoch": 0.97024, + "grad_norm": 0.3424814726842407, + "learning_rate": 4.642238738051474e-07, + "loss": 0.5904, + "step": 3032 + }, + { + "epoch": 0.97056, + "grad_norm": 0.3540963795440962, + "learning_rate": 4.54301763729037e-07, + "loss": 0.5674, + "step": 3033 + }, + { + "epoch": 0.97088, + "grad_norm": 0.35331654650540045, + "learning_rate": 4.4448659615258236e-07, + "loss": 0.5615, + "step": 3034 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3343292615605365, + "learning_rate": 4.3477838162024886e-07, + "loss": 0.5717, + "step": 3035 + }, + { + "epoch": 0.97152, + "grad_norm": 0.3576719247538726, + "learning_rate": 4.251771305616381e-07, + "loss": 0.6138, + "step": 3036 + }, + { + "epoch": 0.97184, + "grad_norm": 0.33256575491615603, + "learning_rate": 4.1568285329143254e-07, + "loss": 0.5943, + "step": 3037 + }, + { + "epoch": 0.97216, + "grad_norm": 0.3574334143465092, + "learning_rate": 4.062955600093732e-07, + "loss": 0.5547, + "step": 3038 + }, + { + "epoch": 0.97248, + "grad_norm": 0.35457194320985824, + "learning_rate": 3.9701526080029304e-07, + "loss": 0.5894, + "step": 3039 + }, + { + "epoch": 0.9728, + "grad_norm": 0.33249794363493285, + "learning_rate": 3.878419656340726e-07, + "loss": 0.5951, + "step": 3040 + }, + { + "epoch": 0.97312, + "grad_norm": 0.3538138734380768, + "learning_rate": 3.7877568436562873e-07, + "loss": 0.574, + "step": 3041 + }, + { + "epoch": 0.97344, + "grad_norm": 0.36469331123021703, + "learning_rate": 3.698164267349036e-07, + "loss": 0.6239, + "step": 3042 + }, + { + "epoch": 0.97376, + "grad_norm": 0.3443538502837614, + "learning_rate": 3.60964202366898e-07, + "loss": 0.5874, + "step": 3043 + }, + { + "epoch": 0.97408, + "grad_norm": 0.3878681209013827, + "learning_rate": 3.522190207716047e-07, + "loss": 0.5756, + "step": 3044 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3472484826445253, + "learning_rate": 3.4358089134400863e-07, + "loss": 0.571, + "step": 3045 + }, + { + "epoch": 0.97472, + "grad_norm": 0.3385241997060716, + "learning_rate": 3.350498233641086e-07, + "loss": 0.5653, + "step": 3046 + }, + { + "epoch": 0.97504, + "grad_norm": 0.34946707608166916, + "learning_rate": 3.266258259968846e-07, + "loss": 0.577, + "step": 3047 + }, + { + "epoch": 0.97536, + "grad_norm": 0.32779632967965444, + "learning_rate": 3.183089082922641e-07, + "loss": 0.5558, + "step": 3048 + }, + { + "epoch": 0.97568, + "grad_norm": 0.38213428342768524, + "learning_rate": 3.1009907918518877e-07, + "loss": 0.5653, + "step": 3049 + }, + { + "epoch": 0.976, + "grad_norm": 0.3277835093048738, + "learning_rate": 3.019963474954923e-07, + "loss": 0.5925, + "step": 3050 + }, + { + "epoch": 0.97632, + "grad_norm": 0.36935821223351273, + "learning_rate": 2.9400072192800054e-07, + "loss": 0.6103, + "step": 3051 + }, + { + "epoch": 0.97664, + "grad_norm": 0.34596844325611154, + "learning_rate": 2.8611221107246455e-07, + "loss": 0.5707, + "step": 3052 + }, + { + "epoch": 0.97696, + "grad_norm": 0.34797252141198665, + "learning_rate": 2.7833082340353867e-07, + "loss": 0.5523, + "step": 3053 + }, + { + "epoch": 0.97728, + "grad_norm": 0.33594949599780155, + "learning_rate": 2.706565672808248e-07, + "loss": 0.5574, + "step": 3054 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3626534604748479, + "learning_rate": 2.630894509488058e-07, + "loss": 0.5947, + "step": 3055 + }, + { + "epoch": 0.97792, + "grad_norm": 0.3393564628713558, + "learning_rate": 2.55629482536901e-07, + "loss": 0.5682, + "step": 3056 + }, + { + "epoch": 0.97824, + "grad_norm": 0.3456214471911362, + "learning_rate": 2.482766700593664e-07, + "loss": 0.5519, + "step": 3057 + }, + { + "epoch": 0.97856, + "grad_norm": 0.41635533270319847, + "learning_rate": 2.4103102141539436e-07, + "loss": 0.5752, + "step": 3058 + }, + { + "epoch": 0.97888, + "grad_norm": 0.3445549388275251, + "learning_rate": 2.3389254438901386e-07, + "loss": 0.5574, + "step": 3059 + }, + { + "epoch": 0.9792, + "grad_norm": 0.33765553083777006, + "learning_rate": 2.2686124664912378e-07, + "loss": 0.6115, + "step": 3060 + }, + { + "epoch": 0.97952, + "grad_norm": 0.3424257364579159, + "learning_rate": 2.199371357495039e-07, + "loss": 0.5889, + "step": 3061 + }, + { + "epoch": 0.97984, + "grad_norm": 0.35676989267410675, + "learning_rate": 2.1312021912875956e-07, + "loss": 0.5679, + "step": 3062 + }, + { + "epoch": 0.98016, + "grad_norm": 0.3605283767238318, + "learning_rate": 2.064105041103326e-07, + "loss": 0.6012, + "step": 3063 + }, + { + "epoch": 0.98048, + "grad_norm": 0.3491057795652527, + "learning_rate": 1.9980799790251247e-07, + "loss": 0.5783, + "step": 3064 + }, + { + "epoch": 0.9808, + "grad_norm": 0.4055076865265235, + "learning_rate": 1.9331270759840313e-07, + "loss": 0.5831, + "step": 3065 + }, + { + "epoch": 0.98112, + "grad_norm": 0.356913503502491, + "learning_rate": 1.8692464017594503e-07, + "loss": 0.6052, + "step": 3066 + }, + { + "epoch": 0.98144, + "grad_norm": 0.36645817945871173, + "learning_rate": 1.8064380249787073e-07, + "loss": 0.5955, + "step": 3067 + }, + { + "epoch": 0.98176, + "grad_norm": 0.33668101057489536, + "learning_rate": 1.744702013117161e-07, + "loss": 0.5877, + "step": 3068 + }, + { + "epoch": 0.98208, + "grad_norm": 0.33554858337596616, + "learning_rate": 1.6840384324980917e-07, + "loss": 0.5149, + "step": 3069 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3517379456116306, + "learning_rate": 1.6244473482929236e-07, + "loss": 0.5725, + "step": 3070 + }, + { + "epoch": 0.98272, + "grad_norm": 0.3542041241590634, + "learning_rate": 1.5659288245204462e-07, + "loss": 0.599, + "step": 3071 + }, + { + "epoch": 0.98304, + "grad_norm": 0.34010650493999023, + "learning_rate": 1.508482924047483e-07, + "loss": 0.545, + "step": 3072 + }, + { + "epoch": 0.98336, + "grad_norm": 0.34643566377072554, + "learning_rate": 1.452109708588667e-07, + "loss": 0.5968, + "step": 3073 + }, + { + "epoch": 0.98368, + "grad_norm": 0.35445576952004687, + "learning_rate": 1.3968092387057763e-07, + "loss": 0.5543, + "step": 3074 + }, + { + "epoch": 0.984, + "grad_norm": 0.35937402681947833, + "learning_rate": 1.342581573808732e-07, + "loss": 0.5788, + "step": 3075 + }, + { + "epoch": 0.98432, + "grad_norm": 0.35937832592448776, + "learning_rate": 1.2894267721543784e-07, + "loss": 0.5749, + "step": 3076 + }, + { + "epoch": 0.98464, + "grad_norm": 0.3362818494664272, + "learning_rate": 1.2373448908473695e-07, + "loss": 0.5743, + "step": 3077 + }, + { + "epoch": 0.98496, + "grad_norm": 0.3718921725009722, + "learning_rate": 1.186335985839393e-07, + "loss": 0.5452, + "step": 3078 + }, + { + "epoch": 0.98528, + "grad_norm": 0.33374310850780264, + "learning_rate": 1.1364001119298362e-07, + "loss": 0.5157, + "step": 3079 + }, + { + "epoch": 0.9856, + "grad_norm": 0.32540387502014295, + "learning_rate": 1.0875373227647866e-07, + "loss": 0.5522, + "step": 3080 + }, + { + "epoch": 0.98592, + "grad_norm": 0.35199700312823773, + "learning_rate": 1.0397476708380315e-07, + "loss": 0.5559, + "step": 3081 + }, + { + "epoch": 0.98624, + "grad_norm": 0.328713406921307, + "learning_rate": 9.930312074902803e-08, + "loss": 0.5604, + "step": 3082 + }, + { + "epoch": 0.98656, + "grad_norm": 0.39051777192109, + "learning_rate": 9.473879829091648e-08, + "loss": 0.5711, + "step": 3083 + }, + { + "epoch": 0.98688, + "grad_norm": 0.3559927552619233, + "learning_rate": 9.02818046129461e-08, + "loss": 0.5886, + "step": 3084 + }, + { + "epoch": 0.9872, + "grad_norm": 0.35322712440811577, + "learning_rate": 8.59321445032979e-08, + "loss": 0.5606, + "step": 3085 + }, + { + "epoch": 0.98752, + "grad_norm": 0.3455829112476088, + "learning_rate": 8.168982263483394e-08, + "loss": 0.5584, + "step": 3086 + }, + { + "epoch": 0.98784, + "grad_norm": 0.3605758720130785, + "learning_rate": 7.755484356509746e-08, + "loss": 0.5659, + "step": 3087 + }, + { + "epoch": 0.98816, + "grad_norm": 0.36406820541827006, + "learning_rate": 7.352721173633504e-08, + "loss": 0.6191, + "step": 3088 + }, + { + "epoch": 0.98848, + "grad_norm": 0.337989920536971, + "learning_rate": 6.960693147542996e-08, + "loss": 0.5625, + "step": 3089 + }, + { + "epoch": 0.9888, + "grad_norm": 0.36405275356887806, + "learning_rate": 6.579400699397998e-08, + "loss": 0.5945, + "step": 3090 + }, + { + "epoch": 0.98912, + "grad_norm": 0.35175253541383983, + "learning_rate": 6.208844238823064e-08, + "loss": 0.534, + "step": 3091 + }, + { + "epoch": 0.98944, + "grad_norm": 0.32270622427841283, + "learning_rate": 5.849024163908645e-08, + "loss": 0.5522, + "step": 3092 + }, + { + "epoch": 0.98976, + "grad_norm": 0.3267607854731609, + "learning_rate": 5.4999408612110834e-08, + "loss": 0.5065, + "step": 3093 + }, + { + "epoch": 0.99008, + "grad_norm": 0.333087602601193, + "learning_rate": 5.161594705753725e-08, + "loss": 0.5683, + "step": 3094 + }, + { + "epoch": 0.9904, + "grad_norm": 0.34012888786014306, + "learning_rate": 4.833986061022477e-08, + "loss": 0.5621, + "step": 3095 + }, + { + "epoch": 0.99072, + "grad_norm": 0.341411041628581, + "learning_rate": 4.517115278969142e-08, + "loss": 0.5392, + "step": 3096 + }, + { + "epoch": 0.99104, + "grad_norm": 0.31678780309177473, + "learning_rate": 4.210982700010302e-08, + "loss": 0.5672, + "step": 3097 + }, + { + "epoch": 0.99136, + "grad_norm": 0.32613069832826314, + "learning_rate": 3.915588653026214e-08, + "loss": 0.5624, + "step": 3098 + }, + { + "epoch": 0.99168, + "grad_norm": 0.3388788089473714, + "learning_rate": 3.6309334553596976e-08, + "loss": 0.5509, + "step": 3099 + }, + { + "epoch": 0.992, + "grad_norm": 0.34094078498765784, + "learning_rate": 3.357017412817243e-08, + "loss": 0.6102, + "step": 3100 + }, + { + "epoch": 0.99232, + "grad_norm": 0.33001286343331304, + "learning_rate": 3.0938408196690136e-08, + "loss": 0.5729, + "step": 3101 + }, + { + "epoch": 0.99264, + "grad_norm": 0.33891470644383886, + "learning_rate": 2.841403958647737e-08, + "loss": 0.536, + "step": 3102 + }, + { + "epoch": 0.99296, + "grad_norm": 0.3534016909798925, + "learning_rate": 2.59970710094537e-08, + "loss": 0.6065, + "step": 3103 + }, + { + "epoch": 0.99328, + "grad_norm": 0.33991779310580084, + "learning_rate": 2.3687505062208738e-08, + "loss": 0.553, + "step": 3104 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3254908024625272, + "learning_rate": 2.1485344225902205e-08, + "loss": 0.532, + "step": 3105 + }, + { + "epoch": 0.99392, + "grad_norm": 0.34983334152372686, + "learning_rate": 1.9390590866341652e-08, + "loss": 0.5995, + "step": 3106 + }, + { + "epoch": 0.99424, + "grad_norm": 0.3537433105206389, + "learning_rate": 1.7403247233926945e-08, + "loss": 0.5993, + "step": 3107 + }, + { + "epoch": 0.99456, + "grad_norm": 0.34247688714640284, + "learning_rate": 1.5523315463672473e-08, + "loss": 0.556, + "step": 3108 + }, + { + "epoch": 0.99488, + "grad_norm": 0.36795006478868114, + "learning_rate": 1.375079757519604e-08, + "loss": 0.6431, + "step": 3109 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3617742501927704, + "learning_rate": 1.2085695472729975e-08, + "loss": 0.5835, + "step": 3110 + }, + { + "epoch": 0.99552, + "grad_norm": 0.35701364706114147, + "learning_rate": 1.0528010945098921e-08, + "loss": 0.6025, + "step": 3111 + }, + { + "epoch": 0.99584, + "grad_norm": 0.3742832929399474, + "learning_rate": 9.077745665730941e-09, + "loss": 0.604, + "step": 3112 + }, + { + "epoch": 0.99616, + "grad_norm": 0.36316691586090294, + "learning_rate": 7.734901192657517e-09, + "loss": 0.5374, + "step": 3113 + }, + { + "epoch": 0.99648, + "grad_norm": 0.3322737274559622, + "learning_rate": 6.499478968502448e-09, + "loss": 0.5394, + "step": 3114 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3594354091921863, + "learning_rate": 5.371480320481848e-09, + "loss": 0.5727, + "step": 3115 + }, + { + "epoch": 0.99712, + "grad_norm": 0.35871374527749056, + "learning_rate": 4.35090646041525e-09, + "loss": 0.5504, + "step": 3116 + }, + { + "epoch": 0.99744, + "grad_norm": 0.38832115662509037, + "learning_rate": 3.437758484714504e-09, + "loss": 0.5861, + "step": 3117 + }, + { + "epoch": 0.99776, + "grad_norm": 0.3561515423446857, + "learning_rate": 2.6320373743837777e-09, + "loss": 0.584, + "step": 3118 + }, + { + "epoch": 0.99808, + "grad_norm": 0.3605520935287755, + "learning_rate": 1.9337439949973502e-09, + "loss": 0.5807, + "step": 3119 + }, + { + "epoch": 0.9984, + "grad_norm": 0.433149334004452, + "learning_rate": 1.3428790967440208e-09, + "loss": 0.6034, + "step": 3120 + }, + { + "epoch": 0.99872, + "grad_norm": 0.39777144062066955, + "learning_rate": 8.594433143938041e-10, + "loss": 0.5864, + "step": 3121 + }, + { + "epoch": 0.99904, + "grad_norm": 0.509715795086994, + "learning_rate": 4.83437167309031e-10, + "loss": 0.5699, + "step": 3122 + }, + { + "epoch": 0.99936, + "grad_norm": 0.33267505866821645, + "learning_rate": 2.148610594221445e-10, + "loss": 0.5725, + "step": 3123 + }, + { + "epoch": 0.99968, + "grad_norm": 0.3562465475370115, + "learning_rate": 5.3715279280108777e-11, + "loss": 0.5751, + "step": 3124 + }, + { + "epoch": 1.0, + "grad_norm": 0.3552379570399888, + "learning_rate": 0.0, + "loss": 0.5645, + "step": 3125 + }, + { + "epoch": 1.0, + "step": 3125, + "total_flos": 2770985822846976.0, + "train_loss": 0.6393652757930756, + "train_runtime": 50013.1059, + "train_samples_per_second": 1.0, + "train_steps_per_second": 0.062 + } + ], + "logging_steps": 1.0, + "max_steps": 3125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 2770985822846976.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..932034178d574272add53b52f3aad9cfdef411d0 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "q_proj", + "down_proj", + "o_proj", + "k_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..628d6aa4457929c0fb80c641ebfd0ced124e6bf3 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae95ba94a4f8f38e236e28a4ee58be7941b804709c60c205490bc088ba9943cb +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..fd49991c44b4d87bd118b70f160b75978e43b6bb --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa722027eae02f64df4708c7015be75ac9446545d1383e5548676926c114f8f1 +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..157b51da3e7c5591ff3a4625eb4527f3a1010a02 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 1.2798540748627416, + "learning_rate": 2e-05, + "loss": 1.5555, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 1.1130663775030645, + "learning_rate": 4e-05, + "loss": 1.5566, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 1.1165879934971559, + "learning_rate": 6e-05, + "loss": 1.4623, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 1.0058575486770314, + "learning_rate": 8e-05, + "loss": 1.3626, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.219011347258107, + "learning_rate": 0.0001, + "loss": 1.2277, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.375374057893773, + "learning_rate": 0.00012, + "loss": 1.0392, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.9163969142049067, + "learning_rate": 0.00014, + "loss": 1.0107, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.7529784242992968, + "learning_rate": 0.00016, + "loss": 0.9169, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.6095581532563064, + "learning_rate": 0.00018, + "loss": 0.8641, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.5462052358272893, + "learning_rate": 0.0002, + "loss": 0.8992, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.5157666896476973, + "learning_rate": 0.00019999458931878073, + "loss": 0.9204, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.5002919821711009, + "learning_rate": 0.0001999783578606323, + "loss": 0.8508, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.5584305955660048, + "learning_rate": 0.00019995130738201966, + "loss": 0.8616, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.5118169294900369, + "learning_rate": 0.0001999134408101731, + "loss": 0.8525, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.5037775467974861, + "learning_rate": 0.00019986476224277165, + "loss": 0.8293, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.5305323185219055, + "learning_rate": 0.00019980527694749952, + "loss": 0.92, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.49521462897177754, + "learning_rate": 0.00019973499136147606, + "loss": 0.8508, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.8517689323701216, + "learning_rate": 0.0001996539130905593, + "loss": 0.8561, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.462010867053101, + "learning_rate": 0.0001995620509085228, + "loss": 0.8378, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.42475383178016946, + "learning_rate": 0.00019945941475610623, + "loss": 0.8345, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.4318454627640549, + "learning_rate": 0.0001993460157399396, + "loss": 0.8076, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.4356633406944935, + "learning_rate": 0.0001992218661313415, + "loss": 0.8393, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.42037201194141616, + "learning_rate": 0.00019908697936499103, + "loss": 0.8468, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.44369315732198766, + "learning_rate": 0.00019894137003747403, + "loss": 0.8319, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.5254814555076129, + "learning_rate": 0.00019878505390570362, + "loss": 0.8612, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.4744180205622037, + "learning_rate": 0.00019861804788521493, + "loss": 0.8749, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.46898004305474095, + "learning_rate": 0.00019844037004833473, + "loss": 0.784, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.4623791910998875, + "learning_rate": 0.00019825203962222572, + "loss": 0.84, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4366697265960226, + "learning_rate": 0.0001980530769868059, + "loss": 0.797, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.44423044794779787, + "learning_rate": 0.00019784350367254322, + "loss": 0.8258, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.42626300065856604, + "learning_rate": 0.0001976233423581255, + "loss": 0.8326, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4193465122697224, + "learning_rate": 0.0001973926168680066, + "loss": 0.8412, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.40681649254496394, + "learning_rate": 0.00019715135216982798, + "loss": 0.7896, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.41612527711854297, + "learning_rate": 0.0001968995743717171, + "loss": 0.778, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.39751435632214827, + "learning_rate": 0.00019663731071946206, + "loss": 0.8441, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.4077296296795258, + "learning_rate": 0.00019636458959356316, + "loss": 0.8604, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.4180398678714927, + "learning_rate": 0.0001960814405061619, + "loss": 0.8307, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.41438332644147524, + "learning_rate": 0.00019578789409784727, + "loss": 0.8027, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4134687870277017, + "learning_rate": 0.00019548398213434007, + "loss": 0.7947, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.3843193571282607, + "learning_rate": 0.00019516973750305532, + "loss": 0.7355, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.41552762355305467, + "learning_rate": 0.00019484519420954354, + "loss": 0.8159, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.41784625055790997, + "learning_rate": 0.00019451038737381077, + "loss": 0.7917, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.38524871144404454, + "learning_rate": 0.00019416535322651818, + "loss": 0.762, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.40557699664979796, + "learning_rate": 0.00019381012910506146, + "loss": 0.8135, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.40666034746303753, + "learning_rate": 0.00019344475344953012, + "loss": 0.8029, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.39051816926073535, + "learning_rate": 0.00019306926579854821, + "loss": 0.8264, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.37829129722754057, + "learning_rate": 0.00019268370678499533, + "loss": 0.7413, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.4220307185970994, + "learning_rate": 0.0001922881181316097, + "loss": 0.8492, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.3778753770849601, + "learning_rate": 0.00019188254264647337, + "loss": 0.7066, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.374888863553464, + "learning_rate": 0.0001914670242183795, + "loss": 0.7368, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.39133935091911604, + "learning_rate": 0.0001910416078120832, + "loss": 0.724, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.40002567971895214, + "learning_rate": 0.0001906063394634356, + "loss": 0.7878, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.42178931968695155, + "learning_rate": 0.00019016126627440237, + "loss": 0.8216, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.40860442858917406, + "learning_rate": 0.00018970643640796642, + "loss": 0.7551, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.39852413957814403, + "learning_rate": 0.000189241899082916, + "loss": 0.7644, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.4189572790433841, + "learning_rate": 0.00018876770456851877, + "loss": 0.8023, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.40855887030389637, + "learning_rate": 0.0001882839041790818, + "loss": 0.7531, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3801608851366451, + "learning_rate": 0.00018779055026839868, + "loss": 0.7723, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4139301030131194, + "learning_rate": 0.00018728769622408423, + "loss": 0.825, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.4017543169947211, + "learning_rate": 0.00018677539646179707, + "loss": 0.8496, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.44378337867320256, + "learning_rate": 0.00018625370641935129, + "loss": 0.8018, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.41300498833124244, + "learning_rate": 0.00018572268255071718, + "loss": 0.7794, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.40657492525865646, + "learning_rate": 0.00018518238231991218, + "loss": 0.7831, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.395465279461956, + "learning_rate": 0.00018463286419478255, + "loss": 0.7721, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.42703491839853985, + "learning_rate": 0.00018407418764067627, + "loss": 0.7692, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.38479406258477233, + "learning_rate": 0.00018350641311400812, + "loss": 0.7739, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.4047609139504125, + "learning_rate": 0.0001829296020557174, + "loss": 0.7813, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.4020033275712984, + "learning_rate": 0.00018234381688461942, + "loss": 0.7504, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3884606066242859, + "learning_rate": 0.0001817491209906506, + "loss": 0.7561, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3934207199977672, + "learning_rate": 0.00018114557872800905, + "loss": 0.7415, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3873179960204124, + "learning_rate": 0.00018053325540819045, + "loss": 0.7912, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.37693458798587726, + "learning_rate": 0.0001799122172929206, + "loss": 0.7326, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.4061678138034663, + "learning_rate": 0.00017928253158698473, + "loss": 0.7822, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3892051655151776, + "learning_rate": 0.0001786442664309554, + "loss": 0.7584, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.38310012668403887, + "learning_rate": 0.0001779974908938184, + "loss": 0.7867, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3784062031016005, + "learning_rate": 0.0001773422749654988, + "loss": 0.7316, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3907279184041235, + "learning_rate": 0.00017667868954928694, + "loss": 0.7596, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.4139029656655114, + "learning_rate": 0.00017600680645416583, + "loss": 0.8192, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4157507841723872, + "learning_rate": 0.00017532669838704035, + "loss": 0.7086, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.39936633784499215, + "learning_rate": 0.00017463843894486937, + "loss": 0.7374, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.4238196965131133, + "learning_rate": 0.0001739421026067017, + "loss": 0.7223, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3767284979400297, + "learning_rate": 0.00017323776472561627, + "loss": 0.7621, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.40780204061722725, + "learning_rate": 0.00017252550152056795, + "loss": 0.7928, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3948337750910549, + "learning_rate": 0.0001718053900681397, + "loss": 0.7786, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.4082750981486944, + "learning_rate": 0.00017107750829420176, + "loss": 0.7655, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3817389147753865, + "learning_rate": 0.00017034193496547902, + "loss": 0.7724, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.4008774240859956, + "learning_rate": 0.00016959874968102735, + "loss": 0.7723, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3788953655261612, + "learning_rate": 0.00016884803286362, + "loss": 0.7652, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.425366550485755, + "learning_rate": 0.00016808986575104465, + "loss": 0.7428, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.44329184148884065, + "learning_rate": 0.00016732433038731242, + "loss": 0.7772, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.46025119810711906, + "learning_rate": 0.0001665515096137797, + "loss": 0.7935, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.37737275103783063, + "learning_rate": 0.00016577148706018328, + "loss": 0.6967, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.42706714197880036, + "learning_rate": 0.00016498434713559088, + "loss": 0.7906, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.45563949250899594, + "learning_rate": 0.00016419017501926656, + "loss": 0.7696, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.3776185851415028, + "learning_rate": 0.0001633890566514535, + "loss": 0.7572, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.36984631782078564, + "learning_rate": 0.00016258107872407375, + "loss": 0.705, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.4346876784878423, + "learning_rate": 0.0001617663286713474, + "loss": 0.7416, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.36934331041539875, + "learning_rate": 0.00016094489466033043, + "loss": 0.7529, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.4081247719533251, + "learning_rate": 0.00016011686558137448, + "loss": 0.7242, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.40855001328336676, + "learning_rate": 0.0001592823310385073, + "loss": 0.7346, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3864667377219663, + "learning_rate": 0.0001584413813397364, + "loss": 0.7443, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.41280954364426664, + "learning_rate": 0.00015759410748727662, + "loss": 0.7136, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3634883141450418, + "learning_rate": 0.00015674060116770236, + "loss": 0.7121, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.395695601758825, + "learning_rate": 0.00015588095474202595, + "loss": 0.7455, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.3643597037537018, + "learning_rate": 0.00015501526123570277, + "loss": 0.7331, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.38169101775305364, + "learning_rate": 0.00015414361432856475, + "loss": 0.783, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.37909138793115676, + "learning_rate": 0.0001532661083446829, + "loss": 0.7507, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3687030856036655, + "learning_rate": 0.00015238283824216015, + "loss": 0.784, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.38865913887000614, + "learning_rate": 0.00015149389960285558, + "loss": 0.7235, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.38487452507771075, + "learning_rate": 0.00015059938862204127, + "loss": 0.7379, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3890186367456905, + "learning_rate": 0.00014969940209799248, + "loss": 0.7081, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.39434783916904126, + "learning_rate": 0.00014879403742151283, + "loss": 0.7741, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4536991410557638, + "learning_rate": 0.00014788339256539544, + "loss": 0.7883, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3874086143124245, + "learning_rate": 0.0001469675660738206, + "loss": 0.7248, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.4140420014704873, + "learning_rate": 0.00014604665705169237, + "loss": 0.7656, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.37306782116254605, + "learning_rate": 0.00014512076515391375, + "loss": 0.7952, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3707950552859267, + "learning_rate": 0.00014418999057460276, + "loss": 0.7108, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.38928915678585724, + "learning_rate": 0.0001432544340362501, + "loss": 0.7361, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3608998439607155, + "learning_rate": 0.00014231419677881966, + "loss": 0.683, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.39931417035035704, + "learning_rate": 0.00014136938054879283, + "loss": 0.7405, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3587369475012143, + "learning_rate": 0.00014042008758815818, + "loss": 0.706, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3750960748163733, + "learning_rate": 0.00013946642062334766, + "loss": 0.754, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.36451783245977326, + "learning_rate": 0.00013850848285411994, + "loss": 0.7736, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.38677483486032344, + "learning_rate": 0.000137546377942393, + "loss": 0.7508, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.3744920171166124, + "learning_rate": 0.00013658021000102636, + "loss": 0.7312, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3644901776487304, + "learning_rate": 0.00013561008358255468, + "loss": 0.7111, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3661180746712227, + "learning_rate": 0.00013463610366787392, + "loss": 0.6941, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3850909914727337, + "learning_rate": 0.00013365837565488064, + "loss": 0.6558, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.38408294553109434, + "learning_rate": 0.0001326770053470668, + "loss": 0.7566, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.3798445105114034, + "learning_rate": 0.0001316920989420703, + "loss": 0.7327, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.37477045736590797, + "learning_rate": 0.00013070376302018287, + "loss": 0.7466, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3602184650700641, + "learning_rate": 0.00012971210453281674, + "loss": 0.7361, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.37286154538507166, + "learning_rate": 0.000128717230790931, + "loss": 0.7151, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.36266055772384015, + "learning_rate": 0.00012771924945341906, + "loss": 0.7254, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.37012911200493753, + "learning_rate": 0.00012671826851545851, + "loss": 0.7276, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3611107229784553, + "learning_rate": 0.0001257143962968246, + "loss": 0.7159, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.36198737541944453, + "learning_rate": 0.00012470774143016853, + "loss": 0.6928, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.36055521300792104, + "learning_rate": 0.00012369841284926188, + "loss": 0.6529, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.36079112047603207, + "learning_rate": 0.00012268651977720866, + "loss": 0.7311, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.38222230627087117, + "learning_rate": 0.00012167217171462566, + "loss": 0.7788, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3583196360529785, + "learning_rate": 0.0001206554784277931, + "loss": 0.7411, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3693156445504609, + "learning_rate": 0.00011963654993677645, + "loss": 0.7223, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.5253433057283227, + "learning_rate": 0.00011861549650352069, + "loss": 0.7243, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.39667834455785345, + "learning_rate": 0.00011759242861991855, + "loss": 0.7103, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.3609170146865518, + "learning_rate": 0.00011656745699585371, + "loss": 0.6811, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.36480685243748595, + "learning_rate": 0.00011554069254722051, + "loss": 0.7154, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.34362916664265325, + "learning_rate": 0.00011451224638392129, + "loss": 0.7292, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3831351941947736, + "learning_rate": 0.00011348222979784289, + "loss": 0.772, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.37678900834449036, + "learning_rate": 0.00011245075425081328, + "loss": 0.7076, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.35821903152379814, + "learning_rate": 0.00011141793136253986, + "loss": 0.7138, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.36335736308223704, + "learning_rate": 0.0001103838728985307, + "loss": 0.7016, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.37001387570803446, + "learning_rate": 0.000109348690758, + "loss": 0.7365, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.34053687246539766, + "learning_rate": 0.00010831249696175918, + "loss": 0.7046, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3480581382643813, + "learning_rate": 0.0001072754036400944, + "loss": 0.7043, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.3622932003356663, + "learning_rate": 0.00010623752302063283, + "loss": 0.7204, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3638401170001878, + "learning_rate": 0.00010519896741619803, + "loss": 0.6955, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.35897583152772905, + "learning_rate": 0.00010415984921265609, + "loss": 0.7112, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.36592350625047265, + "learning_rate": 0.00010312028085675391, + "loss": 0.7081, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.35759204884810725, + "learning_rate": 0.00010208037484395114, + "loss": 0.737, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.3629868550931301, + "learning_rate": 0.00010104024370624644, + "loss": 0.7238, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.37705761812761196, + "learning_rate": 0.0001, + "loss": 0.7096, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.36707266251522613, + "learning_rate": 9.895975629375359e-05, + "loss": 0.6896, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.41185970441538455, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7107, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.35197588060842433, + "learning_rate": 9.687971914324607e-05, + "loss": 0.6446, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.38227151419378474, + "learning_rate": 9.584015078734395e-05, + "loss": 0.6942, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.370823957452414, + "learning_rate": 9.480103258380198e-05, + "loss": 0.684, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3626666194137186, + "learning_rate": 9.376247697936719e-05, + "loss": 0.6576, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.39717099148907603, + "learning_rate": 9.272459635990562e-05, + "loss": 0.6711, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3547429214150075, + "learning_rate": 9.168750303824084e-05, + "loss": 0.6925, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.35141948293081565, + "learning_rate": 9.065130924199998e-05, + "loss": 0.728, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3807425547952564, + "learning_rate": 8.961612710146934e-05, + "loss": 0.7231, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.370750070649178, + "learning_rate": 8.858206863746018e-05, + "loss": 0.7071, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.36587979218630007, + "learning_rate": 8.754924574918675e-05, + "loss": 0.6576, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3756124782863467, + "learning_rate": 8.651777020215712e-05, + "loss": 0.7395, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.3454848482350315, + "learning_rate": 8.548775361607872e-05, + "loss": 0.7006, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3797633014006375, + "learning_rate": 8.445930745277953e-05, + "loss": 0.7259, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3707990015793451, + "learning_rate": 8.343254300414628e-05, + "loss": 0.6494, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3359321829538724, + "learning_rate": 8.240757138008149e-05, + "loss": 0.6849, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.4042981638968137, + "learning_rate": 8.138450349647936e-05, + "loss": 0.681, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.3350977200257071, + "learning_rate": 8.036345006322359e-05, + "loss": 0.6663, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3935415912212897, + "learning_rate": 7.934452157220694e-05, + "loss": 0.7659, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.36189997876991004, + "learning_rate": 7.832782828537437e-05, + "loss": 0.6555, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.35448506517236716, + "learning_rate": 7.731348022279134e-05, + "loss": 0.7298, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.35106530961144455, + "learning_rate": 7.630158715073813e-05, + "loss": 0.6677, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.37542595427026854, + "learning_rate": 7.52922585698315e-05, + "loss": 0.6948, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.36209283497638994, + "learning_rate": 7.428560370317542e-05, + "loss": 0.6903, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3623486443522771, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7069, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3531878449805604, + "learning_rate": 7.228075054658096e-05, + "loss": 0.7003, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.36007187478065084, + "learning_rate": 7.1282769209069e-05, + "loss": 0.7155, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.3526918728137452, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6676, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3505800871386953, + "learning_rate": 6.929623697981718e-05, + "loss": 0.6715, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3554299911235001, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6703, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3627855610891972, + "learning_rate": 6.732299465293322e-05, + "loss": 0.6662, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3642488950050494, + "learning_rate": 6.63416243451194e-05, + "loss": 0.7045, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.3451846156452362, + "learning_rate": 6.536389633212609e-05, + "loss": 0.6755, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.35162369090454626, + "learning_rate": 6.43899164174453e-05, + "loss": 0.7042, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.33039628040857877, + "learning_rate": 6.341978999897365e-05, + "loss": 0.6818, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.36874697217925245, + "learning_rate": 6.245362205760704e-05, + "loss": 0.729, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.35504449153403855, + "learning_rate": 6.149151714588009e-05, + "loss": 0.736, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3505320135825363, + "learning_rate": 6.053357937665237e-05, + "loss": 0.7133, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3340617641467672, + "learning_rate": 5.957991241184184e-05, + "loss": 0.6806, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3578847181945257, + "learning_rate": 5.863061945120719e-05, + "loss": 0.6431, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3489695240441819, + "learning_rate": 5.768580322118034e-05, + "loss": 0.6833, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3506625777470091, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.7078, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.3474445070349821, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.6889, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.32920237707492495, + "learning_rate": 5.487923484608629e-05, + "loss": 0.6646, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3334480843491693, + "learning_rate": 5.395334294830765e-05, + "loss": 0.6501, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3505039143876203, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.7206, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3222047814682131, + "learning_rate": 5.211660743460458e-05, + "loss": 0.6638, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.4160786394442177, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7626, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.34935480168303545, + "learning_rate": 5.030059790200756e-05, + "loss": 0.6544, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3690962579344299, + "learning_rate": 4.940061137795876e-05, + "loss": 0.7605, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.34372036007065915, + "learning_rate": 4.850610039714444e-05, + "loss": 0.706, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.35913661977043404, + "learning_rate": 4.761716175783989e-05, + "loss": 0.6788, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3726112942467802, + "learning_rate": 4.673389165531714e-05, + "loss": 0.7114, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.33458843319227594, + "learning_rate": 4.585638567143529e-05, + "loss": 0.6373, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.34431466922844606, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6884, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3478733203429474, + "learning_rate": 4.411904525797408e-05, + "loss": 0.6962, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.33823956689505236, + "learning_rate": 4.325939883229766e-05, + "loss": 0.6658, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.35445169110428304, + "learning_rate": 4.240589251272342e-05, + "loss": 0.7168, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3659639754863401, + "learning_rate": 4.155861866026364e-05, + "loss": 0.6614, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.358233978795873, + "learning_rate": 4.071766896149273e-05, + "loss": 0.6772, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.35303470161619455, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6174, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3370222333384653, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.6602, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.36422659550593695, + "learning_rate": 3.823367132865265e-05, + "loss": 0.7251, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3834308718279013, + "learning_rate": 3.741892127592625e-05, + "loss": 0.6898, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3644135006275037, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.6253, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3370974447420048, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6417, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.34431625767746926, + "learning_rate": 3.501565286440914e-05, + "loss": 0.678, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.34551979978352365, + "learning_rate": 3.422851293981676e-05, + "loss": 0.6871, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3545045862376799, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.6453, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.32066403890900497, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.6707, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.36734608391442103, + "learning_rate": 3.191013424895536e-05, + "loss": 0.6923, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3547951302678682, + "learning_rate": 3.115196713638e-05, + "loss": 0.6761, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.5267474819572611, + "learning_rate": 3.040125031897264e-05, + "loss": 0.6813, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.350204219412763, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.7204, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.346890632229907, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6814, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3865269459158674, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.6655, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3564641400714697, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.6694, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.34198572845873776, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.7187, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3558838016655507, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6815, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3284662213794393, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6463, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.38476695777550485, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.7077, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.354596469884446, + "learning_rate": 2.399319354583418e-05, + "loss": 0.7029, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.3317849510157482, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.6491, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.36083308267005093, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6873, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3409002629967509, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.6948, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3427746040952621, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.6543, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3410742412711408, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6644, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.34794942732730494, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6971, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3577136892277511, + "learning_rate": 1.946674459180955e-05, + "loss": 0.6797, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.38624710187607797, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.6671, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3486373264972873, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.7327, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.3387786699029933, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6671, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.36571687228097544, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6798, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3588033229242822, + "learning_rate": 1.649358688599191e-05, + "loss": 0.7403, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.336709736386362, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.6639, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.391641085443981, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.6474, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.34919556737995416, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.6833, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.34360019996565294, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.6711, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.33826189609957946, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.6549, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.33743588149216924, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.658, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.38110422068372724, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6688, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3538995730098079, + "learning_rate": 1.220944973160133e-05, + "loss": 0.7222, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.3318938752544763, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6636, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3447082958846042, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.636, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.35066462511198726, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.6468, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.37196499706566954, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.6735, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3527260375754371, + "learning_rate": 9.838733725597615e-06, + "loss": 0.6666, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.3513942177687219, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6958, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3217678276961582, + "learning_rate": 8.958392187916841e-06, + "loss": 0.6256, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.32770407204810276, + "learning_rate": 8.532975781620512e-06, + "loss": 0.6491, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.32765559383846565, + "learning_rate": 8.117457353526625e-06, + "loss": 0.6773, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3794195535065363, + "learning_rate": 7.711881868390291e-06, + "loss": 0.6887, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.37106110605162884, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.6741, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.35683941894435456, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6891, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.35203533584308394, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6577, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.337996015194583, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6332, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.35983244647991486, + "learning_rate": 5.834646773481811e-06, + "loss": 0.7085, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.36214840887826166, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6921, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3585103763875682, + "learning_rate": 5.154805790456485e-06, + "loss": 0.7095, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.34719063880984175, + "learning_rate": 4.830262496944693e-06, + "loss": 0.7, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3950007422079345, + "learning_rate": 4.516017865659949e-06, + "loss": 0.6532, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3588343725626154, + "learning_rate": 4.21210590215273e-06, + "loss": 0.68, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.35916560967802036, + "learning_rate": 3.918559493838114e-06, + "loss": 0.7225, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3532074162031371, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6726, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3684089516670794, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6947, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3505598227413042, + "learning_rate": 3.100425628282899e-06, + "loss": 0.6524, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3480286691290391, + "learning_rate": 2.848647830172024e-06, + "loss": 0.699, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.3391814941929011, + "learning_rate": 2.607383131993424e-06, + "loss": 0.6357, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3335239287966289, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.6366, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3385790402860968, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6937, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.3657441278236026, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.6638, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.36281028704902224, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6588, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.3436454356363298, + "learning_rate": 1.559629951665298e-06, + "loss": 0.6811, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3618953503973711, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.7065, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.35426892334235555, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.6672, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.34139279273687584, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6287, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3760086821701436, + "learning_rate": 9.130206350089765e-07, + "loss": 0.6999, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.3508003120606531, + "learning_rate": 7.781338686584927e-07, + "loss": 0.6651, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.35184452760607765, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6346, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.38870947092895375, + "learning_rate": 5.405852438937764e-07, + "loss": 0.6777, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.330375968306909, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.6571, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.33472574510023134, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6433, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.33577366769732253, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.6649, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.39028037485639394, + "learning_rate": 1.947230525005006e-07, + "loss": 0.657, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3552212376429523, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.6521, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3481211606712931, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6883, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.39425813369040086, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.762, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.320909110748154, + "learning_rate": 2.164213936770576e-08, + "loss": 0.6094, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3604948954854283, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6643, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.336643150141937, + "learning_rate": 0.0, + "loss": 0.7235, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 276143348056064.0, + "train_loss": 0.7382912295751083, + "train_runtime": 4947.3523, + "train_samples_per_second": 1.011, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 276143348056064.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..530a8d474ec8062e65791555dd78697f3fcb36b2 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "v_proj", + "o_proj", + "k_proj", + "q_proj", + "down_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e978324e8a0d889186468958049ed44d7862acea --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:372f73f96faf018a759b81a84a2a496fead382c96067b1a3e929827d989900dc +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b0fcbccdbaca7843a841ca9ed8c3f221aa639ac3 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6eedbab52c520cde50cc465fb01eb8045c76bae390d610723aace3f74fe1a71e +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a622ce67291f4181cb2dd5b8c0e769b779173e4b --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_60000_epochs_1_lora/trainer_state.json @@ -0,0 +1,26292 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 3750, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002666666666666667, + "grad_norm": 1.264853200757946, + "learning_rate": 1.7699115044247788e-06, + "loss": 1.5974, + "step": 1 + }, + { + "epoch": 0.0005333333333333334, + "grad_norm": 1.217457875450062, + "learning_rate": 3.5398230088495575e-06, + "loss": 1.4732, + "step": 2 + }, + { + "epoch": 0.0008, + "grad_norm": 1.259966139048936, + "learning_rate": 5.3097345132743365e-06, + "loss": 1.5691, + "step": 3 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.2505240398961883, + "learning_rate": 7.079646017699115e-06, + "loss": 1.5631, + "step": 4 + }, + { + "epoch": 0.0013333333333333333, + "grad_norm": 1.1873132467629952, + "learning_rate": 8.849557522123894e-06, + "loss": 1.5566, + "step": 5 + }, + { + "epoch": 0.0016, + "grad_norm": 1.083024669118219, + "learning_rate": 1.0619469026548673e-05, + "loss": 1.514, + "step": 6 + }, + { + "epoch": 0.0018666666666666666, + "grad_norm": 0.9670086918795727, + "learning_rate": 1.2389380530973452e-05, + "loss": 1.496, + "step": 7 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 1.034225839852347, + "learning_rate": 1.415929203539823e-05, + "loss": 1.4379, + "step": 8 + }, + { + "epoch": 0.0024, + "grad_norm": 0.9507313498380748, + "learning_rate": 1.592920353982301e-05, + "loss": 1.3717, + "step": 9 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.983287200714607, + "learning_rate": 1.7699115044247787e-05, + "loss": 1.3216, + "step": 10 + }, + { + "epoch": 0.0029333333333333334, + "grad_norm": 0.9080214189446323, + "learning_rate": 1.946902654867257e-05, + "loss": 1.2906, + "step": 11 + }, + { + "epoch": 0.0032, + "grad_norm": 0.999121539250498, + "learning_rate": 2.1238938053097346e-05, + "loss": 1.1628, + "step": 12 + }, + { + "epoch": 0.0034666666666666665, + "grad_norm": 0.9694957107789066, + "learning_rate": 2.3008849557522124e-05, + "loss": 1.2058, + "step": 13 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 1.2113099834106, + "learning_rate": 2.4778761061946905e-05, + "loss": 1.0821, + "step": 14 + }, + { + "epoch": 0.004, + "grad_norm": 0.8054362701984041, + "learning_rate": 2.6548672566371686e-05, + "loss": 1.0796, + "step": 15 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.8062720230664518, + "learning_rate": 2.831858407079646e-05, + "loss": 1.0231, + "step": 16 + }, + { + "epoch": 0.004533333333333334, + "grad_norm": 0.8626468497912609, + "learning_rate": 3.008849557522124e-05, + "loss": 0.9609, + "step": 17 + }, + { + "epoch": 0.0048, + "grad_norm": 0.7948400258968932, + "learning_rate": 3.185840707964602e-05, + "loss": 1.0358, + "step": 18 + }, + { + "epoch": 0.005066666666666666, + "grad_norm": 0.8202331658253242, + "learning_rate": 3.3628318584070804e-05, + "loss": 0.9203, + "step": 19 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.70349059381211, + "learning_rate": 3.5398230088495574e-05, + "loss": 0.9488, + "step": 20 + }, + { + "epoch": 0.0056, + "grad_norm": 0.6245560185622042, + "learning_rate": 3.716814159292036e-05, + "loss": 0.9239, + "step": 21 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.7032002385338322, + "learning_rate": 3.893805309734514e-05, + "loss": 0.9343, + "step": 22 + }, + { + "epoch": 0.0061333333333333335, + "grad_norm": 0.6138853340560978, + "learning_rate": 4.0707964601769914e-05, + "loss": 0.8944, + "step": 23 + }, + { + "epoch": 0.0064, + "grad_norm": 0.6252956302867735, + "learning_rate": 4.247787610619469e-05, + "loss": 0.9142, + "step": 24 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 0.5689669502910203, + "learning_rate": 4.4247787610619477e-05, + "loss": 0.8804, + "step": 25 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.6288100311989897, + "learning_rate": 4.601769911504425e-05, + "loss": 0.8975, + "step": 26 + }, + { + "epoch": 0.0072, + "grad_norm": 0.5525965844770411, + "learning_rate": 4.778761061946903e-05, + "loss": 0.8937, + "step": 27 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.5065552530181454, + "learning_rate": 4.955752212389381e-05, + "loss": 0.9136, + "step": 28 + }, + { + "epoch": 0.007733333333333333, + "grad_norm": 0.5201018248818462, + "learning_rate": 5.132743362831859e-05, + "loss": 0.8997, + "step": 29 + }, + { + "epoch": 0.008, + "grad_norm": 0.5524579564998915, + "learning_rate": 5.309734513274337e-05, + "loss": 0.8422, + "step": 30 + }, + { + "epoch": 0.008266666666666667, + "grad_norm": 0.5303079983520723, + "learning_rate": 5.486725663716814e-05, + "loss": 0.8439, + "step": 31 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.5348941333565699, + "learning_rate": 5.663716814159292e-05, + "loss": 0.9265, + "step": 32 + }, + { + "epoch": 0.0088, + "grad_norm": 0.48407338678075934, + "learning_rate": 5.8407079646017705e-05, + "loss": 0.8567, + "step": 33 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.4930009016915646, + "learning_rate": 6.017699115044248e-05, + "loss": 0.8832, + "step": 34 + }, + { + "epoch": 0.009333333333333334, + "grad_norm": 0.5059503896009777, + "learning_rate": 6.194690265486725e-05, + "loss": 0.8384, + "step": 35 + }, + { + "epoch": 0.0096, + "grad_norm": 0.4968383701366383, + "learning_rate": 6.371681415929204e-05, + "loss": 0.8314, + "step": 36 + }, + { + "epoch": 0.009866666666666666, + "grad_norm": 0.5226825692592999, + "learning_rate": 6.548672566371682e-05, + "loss": 0.8601, + "step": 37 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.5045947528163728, + "learning_rate": 6.725663716814161e-05, + "loss": 0.8369, + "step": 38 + }, + { + "epoch": 0.0104, + "grad_norm": 0.5293030836072474, + "learning_rate": 6.902654867256638e-05, + "loss": 0.7889, + "step": 39 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.47708776202713943, + "learning_rate": 7.079646017699115e-05, + "loss": 0.8008, + "step": 40 + }, + { + "epoch": 0.010933333333333333, + "grad_norm": 0.5686752073899266, + "learning_rate": 7.256637168141593e-05, + "loss": 0.8019, + "step": 41 + }, + { + "epoch": 0.0112, + "grad_norm": 0.544125533975138, + "learning_rate": 7.433628318584072e-05, + "loss": 0.8626, + "step": 42 + }, + { + "epoch": 0.011466666666666667, + "grad_norm": 0.4898347653326332, + "learning_rate": 7.610619469026549e-05, + "loss": 0.8613, + "step": 43 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.45839478731360894, + "learning_rate": 7.787610619469027e-05, + "loss": 0.7437, + "step": 44 + }, + { + "epoch": 0.012, + "grad_norm": 0.4901458596712785, + "learning_rate": 7.964601769911504e-05, + "loss": 0.7739, + "step": 45 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.48317563597167756, + "learning_rate": 8.141592920353983e-05, + "loss": 0.8318, + "step": 46 + }, + { + "epoch": 0.012533333333333334, + "grad_norm": 0.4886339551554164, + "learning_rate": 8.31858407079646e-05, + "loss": 0.8458, + "step": 47 + }, + { + "epoch": 0.0128, + "grad_norm": 0.46895080440104325, + "learning_rate": 8.495575221238938e-05, + "loss": 0.8192, + "step": 48 + }, + { + "epoch": 0.013066666666666667, + "grad_norm": 0.45837188999051376, + "learning_rate": 8.672566371681417e-05, + "loss": 0.7587, + "step": 49 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.5384787634584273, + "learning_rate": 8.849557522123895e-05, + "loss": 0.8482, + "step": 50 + }, + { + "epoch": 0.0136, + "grad_norm": 0.6380236500653841, + "learning_rate": 9.026548672566371e-05, + "loss": 0.8591, + "step": 51 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.4719389003968988, + "learning_rate": 9.20353982300885e-05, + "loss": 0.7666, + "step": 52 + }, + { + "epoch": 0.014133333333333333, + "grad_norm": 0.44310129818703503, + "learning_rate": 9.380530973451328e-05, + "loss": 0.8256, + "step": 53 + }, + { + "epoch": 0.0144, + "grad_norm": 0.44888747088563763, + "learning_rate": 9.557522123893806e-05, + "loss": 0.7907, + "step": 54 + }, + { + "epoch": 0.014666666666666666, + "grad_norm": 0.44470853697317947, + "learning_rate": 9.734513274336283e-05, + "loss": 0.7779, + "step": 55 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.4553984167393368, + "learning_rate": 9.911504424778762e-05, + "loss": 0.764, + "step": 56 + }, + { + "epoch": 0.0152, + "grad_norm": 0.4337441525847164, + "learning_rate": 0.00010088495575221239, + "loss": 0.8364, + "step": 57 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.4725986162131283, + "learning_rate": 0.00010265486725663717, + "loss": 0.846, + "step": 58 + }, + { + "epoch": 0.015733333333333332, + "grad_norm": 0.46683580635871025, + "learning_rate": 0.00010442477876106196, + "loss": 0.7912, + "step": 59 + }, + { + "epoch": 0.016, + "grad_norm": 0.4606875357871431, + "learning_rate": 0.00010619469026548674, + "loss": 0.8111, + "step": 60 + }, + { + "epoch": 0.016266666666666665, + "grad_norm": 0.4588149809457491, + "learning_rate": 0.0001079646017699115, + "loss": 0.7577, + "step": 61 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.4319312435830963, + "learning_rate": 0.00010973451327433629, + "loss": 0.7824, + "step": 62 + }, + { + "epoch": 0.0168, + "grad_norm": 0.4865410084194108, + "learning_rate": 0.00011150442477876106, + "loss": 0.731, + "step": 63 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.47980579123461414, + "learning_rate": 0.00011327433628318584, + "loss": 0.807, + "step": 64 + }, + { + "epoch": 0.017333333333333333, + "grad_norm": 0.4482407983036267, + "learning_rate": 0.00011504424778761063, + "loss": 0.7851, + "step": 65 + }, + { + "epoch": 0.0176, + "grad_norm": 0.4728595754373952, + "learning_rate": 0.00011681415929203541, + "loss": 0.7581, + "step": 66 + }, + { + "epoch": 0.017866666666666666, + "grad_norm": 0.4634381144472228, + "learning_rate": 0.0001185840707964602, + "loss": 0.7178, + "step": 67 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.44357198821834165, + "learning_rate": 0.00012035398230088497, + "loss": 0.8162, + "step": 68 + }, + { + "epoch": 0.0184, + "grad_norm": 0.4350869015181407, + "learning_rate": 0.00012212389380530974, + "loss": 0.6976, + "step": 69 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.4468609647095294, + "learning_rate": 0.0001238938053097345, + "loss": 0.8209, + "step": 70 + }, + { + "epoch": 0.018933333333333333, + "grad_norm": 0.46082477622959217, + "learning_rate": 0.0001256637168141593, + "loss": 0.7785, + "step": 71 + }, + { + "epoch": 0.0192, + "grad_norm": 0.4617330853753975, + "learning_rate": 0.00012743362831858408, + "loss": 0.8422, + "step": 72 + }, + { + "epoch": 0.019466666666666667, + "grad_norm": 0.4454983705630943, + "learning_rate": 0.00012920353982300885, + "loss": 0.6729, + "step": 73 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.44214950569023675, + "learning_rate": 0.00013097345132743365, + "loss": 0.8234, + "step": 74 + }, + { + "epoch": 0.02, + "grad_norm": 0.4324575762207759, + "learning_rate": 0.00013274336283185842, + "loss": 0.7741, + "step": 75 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.43153424291430115, + "learning_rate": 0.00013451327433628321, + "loss": 0.764, + "step": 76 + }, + { + "epoch": 0.020533333333333334, + "grad_norm": 0.45034174286548495, + "learning_rate": 0.00013628318584070796, + "loss": 0.8018, + "step": 77 + }, + { + "epoch": 0.0208, + "grad_norm": 0.4464786137625371, + "learning_rate": 0.00013805309734513276, + "loss": 0.7607, + "step": 78 + }, + { + "epoch": 0.021066666666666668, + "grad_norm": 0.4590769909090508, + "learning_rate": 0.00013982300884955753, + "loss": 0.8108, + "step": 79 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.4463713947630006, + "learning_rate": 0.0001415929203539823, + "loss": 0.7346, + "step": 80 + }, + { + "epoch": 0.0216, + "grad_norm": 0.41533645822148124, + "learning_rate": 0.0001433628318584071, + "loss": 0.7438, + "step": 81 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.4740827050643028, + "learning_rate": 0.00014513274336283187, + "loss": 0.7964, + "step": 82 + }, + { + "epoch": 0.022133333333333335, + "grad_norm": 0.44486736334646143, + "learning_rate": 0.00014690265486725664, + "loss": 0.7707, + "step": 83 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4302755386054654, + "learning_rate": 0.00014867256637168144, + "loss": 0.7948, + "step": 84 + }, + { + "epoch": 0.02266666666666667, + "grad_norm": 0.4559094561783215, + "learning_rate": 0.00015044247787610618, + "loss": 0.8243, + "step": 85 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.4900569264486964, + "learning_rate": 0.00015221238938053098, + "loss": 0.755, + "step": 86 + }, + { + "epoch": 0.0232, + "grad_norm": 0.4411264029207737, + "learning_rate": 0.00015398230088495575, + "loss": 0.7284, + "step": 87 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.4380314589116113, + "learning_rate": 0.00015575221238938055, + "loss": 0.758, + "step": 88 + }, + { + "epoch": 0.023733333333333332, + "grad_norm": 0.4464592706350816, + "learning_rate": 0.00015752212389380532, + "loss": 0.7402, + "step": 89 + }, + { + "epoch": 0.024, + "grad_norm": 0.4740249465879794, + "learning_rate": 0.0001592920353982301, + "loss": 0.7877, + "step": 90 + }, + { + "epoch": 0.024266666666666666, + "grad_norm": 0.4454949189834306, + "learning_rate": 0.0001610619469026549, + "loss": 0.7191, + "step": 91 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.5050326165623321, + "learning_rate": 0.00016283185840707966, + "loss": 0.826, + "step": 92 + }, + { + "epoch": 0.0248, + "grad_norm": 0.44453758782037683, + "learning_rate": 0.00016460176991150443, + "loss": 0.7437, + "step": 93 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.4336087295604444, + "learning_rate": 0.0001663716814159292, + "loss": 0.7969, + "step": 94 + }, + { + "epoch": 0.025333333333333333, + "grad_norm": 0.4308296907448977, + "learning_rate": 0.000168141592920354, + "loss": 0.7346, + "step": 95 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4477684697781309, + "learning_rate": 0.00016991150442477877, + "loss": 0.7826, + "step": 96 + }, + { + "epoch": 0.025866666666666666, + "grad_norm": 0.43116255885853144, + "learning_rate": 0.00017168141592920354, + "loss": 0.7633, + "step": 97 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.45304995086487815, + "learning_rate": 0.00017345132743362834, + "loss": 0.7727, + "step": 98 + }, + { + "epoch": 0.0264, + "grad_norm": 0.4082081367581844, + "learning_rate": 0.0001752212389380531, + "loss": 0.741, + "step": 99 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.4286344877289435, + "learning_rate": 0.0001769911504424779, + "loss": 0.7689, + "step": 100 + }, + { + "epoch": 0.026933333333333333, + "grad_norm": 0.432430752273439, + "learning_rate": 0.00017876106194690265, + "loss": 0.8096, + "step": 101 + }, + { + "epoch": 0.0272, + "grad_norm": 0.46785258504213295, + "learning_rate": 0.00018053097345132742, + "loss": 0.7556, + "step": 102 + }, + { + "epoch": 0.027466666666666667, + "grad_norm": 0.4609078171732939, + "learning_rate": 0.00018230088495575222, + "loss": 0.8028, + "step": 103 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.4120994332670511, + "learning_rate": 0.000184070796460177, + "loss": 0.7932, + "step": 104 + }, + { + "epoch": 0.028, + "grad_norm": 0.44499469315040363, + "learning_rate": 0.0001858407079646018, + "loss": 0.7785, + "step": 105 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.46531373423169303, + "learning_rate": 0.00018761061946902656, + "loss": 0.7908, + "step": 106 + }, + { + "epoch": 0.028533333333333334, + "grad_norm": 0.4252555869358924, + "learning_rate": 0.00018938053097345133, + "loss": 0.7737, + "step": 107 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4405067233148738, + "learning_rate": 0.00019115044247787613, + "loss": 0.7763, + "step": 108 + }, + { + "epoch": 0.029066666666666668, + "grad_norm": 0.4079319254410207, + "learning_rate": 0.00019292035398230087, + "loss": 0.7709, + "step": 109 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.4184085086369884, + "learning_rate": 0.00019469026548672567, + "loss": 0.7483, + "step": 110 + }, + { + "epoch": 0.0296, + "grad_norm": 0.45950580294816856, + "learning_rate": 0.00019646017699115044, + "loss": 0.772, + "step": 111 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.44509191760629474, + "learning_rate": 0.00019823008849557524, + "loss": 0.7477, + "step": 112 + }, + { + "epoch": 0.030133333333333335, + "grad_norm": 0.4295996816553574, + "learning_rate": 0.0002, + "loss": 0.7339, + "step": 113 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4364555498847836, + "learning_rate": 0.00019999996269361907, + "loss": 0.7499, + "step": 114 + }, + { + "epoch": 0.030666666666666665, + "grad_norm": 0.45973042163769945, + "learning_rate": 0.00019999985077450406, + "loss": 0.7509, + "step": 115 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.45832747766303494, + "learning_rate": 0.0001999996642427385, + "loss": 0.764, + "step": 116 + }, + { + "epoch": 0.0312, + "grad_norm": 0.4190613732773172, + "learning_rate": 0.00019999940309846159, + "loss": 0.7682, + "step": 117 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.43455127814284006, + "learning_rate": 0.00019999906734186813, + "loss": 0.7423, + "step": 118 + }, + { + "epoch": 0.031733333333333336, + "grad_norm": 0.39579327955490035, + "learning_rate": 0.00019999865697320867, + "loss": 0.7257, + "step": 119 + }, + { + "epoch": 0.032, + "grad_norm": 0.44288727123545557, + "learning_rate": 0.0001999981719927894, + "loss": 0.7668, + "step": 120 + }, + { + "epoch": 0.032266666666666666, + "grad_norm": 0.44031845865834995, + "learning_rate": 0.00019999761240097215, + "loss": 0.8193, + "step": 121 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.42671499637114035, + "learning_rate": 0.00019999697819817448, + "loss": 0.7412, + "step": 122 + }, + { + "epoch": 0.0328, + "grad_norm": 0.43688245149224186, + "learning_rate": 0.00019999626938486956, + "loss": 0.7553, + "step": 123 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.4081876652320984, + "learning_rate": 0.00019999548596158625, + "loss": 0.7363, + "step": 124 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 0.3949348426439058, + "learning_rate": 0.00019999462792890912, + "loss": 0.7375, + "step": 125 + }, + { + "epoch": 0.0336, + "grad_norm": 0.436949348455344, + "learning_rate": 0.00019999369528747834, + "loss": 0.7542, + "step": 126 + }, + { + "epoch": 0.03386666666666667, + "grad_norm": 0.44127480376228306, + "learning_rate": 0.00019999268803798977, + "loss": 0.8077, + "step": 127 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.4278322129090262, + "learning_rate": 0.000199991606181195, + "loss": 0.7472, + "step": 128 + }, + { + "epoch": 0.0344, + "grad_norm": 0.42654970123056357, + "learning_rate": 0.0001999904497179012, + "loss": 0.7304, + "step": 129 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.49695312757498045, + "learning_rate": 0.00019998921864897121, + "loss": 0.7919, + "step": 130 + }, + { + "epoch": 0.03493333333333333, + "grad_norm": 0.42374704515274303, + "learning_rate": 0.00019998791297532362, + "loss": 0.719, + "step": 131 + }, + { + "epoch": 0.0352, + "grad_norm": 0.43558716586278284, + "learning_rate": 0.00019998653269793256, + "loss": 0.746, + "step": 132 + }, + { + "epoch": 0.03546666666666667, + "grad_norm": 0.40823667620717713, + "learning_rate": 0.000199985077817828, + "loss": 0.7034, + "step": 133 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.44248614365245026, + "learning_rate": 0.00019998354833609537, + "loss": 0.8261, + "step": 134 + }, + { + "epoch": 0.036, + "grad_norm": 0.4615866263452828, + "learning_rate": 0.00019998194425387586, + "loss": 0.7399, + "step": 135 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.43962461064283653, + "learning_rate": 0.00019998026557236636, + "loss": 0.7973, + "step": 136 + }, + { + "epoch": 0.036533333333333334, + "grad_norm": 0.41934926245557447, + "learning_rate": 0.0001999785122928194, + "loss": 0.7024, + "step": 137 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4194356667697614, + "learning_rate": 0.00019997668441654312, + "loss": 0.7685, + "step": 138 + }, + { + "epoch": 0.037066666666666664, + "grad_norm": 0.4182590192400099, + "learning_rate": 0.00019997478194490133, + "loss": 0.7646, + "step": 139 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.42231129891707286, + "learning_rate": 0.00019997280487931356, + "loss": 0.7258, + "step": 140 + }, + { + "epoch": 0.0376, + "grad_norm": 0.44714167228108054, + "learning_rate": 0.00019997075322125492, + "loss": 0.7829, + "step": 141 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.40359445738189087, + "learning_rate": 0.00019996862697225622, + "loss": 0.7325, + "step": 142 + }, + { + "epoch": 0.03813333333333333, + "grad_norm": 0.4064228325069067, + "learning_rate": 0.0001999664261339039, + "loss": 0.727, + "step": 143 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4184207142338632, + "learning_rate": 0.00019996415070784007, + "loss": 0.7871, + "step": 144 + }, + { + "epoch": 0.03866666666666667, + "grad_norm": 0.40278141773965426, + "learning_rate": 0.0001999618006957625, + "loss": 0.7154, + "step": 145 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.4218347867198309, + "learning_rate": 0.00019995937609942462, + "loss": 0.7294, + "step": 146 + }, + { + "epoch": 0.0392, + "grad_norm": 0.4288114460153231, + "learning_rate": 0.00019995687692063544, + "loss": 0.7619, + "step": 147 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.4362730719846335, + "learning_rate": 0.0001999543031612597, + "loss": 0.805, + "step": 148 + }, + { + "epoch": 0.039733333333333336, + "grad_norm": 0.46119959525223236, + "learning_rate": 0.00019995165482321773, + "loss": 0.7192, + "step": 149 + }, + { + "epoch": 0.04, + "grad_norm": 0.4255994275960042, + "learning_rate": 0.00019994893190848555, + "loss": 0.6974, + "step": 150 + }, + { + "epoch": 0.040266666666666666, + "grad_norm": 0.39591345422504165, + "learning_rate": 0.00019994613441909483, + "loss": 0.7242, + "step": 151 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.45645688244270527, + "learning_rate": 0.00019994326235713277, + "loss": 0.7561, + "step": 152 + }, + { + "epoch": 0.0408, + "grad_norm": 0.4238047747089797, + "learning_rate": 0.00019994031572474237, + "loss": 0.75, + "step": 153 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.4163029492160629, + "learning_rate": 0.00019993729452412213, + "loss": 0.7097, + "step": 154 + }, + { + "epoch": 0.04133333333333333, + "grad_norm": 0.4412984717208657, + "learning_rate": 0.00019993419875752633, + "loss": 0.7913, + "step": 155 + }, + { + "epoch": 0.0416, + "grad_norm": 0.44306476010633583, + "learning_rate": 0.00019993102842726473, + "loss": 0.7528, + "step": 156 + }, + { + "epoch": 0.04186666666666667, + "grad_norm": 0.3974985905615024, + "learning_rate": 0.00019992778353570282, + "loss": 0.8091, + "step": 157 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.4347226330417167, + "learning_rate": 0.00019992446408526176, + "loss": 0.7327, + "step": 158 + }, + { + "epoch": 0.0424, + "grad_norm": 0.41402234147777023, + "learning_rate": 0.0001999210700784182, + "loss": 0.7646, + "step": 159 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.40254167313476474, + "learning_rate": 0.00019991760151770457, + "loss": 0.7505, + "step": 160 + }, + { + "epoch": 0.04293333333333333, + "grad_norm": 0.4044806910369341, + "learning_rate": 0.00019991405840570886, + "loss": 0.6412, + "step": 161 + }, + { + "epoch": 0.0432, + "grad_norm": 0.43014238855369424, + "learning_rate": 0.0001999104407450746, + "loss": 0.7195, + "step": 162 + }, + { + "epoch": 0.04346666666666667, + "grad_norm": 0.42138360468041963, + "learning_rate": 0.00019990674853850111, + "loss": 0.7595, + "step": 163 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.42512241780112225, + "learning_rate": 0.00019990298178874322, + "loss": 0.7241, + "step": 164 + }, + { + "epoch": 0.044, + "grad_norm": 0.4329587059045856, + "learning_rate": 0.00019989914049861143, + "loss": 0.7581, + "step": 165 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.41165876976626564, + "learning_rate": 0.00019989522467097178, + "loss": 0.7899, + "step": 166 + }, + { + "epoch": 0.044533333333333334, + "grad_norm": 0.46486524170602445, + "learning_rate": 0.00019989123430874602, + "loss": 0.7256, + "step": 167 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3932375147112573, + "learning_rate": 0.0001998871694149114, + "loss": 0.7006, + "step": 168 + }, + { + "epoch": 0.045066666666666665, + "grad_norm": 0.4065406693697278, + "learning_rate": 0.00019988302999250098, + "loss": 0.7877, + "step": 169 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.41237994290530905, + "learning_rate": 0.0001998788160446032, + "loss": 0.7435, + "step": 170 + }, + { + "epoch": 0.0456, + "grad_norm": 0.41724585168439526, + "learning_rate": 0.0001998745275743622, + "loss": 0.7635, + "step": 171 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.39239738733273327, + "learning_rate": 0.00019987016458497778, + "loss": 0.7213, + "step": 172 + }, + { + "epoch": 0.04613333333333333, + "grad_norm": 0.4153912236878267, + "learning_rate": 0.00019986572707970525, + "loss": 0.6877, + "step": 173 + }, + { + "epoch": 0.0464, + "grad_norm": 0.40897316873652534, + "learning_rate": 0.00019986121506185555, + "loss": 0.765, + "step": 174 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 0.42636975728495263, + "learning_rate": 0.00019985662853479525, + "loss": 0.7222, + "step": 175 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.4087494019314695, + "learning_rate": 0.00019985196750194647, + "loss": 0.6956, + "step": 176 + }, + { + "epoch": 0.0472, + "grad_norm": 0.40632207244215324, + "learning_rate": 0.0001998472319667869, + "loss": 0.7523, + "step": 177 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.4005990115694852, + "learning_rate": 0.00019984242193284995, + "loss": 0.7262, + "step": 178 + }, + { + "epoch": 0.047733333333333336, + "grad_norm": 0.40921455765118137, + "learning_rate": 0.00019983753740372443, + "loss": 0.7415, + "step": 179 + }, + { + "epoch": 0.048, + "grad_norm": 0.39320300572180766, + "learning_rate": 0.00019983257838305485, + "loss": 0.7473, + "step": 180 + }, + { + "epoch": 0.048266666666666666, + "grad_norm": 0.46521592954335056, + "learning_rate": 0.00019982754487454126, + "loss": 0.7795, + "step": 181 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.420772938160873, + "learning_rate": 0.00019982243688193934, + "loss": 0.7604, + "step": 182 + }, + { + "epoch": 0.0488, + "grad_norm": 0.4159098112445086, + "learning_rate": 0.00019981725440906023, + "loss": 0.7592, + "step": 183 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.4142585530411165, + "learning_rate": 0.0001998119974597708, + "loss": 0.7427, + "step": 184 + }, + { + "epoch": 0.04933333333333333, + "grad_norm": 0.3965954403079215, + "learning_rate": 0.00019980666603799333, + "loss": 0.7122, + "step": 185 + }, + { + "epoch": 0.0496, + "grad_norm": 0.38253711216327274, + "learning_rate": 0.0001998012601477058, + "loss": 0.7303, + "step": 186 + }, + { + "epoch": 0.04986666666666666, + "grad_norm": 0.3819538197346378, + "learning_rate": 0.00019979577979294168, + "loss": 0.7364, + "step": 187 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.4102247800801515, + "learning_rate": 0.00019979022497779002, + "loss": 0.7478, + "step": 188 + }, + { + "epoch": 0.0504, + "grad_norm": 0.39816195365377816, + "learning_rate": 0.0001997845957063954, + "loss": 0.7482, + "step": 189 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.40065497357861385, + "learning_rate": 0.00019977889198295794, + "loss": 0.6791, + "step": 190 + }, + { + "epoch": 0.05093333333333333, + "grad_norm": 0.4185398481679138, + "learning_rate": 0.0001997731138117334, + "loss": 0.7104, + "step": 191 + }, + { + "epoch": 0.0512, + "grad_norm": 0.41532835516077843, + "learning_rate": 0.00019976726119703305, + "loss": 0.6745, + "step": 192 + }, + { + "epoch": 0.05146666666666667, + "grad_norm": 0.4849381478848775, + "learning_rate": 0.00019976133414322366, + "loss": 0.7565, + "step": 193 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.42890622187771793, + "learning_rate": 0.00019975533265472755, + "loss": 0.7601, + "step": 194 + }, + { + "epoch": 0.052, + "grad_norm": 0.40645431891112266, + "learning_rate": 0.0001997492567360226, + "loss": 0.7471, + "step": 195 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.44050504688919007, + "learning_rate": 0.00019974310639164227, + "loss": 0.7172, + "step": 196 + }, + { + "epoch": 0.052533333333333335, + "grad_norm": 0.4175460735073087, + "learning_rate": 0.00019973688162617544, + "loss": 0.7852, + "step": 197 + }, + { + "epoch": 0.0528, + "grad_norm": 0.46387859545026644, + "learning_rate": 0.0001997305824442666, + "loss": 0.7344, + "step": 198 + }, + { + "epoch": 0.053066666666666665, + "grad_norm": 0.43950368771088844, + "learning_rate": 0.00019972420885061576, + "loss": 0.777, + "step": 199 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.47682984929324623, + "learning_rate": 0.00019971776084997842, + "loss": 0.6963, + "step": 200 + }, + { + "epoch": 0.0536, + "grad_norm": 0.4210968151337064, + "learning_rate": 0.00019971123844716562, + "loss": 0.7078, + "step": 201 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.4426667897608301, + "learning_rate": 0.00019970464164704389, + "loss": 0.7714, + "step": 202 + }, + { + "epoch": 0.05413333333333333, + "grad_norm": 0.3943173565109175, + "learning_rate": 0.0001996979704545353, + "loss": 0.7884, + "step": 203 + }, + { + "epoch": 0.0544, + "grad_norm": 0.4071080597282432, + "learning_rate": 0.0001996912248746174, + "loss": 0.7364, + "step": 204 + }, + { + "epoch": 0.05466666666666667, + "grad_norm": 0.39539875302690025, + "learning_rate": 0.00019968440491232326, + "loss": 0.7742, + "step": 205 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.427477962182699, + "learning_rate": 0.00019967751057274144, + "loss": 0.7617, + "step": 206 + }, + { + "epoch": 0.0552, + "grad_norm": 0.40297635868881115, + "learning_rate": 0.00019967054186101598, + "loss": 0.7412, + "step": 207 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.4072669433506653, + "learning_rate": 0.00019966349878234647, + "loss": 0.7117, + "step": 208 + }, + { + "epoch": 0.055733333333333336, + "grad_norm": 0.4236907433036573, + "learning_rate": 0.00019965638134198792, + "loss": 0.7667, + "step": 209 + }, + { + "epoch": 0.056, + "grad_norm": 0.41684491796587203, + "learning_rate": 0.00019964918954525085, + "loss": 0.7525, + "step": 210 + }, + { + "epoch": 0.056266666666666666, + "grad_norm": 0.4196558925944855, + "learning_rate": 0.00019964192339750128, + "loss": 0.7627, + "step": 211 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.4138313180882757, + "learning_rate": 0.00019963458290416064, + "loss": 0.7377, + "step": 212 + }, + { + "epoch": 0.0568, + "grad_norm": 0.3791671739181995, + "learning_rate": 0.0001996271680707059, + "loss": 0.6732, + "step": 213 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.4049548923517151, + "learning_rate": 0.00019961967890266946, + "loss": 0.7683, + "step": 214 + }, + { + "epoch": 0.05733333333333333, + "grad_norm": 0.43085041346392816, + "learning_rate": 0.00019961211540563917, + "loss": 0.7627, + "step": 215 + }, + { + "epoch": 0.0576, + "grad_norm": 0.48306971944769167, + "learning_rate": 0.00019960447758525846, + "loss": 0.7485, + "step": 216 + }, + { + "epoch": 0.057866666666666663, + "grad_norm": 0.4167352089373417, + "learning_rate": 0.00019959676544722602, + "loss": 0.7579, + "step": 217 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.3854266730684992, + "learning_rate": 0.00019958897899729613, + "loss": 0.7072, + "step": 218 + }, + { + "epoch": 0.0584, + "grad_norm": 0.4143261432972768, + "learning_rate": 0.0001995811182412785, + "loss": 0.7479, + "step": 219 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.39807525682944156, + "learning_rate": 0.00019957318318503822, + "loss": 0.7295, + "step": 220 + }, + { + "epoch": 0.05893333333333333, + "grad_norm": 0.42879920347124467, + "learning_rate": 0.00019956517383449583, + "loss": 0.7272, + "step": 221 + }, + { + "epoch": 0.0592, + "grad_norm": 0.3805555655205918, + "learning_rate": 0.00019955709019562742, + "loss": 0.7118, + "step": 222 + }, + { + "epoch": 0.05946666666666667, + "grad_norm": 0.4141547390911347, + "learning_rate": 0.0001995489322744643, + "loss": 0.7557, + "step": 223 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.4297484969084553, + "learning_rate": 0.0001995407000770934, + "loss": 0.7887, + "step": 224 + }, + { + "epoch": 0.06, + "grad_norm": 0.4274267901647024, + "learning_rate": 0.00019953239360965695, + "loss": 0.7769, + "step": 225 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.38182301143404707, + "learning_rate": 0.00019952401287835268, + "loss": 0.6746, + "step": 226 + }, + { + "epoch": 0.060533333333333335, + "grad_norm": 0.40416336228997984, + "learning_rate": 0.00019951555788943364, + "loss": 0.7707, + "step": 227 + }, + { + "epoch": 0.0608, + "grad_norm": 0.4088907253997118, + "learning_rate": 0.00019950702864920836, + "loss": 0.7583, + "step": 228 + }, + { + "epoch": 0.061066666666666665, + "grad_norm": 0.4010022725937896, + "learning_rate": 0.0001994984251640407, + "loss": 0.7512, + "step": 229 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.38961594471098804, + "learning_rate": 0.00019948974744035, + "loss": 0.7066, + "step": 230 + }, + { + "epoch": 0.0616, + "grad_norm": 0.39078296075160246, + "learning_rate": 0.00019948099548461096, + "loss": 0.6713, + "step": 231 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.3881355954199725, + "learning_rate": 0.0001994721693033536, + "loss": 0.6901, + "step": 232 + }, + { + "epoch": 0.06213333333333333, + "grad_norm": 0.39450258580406516, + "learning_rate": 0.00019946326890316345, + "loss": 0.6874, + "step": 233 + }, + { + "epoch": 0.0624, + "grad_norm": 0.3842897115626507, + "learning_rate": 0.00019945429429068127, + "loss": 0.6701, + "step": 234 + }, + { + "epoch": 0.06266666666666666, + "grad_norm": 0.42412588826941533, + "learning_rate": 0.00019944524547260332, + "loss": 0.7897, + "step": 235 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.39660675719735494, + "learning_rate": 0.00019943612245568114, + "loss": 0.7323, + "step": 236 + }, + { + "epoch": 0.0632, + "grad_norm": 0.452702035122642, + "learning_rate": 0.0001994269252467217, + "loss": 0.6731, + "step": 237 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.39058240002897643, + "learning_rate": 0.00019941765385258725, + "loss": 0.7142, + "step": 238 + }, + { + "epoch": 0.06373333333333334, + "grad_norm": 0.403458169929397, + "learning_rate": 0.00019940830828019546, + "loss": 0.7358, + "step": 239 + }, + { + "epoch": 0.064, + "grad_norm": 0.4030035336489462, + "learning_rate": 0.0001993988885365193, + "loss": 0.6907, + "step": 240 + }, + { + "epoch": 0.06426666666666667, + "grad_norm": 0.41307744746815045, + "learning_rate": 0.00019938939462858714, + "loss": 0.7133, + "step": 241 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.4162945649290952, + "learning_rate": 0.0001993798265634826, + "loss": 0.7393, + "step": 242 + }, + { + "epoch": 0.0648, + "grad_norm": 0.44325936934792853, + "learning_rate": 0.0001993701843483447, + "loss": 0.7154, + "step": 243 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.3811590653484472, + "learning_rate": 0.00019936046799036777, + "loss": 0.6902, + "step": 244 + }, + { + "epoch": 0.06533333333333333, + "grad_norm": 0.4170311529581271, + "learning_rate": 0.00019935067749680147, + "loss": 0.6933, + "step": 245 + }, + { + "epoch": 0.0656, + "grad_norm": 0.38738516880995455, + "learning_rate": 0.00019934081287495067, + "loss": 0.7201, + "step": 246 + }, + { + "epoch": 0.06586666666666667, + "grad_norm": 0.4371298094780602, + "learning_rate": 0.00019933087413217575, + "loss": 0.7512, + "step": 247 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.40615564597816506, + "learning_rate": 0.0001993208612758922, + "loss": 0.7315, + "step": 248 + }, + { + "epoch": 0.0664, + "grad_norm": 0.36799596485043146, + "learning_rate": 0.00019931077431357096, + "loss": 0.6731, + "step": 249 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.4075115948919754, + "learning_rate": 0.0001993006132527381, + "loss": 0.7257, + "step": 250 + }, + { + "epoch": 0.06693333333333333, + "grad_norm": 0.41397887078278667, + "learning_rate": 0.00019929037810097514, + "loss": 0.7463, + "step": 251 + }, + { + "epoch": 0.0672, + "grad_norm": 0.39935735721022253, + "learning_rate": 0.0001992800688659188, + "loss": 0.7212, + "step": 252 + }, + { + "epoch": 0.06746666666666666, + "grad_norm": 0.397466312253459, + "learning_rate": 0.00019926968555526107, + "loss": 0.7401, + "step": 253 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.3993745573428489, + "learning_rate": 0.00019925922817674922, + "loss": 0.7534, + "step": 254 + }, + { + "epoch": 0.068, + "grad_norm": 0.4135574441728364, + "learning_rate": 0.00019924869673818577, + "loss": 0.669, + "step": 255 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.3932811852789586, + "learning_rate": 0.00019923809124742858, + "loss": 0.6537, + "step": 256 + }, + { + "epoch": 0.06853333333333333, + "grad_norm": 0.4306549060493546, + "learning_rate": 0.00019922741171239064, + "loss": 0.7526, + "step": 257 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4485140229636536, + "learning_rate": 0.00019921665814104028, + "loss": 0.7535, + "step": 258 + }, + { + "epoch": 0.06906666666666667, + "grad_norm": 0.407145027253977, + "learning_rate": 0.00019920583054140102, + "loss": 0.7281, + "step": 259 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.444754696972281, + "learning_rate": 0.00019919492892155164, + "loss": 0.7493, + "step": 260 + }, + { + "epoch": 0.0696, + "grad_norm": 0.3834940923800163, + "learning_rate": 0.00019918395328962613, + "loss": 0.7663, + "step": 261 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.5135946291558379, + "learning_rate": 0.00019917290365381373, + "loss": 0.7565, + "step": 262 + }, + { + "epoch": 0.07013333333333334, + "grad_norm": 0.41456780545003297, + "learning_rate": 0.00019916178002235885, + "loss": 0.7214, + "step": 263 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3817303462004924, + "learning_rate": 0.00019915058240356118, + "loss": 0.7189, + "step": 264 + }, + { + "epoch": 0.07066666666666667, + "grad_norm": 0.6253105182319767, + "learning_rate": 0.00019913931080577552, + "loss": 0.7168, + "step": 265 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.41538467401381385, + "learning_rate": 0.00019912796523741198, + "loss": 0.7452, + "step": 266 + }, + { + "epoch": 0.0712, + "grad_norm": 0.44820153303680027, + "learning_rate": 0.00019911654570693574, + "loss": 0.6903, + "step": 267 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.42349113398474625, + "learning_rate": 0.0001991050522228673, + "loss": 0.7091, + "step": 268 + }, + { + "epoch": 0.07173333333333333, + "grad_norm": 0.38713460495344987, + "learning_rate": 0.00019909348479378217, + "loss": 0.7084, + "step": 269 + }, + { + "epoch": 0.072, + "grad_norm": 0.42696161551154627, + "learning_rate": 0.0001990818434283112, + "loss": 0.6797, + "step": 270 + }, + { + "epoch": 0.07226666666666667, + "grad_norm": 0.4044084040674422, + "learning_rate": 0.00019907012813514033, + "loss": 0.6659, + "step": 271 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.41744902477227236, + "learning_rate": 0.00019905833892301065, + "loss": 0.7039, + "step": 272 + }, + { + "epoch": 0.0728, + "grad_norm": 0.40150435958851094, + "learning_rate": 0.0001990464758007184, + "loss": 0.7214, + "step": 273 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.4284395338256494, + "learning_rate": 0.000199034538777115, + "loss": 0.7717, + "step": 274 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 0.383533857037192, + "learning_rate": 0.000199022527861107, + "loss": 0.658, + "step": 275 + }, + { + "epoch": 0.0736, + "grad_norm": 0.3900923018002736, + "learning_rate": 0.00019901044306165606, + "loss": 0.7199, + "step": 276 + }, + { + "epoch": 0.07386666666666666, + "grad_norm": 0.3947161443368059, + "learning_rate": 0.00019899828438777899, + "loss": 0.7204, + "step": 277 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.3763194767049703, + "learning_rate": 0.00019898605184854774, + "loss": 0.682, + "step": 278 + }, + { + "epoch": 0.0744, + "grad_norm": 0.39356463846876866, + "learning_rate": 0.00019897374545308928, + "loss": 0.715, + "step": 279 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.40231244413746337, + "learning_rate": 0.0001989613652105858, + "loss": 0.7184, + "step": 280 + }, + { + "epoch": 0.07493333333333334, + "grad_norm": 0.40646593423145344, + "learning_rate": 0.00019894891113027456, + "loss": 0.6882, + "step": 281 + }, + { + "epoch": 0.0752, + "grad_norm": 0.39043792467687116, + "learning_rate": 0.00019893638322144788, + "loss": 0.7521, + "step": 282 + }, + { + "epoch": 0.07546666666666667, + "grad_norm": 0.3976099281474279, + "learning_rate": 0.0001989237814934531, + "loss": 0.6594, + "step": 283 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.41092639532736164, + "learning_rate": 0.00019891110595569283, + "loss": 0.7229, + "step": 284 + }, + { + "epoch": 0.076, + "grad_norm": 0.3810076008125446, + "learning_rate": 0.00019889835661762457, + "loss": 0.6801, + "step": 285 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.38830977529156335, + "learning_rate": 0.00019888553348876097, + "loss": 0.708, + "step": 286 + }, + { + "epoch": 0.07653333333333333, + "grad_norm": 0.416149543478018, + "learning_rate": 0.00019887263657866972, + "loss": 0.7352, + "step": 287 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4053726072491817, + "learning_rate": 0.0001988596658969736, + "loss": 0.7113, + "step": 288 + }, + { + "epoch": 0.07706666666666667, + "grad_norm": 0.4145930541903273, + "learning_rate": 0.00019884662145335034, + "loss": 0.701, + "step": 289 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.40095414721333383, + "learning_rate": 0.00019883350325753277, + "loss": 0.6995, + "step": 290 + }, + { + "epoch": 0.0776, + "grad_norm": 0.40280416171137895, + "learning_rate": 0.00019882031131930874, + "loss": 0.7844, + "step": 291 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.37286594222117764, + "learning_rate": 0.00019880704564852113, + "loss": 0.7309, + "step": 292 + }, + { + "epoch": 0.07813333333333333, + "grad_norm": 0.39936297044802194, + "learning_rate": 0.0001987937062550678, + "loss": 0.7053, + "step": 293 + }, + { + "epoch": 0.0784, + "grad_norm": 0.39061374120110426, + "learning_rate": 0.0001987802931489017, + "loss": 0.7232, + "step": 294 + }, + { + "epoch": 0.07866666666666666, + "grad_norm": 0.39757752193953794, + "learning_rate": 0.00019876680634003068, + "loss": 0.7429, + "step": 295 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.4104169489448244, + "learning_rate": 0.00019875324583851757, + "loss": 0.7115, + "step": 296 + }, + { + "epoch": 0.0792, + "grad_norm": 0.37368511859538683, + "learning_rate": 0.00019873961165448032, + "loss": 0.7219, + "step": 297 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.4199598853000963, + "learning_rate": 0.00019872590379809172, + "loss": 0.7756, + "step": 298 + }, + { + "epoch": 0.07973333333333334, + "grad_norm": 0.40882562410751905, + "learning_rate": 0.00019871212227957961, + "loss": 0.6989, + "step": 299 + }, + { + "epoch": 0.08, + "grad_norm": 0.4090549309973313, + "learning_rate": 0.00019869826710922675, + "loss": 0.7451, + "step": 300 + }, + { + "epoch": 0.08026666666666667, + "grad_norm": 0.40162365150041546, + "learning_rate": 0.00019868433829737083, + "loss": 0.7506, + "step": 301 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.4094513705677747, + "learning_rate": 0.00019867033585440456, + "loss": 0.7675, + "step": 302 + }, + { + "epoch": 0.0808, + "grad_norm": 0.39038982751168183, + "learning_rate": 0.00019865625979077555, + "loss": 0.7277, + "step": 303 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.41323184742019836, + "learning_rate": 0.00019864211011698634, + "loss": 0.7182, + "step": 304 + }, + { + "epoch": 0.08133333333333333, + "grad_norm": 0.38188370467533733, + "learning_rate": 0.00019862788684359438, + "loss": 0.6865, + "step": 305 + }, + { + "epoch": 0.0816, + "grad_norm": 0.38843917921763027, + "learning_rate": 0.00019861358998121204, + "loss": 0.7148, + "step": 306 + }, + { + "epoch": 0.08186666666666667, + "grad_norm": 0.37220997337943657, + "learning_rate": 0.00019859921954050664, + "loss": 0.7134, + "step": 307 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.3986392957863737, + "learning_rate": 0.00019858477553220033, + "loss": 0.7156, + "step": 308 + }, + { + "epoch": 0.0824, + "grad_norm": 0.3977418558092343, + "learning_rate": 0.0001985702579670702, + "loss": 0.7261, + "step": 309 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.3840774177569215, + "learning_rate": 0.00019855566685594815, + "loss": 0.7901, + "step": 310 + }, + { + "epoch": 0.08293333333333333, + "grad_norm": 0.37988919505690316, + "learning_rate": 0.00019854100220972112, + "loss": 0.7309, + "step": 311 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3707403548715134, + "learning_rate": 0.0001985262640393307, + "loss": 0.704, + "step": 312 + }, + { + "epoch": 0.08346666666666666, + "grad_norm": 0.3808201215582751, + "learning_rate": 0.00019851145235577354, + "loss": 0.6774, + "step": 313 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.4045590430944023, + "learning_rate": 0.00019849656717010094, + "loss": 0.7009, + "step": 314 + }, + { + "epoch": 0.084, + "grad_norm": 0.407890590412557, + "learning_rate": 0.00019848160849341925, + "loss": 0.7121, + "step": 315 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.3998158812732857, + "learning_rate": 0.0001984665763368895, + "loss": 0.7352, + "step": 316 + }, + { + "epoch": 0.08453333333333334, + "grad_norm": 0.4398899486261411, + "learning_rate": 0.00019845147071172759, + "loss": 0.7312, + "step": 317 + }, + { + "epoch": 0.0848, + "grad_norm": 0.39424069500295306, + "learning_rate": 0.00019843629162920426, + "loss": 0.7057, + "step": 318 + }, + { + "epoch": 0.08506666666666667, + "grad_norm": 0.39213572853124057, + "learning_rate": 0.00019842103910064506, + "loss": 0.7003, + "step": 319 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.4088822832005749, + "learning_rate": 0.00019840571313743032, + "loss": 0.7312, + "step": 320 + }, + { + "epoch": 0.0856, + "grad_norm": 0.38220192094973704, + "learning_rate": 0.00019839031375099513, + "loss": 0.6551, + "step": 321 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.4297798800475834, + "learning_rate": 0.00019837484095282942, + "loss": 0.7285, + "step": 322 + }, + { + "epoch": 0.08613333333333334, + "grad_norm": 0.39065133633158505, + "learning_rate": 0.00019835929475447785, + "loss": 0.7, + "step": 323 + }, + { + "epoch": 0.0864, + "grad_norm": 0.3768072519822633, + "learning_rate": 0.0001983436751675399, + "loss": 0.686, + "step": 324 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 0.4033863317710327, + "learning_rate": 0.00019832798220366978, + "loss": 0.7154, + "step": 325 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.3970458373238512, + "learning_rate": 0.0001983122158745764, + "loss": 0.7407, + "step": 326 + }, + { + "epoch": 0.0872, + "grad_norm": 0.3833499500800447, + "learning_rate": 0.0001982963761920235, + "loss": 0.7371, + "step": 327 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.39051951392278295, + "learning_rate": 0.00019828046316782948, + "loss": 0.7139, + "step": 328 + }, + { + "epoch": 0.08773333333333333, + "grad_norm": 0.416335268121753, + "learning_rate": 0.0001982644768138675, + "loss": 0.7186, + "step": 329 + }, + { + "epoch": 0.088, + "grad_norm": 0.39878078216361823, + "learning_rate": 0.0001982484171420654, + "loss": 0.7223, + "step": 330 + }, + { + "epoch": 0.08826666666666666, + "grad_norm": 0.4258223335265266, + "learning_rate": 0.00019823228416440575, + "loss": 0.6913, + "step": 331 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.36763344196266556, + "learning_rate": 0.00019821607789292583, + "loss": 0.7624, + "step": 332 + }, + { + "epoch": 0.0888, + "grad_norm": 0.3784547523972229, + "learning_rate": 0.00019819979833971755, + "loss": 0.7005, + "step": 333 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.41740648720516116, + "learning_rate": 0.00019818344551692757, + "loss": 0.7039, + "step": 334 + }, + { + "epoch": 0.08933333333333333, + "grad_norm": 0.3669460038794941, + "learning_rate": 0.00019816701943675718, + "loss": 0.7416, + "step": 335 + }, + { + "epoch": 0.0896, + "grad_norm": 0.47643637440544684, + "learning_rate": 0.0001981505201114623, + "loss": 0.7406, + "step": 336 + }, + { + "epoch": 0.08986666666666666, + "grad_norm": 0.4111404653795723, + "learning_rate": 0.0001981339475533536, + "loss": 0.7365, + "step": 337 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.398282186305748, + "learning_rate": 0.00019811730177479625, + "loss": 0.7166, + "step": 338 + }, + { + "epoch": 0.0904, + "grad_norm": 0.41489095670999077, + "learning_rate": 0.00019810058278821015, + "loss": 0.6831, + "step": 339 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.41367677985636275, + "learning_rate": 0.0001980837906060698, + "loss": 0.7237, + "step": 340 + }, + { + "epoch": 0.09093333333333334, + "grad_norm": 0.4118613955842153, + "learning_rate": 0.00019806692524090434, + "loss": 0.7493, + "step": 341 + }, + { + "epoch": 0.0912, + "grad_norm": 0.38049350260478093, + "learning_rate": 0.0001980499867052974, + "loss": 0.7206, + "step": 342 + }, + { + "epoch": 0.09146666666666667, + "grad_norm": 0.363861364552363, + "learning_rate": 0.0001980329750118874, + "loss": 0.6824, + "step": 343 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.36385625278410144, + "learning_rate": 0.00019801589017336715, + "loss": 0.6738, + "step": 344 + }, + { + "epoch": 0.092, + "grad_norm": 0.4084914661595541, + "learning_rate": 0.00019799873220248415, + "loss": 0.7318, + "step": 345 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.38902592796422086, + "learning_rate": 0.00019798150111204047, + "loss": 0.7119, + "step": 346 + }, + { + "epoch": 0.09253333333333333, + "grad_norm": 0.3947885945538274, + "learning_rate": 0.00019796419691489264, + "loss": 0.7115, + "step": 347 + }, + { + "epoch": 0.0928, + "grad_norm": 0.3890436587380582, + "learning_rate": 0.00019794681962395183, + "loss": 0.7276, + "step": 348 + }, + { + "epoch": 0.09306666666666667, + "grad_norm": 0.3926882677435967, + "learning_rate": 0.00019792936925218372, + "loss": 0.6663, + "step": 349 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.36238182009460046, + "learning_rate": 0.00019791184581260848, + "loss": 0.6512, + "step": 350 + }, + { + "epoch": 0.0936, + "grad_norm": 0.3662753465749837, + "learning_rate": 0.00019789424931830087, + "loss": 0.6519, + "step": 351 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.38546599016871463, + "learning_rate": 0.00019787657978239014, + "loss": 0.6919, + "step": 352 + }, + { + "epoch": 0.09413333333333333, + "grad_norm": 0.374980384366503, + "learning_rate": 0.00019785883721806, + "loss": 0.7021, + "step": 353 + }, + { + "epoch": 0.0944, + "grad_norm": 0.3598183990882242, + "learning_rate": 0.00019784102163854862, + "loss": 0.7078, + "step": 354 + }, + { + "epoch": 0.09466666666666666, + "grad_norm": 0.3940805411387451, + "learning_rate": 0.00019782313305714873, + "loss": 0.7336, + "step": 355 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.36953792011424874, + "learning_rate": 0.00019780517148720753, + "loss": 0.6453, + "step": 356 + }, + { + "epoch": 0.0952, + "grad_norm": 0.3890574355800986, + "learning_rate": 0.0001977871369421266, + "loss": 0.6786, + "step": 357 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.4112663862907459, + "learning_rate": 0.000197769029435362, + "loss": 0.7032, + "step": 358 + }, + { + "epoch": 0.09573333333333334, + "grad_norm": 0.37062221805783596, + "learning_rate": 0.00019775084898042427, + "loss": 0.6423, + "step": 359 + }, + { + "epoch": 0.096, + "grad_norm": 0.3825463311199024, + "learning_rate": 0.00019773259559087837, + "loss": 0.6678, + "step": 360 + }, + { + "epoch": 0.09626666666666667, + "grad_norm": 0.3779182311924708, + "learning_rate": 0.0001977142692803436, + "loss": 0.6635, + "step": 361 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.4010129545052071, + "learning_rate": 0.00019769587006249382, + "loss": 0.7255, + "step": 362 + }, + { + "epoch": 0.0968, + "grad_norm": 0.3886084008931301, + "learning_rate": 0.0001976773979510571, + "loss": 0.7252, + "step": 363 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.3919435017107045, + "learning_rate": 0.000197658852959816, + "loss": 0.7018, + "step": 364 + }, + { + "epoch": 0.09733333333333333, + "grad_norm": 0.38254913244976363, + "learning_rate": 0.0001976402351026075, + "loss": 0.7206, + "step": 365 + }, + { + "epoch": 0.0976, + "grad_norm": 0.4438518685757539, + "learning_rate": 0.00019762154439332289, + "loss": 0.6991, + "step": 366 + }, + { + "epoch": 0.09786666666666667, + "grad_norm": 0.41508929267611483, + "learning_rate": 0.00019760278084590777, + "loss": 0.7166, + "step": 367 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.43536758123081454, + "learning_rate": 0.0001975839444743622, + "loss": 0.7193, + "step": 368 + }, + { + "epoch": 0.0984, + "grad_norm": 0.3856740683698884, + "learning_rate": 0.00019756503529274046, + "loss": 0.7189, + "step": 369 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.39611104594234997, + "learning_rate": 0.00019754605331515128, + "loss": 0.7121, + "step": 370 + }, + { + "epoch": 0.09893333333333333, + "grad_norm": 0.37928103167529814, + "learning_rate": 0.0001975269985557576, + "loss": 0.6837, + "step": 371 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3888164429713909, + "learning_rate": 0.00019750787102877673, + "loss": 0.6697, + "step": 372 + }, + { + "epoch": 0.09946666666666666, + "grad_norm": 0.40899344492977824, + "learning_rate": 0.0001974886707484802, + "loss": 0.7001, + "step": 373 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.41039107754431725, + "learning_rate": 0.00019746939772919393, + "loss": 0.7342, + "step": 374 + }, + { + "epoch": 0.1, + "grad_norm": 0.3765992456098516, + "learning_rate": 0.00019745005198529799, + "loss": 0.7021, + "step": 375 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.38566329561207197, + "learning_rate": 0.00019743063353122676, + "loss": 0.7452, + "step": 376 + }, + { + "epoch": 0.10053333333333334, + "grad_norm": 0.3898370403448482, + "learning_rate": 0.00019741114238146899, + "loss": 0.7206, + "step": 377 + }, + { + "epoch": 0.1008, + "grad_norm": 0.39569847947339876, + "learning_rate": 0.00019739157855056747, + "loss": 0.7104, + "step": 378 + }, + { + "epoch": 0.10106666666666667, + "grad_norm": 0.4720163610507204, + "learning_rate": 0.00019737194205311936, + "loss": 0.7244, + "step": 379 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.3860555716947627, + "learning_rate": 0.00019735223290377594, + "loss": 0.67, + "step": 380 + }, + { + "epoch": 0.1016, + "grad_norm": 0.39251393253669836, + "learning_rate": 0.0001973324511172428, + "loss": 0.7087, + "step": 381 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.42593556820946193, + "learning_rate": 0.0001973125967082797, + "loss": 0.6885, + "step": 382 + }, + { + "epoch": 0.10213333333333334, + "grad_norm": 0.40780829567432786, + "learning_rate": 0.00019729266969170049, + "loss": 0.711, + "step": 383 + }, + { + "epoch": 0.1024, + "grad_norm": 0.4056697673225834, + "learning_rate": 0.00019727267008237334, + "loss": 0.7211, + "step": 384 + }, + { + "epoch": 0.10266666666666667, + "grad_norm": 0.39581626367201905, + "learning_rate": 0.00019725259789522045, + "loss": 0.7272, + "step": 385 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.373874531062953, + "learning_rate": 0.00019723245314521827, + "loss": 0.7052, + "step": 386 + }, + { + "epoch": 0.1032, + "grad_norm": 0.4068541260659157, + "learning_rate": 0.00019721223584739735, + "loss": 0.7115, + "step": 387 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.38118981432547044, + "learning_rate": 0.00019719194601684235, + "loss": 0.6784, + "step": 388 + }, + { + "epoch": 0.10373333333333333, + "grad_norm": 0.3881837421417207, + "learning_rate": 0.0001971715836686921, + "loss": 0.7183, + "step": 389 + }, + { + "epoch": 0.104, + "grad_norm": 0.40651980511531366, + "learning_rate": 0.0001971511488181395, + "loss": 0.6528, + "step": 390 + }, + { + "epoch": 0.10426666666666666, + "grad_norm": 0.36871056090016163, + "learning_rate": 0.00019713064148043158, + "loss": 0.7222, + "step": 391 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.3948854284153273, + "learning_rate": 0.00019711006167086938, + "loss": 0.7109, + "step": 392 + }, + { + "epoch": 0.1048, + "grad_norm": 0.4027591206578172, + "learning_rate": 0.0001970894094048081, + "loss": 0.716, + "step": 393 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.36856288639039225, + "learning_rate": 0.00019706868469765695, + "loss": 0.6629, + "step": 394 + }, + { + "epoch": 0.10533333333333333, + "grad_norm": 0.40244812724478196, + "learning_rate": 0.00019704788756487926, + "loss": 0.6666, + "step": 395 + }, + { + "epoch": 0.1056, + "grad_norm": 0.40518224265258496, + "learning_rate": 0.00019702701802199227, + "loss": 0.7695, + "step": 396 + }, + { + "epoch": 0.10586666666666666, + "grad_norm": 0.39128058766537643, + "learning_rate": 0.00019700607608456733, + "loss": 0.7157, + "step": 397 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.3548081988054615, + "learning_rate": 0.00019698506176822988, + "loss": 0.7052, + "step": 398 + }, + { + "epoch": 0.1064, + "grad_norm": 0.37775388733574533, + "learning_rate": 0.00019696397508865918, + "loss": 0.6781, + "step": 399 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.3777432616655833, + "learning_rate": 0.00019694281606158864, + "loss": 0.694, + "step": 400 + }, + { + "epoch": 0.10693333333333334, + "grad_norm": 0.3817320084513074, + "learning_rate": 0.0001969215847028056, + "loss": 0.7158, + "step": 401 + }, + { + "epoch": 0.1072, + "grad_norm": 0.36780205270365646, + "learning_rate": 0.0001969002810281513, + "loss": 0.6478, + "step": 402 + }, + { + "epoch": 0.10746666666666667, + "grad_norm": 0.3575251301932596, + "learning_rate": 0.00019687890505352108, + "loss": 0.6592, + "step": 403 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.36015115408778114, + "learning_rate": 0.0001968574567948641, + "loss": 0.6652, + "step": 404 + }, + { + "epoch": 0.108, + "grad_norm": 0.37514101701633856, + "learning_rate": 0.0001968359362681835, + "loss": 0.7163, + "step": 405 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.35848046775503867, + "learning_rate": 0.00019681434348953636, + "loss": 0.6813, + "step": 406 + }, + { + "epoch": 0.10853333333333333, + "grad_norm": 0.38195265703262476, + "learning_rate": 0.0001967926784750336, + "loss": 0.6695, + "step": 407 + }, + { + "epoch": 0.1088, + "grad_norm": 0.41766714739982413, + "learning_rate": 0.00019677094124084018, + "loss": 0.7335, + "step": 408 + }, + { + "epoch": 0.10906666666666667, + "grad_norm": 0.39122290353640826, + "learning_rate": 0.00019674913180317476, + "loss": 0.6989, + "step": 409 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.3974503199045698, + "learning_rate": 0.00019672725017831, + "loss": 0.6931, + "step": 410 + }, + { + "epoch": 0.1096, + "grad_norm": 0.3735666471227619, + "learning_rate": 0.0001967052963825724, + "loss": 0.6953, + "step": 411 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.4089716436278948, + "learning_rate": 0.00019668327043234225, + "loss": 0.7341, + "step": 412 + }, + { + "epoch": 0.11013333333333333, + "grad_norm": 0.38233747405220225, + "learning_rate": 0.00019666117234405376, + "loss": 0.6879, + "step": 413 + }, + { + "epoch": 0.1104, + "grad_norm": 0.3801004094705498, + "learning_rate": 0.0001966390021341949, + "loss": 0.673, + "step": 414 + }, + { + "epoch": 0.11066666666666666, + "grad_norm": 0.3852979399073058, + "learning_rate": 0.00019661675981930748, + "loss": 0.7, + "step": 415 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.36823664882709845, + "learning_rate": 0.0001965944454159871, + "loss": 0.638, + "step": 416 + }, + { + "epoch": 0.1112, + "grad_norm": 0.39215177508706645, + "learning_rate": 0.0001965720589408832, + "loss": 0.7268, + "step": 417 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.39177098212365996, + "learning_rate": 0.0001965496004106989, + "loss": 0.7345, + "step": 418 + }, + { + "epoch": 0.11173333333333334, + "grad_norm": 0.41500414564582316, + "learning_rate": 0.0001965270698421911, + "loss": 0.6647, + "step": 419 + }, + { + "epoch": 0.112, + "grad_norm": 0.4293857469972005, + "learning_rate": 0.00019650446725217056, + "loss": 0.7198, + "step": 420 + }, + { + "epoch": 0.11226666666666667, + "grad_norm": 0.5469028745667647, + "learning_rate": 0.00019648179265750165, + "loss": 0.7457, + "step": 421 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.389404673712786, + "learning_rate": 0.00019645904607510248, + "loss": 0.7336, + "step": 422 + }, + { + "epoch": 0.1128, + "grad_norm": 0.4632565101273264, + "learning_rate": 0.00019643622752194497, + "loss": 0.6786, + "step": 423 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.39670356476204083, + "learning_rate": 0.00019641333701505463, + "loss": 0.7391, + "step": 424 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 0.3879711271305278, + "learning_rate": 0.00019639037457151073, + "loss": 0.7339, + "step": 425 + }, + { + "epoch": 0.1136, + "grad_norm": 0.38946149940665215, + "learning_rate": 0.00019636734020844613, + "loss": 0.7366, + "step": 426 + }, + { + "epoch": 0.11386666666666667, + "grad_norm": 0.3769750012390701, + "learning_rate": 0.00019634423394304749, + "loss": 0.6679, + "step": 427 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.3941058498900877, + "learning_rate": 0.00019632105579255496, + "loss": 0.7171, + "step": 428 + }, + { + "epoch": 0.1144, + "grad_norm": 0.3942461239339633, + "learning_rate": 0.00019629780577426243, + "loss": 0.7284, + "step": 429 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.39236431276715966, + "learning_rate": 0.00019627448390551735, + "loss": 0.7804, + "step": 430 + }, + { + "epoch": 0.11493333333333333, + "grad_norm": 0.4247373584191937, + "learning_rate": 0.00019625109020372084, + "loss": 0.7512, + "step": 431 + }, + { + "epoch": 0.1152, + "grad_norm": 0.41224618842272703, + "learning_rate": 0.00019622762468632759, + "loss": 0.7288, + "step": 432 + }, + { + "epoch": 0.11546666666666666, + "grad_norm": 0.38269657954130515, + "learning_rate": 0.00019620408737084586, + "loss": 0.7338, + "step": 433 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.38041705459703384, + "learning_rate": 0.00019618047827483744, + "loss": 0.6806, + "step": 434 + }, + { + "epoch": 0.116, + "grad_norm": 0.3527624598512677, + "learning_rate": 0.00019615679741591784, + "loss": 0.6883, + "step": 435 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.40031145378579214, + "learning_rate": 0.00019613304481175595, + "loss": 0.6551, + "step": 436 + }, + { + "epoch": 0.11653333333333334, + "grad_norm": 0.38611200230215986, + "learning_rate": 0.0001961092204800742, + "loss": 0.7388, + "step": 437 + }, + { + "epoch": 0.1168, + "grad_norm": 0.39393977207600733, + "learning_rate": 0.0001960853244386486, + "loss": 0.7083, + "step": 438 + }, + { + "epoch": 0.11706666666666667, + "grad_norm": 0.4090195353783212, + "learning_rate": 0.00019606135670530872, + "loss": 0.7175, + "step": 439 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.3936963270607546, + "learning_rate": 0.00019603731729793747, + "loss": 0.7228, + "step": 440 + }, + { + "epoch": 0.1176, + "grad_norm": 0.38046986789467907, + "learning_rate": 0.00019601320623447132, + "loss": 0.675, + "step": 441 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.3968625139483729, + "learning_rate": 0.00019598902353290022, + "loss": 0.6765, + "step": 442 + }, + { + "epoch": 0.11813333333333334, + "grad_norm": 0.42326442027237987, + "learning_rate": 0.00019596476921126757, + "loss": 0.7131, + "step": 443 + }, + { + "epoch": 0.1184, + "grad_norm": 0.388244436576597, + "learning_rate": 0.00019594044328767016, + "loss": 0.701, + "step": 444 + }, + { + "epoch": 0.11866666666666667, + "grad_norm": 0.4706560893775817, + "learning_rate": 0.00019591604578025825, + "loss": 0.7012, + "step": 445 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.3712409004617936, + "learning_rate": 0.00019589157670723547, + "loss": 0.7553, + "step": 446 + }, + { + "epoch": 0.1192, + "grad_norm": 0.3878336725075563, + "learning_rate": 0.0001958670360868589, + "loss": 0.7075, + "step": 447 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.37540072132025487, + "learning_rate": 0.00019584242393743897, + "loss": 0.7147, + "step": 448 + }, + { + "epoch": 0.11973333333333333, + "grad_norm": 0.38676224850073415, + "learning_rate": 0.00019581774027733947, + "loss": 0.6761, + "step": 449 + }, + { + "epoch": 0.12, + "grad_norm": 0.3669138850766407, + "learning_rate": 0.00019579298512497758, + "loss": 0.765, + "step": 450 + }, + { + "epoch": 0.12026666666666666, + "grad_norm": 0.3613272862413039, + "learning_rate": 0.00019576815849882377, + "loss": 0.646, + "step": 451 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.38302150751257635, + "learning_rate": 0.0001957432604174019, + "loss": 0.6995, + "step": 452 + }, + { + "epoch": 0.1208, + "grad_norm": 0.43263743750584843, + "learning_rate": 0.0001957182908992891, + "loss": 0.7126, + "step": 453 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.4288227827429822, + "learning_rate": 0.00019569324996311584, + "loss": 0.7298, + "step": 454 + }, + { + "epoch": 0.12133333333333333, + "grad_norm": 0.3738395629251792, + "learning_rate": 0.00019566813762756584, + "loss": 0.7032, + "step": 455 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3985200360659549, + "learning_rate": 0.0001956429539113761, + "loss": 0.6797, + "step": 456 + }, + { + "epoch": 0.12186666666666666, + "grad_norm": 0.3778754358482348, + "learning_rate": 0.00019561769883333688, + "loss": 0.6595, + "step": 457 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.37786666063658525, + "learning_rate": 0.00019559237241229173, + "loss": 0.6953, + "step": 458 + }, + { + "epoch": 0.1224, + "grad_norm": 0.3789340648124684, + "learning_rate": 0.00019556697466713735, + "loss": 0.761, + "step": 459 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.3730473524126068, + "learning_rate": 0.00019554150561682372, + "loss": 0.7201, + "step": 460 + }, + { + "epoch": 0.12293333333333334, + "grad_norm": 0.3769221787949578, + "learning_rate": 0.000195515965280354, + "loss": 0.7414, + "step": 461 + }, + { + "epoch": 0.1232, + "grad_norm": 0.38519664938922243, + "learning_rate": 0.00019549035367678451, + "loss": 0.718, + "step": 462 + }, + { + "epoch": 0.12346666666666667, + "grad_norm": 0.3699775356024406, + "learning_rate": 0.00019546467082522483, + "loss": 0.721, + "step": 463 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.372762708466123, + "learning_rate": 0.00019543891674483766, + "loss": 0.6597, + "step": 464 + }, + { + "epoch": 0.124, + "grad_norm": 0.36824938738061047, + "learning_rate": 0.0001954130914548387, + "loss": 0.709, + "step": 465 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.4107032479970567, + "learning_rate": 0.00019538719497449707, + "loss": 0.7142, + "step": 466 + }, + { + "epoch": 0.12453333333333333, + "grad_norm": 0.4254004625795781, + "learning_rate": 0.00019536122732313475, + "loss": 0.7167, + "step": 467 + }, + { + "epoch": 0.1248, + "grad_norm": 0.40441743219413473, + "learning_rate": 0.00019533518852012693, + "loss": 0.7152, + "step": 468 + }, + { + "epoch": 0.12506666666666666, + "grad_norm": 0.39060382177465347, + "learning_rate": 0.00019530907858490191, + "loss": 0.7042, + "step": 469 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.3817381243273451, + "learning_rate": 0.00019528289753694108, + "loss": 0.7459, + "step": 470 + }, + { + "epoch": 0.1256, + "grad_norm": 0.3967028651412326, + "learning_rate": 0.00019525664539577875, + "loss": 0.6417, + "step": 471 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.3939787020778525, + "learning_rate": 0.0001952303221810024, + "loss": 0.6592, + "step": 472 + }, + { + "epoch": 0.12613333333333332, + "grad_norm": 0.4100587751073264, + "learning_rate": 0.00019520392791225254, + "loss": 0.6638, + "step": 473 + }, + { + "epoch": 0.1264, + "grad_norm": 0.3986654381090219, + "learning_rate": 0.0001951774626092226, + "loss": 0.7116, + "step": 474 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 0.42074501307535356, + "learning_rate": 0.0001951509262916591, + "loss": 0.7053, + "step": 475 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.38149676264179544, + "learning_rate": 0.00019512431897936156, + "loss": 0.7113, + "step": 476 + }, + { + "epoch": 0.1272, + "grad_norm": 0.4189460603285744, + "learning_rate": 0.0001950976406921824, + "loss": 0.688, + "step": 477 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.399465157569357, + "learning_rate": 0.000195070891450027, + "loss": 0.7806, + "step": 478 + }, + { + "epoch": 0.12773333333333334, + "grad_norm": 0.3874773618021789, + "learning_rate": 0.00019504407127285376, + "loss": 0.7193, + "step": 479 + }, + { + "epoch": 0.128, + "grad_norm": 0.39690339102423877, + "learning_rate": 0.00019501718018067395, + "loss": 0.6713, + "step": 480 + }, + { + "epoch": 0.12826666666666667, + "grad_norm": 0.3912036625403481, + "learning_rate": 0.0001949902181935517, + "loss": 0.7316, + "step": 481 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.39128560548342606, + "learning_rate": 0.0001949631853316041, + "loss": 0.6494, + "step": 482 + }, + { + "epoch": 0.1288, + "grad_norm": 0.37267424886413625, + "learning_rate": 0.0001949360816150012, + "loss": 0.6618, + "step": 483 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.446165209883545, + "learning_rate": 0.00019490890706396575, + "loss": 0.7394, + "step": 484 + }, + { + "epoch": 0.12933333333333333, + "grad_norm": 0.3772600031543454, + "learning_rate": 0.0001948816616987735, + "loss": 0.7313, + "step": 485 + }, + { + "epoch": 0.1296, + "grad_norm": 0.36533180458150905, + "learning_rate": 0.0001948543455397529, + "loss": 0.6811, + "step": 486 + }, + { + "epoch": 0.12986666666666666, + "grad_norm": 0.38475823901985934, + "learning_rate": 0.00019482695860728531, + "loss": 0.6924, + "step": 487 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.3959141517349, + "learning_rate": 0.0001947995009218049, + "loss": 0.6533, + "step": 488 + }, + { + "epoch": 0.1304, + "grad_norm": 0.3615375786062136, + "learning_rate": 0.0001947719725037986, + "loss": 0.6732, + "step": 489 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.38276022814175553, + "learning_rate": 0.00019474437337380608, + "loss": 0.7338, + "step": 490 + }, + { + "epoch": 0.13093333333333335, + "grad_norm": 0.3924131689767105, + "learning_rate": 0.00019471670355241988, + "loss": 0.7005, + "step": 491 + }, + { + "epoch": 0.1312, + "grad_norm": 0.34943802477738917, + "learning_rate": 0.00019468896306028518, + "loss": 0.6368, + "step": 492 + }, + { + "epoch": 0.13146666666666668, + "grad_norm": 0.3811001180617372, + "learning_rate": 0.00019466115191809995, + "loss": 0.6497, + "step": 493 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.39204351851599806, + "learning_rate": 0.00019463327014661484, + "loss": 0.6287, + "step": 494 + }, + { + "epoch": 0.132, + "grad_norm": 0.39848918656273674, + "learning_rate": 0.00019460531776663317, + "loss": 0.7274, + "step": 495 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.3968669072852806, + "learning_rate": 0.00019457729479901103, + "loss": 0.7507, + "step": 496 + }, + { + "epoch": 0.13253333333333334, + "grad_norm": 0.39250779593378327, + "learning_rate": 0.00019454920126465715, + "loss": 0.7157, + "step": 497 + }, + { + "epoch": 0.1328, + "grad_norm": 0.3596616176352458, + "learning_rate": 0.00019452103718453284, + "loss": 0.6984, + "step": 498 + }, + { + "epoch": 0.13306666666666667, + "grad_norm": 0.37092589890152405, + "learning_rate": 0.0001944928025796521, + "loss": 0.6931, + "step": 499 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.3997701138574225, + "learning_rate": 0.0001944644974710816, + "loss": 0.6404, + "step": 500 + }, + { + "epoch": 0.1336, + "grad_norm": 0.39686760159172485, + "learning_rate": 0.00019443612187994053, + "loss": 0.6973, + "step": 501 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.377371434441508, + "learning_rate": 0.00019440767582740067, + "loss": 0.6453, + "step": 502 + }, + { + "epoch": 0.13413333333333333, + "grad_norm": 0.3725684843333112, + "learning_rate": 0.00019437915933468648, + "loss": 0.6715, + "step": 503 + }, + { + "epoch": 0.1344, + "grad_norm": 0.36606500678021225, + "learning_rate": 0.0001943505724230748, + "loss": 0.6707, + "step": 504 + }, + { + "epoch": 0.13466666666666666, + "grad_norm": 0.3661108023226464, + "learning_rate": 0.0001943219151138952, + "loss": 0.6641, + "step": 505 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.3732834449322749, + "learning_rate": 0.00019429318742852968, + "loss": 0.7234, + "step": 506 + }, + { + "epoch": 0.1352, + "grad_norm": 0.3893008992826764, + "learning_rate": 0.00019426438938841277, + "loss": 0.7005, + "step": 507 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.3687076104728676, + "learning_rate": 0.00019423552101503142, + "loss": 0.7207, + "step": 508 + }, + { + "epoch": 0.13573333333333334, + "grad_norm": 0.38796754942956857, + "learning_rate": 0.00019420658232992518, + "loss": 0.7281, + "step": 509 + }, + { + "epoch": 0.136, + "grad_norm": 0.3879548636092806, + "learning_rate": 0.00019417757335468596, + "loss": 0.6998, + "step": 510 + }, + { + "epoch": 0.13626666666666667, + "grad_norm": 0.38507133760254353, + "learning_rate": 0.0001941484941109582, + "loss": 0.6637, + "step": 511 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.38482947061663325, + "learning_rate": 0.00019411934462043872, + "loss": 0.7342, + "step": 512 + }, + { + "epoch": 0.1368, + "grad_norm": 0.3909204374564946, + "learning_rate": 0.00019409012490487668, + "loss": 0.7501, + "step": 513 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.37049107992470565, + "learning_rate": 0.00019406083498607385, + "loss": 0.6687, + "step": 514 + }, + { + "epoch": 0.13733333333333334, + "grad_norm": 0.3730924397110541, + "learning_rate": 0.00019403147488588414, + "loss": 0.7122, + "step": 515 + }, + { + "epoch": 0.1376, + "grad_norm": 0.3541371004502841, + "learning_rate": 0.000194002044626214, + "loss": 0.7239, + "step": 516 + }, + { + "epoch": 0.13786666666666667, + "grad_norm": 0.40978986375192183, + "learning_rate": 0.00019397254422902206, + "loss": 0.7464, + "step": 517 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.41531228006812554, + "learning_rate": 0.00019394297371631952, + "loss": 0.7357, + "step": 518 + }, + { + "epoch": 0.1384, + "grad_norm": 0.3776876049266166, + "learning_rate": 0.00019391333311016967, + "loss": 0.6312, + "step": 519 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.44763687900673493, + "learning_rate": 0.00019388362243268824, + "loss": 0.7614, + "step": 520 + }, + { + "epoch": 0.13893333333333333, + "grad_norm": 0.3722603187197382, + "learning_rate": 0.0001938538417060431, + "loss": 0.7074, + "step": 521 + }, + { + "epoch": 0.1392, + "grad_norm": 0.3672755771023005, + "learning_rate": 0.00019382399095245454, + "loss": 0.7071, + "step": 522 + }, + { + "epoch": 0.13946666666666666, + "grad_norm": 0.3722366282524974, + "learning_rate": 0.000193794070194195, + "loss": 0.6997, + "step": 523 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.3738321181849173, + "learning_rate": 0.0001937640794535892, + "loss": 0.6596, + "step": 524 + }, + { + "epoch": 0.14, + "grad_norm": 0.39904992894995667, + "learning_rate": 0.00019373401875301407, + "loss": 0.6974, + "step": 525 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.37494149466913873, + "learning_rate": 0.00019370388811489872, + "loss": 0.7086, + "step": 526 + }, + { + "epoch": 0.14053333333333334, + "grad_norm": 0.37347478013109475, + "learning_rate": 0.00019367368756172443, + "loss": 0.7236, + "step": 527 + }, + { + "epoch": 0.1408, + "grad_norm": 0.34784329622429616, + "learning_rate": 0.0001936434171160247, + "loss": 0.6935, + "step": 528 + }, + { + "epoch": 0.14106666666666667, + "grad_norm": 0.3750156617703292, + "learning_rate": 0.00019361307680038517, + "loss": 0.739, + "step": 529 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.37232606672390345, + "learning_rate": 0.0001935826666374435, + "loss": 0.6925, + "step": 530 + }, + { + "epoch": 0.1416, + "grad_norm": 0.39762486938390995, + "learning_rate": 0.0001935521866498896, + "loss": 0.7205, + "step": 531 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.38044493859724926, + "learning_rate": 0.00019352163686046545, + "loss": 0.6796, + "step": 532 + }, + { + "epoch": 0.14213333333333333, + "grad_norm": 0.39295855919122624, + "learning_rate": 0.00019349101729196507, + "loss": 0.6914, + "step": 533 + }, + { + "epoch": 0.1424, + "grad_norm": 0.45628995595205124, + "learning_rate": 0.00019346032796723454, + "loss": 0.6757, + "step": 534 + }, + { + "epoch": 0.14266666666666666, + "grad_norm": 0.37460111359597603, + "learning_rate": 0.00019342956890917209, + "loss": 0.7017, + "step": 535 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.38274088967527936, + "learning_rate": 0.00019339874014072782, + "loss": 0.7232, + "step": 536 + }, + { + "epoch": 0.1432, + "grad_norm": 0.3722286636113766, + "learning_rate": 0.00019336784168490396, + "loss": 0.7645, + "step": 537 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.39360083837185894, + "learning_rate": 0.00019333687356475472, + "loss": 0.6773, + "step": 538 + }, + { + "epoch": 0.14373333333333332, + "grad_norm": 0.4105223572181332, + "learning_rate": 0.00019330583580338622, + "loss": 0.7111, + "step": 539 + }, + { + "epoch": 0.144, + "grad_norm": 0.3927114267160294, + "learning_rate": 0.00019327472842395666, + "loss": 0.6867, + "step": 540 + }, + { + "epoch": 0.14426666666666665, + "grad_norm": 0.3767313755097996, + "learning_rate": 0.00019324355144967605, + "loss": 0.7062, + "step": 541 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.3877264118901853, + "learning_rate": 0.00019321230490380642, + "loss": 0.6426, + "step": 542 + }, + { + "epoch": 0.1448, + "grad_norm": 0.3760745762306027, + "learning_rate": 0.00019318098880966172, + "loss": 0.7041, + "step": 543 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.3878799886567483, + "learning_rate": 0.00019314960319060767, + "loss": 0.6825, + "step": 544 + }, + { + "epoch": 0.14533333333333334, + "grad_norm": 0.3798409262367505, + "learning_rate": 0.00019311814807006198, + "loss": 0.7019, + "step": 545 + }, + { + "epoch": 0.1456, + "grad_norm": 0.4760867935296923, + "learning_rate": 0.00019308662347149421, + "loss": 0.7003, + "step": 546 + }, + { + "epoch": 0.14586666666666667, + "grad_norm": 0.36622394318646834, + "learning_rate": 0.00019305502941842573, + "loss": 0.7382, + "step": 547 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.35870682999037357, + "learning_rate": 0.00019302336593442972, + "loss": 0.6733, + "step": 548 + }, + { + "epoch": 0.1464, + "grad_norm": 0.3977432873452541, + "learning_rate": 0.00019299163304313118, + "loss": 0.7252, + "step": 549 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.3818882114328452, + "learning_rate": 0.00019295983076820687, + "loss": 0.6856, + "step": 550 + }, + { + "epoch": 0.14693333333333333, + "grad_norm": 0.39394746748151416, + "learning_rate": 0.00019292795913338542, + "loss": 0.7047, + "step": 551 + }, + { + "epoch": 0.1472, + "grad_norm": 0.36698869326363837, + "learning_rate": 0.00019289601816244707, + "loss": 0.6528, + "step": 552 + }, + { + "epoch": 0.14746666666666666, + "grad_norm": 0.3730057549400165, + "learning_rate": 0.0001928640078792239, + "loss": 0.6872, + "step": 553 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.38174600345017096, + "learning_rate": 0.0001928319283075996, + "loss": 0.6967, + "step": 554 + }, + { + "epoch": 0.148, + "grad_norm": 0.3643558499171911, + "learning_rate": 0.0001927997794715097, + "loss": 0.6972, + "step": 555 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.3871930364023789, + "learning_rate": 0.00019276756139494132, + "loss": 0.6772, + "step": 556 + }, + { + "epoch": 0.14853333333333332, + "grad_norm": 0.4107975484164306, + "learning_rate": 0.00019273527410193324, + "loss": 0.7113, + "step": 557 + }, + { + "epoch": 0.1488, + "grad_norm": 0.3725140955042874, + "learning_rate": 0.0001927029176165759, + "loss": 0.6494, + "step": 558 + }, + { + "epoch": 0.14906666666666665, + "grad_norm": 0.3860544287407251, + "learning_rate": 0.00019267049196301135, + "loss": 0.6827, + "step": 559 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.34947671081582554, + "learning_rate": 0.00019263799716543335, + "loss": 0.6748, + "step": 560 + }, + { + "epoch": 0.1496, + "grad_norm": 0.35548483210574205, + "learning_rate": 0.00019260543324808705, + "loss": 0.7127, + "step": 561 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.3508317019595235, + "learning_rate": 0.00019257280023526936, + "loss": 0.6872, + "step": 562 + }, + { + "epoch": 0.15013333333333334, + "grad_norm": 0.3641977172837662, + "learning_rate": 0.00019254009815132864, + "loss": 0.69, + "step": 563 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3862822763071805, + "learning_rate": 0.00019250732702066488, + "loss": 0.6951, + "step": 564 + }, + { + "epoch": 0.15066666666666667, + "grad_norm": 0.40894636273487134, + "learning_rate": 0.00019247448686772944, + "loss": 0.6514, + "step": 565 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.3693964312769452, + "learning_rate": 0.00019244157771702532, + "loss": 0.6528, + "step": 566 + }, + { + "epoch": 0.1512, + "grad_norm": 0.3728179985163386, + "learning_rate": 0.0001924085995931069, + "loss": 0.6728, + "step": 567 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.37524843842900835, + "learning_rate": 0.00019237555252058015, + "loss": 0.6731, + "step": 568 + }, + { + "epoch": 0.15173333333333333, + "grad_norm": 0.3650361301148371, + "learning_rate": 0.00019234243652410232, + "loss": 0.6677, + "step": 569 + }, + { + "epoch": 0.152, + "grad_norm": 0.38124744091028967, + "learning_rate": 0.0001923092516283822, + "loss": 0.6578, + "step": 570 + }, + { + "epoch": 0.15226666666666666, + "grad_norm": 0.37174150058188726, + "learning_rate": 0.00019227599785817998, + "loss": 0.6614, + "step": 571 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.3658327860581027, + "learning_rate": 0.00019224267523830716, + "loss": 0.7255, + "step": 572 + }, + { + "epoch": 0.1528, + "grad_norm": 0.36609790952934385, + "learning_rate": 0.00019220928379362672, + "loss": 0.7278, + "step": 573 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.37194123384017186, + "learning_rate": 0.00019217582354905295, + "loss": 0.6975, + "step": 574 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 0.37098255942659086, + "learning_rate": 0.0001921422945295514, + "loss": 0.6824, + "step": 575 + }, + { + "epoch": 0.1536, + "grad_norm": 0.39104178693675107, + "learning_rate": 0.00019210869676013906, + "loss": 0.7462, + "step": 576 + }, + { + "epoch": 0.15386666666666668, + "grad_norm": 0.3647716235710022, + "learning_rate": 0.00019207503026588406, + "loss": 0.7252, + "step": 577 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.3834933785003577, + "learning_rate": 0.00019204129507190604, + "loss": 0.7185, + "step": 578 + }, + { + "epoch": 0.1544, + "grad_norm": 0.37896481990941155, + "learning_rate": 0.00019200749120337567, + "loss": 0.7405, + "step": 579 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.3736305411570718, + "learning_rate": 0.00019197361868551494, + "loss": 0.723, + "step": 580 + }, + { + "epoch": 0.15493333333333334, + "grad_norm": 0.3806871543208302, + "learning_rate": 0.00019193967754359715, + "loss": 0.6891, + "step": 581 + }, + { + "epoch": 0.1552, + "grad_norm": 0.3932025476034644, + "learning_rate": 0.00019190566780294662, + "loss": 0.67, + "step": 582 + }, + { + "epoch": 0.15546666666666667, + "grad_norm": 0.37005181168015094, + "learning_rate": 0.000191871589488939, + "loss": 0.6708, + "step": 583 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.37698964774570576, + "learning_rate": 0.00019183744262700112, + "loss": 0.6982, + "step": 584 + }, + { + "epoch": 0.156, + "grad_norm": 0.39283562495456964, + "learning_rate": 0.00019180322724261082, + "loss": 0.6928, + "step": 585 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.3847267098844669, + "learning_rate": 0.00019176894336129716, + "loss": 0.6919, + "step": 586 + }, + { + "epoch": 0.15653333333333333, + "grad_norm": 0.36784451810982993, + "learning_rate": 0.00019173459100864032, + "loss": 0.6739, + "step": 587 + }, + { + "epoch": 0.1568, + "grad_norm": 0.4208591638419817, + "learning_rate": 0.00019170017021027152, + "loss": 0.744, + "step": 588 + }, + { + "epoch": 0.15706666666666666, + "grad_norm": 0.35266156318845115, + "learning_rate": 0.00019166568099187304, + "loss": 0.6549, + "step": 589 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.38803183828469207, + "learning_rate": 0.0001916311233791783, + "loss": 0.6723, + "step": 590 + }, + { + "epoch": 0.1576, + "grad_norm": 0.3624907180591766, + "learning_rate": 0.00019159649739797162, + "loss": 0.6929, + "step": 591 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.3832990684783281, + "learning_rate": 0.00019156180307408846, + "loss": 0.7006, + "step": 592 + }, + { + "epoch": 0.15813333333333332, + "grad_norm": 0.3849305246144806, + "learning_rate": 0.0001915270404334152, + "loss": 0.6389, + "step": 593 + }, + { + "epoch": 0.1584, + "grad_norm": 0.3846659635172018, + "learning_rate": 0.00019149220950188917, + "loss": 0.7154, + "step": 594 + }, + { + "epoch": 0.15866666666666668, + "grad_norm": 0.39216567763845006, + "learning_rate": 0.0001914573103054987, + "loss": 0.7012, + "step": 595 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.38246961098197996, + "learning_rate": 0.00019142234287028312, + "loss": 0.6521, + "step": 596 + }, + { + "epoch": 0.1592, + "grad_norm": 0.3704981314071925, + "learning_rate": 0.00019138730722233248, + "loss": 0.6531, + "step": 597 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.3899946091619324, + "learning_rate": 0.00019135220338778797, + "loss": 0.693, + "step": 598 + }, + { + "epoch": 0.15973333333333334, + "grad_norm": 0.3672076618385661, + "learning_rate": 0.0001913170313928414, + "loss": 0.6759, + "step": 599 + }, + { + "epoch": 0.16, + "grad_norm": 0.40260111828210965, + "learning_rate": 0.00019128179126373567, + "loss": 0.7136, + "step": 600 + }, + { + "epoch": 0.16026666666666667, + "grad_norm": 0.3959889909912464, + "learning_rate": 0.00019124648302676434, + "loss": 0.7347, + "step": 601 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.37393190727411146, + "learning_rate": 0.00019121110670827193, + "loss": 0.6708, + "step": 602 + }, + { + "epoch": 0.1608, + "grad_norm": 0.37990440438717593, + "learning_rate": 0.00019117566233465362, + "loss": 0.6772, + "step": 603 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.3990101216733403, + "learning_rate": 0.00019114014993235553, + "loss": 0.7077, + "step": 604 + }, + { + "epoch": 0.16133333333333333, + "grad_norm": 0.36806391265789873, + "learning_rate": 0.00019110456952787432, + "loss": 0.6617, + "step": 605 + }, + { + "epoch": 0.1616, + "grad_norm": 0.3680440018563989, + "learning_rate": 0.00019106892114775762, + "loss": 0.6736, + "step": 606 + }, + { + "epoch": 0.16186666666666666, + "grad_norm": 0.3770635583222423, + "learning_rate": 0.0001910332048186036, + "loss": 0.6786, + "step": 607 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.36781987369544633, + "learning_rate": 0.00019099742056706123, + "loss": 0.6597, + "step": 608 + }, + { + "epoch": 0.1624, + "grad_norm": 0.4188436166773392, + "learning_rate": 0.00019096156841983013, + "loss": 0.6706, + "step": 609 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.3974136941701652, + "learning_rate": 0.00019092564840366056, + "loss": 0.7326, + "step": 610 + }, + { + "epoch": 0.16293333333333335, + "grad_norm": 0.3688977129953491, + "learning_rate": 0.0001908896605453535, + "loss": 0.6486, + "step": 611 + }, + { + "epoch": 0.1632, + "grad_norm": 0.36998873237097046, + "learning_rate": 0.00019085360487176037, + "loss": 0.6528, + "step": 612 + }, + { + "epoch": 0.16346666666666668, + "grad_norm": 0.3725068864338746, + "learning_rate": 0.0001908174814097834, + "loss": 0.6985, + "step": 613 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.37511559165531033, + "learning_rate": 0.00019078129018637528, + "loss": 0.6815, + "step": 614 + }, + { + "epoch": 0.164, + "grad_norm": 0.3761365006708317, + "learning_rate": 0.00019074503122853924, + "loss": 0.6695, + "step": 615 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.37067365233924277, + "learning_rate": 0.00019070870456332914, + "loss": 0.6804, + "step": 616 + }, + { + "epoch": 0.16453333333333334, + "grad_norm": 0.33584045515665806, + "learning_rate": 0.00019067231021784929, + "loss": 0.616, + "step": 617 + }, + { + "epoch": 0.1648, + "grad_norm": 0.37880782554632325, + "learning_rate": 0.0001906358482192545, + "loss": 0.6822, + "step": 618 + }, + { + "epoch": 0.16506666666666667, + "grad_norm": 0.36710286954036647, + "learning_rate": 0.00019059931859475012, + "loss": 0.6798, + "step": 619 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.38456640408824266, + "learning_rate": 0.00019056272137159188, + "loss": 0.6911, + "step": 620 + }, + { + "epoch": 0.1656, + "grad_norm": 0.43076307050161555, + "learning_rate": 0.00019052605657708595, + "loss": 0.7397, + "step": 621 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.3843910559308764, + "learning_rate": 0.000190489324238589, + "loss": 0.765, + "step": 622 + }, + { + "epoch": 0.16613333333333333, + "grad_norm": 0.35023931610344894, + "learning_rate": 0.00019045252438350802, + "loss": 0.6891, + "step": 623 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3684146530025913, + "learning_rate": 0.0001904156570393004, + "loss": 0.6723, + "step": 624 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.37507060892962285, + "learning_rate": 0.00019037872223347387, + "loss": 0.7164, + "step": 625 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.3862607727860442, + "learning_rate": 0.00019034171999358655, + "loss": 0.7107, + "step": 626 + }, + { + "epoch": 0.1672, + "grad_norm": 0.39292680630064536, + "learning_rate": 0.00019030465034724676, + "loss": 0.7351, + "step": 627 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.38247952716548084, + "learning_rate": 0.00019026751332211324, + "loss": 0.693, + "step": 628 + }, + { + "epoch": 0.16773333333333335, + "grad_norm": 0.39190391476563474, + "learning_rate": 0.00019023030894589496, + "loss": 0.701, + "step": 629 + }, + { + "epoch": 0.168, + "grad_norm": 0.3685822952964325, + "learning_rate": 0.0001901930372463511, + "loss": 0.697, + "step": 630 + }, + { + "epoch": 0.16826666666666668, + "grad_norm": 0.390305551735311, + "learning_rate": 0.0001901556982512911, + "loss": 0.6797, + "step": 631 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.40394755438945706, + "learning_rate": 0.00019011829198857467, + "loss": 0.6542, + "step": 632 + }, + { + "epoch": 0.1688, + "grad_norm": 0.3641097467415087, + "learning_rate": 0.0001900808184861116, + "loss": 0.6735, + "step": 633 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.38171692349208125, + "learning_rate": 0.00019004327777186192, + "loss": 0.7438, + "step": 634 + }, + { + "epoch": 0.16933333333333334, + "grad_norm": 0.364935435068997, + "learning_rate": 0.00019000566987383583, + "loss": 0.7249, + "step": 635 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3745818024901291, + "learning_rate": 0.00018996799482009352, + "loss": 0.703, + "step": 636 + }, + { + "epoch": 0.16986666666666667, + "grad_norm": 0.3816239881383332, + "learning_rate": 0.0001899302526387455, + "loss": 0.6274, + "step": 637 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.37760949844648883, + "learning_rate": 0.00018989244335795223, + "loss": 0.6202, + "step": 638 + }, + { + "epoch": 0.1704, + "grad_norm": 0.3991502273105771, + "learning_rate": 0.0001898545670059242, + "loss": 0.7343, + "step": 639 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.4644524349583215, + "learning_rate": 0.00018981662361092206, + "loss": 0.7537, + "step": 640 + }, + { + "epoch": 0.17093333333333333, + "grad_norm": 0.3743258046392929, + "learning_rate": 0.0001897786132012564, + "loss": 0.7124, + "step": 641 + }, + { + "epoch": 0.1712, + "grad_norm": 0.37129701740053306, + "learning_rate": 0.00018974053580528784, + "loss": 0.6693, + "step": 642 + }, + { + "epoch": 0.17146666666666666, + "grad_norm": 0.3976736544219594, + "learning_rate": 0.000189702391451427, + "loss": 0.7239, + "step": 643 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.40436774671596665, + "learning_rate": 0.00018966418016813443, + "loss": 0.758, + "step": 644 + }, + { + "epoch": 0.172, + "grad_norm": 0.39558895992871945, + "learning_rate": 0.00018962590198392057, + "loss": 0.7048, + "step": 645 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.4039053861116352, + "learning_rate": 0.0001895875569273459, + "loss": 0.6667, + "step": 646 + }, + { + "epoch": 0.17253333333333334, + "grad_norm": 0.3832393253577056, + "learning_rate": 0.00018954914502702068, + "loss": 0.7354, + "step": 647 + }, + { + "epoch": 0.1728, + "grad_norm": 0.39840124059988224, + "learning_rate": 0.00018951066631160511, + "loss": 0.7019, + "step": 648 + }, + { + "epoch": 0.17306666666666667, + "grad_norm": 0.3821003277970343, + "learning_rate": 0.0001894721208098092, + "loss": 0.655, + "step": 649 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.4031718649940101, + "learning_rate": 0.00018943350855039285, + "loss": 0.6894, + "step": 650 + }, + { + "epoch": 0.1736, + "grad_norm": 0.38452117467402436, + "learning_rate": 0.00018939482956216572, + "loss": 0.7054, + "step": 651 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.37693538774583135, + "learning_rate": 0.00018935608387398727, + "loss": 0.6349, + "step": 652 + }, + { + "epoch": 0.17413333333333333, + "grad_norm": 0.3697648619397049, + "learning_rate": 0.00018931727151476671, + "loss": 0.6857, + "step": 653 + }, + { + "epoch": 0.1744, + "grad_norm": 0.40125499460001507, + "learning_rate": 0.00018927839251346303, + "loss": 0.7577, + "step": 654 + }, + { + "epoch": 0.17466666666666666, + "grad_norm": 0.35724768359663395, + "learning_rate": 0.00018923944689908494, + "loss": 0.6452, + "step": 655 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.3723289881457659, + "learning_rate": 0.00018920043470069077, + "loss": 0.6977, + "step": 656 + }, + { + "epoch": 0.1752, + "grad_norm": 0.37662720192702304, + "learning_rate": 0.0001891613559473887, + "loss": 0.6155, + "step": 657 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.39095978551764027, + "learning_rate": 0.0001891222106683364, + "loss": 0.6831, + "step": 658 + }, + { + "epoch": 0.17573333333333332, + "grad_norm": 0.3953272394697602, + "learning_rate": 0.00018908299889274128, + "loss": 0.7185, + "step": 659 + }, + { + "epoch": 0.176, + "grad_norm": 0.38921264053878885, + "learning_rate": 0.0001890437206498603, + "loss": 0.6773, + "step": 660 + }, + { + "epoch": 0.17626666666666665, + "grad_norm": 0.4363970184028858, + "learning_rate": 0.00018900437596900007, + "loss": 0.7083, + "step": 661 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.35566662636829927, + "learning_rate": 0.0001889649648795167, + "loss": 0.6541, + "step": 662 + }, + { + "epoch": 0.1768, + "grad_norm": 0.35499374433455266, + "learning_rate": 0.00018892548741081592, + "loss": 0.6732, + "step": 663 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.3798882778924792, + "learning_rate": 0.00018888594359235295, + "loss": 0.7193, + "step": 664 + }, + { + "epoch": 0.17733333333333334, + "grad_norm": 0.37463819001129894, + "learning_rate": 0.00018884633345363257, + "loss": 0.7098, + "step": 665 + }, + { + "epoch": 0.1776, + "grad_norm": 0.3707520243831594, + "learning_rate": 0.00018880665702420893, + "loss": 0.6652, + "step": 666 + }, + { + "epoch": 0.17786666666666667, + "grad_norm": 0.38359257838556565, + "learning_rate": 0.00018876691433368577, + "loss": 0.7044, + "step": 667 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.3498781616388173, + "learning_rate": 0.00018872710541171614, + "loss": 0.6999, + "step": 668 + }, + { + "epoch": 0.1784, + "grad_norm": 0.3493419838565732, + "learning_rate": 0.00018868723028800263, + "loss": 0.6425, + "step": 669 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.36423931168153995, + "learning_rate": 0.00018864728899229717, + "loss": 0.6733, + "step": 670 + }, + { + "epoch": 0.17893333333333333, + "grad_norm": 0.36980669313173353, + "learning_rate": 0.00018860728155440106, + "loss": 0.6896, + "step": 671 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3565752863470386, + "learning_rate": 0.00018856720800416494, + "loss": 0.672, + "step": 672 + }, + { + "epoch": 0.17946666666666666, + "grad_norm": 0.35198793277813784, + "learning_rate": 0.0001885270683714888, + "loss": 0.6548, + "step": 673 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.37550168014656987, + "learning_rate": 0.00018848686268632193, + "loss": 0.6914, + "step": 674 + }, + { + "epoch": 0.18, + "grad_norm": 0.37318313956676913, + "learning_rate": 0.0001884465909786629, + "loss": 0.7396, + "step": 675 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.37129209587571155, + "learning_rate": 0.0001884062532785595, + "loss": 0.7052, + "step": 676 + }, + { + "epoch": 0.18053333333333332, + "grad_norm": 0.3756634716439558, + "learning_rate": 0.00018836584961610887, + "loss": 0.6628, + "step": 677 + }, + { + "epoch": 0.1808, + "grad_norm": 0.3685808183977085, + "learning_rate": 0.00018832538002145727, + "loss": 0.6351, + "step": 678 + }, + { + "epoch": 0.18106666666666665, + "grad_norm": 0.3585546951446435, + "learning_rate": 0.00018828484452480023, + "loss": 0.7001, + "step": 679 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.3635651527327575, + "learning_rate": 0.00018824424315638233, + "loss": 0.6577, + "step": 680 + }, + { + "epoch": 0.1816, + "grad_norm": 0.4062132118445034, + "learning_rate": 0.00018820357594649738, + "loss": 0.6699, + "step": 681 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.39259652521345406, + "learning_rate": 0.00018816284292548833, + "loss": 0.688, + "step": 682 + }, + { + "epoch": 0.18213333333333334, + "grad_norm": 0.4034976908204871, + "learning_rate": 0.00018812204412374723, + "loss": 0.6796, + "step": 683 + }, + { + "epoch": 0.1824, + "grad_norm": 0.35753477880743434, + "learning_rate": 0.00018808117957171518, + "loss": 0.6556, + "step": 684 + }, + { + "epoch": 0.18266666666666667, + "grad_norm": 0.3773797963092146, + "learning_rate": 0.00018804024929988233, + "loss": 0.7251, + "step": 685 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.3937096142107768, + "learning_rate": 0.0001879992533387879, + "loss": 0.7266, + "step": 686 + }, + { + "epoch": 0.1832, + "grad_norm": 0.3559842332949686, + "learning_rate": 0.00018795819171902014, + "loss": 0.7054, + "step": 687 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.38416397544317316, + "learning_rate": 0.00018791706447121622, + "loss": 0.6845, + "step": 688 + }, + { + "epoch": 0.18373333333333333, + "grad_norm": 0.37566552918129953, + "learning_rate": 0.00018787587162606231, + "loss": 0.723, + "step": 689 + }, + { + "epoch": 0.184, + "grad_norm": 0.3854055545132654, + "learning_rate": 0.00018783461321429353, + "loss": 0.7013, + "step": 690 + }, + { + "epoch": 0.18426666666666666, + "grad_norm": 0.38311731286896406, + "learning_rate": 0.00018779328926669397, + "loss": 0.7069, + "step": 691 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.3829186839362291, + "learning_rate": 0.00018775189981409652, + "loss": 0.6708, + "step": 692 + }, + { + "epoch": 0.1848, + "grad_norm": 0.38912545616839017, + "learning_rate": 0.00018771044488738299, + "loss": 0.694, + "step": 693 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.3933253628815094, + "learning_rate": 0.00018766892451748407, + "loss": 0.7311, + "step": 694 + }, + { + "epoch": 0.18533333333333332, + "grad_norm": 0.3765880474272644, + "learning_rate": 0.0001876273387353793, + "loss": 0.6928, + "step": 695 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3772820075421581, + "learning_rate": 0.00018758568757209685, + "loss": 0.6824, + "step": 696 + }, + { + "epoch": 0.18586666666666668, + "grad_norm": 0.3729809873038056, + "learning_rate": 0.0001875439710587139, + "loss": 0.6719, + "step": 697 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.37035412471318624, + "learning_rate": 0.00018750218922635633, + "loss": 0.6914, + "step": 698 + }, + { + "epoch": 0.1864, + "grad_norm": 0.3801581932418795, + "learning_rate": 0.0001874603421061986, + "loss": 0.6536, + "step": 699 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.3885577142479478, + "learning_rate": 0.0001874184297294641, + "loss": 0.6715, + "step": 700 + }, + { + "epoch": 0.18693333333333334, + "grad_norm": 0.38220258521124545, + "learning_rate": 0.00018737645212742475, + "loss": 0.7083, + "step": 701 + }, + { + "epoch": 0.1872, + "grad_norm": 0.3618802957433504, + "learning_rate": 0.00018733440933140126, + "loss": 0.7219, + "step": 702 + }, + { + "epoch": 0.18746666666666667, + "grad_norm": 0.3625799662149451, + "learning_rate": 0.00018729230137276285, + "loss": 0.6565, + "step": 703 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.38285072803778125, + "learning_rate": 0.0001872501282829275, + "loss": 0.668, + "step": 704 + }, + { + "epoch": 0.188, + "grad_norm": 0.36021955278262335, + "learning_rate": 0.00018720789009336167, + "loss": 0.6385, + "step": 705 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.3768393533352249, + "learning_rate": 0.00018716558683558044, + "loss": 0.705, + "step": 706 + }, + { + "epoch": 0.18853333333333333, + "grad_norm": 0.37254652608248173, + "learning_rate": 0.00018712321854114748, + "loss": 0.6633, + "step": 707 + }, + { + "epoch": 0.1888, + "grad_norm": 0.36355687961053096, + "learning_rate": 0.00018708078524167488, + "loss": 0.6509, + "step": 708 + }, + { + "epoch": 0.18906666666666666, + "grad_norm": 0.3706284035093737, + "learning_rate": 0.00018703828696882337, + "loss": 0.6912, + "step": 709 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.40500471249350956, + "learning_rate": 0.00018699572375430206, + "loss": 0.6926, + "step": 710 + }, + { + "epoch": 0.1896, + "grad_norm": 0.3908052292451713, + "learning_rate": 0.0001869530956298685, + "loss": 0.719, + "step": 711 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.3741685035500202, + "learning_rate": 0.0001869104026273288, + "loss": 0.6612, + "step": 712 + }, + { + "epoch": 0.19013333333333332, + "grad_norm": 0.371074216203983, + "learning_rate": 0.00018686764477853724, + "loss": 0.7273, + "step": 713 + }, + { + "epoch": 0.1904, + "grad_norm": 0.36509941624145703, + "learning_rate": 0.00018682482211539677, + "loss": 0.6396, + "step": 714 + }, + { + "epoch": 0.19066666666666668, + "grad_norm": 0.3497839006663842, + "learning_rate": 0.0001867819346698585, + "loss": 0.6242, + "step": 715 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.3537148420688285, + "learning_rate": 0.00018673898247392197, + "loss": 0.6216, + "step": 716 + }, + { + "epoch": 0.1912, + "grad_norm": 0.38854314638329096, + "learning_rate": 0.00018669596555963497, + "loss": 0.6639, + "step": 717 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.38526262005188155, + "learning_rate": 0.00018665288395909363, + "loss": 0.6941, + "step": 718 + }, + { + "epoch": 0.19173333333333334, + "grad_norm": 0.3909003225026855, + "learning_rate": 0.00018660973770444228, + "loss": 0.7138, + "step": 719 + }, + { + "epoch": 0.192, + "grad_norm": 0.35323993123136765, + "learning_rate": 0.00018656652682787358, + "loss": 0.6824, + "step": 720 + }, + { + "epoch": 0.19226666666666667, + "grad_norm": 0.37616707591461246, + "learning_rate": 0.00018652325136162833, + "loss": 0.6643, + "step": 721 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.3771200028723444, + "learning_rate": 0.00018647991133799558, + "loss": 0.7281, + "step": 722 + }, + { + "epoch": 0.1928, + "grad_norm": 0.39027530921813985, + "learning_rate": 0.00018643650678931248, + "loss": 0.6906, + "step": 723 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.37809192544625997, + "learning_rate": 0.0001863930377479644, + "loss": 0.7354, + "step": 724 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 0.3799375259063448, + "learning_rate": 0.0001863495042463848, + "loss": 0.7017, + "step": 725 + }, + { + "epoch": 0.1936, + "grad_norm": 0.3779825934739253, + "learning_rate": 0.00018630590631705512, + "loss": 0.6667, + "step": 726 + }, + { + "epoch": 0.19386666666666666, + "grad_norm": 0.3846767603058412, + "learning_rate": 0.00018626224399250513, + "loss": 0.7023, + "step": 727 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.38845373616066514, + "learning_rate": 0.00018621851730531242, + "loss": 0.672, + "step": 728 + }, + { + "epoch": 0.1944, + "grad_norm": 0.3752092260227272, + "learning_rate": 0.00018617472628810268, + "loss": 0.712, + "step": 729 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.3743270364949635, + "learning_rate": 0.00018613087097354958, + "loss": 0.6662, + "step": 730 + }, + { + "epoch": 0.19493333333333332, + "grad_norm": 0.358376110306125, + "learning_rate": 0.00018608695139437487, + "loss": 0.6291, + "step": 731 + }, + { + "epoch": 0.1952, + "grad_norm": 0.36722400631538576, + "learning_rate": 0.00018604296758334803, + "loss": 0.6954, + "step": 732 + }, + { + "epoch": 0.19546666666666668, + "grad_norm": 0.3794407568863177, + "learning_rate": 0.0001859989195732867, + "loss": 0.7316, + "step": 733 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.3738374961196488, + "learning_rate": 0.00018595480739705628, + "loss": 0.6928, + "step": 734 + }, + { + "epoch": 0.196, + "grad_norm": 0.3742656591480173, + "learning_rate": 0.00018591063108757007, + "loss": 0.7018, + "step": 735 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.41126781845681987, + "learning_rate": 0.00018586639067778924, + "loss": 0.6897, + "step": 736 + }, + { + "epoch": 0.19653333333333334, + "grad_norm": 0.3742048996635612, + "learning_rate": 0.0001858220862007228, + "loss": 0.6677, + "step": 737 + }, + { + "epoch": 0.1968, + "grad_norm": 0.39672258470120586, + "learning_rate": 0.00018577771768942753, + "loss": 0.7297, + "step": 738 + }, + { + "epoch": 0.19706666666666667, + "grad_norm": 0.35983278629271476, + "learning_rate": 0.00018573328517700803, + "loss": 0.6868, + "step": 739 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.360530906892004, + "learning_rate": 0.00018568878869661658, + "loss": 0.7297, + "step": 740 + }, + { + "epoch": 0.1976, + "grad_norm": 0.37679000714248617, + "learning_rate": 0.00018564422828145326, + "loss": 0.6783, + "step": 741 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.38135885735408154, + "learning_rate": 0.00018559960396476578, + "loss": 0.6755, + "step": 742 + }, + { + "epoch": 0.19813333333333333, + "grad_norm": 0.3676255502336289, + "learning_rate": 0.00018555491577984968, + "loss": 0.6701, + "step": 743 + }, + { + "epoch": 0.1984, + "grad_norm": 0.37092160190801005, + "learning_rate": 0.00018551016376004795, + "loss": 0.652, + "step": 744 + }, + { + "epoch": 0.19866666666666666, + "grad_norm": 0.37617860948393356, + "learning_rate": 0.00018546534793875132, + "loss": 0.6609, + "step": 745 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.39949026181909875, + "learning_rate": 0.00018542046834939816, + "loss": 0.6735, + "step": 746 + }, + { + "epoch": 0.1992, + "grad_norm": 0.35456959936277177, + "learning_rate": 0.00018537552502547432, + "loss": 0.6795, + "step": 747 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.3547637684272837, + "learning_rate": 0.00018533051800051332, + "loss": 0.6404, + "step": 748 + }, + { + "epoch": 0.19973333333333335, + "grad_norm": 0.39532200273811835, + "learning_rate": 0.0001852854473080961, + "loss": 0.7151, + "step": 749 + }, + { + "epoch": 0.2, + "grad_norm": 0.3594284858890217, + "learning_rate": 0.0001852403129818511, + "loss": 0.6601, + "step": 750 + }, + { + "epoch": 0.20026666666666668, + "grad_norm": 0.3802212011617842, + "learning_rate": 0.0001851951150554544, + "loss": 0.7347, + "step": 751 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.3616455410835533, + "learning_rate": 0.00018514985356262934, + "loss": 0.6856, + "step": 752 + }, + { + "epoch": 0.2008, + "grad_norm": 0.37844405621281424, + "learning_rate": 0.00018510452853714678, + "loss": 0.6433, + "step": 753 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.3650231986573298, + "learning_rate": 0.000185059140012825, + "loss": 0.6539, + "step": 754 + }, + { + "epoch": 0.20133333333333334, + "grad_norm": 0.3671252126818593, + "learning_rate": 0.00018501368802352957, + "loss": 0.7281, + "step": 755 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3563889043200433, + "learning_rate": 0.0001849681726031736, + "loss": 0.6583, + "step": 756 + }, + { + "epoch": 0.20186666666666667, + "grad_norm": 0.35200170683064536, + "learning_rate": 0.00018492259378571725, + "loss": 0.6707, + "step": 757 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.3636122431240546, + "learning_rate": 0.00018487695160516825, + "loss": 0.711, + "step": 758 + }, + { + "epoch": 0.2024, + "grad_norm": 0.36439355090291375, + "learning_rate": 0.00018483124609558143, + "loss": 0.6849, + "step": 759 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.3735234618741416, + "learning_rate": 0.00018478547729105897, + "loss": 0.6558, + "step": 760 + }, + { + "epoch": 0.20293333333333333, + "grad_norm": 0.3621322005276756, + "learning_rate": 0.0001847396452257502, + "loss": 0.679, + "step": 761 + }, + { + "epoch": 0.2032, + "grad_norm": 0.3663733156867625, + "learning_rate": 0.00018469374993385174, + "loss": 0.6962, + "step": 762 + }, + { + "epoch": 0.20346666666666666, + "grad_norm": 0.38410871208592184, + "learning_rate": 0.00018464779144960726, + "loss": 0.7099, + "step": 763 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.3791658521215643, + "learning_rate": 0.00018460176980730775, + "loss": 0.7021, + "step": 764 + }, + { + "epoch": 0.204, + "grad_norm": 0.35880046589803916, + "learning_rate": 0.00018455568504129115, + "loss": 0.6467, + "step": 765 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.3505270740068604, + "learning_rate": 0.0001845095371859426, + "loss": 0.6661, + "step": 766 + }, + { + "epoch": 0.20453333333333334, + "grad_norm": 0.3620142085417268, + "learning_rate": 0.0001844633262756943, + "loss": 0.7191, + "step": 767 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3864279961960631, + "learning_rate": 0.00018441705234502548, + "loss": 0.6899, + "step": 768 + }, + { + "epoch": 0.20506666666666667, + "grad_norm": 0.37482766104381093, + "learning_rate": 0.0001843707154284624, + "loss": 0.6976, + "step": 769 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.38529960394893153, + "learning_rate": 0.00018432431556057832, + "loss": 0.7183, + "step": 770 + }, + { + "epoch": 0.2056, + "grad_norm": 0.36391481747905546, + "learning_rate": 0.00018427785277599345, + "loss": 0.7219, + "step": 771 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.380908481889315, + "learning_rate": 0.00018423132710937497, + "loss": 0.7165, + "step": 772 + }, + { + "epoch": 0.20613333333333334, + "grad_norm": 0.3779389305458948, + "learning_rate": 0.00018418473859543695, + "loss": 0.7051, + "step": 773 + }, + { + "epoch": 0.2064, + "grad_norm": 0.36318923582328305, + "learning_rate": 0.00018413808726894037, + "loss": 0.6515, + "step": 774 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 0.365308687784887, + "learning_rate": 0.00018409137316469307, + "loss": 0.6767, + "step": 775 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.3626020322486663, + "learning_rate": 0.00018404459631754974, + "loss": 0.6629, + "step": 776 + }, + { + "epoch": 0.2072, + "grad_norm": 0.3491393349155495, + "learning_rate": 0.0001839977567624119, + "loss": 0.632, + "step": 777 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.3790090586840057, + "learning_rate": 0.0001839508545342278, + "loss": 0.6615, + "step": 778 + }, + { + "epoch": 0.20773333333333333, + "grad_norm": 0.37875087514028966, + "learning_rate": 0.0001839038896679925, + "loss": 0.6749, + "step": 779 + }, + { + "epoch": 0.208, + "grad_norm": 0.37912120811174993, + "learning_rate": 0.0001838568621987478, + "loss": 0.6676, + "step": 780 + }, + { + "epoch": 0.20826666666666666, + "grad_norm": 0.3783780332693869, + "learning_rate": 0.00018380977216158215, + "loss": 0.6592, + "step": 781 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.35583651903348806, + "learning_rate": 0.00018376261959163076, + "loss": 0.6626, + "step": 782 + }, + { + "epoch": 0.2088, + "grad_norm": 0.36447921870025685, + "learning_rate": 0.00018371540452407546, + "loss": 0.7059, + "step": 783 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.37712528689170866, + "learning_rate": 0.00018366812699414475, + "loss": 0.6725, + "step": 784 + }, + { + "epoch": 0.20933333333333334, + "grad_norm": 0.3613627845284323, + "learning_rate": 0.00018362078703711366, + "loss": 0.6447, + "step": 785 + }, + { + "epoch": 0.2096, + "grad_norm": 0.36877410930130333, + "learning_rate": 0.0001835733846883038, + "loss": 0.6449, + "step": 786 + }, + { + "epoch": 0.20986666666666667, + "grad_norm": 0.3569084003684999, + "learning_rate": 0.00018352591998308345, + "loss": 0.6077, + "step": 787 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.37035467939568206, + "learning_rate": 0.00018347839295686732, + "loss": 0.6931, + "step": 788 + }, + { + "epoch": 0.2104, + "grad_norm": 0.4034614433225208, + "learning_rate": 0.0001834308036451166, + "loss": 0.7177, + "step": 789 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.37777579908281395, + "learning_rate": 0.00018338315208333902, + "loss": 0.669, + "step": 790 + }, + { + "epoch": 0.21093333333333333, + "grad_norm": 0.37091469516607756, + "learning_rate": 0.00018333543830708872, + "loss": 0.6635, + "step": 791 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3619506344302214, + "learning_rate": 0.0001832876623519663, + "loss": 0.6571, + "step": 792 + }, + { + "epoch": 0.21146666666666666, + "grad_norm": 0.3913724968303028, + "learning_rate": 0.00018323982425361862, + "loss": 0.6397, + "step": 793 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.35941798327326463, + "learning_rate": 0.0001831919240477391, + "loss": 0.6512, + "step": 794 + }, + { + "epoch": 0.212, + "grad_norm": 0.364866497442862, + "learning_rate": 0.00018314396177006737, + "loss": 0.683, + "step": 795 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.37662644812584084, + "learning_rate": 0.00018309593745638943, + "loss": 0.6618, + "step": 796 + }, + { + "epoch": 0.21253333333333332, + "grad_norm": 0.41794098986117456, + "learning_rate": 0.00018304785114253756, + "loss": 0.7205, + "step": 797 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3565020649378219, + "learning_rate": 0.0001829997028643902, + "loss": 0.6568, + "step": 798 + }, + { + "epoch": 0.21306666666666665, + "grad_norm": 0.38708444261364383, + "learning_rate": 0.00018295149265787222, + "loss": 0.7193, + "step": 799 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.41124485895028245, + "learning_rate": 0.00018290322055895453, + "loss": 0.706, + "step": 800 + }, + { + "epoch": 0.2136, + "grad_norm": 0.37390625327736315, + "learning_rate": 0.0001828548866036543, + "loss": 0.7001, + "step": 801 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.43156584902192524, + "learning_rate": 0.00018280649082803478, + "loss": 0.6869, + "step": 802 + }, + { + "epoch": 0.21413333333333334, + "grad_norm": 0.37331145354869777, + "learning_rate": 0.00018275803326820545, + "loss": 0.6633, + "step": 803 + }, + { + "epoch": 0.2144, + "grad_norm": 0.36755237856747536, + "learning_rate": 0.00018270951396032179, + "loss": 0.7256, + "step": 804 + }, + { + "epoch": 0.21466666666666667, + "grad_norm": 0.3761698123559096, + "learning_rate": 0.00018266093294058542, + "loss": 0.7128, + "step": 805 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.37577680738610936, + "learning_rate": 0.000182612290245244, + "loss": 0.6561, + "step": 806 + }, + { + "epoch": 0.2152, + "grad_norm": 0.3546255870164141, + "learning_rate": 0.00018256358591059116, + "loss": 0.68, + "step": 807 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.4004601400429358, + "learning_rate": 0.00018251481997296653, + "loss": 0.6453, + "step": 808 + }, + { + "epoch": 0.21573333333333333, + "grad_norm": 0.3864264432170919, + "learning_rate": 0.0001824659924687558, + "loss": 0.6829, + "step": 809 + }, + { + "epoch": 0.216, + "grad_norm": 0.3731103874980662, + "learning_rate": 0.00018241710343439043, + "loss": 0.7011, + "step": 810 + }, + { + "epoch": 0.21626666666666666, + "grad_norm": 0.39402952342028597, + "learning_rate": 0.00018236815290634796, + "loss": 0.73, + "step": 811 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.38064391724654933, + "learning_rate": 0.00018231914092115163, + "loss": 0.6975, + "step": 812 + }, + { + "epoch": 0.2168, + "grad_norm": 0.3454030876230318, + "learning_rate": 0.0001822700675153707, + "loss": 0.6306, + "step": 813 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.3757030862071576, + "learning_rate": 0.0001822209327256202, + "loss": 0.6193, + "step": 814 + }, + { + "epoch": 0.21733333333333332, + "grad_norm": 0.34305496517409284, + "learning_rate": 0.00018217173658856097, + "loss": 0.6308, + "step": 815 + }, + { + "epoch": 0.2176, + "grad_norm": 0.37111523510124256, + "learning_rate": 0.00018212247914089954, + "loss": 0.6539, + "step": 816 + }, + { + "epoch": 0.21786666666666665, + "grad_norm": 0.3787404183646755, + "learning_rate": 0.00018207316041938832, + "loss": 0.6763, + "step": 817 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.38946742059341627, + "learning_rate": 0.00018202378046082532, + "loss": 0.6883, + "step": 818 + }, + { + "epoch": 0.2184, + "grad_norm": 0.3771883394030353, + "learning_rate": 0.0001819743393020543, + "loss": 0.6729, + "step": 819 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.3669827885293365, + "learning_rate": 0.00018192483697996472, + "loss": 0.705, + "step": 820 + }, + { + "epoch": 0.21893333333333334, + "grad_norm": 0.38744728666382117, + "learning_rate": 0.00018187527353149158, + "loss": 0.676, + "step": 821 + }, + { + "epoch": 0.2192, + "grad_norm": 0.3702189972702748, + "learning_rate": 0.00018182564899361556, + "loss": 0.6725, + "step": 822 + }, + { + "epoch": 0.21946666666666667, + "grad_norm": 0.37497364351640805, + "learning_rate": 0.00018177596340336288, + "loss": 0.6539, + "step": 823 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.3937422659286144, + "learning_rate": 0.00018172621679780532, + "loss": 0.7364, + "step": 824 + }, + { + "epoch": 0.22, + "grad_norm": 0.3575934371504687, + "learning_rate": 0.00018167640921406023, + "loss": 0.7327, + "step": 825 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.3672174923553296, + "learning_rate": 0.00018162654068929043, + "loss": 0.7038, + "step": 826 + }, + { + "epoch": 0.22053333333333333, + "grad_norm": 0.4161839683300949, + "learning_rate": 0.0001815766112607042, + "loss": 0.7, + "step": 827 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3706453113933822, + "learning_rate": 0.0001815266209655552, + "loss": 0.6571, + "step": 828 + }, + { + "epoch": 0.22106666666666666, + "grad_norm": 0.3670205544473324, + "learning_rate": 0.00018147656984114266, + "loss": 0.6754, + "step": 829 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.3789810792810146, + "learning_rate": 0.00018142645792481107, + "loss": 0.6946, + "step": 830 + }, + { + "epoch": 0.2216, + "grad_norm": 0.40461845415517067, + "learning_rate": 0.0001813762852539503, + "loss": 0.6543, + "step": 831 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.38411984430369106, + "learning_rate": 0.0001813260518659956, + "loss": 0.6681, + "step": 832 + }, + { + "epoch": 0.22213333333333332, + "grad_norm": 0.40885506891758255, + "learning_rate": 0.00018127575779842742, + "loss": 0.6045, + "step": 833 + }, + { + "epoch": 0.2224, + "grad_norm": 0.37170446490494097, + "learning_rate": 0.00018122540308877162, + "loss": 0.7112, + "step": 834 + }, + { + "epoch": 0.22266666666666668, + "grad_norm": 0.39703368312579296, + "learning_rate": 0.00018117498777459924, + "loss": 0.6759, + "step": 835 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.3767130091451189, + "learning_rate": 0.00018112451189352652, + "loss": 0.6686, + "step": 836 + }, + { + "epoch": 0.2232, + "grad_norm": 0.38993338683923956, + "learning_rate": 0.00018107397548321487, + "loss": 0.6136, + "step": 837 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.3855059989989872, + "learning_rate": 0.00018102337858137094, + "loss": 0.6788, + "step": 838 + }, + { + "epoch": 0.22373333333333334, + "grad_norm": 0.3803130749460877, + "learning_rate": 0.0001809727212257465, + "loss": 0.6718, + "step": 839 + }, + { + "epoch": 0.224, + "grad_norm": 0.4033073629746989, + "learning_rate": 0.00018092200345413837, + "loss": 0.6091, + "step": 840 + }, + { + "epoch": 0.22426666666666667, + "grad_norm": 0.3797057141579435, + "learning_rate": 0.00018087122530438846, + "loss": 0.6326, + "step": 841 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.3879992997600297, + "learning_rate": 0.0001808203868143838, + "loss": 0.7559, + "step": 842 + }, + { + "epoch": 0.2248, + "grad_norm": 0.3928624627878775, + "learning_rate": 0.00018076948802205636, + "loss": 0.7095, + "step": 843 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.3938380577687365, + "learning_rate": 0.00018071852896538315, + "loss": 0.6768, + "step": 844 + }, + { + "epoch": 0.22533333333333333, + "grad_norm": 0.35035889321287694, + "learning_rate": 0.0001806675096823861, + "loss": 0.6797, + "step": 845 + }, + { + "epoch": 0.2256, + "grad_norm": 0.36684177650978017, + "learning_rate": 0.00018061643021113216, + "loss": 0.7067, + "step": 846 + }, + { + "epoch": 0.22586666666666666, + "grad_norm": 0.3830949418296738, + "learning_rate": 0.0001805652905897331, + "loss": 0.7198, + "step": 847 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.34357360805127096, + "learning_rate": 0.00018051409085634556, + "loss": 0.6488, + "step": 848 + }, + { + "epoch": 0.2264, + "grad_norm": 0.3702133101019209, + "learning_rate": 0.00018046283104917118, + "loss": 0.6151, + "step": 849 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.3763682241149857, + "learning_rate": 0.0001804115112064562, + "loss": 0.657, + "step": 850 + }, + { + "epoch": 0.22693333333333332, + "grad_norm": 0.37035958266927865, + "learning_rate": 0.00018036013136649186, + "loss": 0.6699, + "step": 851 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3603625279827103, + "learning_rate": 0.00018030869156761403, + "loss": 0.6582, + "step": 852 + }, + { + "epoch": 0.22746666666666668, + "grad_norm": 0.3982720688740039, + "learning_rate": 0.0001802571918482034, + "loss": 0.7482, + "step": 853 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.3758094698939472, + "learning_rate": 0.00018020563224668533, + "loss": 0.6999, + "step": 854 + }, + { + "epoch": 0.228, + "grad_norm": 0.38749675883304113, + "learning_rate": 0.00018015401280152983, + "loss": 0.7056, + "step": 855 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.3915657129303144, + "learning_rate": 0.00018010233355125163, + "loss": 0.6845, + "step": 856 + }, + { + "epoch": 0.22853333333333334, + "grad_norm": 0.37142343215143453, + "learning_rate": 0.00018005059453441002, + "loss": 0.6336, + "step": 857 + }, + { + "epoch": 0.2288, + "grad_norm": 0.3649017025082439, + "learning_rate": 0.00017999879578960889, + "loss": 0.6321, + "step": 858 + }, + { + "epoch": 0.22906666666666667, + "grad_norm": 0.41404793524551164, + "learning_rate": 0.00017994693735549677, + "loss": 0.6768, + "step": 859 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.37170947659867914, + "learning_rate": 0.00017989501927076663, + "loss": 0.6663, + "step": 860 + }, + { + "epoch": 0.2296, + "grad_norm": 0.3995413208923138, + "learning_rate": 0.00017984304157415602, + "loss": 0.6732, + "step": 861 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.37557958255479806, + "learning_rate": 0.0001797910043044469, + "loss": 0.6547, + "step": 862 + }, + { + "epoch": 0.23013333333333333, + "grad_norm": 0.3477869841404091, + "learning_rate": 0.00017973890750046573, + "loss": 0.6151, + "step": 863 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3775952170739762, + "learning_rate": 0.00017968675120108338, + "loss": 0.632, + "step": 864 + }, + { + "epoch": 0.23066666666666666, + "grad_norm": 0.36848209655282005, + "learning_rate": 0.0001796345354452151, + "loss": 0.677, + "step": 865 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.3737195218502748, + "learning_rate": 0.0001795822602718205, + "loss": 0.6683, + "step": 866 + }, + { + "epoch": 0.2312, + "grad_norm": 0.36308433478212737, + "learning_rate": 0.0001795299257199035, + "loss": 0.6572, + "step": 867 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.36289198132546524, + "learning_rate": 0.00017947753182851245, + "loss": 0.6729, + "step": 868 + }, + { + "epoch": 0.23173333333333335, + "grad_norm": 0.3745943466101185, + "learning_rate": 0.0001794250786367398, + "loss": 0.7091, + "step": 869 + }, + { + "epoch": 0.232, + "grad_norm": 0.3523879995591142, + "learning_rate": 0.00017937256618372232, + "loss": 0.6247, + "step": 870 + }, + { + "epoch": 0.23226666666666668, + "grad_norm": 0.3551454603054259, + "learning_rate": 0.00017931999450864104, + "loss": 0.6516, + "step": 871 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.345300826949314, + "learning_rate": 0.00017926736365072115, + "loss": 0.616, + "step": 872 + }, + { + "epoch": 0.2328, + "grad_norm": 0.3719750321170669, + "learning_rate": 0.00017921467364923193, + "loss": 0.7055, + "step": 873 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.3766349813970907, + "learning_rate": 0.00017916192454348688, + "loss": 0.6575, + "step": 874 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.35536148277251234, + "learning_rate": 0.00017910911637284357, + "loss": 0.698, + "step": 875 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3645922686346899, + "learning_rate": 0.00017905624917670366, + "loss": 0.6505, + "step": 876 + }, + { + "epoch": 0.23386666666666667, + "grad_norm": 0.3733496800144498, + "learning_rate": 0.00017900332299451273, + "loss": 0.6472, + "step": 877 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.3855523186295078, + "learning_rate": 0.00017895033786576056, + "loss": 0.6751, + "step": 878 + }, + { + "epoch": 0.2344, + "grad_norm": 0.38341509256938605, + "learning_rate": 0.0001788972938299808, + "loss": 0.6431, + "step": 879 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.37360303338418127, + "learning_rate": 0.00017884419092675105, + "loss": 0.6292, + "step": 880 + }, + { + "epoch": 0.23493333333333333, + "grad_norm": 0.42119666302347647, + "learning_rate": 0.00017879102919569285, + "loss": 0.6444, + "step": 881 + }, + { + "epoch": 0.2352, + "grad_norm": 0.36755331891950127, + "learning_rate": 0.00017873780867647162, + "loss": 0.7022, + "step": 882 + }, + { + "epoch": 0.23546666666666666, + "grad_norm": 0.35717611916899655, + "learning_rate": 0.00017868452940879675, + "loss": 0.6728, + "step": 883 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.36170010813011433, + "learning_rate": 0.00017863119143242124, + "loss": 0.6126, + "step": 884 + }, + { + "epoch": 0.236, + "grad_norm": 0.4115929580021471, + "learning_rate": 0.00017857779478714213, + "loss": 0.6457, + "step": 885 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.35007903341304025, + "learning_rate": 0.0001785243395128001, + "loss": 0.6956, + "step": 886 + }, + { + "epoch": 0.23653333333333335, + "grad_norm": 0.3700423171601644, + "learning_rate": 0.00017847082564927957, + "loss": 0.6552, + "step": 887 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3838509453166939, + "learning_rate": 0.00017841725323650877, + "loss": 0.6744, + "step": 888 + }, + { + "epoch": 0.23706666666666668, + "grad_norm": 0.38170306779665314, + "learning_rate": 0.00017836362231445953, + "loss": 0.6526, + "step": 889 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.38212518819422253, + "learning_rate": 0.00017830993292314737, + "loss": 0.6637, + "step": 890 + }, + { + "epoch": 0.2376, + "grad_norm": 0.39266284643771737, + "learning_rate": 0.00017825618510263142, + "loss": 0.6682, + "step": 891 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.37057321845640795, + "learning_rate": 0.00017820237889301437, + "loss": 0.6751, + "step": 892 + }, + { + "epoch": 0.23813333333333334, + "grad_norm": 0.3525895622782891, + "learning_rate": 0.00017814851433444262, + "loss": 0.635, + "step": 893 + }, + { + "epoch": 0.2384, + "grad_norm": 0.36731955901952146, + "learning_rate": 0.00017809459146710593, + "loss": 0.6296, + "step": 894 + }, + { + "epoch": 0.23866666666666667, + "grad_norm": 0.3776104002873812, + "learning_rate": 0.00017804061033123767, + "loss": 0.7258, + "step": 895 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.3681502417412675, + "learning_rate": 0.00017798657096711464, + "loss": 0.6405, + "step": 896 + }, + { + "epoch": 0.2392, + "grad_norm": 0.36315112764267293, + "learning_rate": 0.0001779324734150571, + "loss": 0.6668, + "step": 897 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.3329708112538578, + "learning_rate": 0.00017787831771542872, + "loss": 0.6053, + "step": 898 + }, + { + "epoch": 0.23973333333333333, + "grad_norm": 0.3597972038634347, + "learning_rate": 0.00017782410390863662, + "loss": 0.6876, + "step": 899 + }, + { + "epoch": 0.24, + "grad_norm": 0.36310599619074185, + "learning_rate": 0.00017776983203513113, + "loss": 0.6229, + "step": 900 + }, + { + "epoch": 0.24026666666666666, + "grad_norm": 0.37462854428278636, + "learning_rate": 0.00017771550213540607, + "loss": 0.7268, + "step": 901 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.35205708604798935, + "learning_rate": 0.00017766111424999842, + "loss": 0.6494, + "step": 902 + }, + { + "epoch": 0.2408, + "grad_norm": 0.3598438745211486, + "learning_rate": 0.00017760666841948856, + "loss": 0.7102, + "step": 903 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.36477586393521, + "learning_rate": 0.00017755216468449996, + "loss": 0.6396, + "step": 904 + }, + { + "epoch": 0.24133333333333334, + "grad_norm": 0.36967086452141024, + "learning_rate": 0.0001774976030856994, + "loss": 0.6866, + "step": 905 + }, + { + "epoch": 0.2416, + "grad_norm": 0.34281380487072416, + "learning_rate": 0.00017744298366379672, + "loss": 0.6625, + "step": 906 + }, + { + "epoch": 0.24186666666666667, + "grad_norm": 0.3727813585799761, + "learning_rate": 0.0001773883064595451, + "loss": 0.6521, + "step": 907 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.3695465129269741, + "learning_rate": 0.0001773335715137406, + "loss": 0.6337, + "step": 908 + }, + { + "epoch": 0.2424, + "grad_norm": 0.3998986207982075, + "learning_rate": 0.00017727877886722257, + "loss": 0.7152, + "step": 909 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.37960333241164557, + "learning_rate": 0.00017722392856087327, + "loss": 0.6853, + "step": 910 + }, + { + "epoch": 0.24293333333333333, + "grad_norm": 0.35251879527239355, + "learning_rate": 0.00017716902063561797, + "loss": 0.6539, + "step": 911 + }, + { + "epoch": 0.2432, + "grad_norm": 0.34915552878152734, + "learning_rate": 0.00017711405513242513, + "loss": 0.6582, + "step": 912 + }, + { + "epoch": 0.24346666666666666, + "grad_norm": 0.3853434496640489, + "learning_rate": 0.0001770590320923059, + "loss": 0.6455, + "step": 913 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.38215275427772366, + "learning_rate": 0.00017700395155631455, + "loss": 0.7082, + "step": 914 + }, + { + "epoch": 0.244, + "grad_norm": 0.3735208392908576, + "learning_rate": 0.00017694881356554817, + "loss": 0.6519, + "step": 915 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.39245973372040327, + "learning_rate": 0.00017689361816114677, + "loss": 0.6559, + "step": 916 + }, + { + "epoch": 0.24453333333333332, + "grad_norm": 0.42824539539470896, + "learning_rate": 0.00017683836538429315, + "loss": 0.6914, + "step": 917 + }, + { + "epoch": 0.2448, + "grad_norm": 0.38039884993754464, + "learning_rate": 0.0001767830552762129, + "loss": 0.6935, + "step": 918 + }, + { + "epoch": 0.24506666666666665, + "grad_norm": 0.4053472220079151, + "learning_rate": 0.00017672768787817443, + "loss": 0.716, + "step": 919 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.3625670965985844, + "learning_rate": 0.00017667226323148894, + "loss": 0.654, + "step": 920 + }, + { + "epoch": 0.2456, + "grad_norm": 0.3846759169631727, + "learning_rate": 0.0001766167813775102, + "loss": 0.7346, + "step": 921 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.387823479061407, + "learning_rate": 0.00017656124235763485, + "loss": 0.6725, + "step": 922 + }, + { + "epoch": 0.24613333333333334, + "grad_norm": 0.37103577961587353, + "learning_rate": 0.000176505646213302, + "loss": 0.6387, + "step": 923 + }, + { + "epoch": 0.2464, + "grad_norm": 0.364084287388942, + "learning_rate": 0.00017644999298599355, + "loss": 0.6325, + "step": 924 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 0.3719338408947227, + "learning_rate": 0.00017639428271723384, + "loss": 0.6206, + "step": 925 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.5079978985352054, + "learning_rate": 0.00017633851544858988, + "loss": 0.6986, + "step": 926 + }, + { + "epoch": 0.2472, + "grad_norm": 0.3831887933477012, + "learning_rate": 0.00017628269122167115, + "loss": 0.681, + "step": 927 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.3862617901432955, + "learning_rate": 0.00017622681007812963, + "loss": 0.6786, + "step": 928 + }, + { + "epoch": 0.24773333333333333, + "grad_norm": 0.40196095507069857, + "learning_rate": 0.00017617087205965985, + "loss": 0.728, + "step": 929 + }, + { + "epoch": 0.248, + "grad_norm": 0.36036525174309214, + "learning_rate": 0.00017611487720799865, + "loss": 0.662, + "step": 930 + }, + { + "epoch": 0.24826666666666666, + "grad_norm": 0.3450622496118081, + "learning_rate": 0.00017605882556492536, + "loss": 0.6656, + "step": 931 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.3827344319113975, + "learning_rate": 0.00017600271717226165, + "loss": 0.6747, + "step": 932 + }, + { + "epoch": 0.2488, + "grad_norm": 0.35548505635722083, + "learning_rate": 0.00017594655207187157, + "loss": 0.6792, + "step": 933 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.35897227580611457, + "learning_rate": 0.0001758903303056614, + "loss": 0.6459, + "step": 934 + }, + { + "epoch": 0.24933333333333332, + "grad_norm": 0.39868659999654915, + "learning_rate": 0.0001758340519155798, + "loss": 0.6946, + "step": 935 + }, + { + "epoch": 0.2496, + "grad_norm": 0.3733448716299448, + "learning_rate": 0.0001757777169436176, + "loss": 0.708, + "step": 936 + }, + { + "epoch": 0.24986666666666665, + "grad_norm": 0.35651216220935267, + "learning_rate": 0.00017572132543180788, + "loss": 0.6846, + "step": 937 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.382658548425777, + "learning_rate": 0.00017566487742222596, + "loss": 0.6739, + "step": 938 + }, + { + "epoch": 0.2504, + "grad_norm": 0.36169049365474176, + "learning_rate": 0.00017560837295698916, + "loss": 0.6289, + "step": 939 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.3713348766937921, + "learning_rate": 0.0001755518120782571, + "loss": 0.6972, + "step": 940 + }, + { + "epoch": 0.25093333333333334, + "grad_norm": 0.37561670810196857, + "learning_rate": 0.0001754951948282314, + "loss": 0.6828, + "step": 941 + }, + { + "epoch": 0.2512, + "grad_norm": 0.38705806321579894, + "learning_rate": 0.0001754385212491557, + "loss": 0.7119, + "step": 942 + }, + { + "epoch": 0.25146666666666667, + "grad_norm": 0.39562921430988984, + "learning_rate": 0.00017538179138331582, + "loss": 0.6651, + "step": 943 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.41192107043951953, + "learning_rate": 0.00017532500527303938, + "loss": 0.6803, + "step": 944 + }, + { + "epoch": 0.252, + "grad_norm": 0.3858319014901234, + "learning_rate": 0.00017526816296069614, + "loss": 0.7049, + "step": 945 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.37856325845192396, + "learning_rate": 0.00017521126448869772, + "loss": 0.6454, + "step": 946 + }, + { + "epoch": 0.25253333333333333, + "grad_norm": 0.42287906762165467, + "learning_rate": 0.00017515430989949754, + "loss": 0.6724, + "step": 947 + }, + { + "epoch": 0.2528, + "grad_norm": 0.38701881331527244, + "learning_rate": 0.00017509729923559112, + "loss": 0.661, + "step": 948 + }, + { + "epoch": 0.25306666666666666, + "grad_norm": 0.382028694687477, + "learning_rate": 0.00017504023253951562, + "loss": 0.6603, + "step": 949 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.35823239469334595, + "learning_rate": 0.00017498310985385008, + "loss": 0.6681, + "step": 950 + }, + { + "epoch": 0.2536, + "grad_norm": 0.3657986186198803, + "learning_rate": 0.00017492593122121536, + "loss": 0.6195, + "step": 951 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.36008755659015096, + "learning_rate": 0.00017486869668427394, + "loss": 0.6287, + "step": 952 + }, + { + "epoch": 0.2541333333333333, + "grad_norm": 0.36541505898332444, + "learning_rate": 0.00017481140628573016, + "loss": 0.6741, + "step": 953 + }, + { + "epoch": 0.2544, + "grad_norm": 0.37576726204518385, + "learning_rate": 0.00017475406006832995, + "loss": 0.7042, + "step": 954 + }, + { + "epoch": 0.25466666666666665, + "grad_norm": 0.37805162720487145, + "learning_rate": 0.0001746966580748609, + "loss": 0.6494, + "step": 955 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.3522710023320336, + "learning_rate": 0.00017463920034815216, + "loss": 0.6649, + "step": 956 + }, + { + "epoch": 0.2552, + "grad_norm": 0.3560015561765506, + "learning_rate": 0.00017458168693107465, + "loss": 0.6844, + "step": 957 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.4218857153029114, + "learning_rate": 0.00017452411786654062, + "loss": 0.7998, + "step": 958 + }, + { + "epoch": 0.2557333333333333, + "grad_norm": 0.36335816278009564, + "learning_rate": 0.00017446649319750402, + "loss": 0.6404, + "step": 959 + }, + { + "epoch": 0.256, + "grad_norm": 0.34846999568866915, + "learning_rate": 0.0001744088129669601, + "loss": 0.662, + "step": 960 + }, + { + "epoch": 0.25626666666666664, + "grad_norm": 0.349520338294158, + "learning_rate": 0.00017435107721794577, + "loss": 0.6463, + "step": 961 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.38063931231201026, + "learning_rate": 0.00017429328599353924, + "loss": 0.6837, + "step": 962 + }, + { + "epoch": 0.2568, + "grad_norm": 0.3737308226896675, + "learning_rate": 0.00017423543933686012, + "loss": 0.7435, + "step": 963 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.3650735125260079, + "learning_rate": 0.0001741775372910694, + "loss": 0.6597, + "step": 964 + }, + { + "epoch": 0.25733333333333336, + "grad_norm": 0.3398711396353875, + "learning_rate": 0.00017411957989936941, + "loss": 0.6613, + "step": 965 + }, + { + "epoch": 0.2576, + "grad_norm": 0.37537336653979186, + "learning_rate": 0.00017406156720500376, + "loss": 0.6664, + "step": 966 + }, + { + "epoch": 0.2578666666666667, + "grad_norm": 0.37882826754476945, + "learning_rate": 0.00017400349925125733, + "loss": 0.6795, + "step": 967 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.37987112452770816, + "learning_rate": 0.0001739453760814562, + "loss": 0.6681, + "step": 968 + }, + { + "epoch": 0.2584, + "grad_norm": 0.3956517038538305, + "learning_rate": 0.00017388719773896768, + "loss": 0.6895, + "step": 969 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.38632946836606147, + "learning_rate": 0.00017382896426720024, + "loss": 0.6631, + "step": 970 + }, + { + "epoch": 0.25893333333333335, + "grad_norm": 0.36795667973267104, + "learning_rate": 0.00017377067570960352, + "loss": 0.6551, + "step": 971 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3827397268368528, + "learning_rate": 0.00017371233210966816, + "loss": 0.6936, + "step": 972 + }, + { + "epoch": 0.2594666666666667, + "grad_norm": 0.47607575702912713, + "learning_rate": 0.00017365393351092596, + "loss": 0.6075, + "step": 973 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.36037312736055466, + "learning_rate": 0.00017359547995694975, + "loss": 0.6494, + "step": 974 + }, + { + "epoch": 0.26, + "grad_norm": 0.35789956483448954, + "learning_rate": 0.00017353697149135325, + "loss": 0.6267, + "step": 975 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.36342931327877476, + "learning_rate": 0.00017347840815779136, + "loss": 0.7116, + "step": 976 + }, + { + "epoch": 0.26053333333333334, + "grad_norm": 0.36287677779910565, + "learning_rate": 0.00017341978999995975, + "loss": 0.6531, + "step": 977 + }, + { + "epoch": 0.2608, + "grad_norm": 0.34975457415301003, + "learning_rate": 0.00017336111706159506, + "loss": 0.6385, + "step": 978 + }, + { + "epoch": 0.26106666666666667, + "grad_norm": 0.36562474091398467, + "learning_rate": 0.00017330238938647474, + "loss": 0.6516, + "step": 979 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.3539378322118544, + "learning_rate": 0.00017324360701841717, + "loss": 0.6435, + "step": 980 + }, + { + "epoch": 0.2616, + "grad_norm": 0.34563315328494715, + "learning_rate": 0.00017318477000128151, + "loss": 0.6748, + "step": 981 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.3841838483389478, + "learning_rate": 0.00017312587837896767, + "loss": 0.6784, + "step": 982 + }, + { + "epoch": 0.26213333333333333, + "grad_norm": 0.3615563485319042, + "learning_rate": 0.00017306693219541633, + "loss": 0.6782, + "step": 983 + }, + { + "epoch": 0.2624, + "grad_norm": 0.5406080241399385, + "learning_rate": 0.00017300793149460883, + "loss": 0.6646, + "step": 984 + }, + { + "epoch": 0.26266666666666666, + "grad_norm": 0.34457375526611966, + "learning_rate": 0.00017294887632056724, + "loss": 0.6545, + "step": 985 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.35865336747608817, + "learning_rate": 0.00017288976671735426, + "loss": 0.6534, + "step": 986 + }, + { + "epoch": 0.2632, + "grad_norm": 0.37085132674766746, + "learning_rate": 0.0001728306027290732, + "loss": 0.632, + "step": 987 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.36442084375889044, + "learning_rate": 0.00017277138439986795, + "loss": 0.6495, + "step": 988 + }, + { + "epoch": 0.2637333333333333, + "grad_norm": 0.384949504867074, + "learning_rate": 0.00017271211177392296, + "loss": 0.6647, + "step": 989 + }, + { + "epoch": 0.264, + "grad_norm": 0.37650754017425686, + "learning_rate": 0.00017265278489546308, + "loss": 0.6876, + "step": 990 + }, + { + "epoch": 0.26426666666666665, + "grad_norm": 0.37880256580782695, + "learning_rate": 0.00017259340380875384, + "loss": 0.6402, + "step": 991 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.37368116886725655, + "learning_rate": 0.00017253396855810107, + "loss": 0.6552, + "step": 992 + }, + { + "epoch": 0.2648, + "grad_norm": 0.39662226242597787, + "learning_rate": 0.00017247447918785104, + "loss": 0.6766, + "step": 993 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.38645991795238954, + "learning_rate": 0.0001724149357423904, + "loss": 0.6795, + "step": 994 + }, + { + "epoch": 0.2653333333333333, + "grad_norm": 0.3769234981738238, + "learning_rate": 0.0001723553382661462, + "loss": 0.6751, + "step": 995 + }, + { + "epoch": 0.2656, + "grad_norm": 0.35753086326356054, + "learning_rate": 0.00017229568680358575, + "loss": 0.6677, + "step": 996 + }, + { + "epoch": 0.26586666666666664, + "grad_norm": 0.3872739276548074, + "learning_rate": 0.00017223598139921666, + "loss": 0.7158, + "step": 997 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.3460464919380646, + "learning_rate": 0.00017217622209758675, + "loss": 0.6815, + "step": 998 + }, + { + "epoch": 0.2664, + "grad_norm": 0.3732913527461127, + "learning_rate": 0.00017211640894328412, + "loss": 0.7025, + "step": 999 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.3753496382940533, + "learning_rate": 0.00017205654198093696, + "loss": 0.702, + "step": 1000 + }, + { + "epoch": 0.26693333333333336, + "grad_norm": 0.35997945440192164, + "learning_rate": 0.00017199662125521377, + "loss": 0.6728, + "step": 1001 + }, + { + "epoch": 0.2672, + "grad_norm": 0.3574497641196626, + "learning_rate": 0.00017193664681082295, + "loss": 0.6552, + "step": 1002 + }, + { + "epoch": 0.2674666666666667, + "grad_norm": 0.36065811987352553, + "learning_rate": 0.00017187661869251313, + "loss": 0.6631, + "step": 1003 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.3691275127306625, + "learning_rate": 0.00017181653694507297, + "loss": 0.6415, + "step": 1004 + }, + { + "epoch": 0.268, + "grad_norm": 0.3862848020475319, + "learning_rate": 0.0001717564016133311, + "loss": 0.6973, + "step": 1005 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.36563631397392443, + "learning_rate": 0.00017169621274215613, + "loss": 0.6213, + "step": 1006 + }, + { + "epoch": 0.26853333333333335, + "grad_norm": 0.37070552499562903, + "learning_rate": 0.00017163597037645666, + "loss": 0.6463, + "step": 1007 + }, + { + "epoch": 0.2688, + "grad_norm": 0.38403109410280967, + "learning_rate": 0.00017157567456118123, + "loss": 0.7268, + "step": 1008 + }, + { + "epoch": 0.2690666666666667, + "grad_norm": 0.3930852881835335, + "learning_rate": 0.0001715153253413181, + "loss": 0.7423, + "step": 1009 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.36070837973069164, + "learning_rate": 0.00017145492276189562, + "loss": 0.6704, + "step": 1010 + }, + { + "epoch": 0.2696, + "grad_norm": 0.3712880317217717, + "learning_rate": 0.00017139446686798175, + "loss": 0.6619, + "step": 1011 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.38815807584402684, + "learning_rate": 0.0001713339577046843, + "loss": 0.6804, + "step": 1012 + }, + { + "epoch": 0.27013333333333334, + "grad_norm": 0.37497177446629665, + "learning_rate": 0.00017127339531715084, + "loss": 0.6666, + "step": 1013 + }, + { + "epoch": 0.2704, + "grad_norm": 0.36114355099659295, + "learning_rate": 0.00017121277975056865, + "loss": 0.6141, + "step": 1014 + }, + { + "epoch": 0.27066666666666667, + "grad_norm": 0.3683652498766256, + "learning_rate": 0.00017115211105016463, + "loss": 0.673, + "step": 1015 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.39361544308351054, + "learning_rate": 0.00017109138926120547, + "loss": 0.6774, + "step": 1016 + }, + { + "epoch": 0.2712, + "grad_norm": 0.3701280177297978, + "learning_rate": 0.00017103061442899729, + "loss": 0.7574, + "step": 1017 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.36235762561339424, + "learning_rate": 0.00017096978659888586, + "loss": 0.6059, + "step": 1018 + }, + { + "epoch": 0.2717333333333333, + "grad_norm": 0.40450227155709073, + "learning_rate": 0.0001709089058162566, + "loss": 0.6639, + "step": 1019 + }, + { + "epoch": 0.272, + "grad_norm": 0.3709054607836301, + "learning_rate": 0.00017084797212653429, + "loss": 0.6022, + "step": 1020 + }, + { + "epoch": 0.27226666666666666, + "grad_norm": 0.3616505454176225, + "learning_rate": 0.00017078698557518318, + "loss": 0.6562, + "step": 1021 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.3639892418035812, + "learning_rate": 0.0001707259462077071, + "loss": 0.6463, + "step": 1022 + }, + { + "epoch": 0.2728, + "grad_norm": 0.36599974791078516, + "learning_rate": 0.00017066485406964915, + "loss": 0.6535, + "step": 1023 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.4085410536846461, + "learning_rate": 0.0001706037092065919, + "loss": 0.7248, + "step": 1024 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 0.37112209185505324, + "learning_rate": 0.00017054251166415726, + "loss": 0.6836, + "step": 1025 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3714015553228088, + "learning_rate": 0.00017048126148800635, + "loss": 0.6857, + "step": 1026 + }, + { + "epoch": 0.27386666666666665, + "grad_norm": 0.36393290295019776, + "learning_rate": 0.0001704199587238396, + "loss": 0.7034, + "step": 1027 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.37032237548237024, + "learning_rate": 0.00017035860341739674, + "loss": 0.669, + "step": 1028 + }, + { + "epoch": 0.2744, + "grad_norm": 0.3847663769376323, + "learning_rate": 0.00017029719561445665, + "loss": 0.7103, + "step": 1029 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.36727470895649217, + "learning_rate": 0.00017023573536083735, + "loss": 0.6926, + "step": 1030 + }, + { + "epoch": 0.2749333333333333, + "grad_norm": 0.4119377769662957, + "learning_rate": 0.0001701742227023961, + "loss": 0.6666, + "step": 1031 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3850447201755565, + "learning_rate": 0.00017011265768502912, + "loss": 0.6668, + "step": 1032 + }, + { + "epoch": 0.2754666666666667, + "grad_norm": 0.36228529837308865, + "learning_rate": 0.00017005104035467184, + "loss": 0.6372, + "step": 1033 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.3665351766490015, + "learning_rate": 0.0001699893707572986, + "loss": 0.6906, + "step": 1034 + }, + { + "epoch": 0.276, + "grad_norm": 0.37174418114514696, + "learning_rate": 0.0001699276489389228, + "loss": 0.7004, + "step": 1035 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.3934583187983519, + "learning_rate": 0.00016986587494559682, + "loss": 0.6933, + "step": 1036 + }, + { + "epoch": 0.27653333333333335, + "grad_norm": 0.3703166739240665, + "learning_rate": 0.0001698040488234119, + "loss": 0.705, + "step": 1037 + }, + { + "epoch": 0.2768, + "grad_norm": 0.3744401150164368, + "learning_rate": 0.0001697421706184983, + "loss": 0.6827, + "step": 1038 + }, + { + "epoch": 0.2770666666666667, + "grad_norm": 0.3658276342156474, + "learning_rate": 0.00016968024037702491, + "loss": 0.7223, + "step": 1039 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.35985253347044355, + "learning_rate": 0.00016961825814519976, + "loss": 0.6944, + "step": 1040 + }, + { + "epoch": 0.2776, + "grad_norm": 0.36943852949361683, + "learning_rate": 0.0001695562239692694, + "loss": 0.6679, + "step": 1041 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.36021617629421065, + "learning_rate": 0.00016949413789551924, + "loss": 0.6426, + "step": 1042 + }, + { + "epoch": 0.27813333333333334, + "grad_norm": 0.3927581675847467, + "learning_rate": 0.0001694319999702735, + "loss": 0.6149, + "step": 1043 + }, + { + "epoch": 0.2784, + "grad_norm": 0.38415736834236625, + "learning_rate": 0.0001693698102398949, + "loss": 0.6353, + "step": 1044 + }, + { + "epoch": 0.2786666666666667, + "grad_norm": 0.40727383142638085, + "learning_rate": 0.00016930756875078496, + "loss": 0.6988, + "step": 1045 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.36211847481910026, + "learning_rate": 0.00016924527554938382, + "loss": 0.641, + "step": 1046 + }, + { + "epoch": 0.2792, + "grad_norm": 0.3924335682331854, + "learning_rate": 0.0001691829306821701, + "loss": 0.6694, + "step": 1047 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.3557572253638986, + "learning_rate": 0.000169120534195661, + "loss": 0.6477, + "step": 1048 + }, + { + "epoch": 0.27973333333333333, + "grad_norm": 0.3576945514579245, + "learning_rate": 0.00016905808613641235, + "loss": 0.6571, + "step": 1049 + }, + { + "epoch": 0.28, + "grad_norm": 0.3432926613371049, + "learning_rate": 0.0001689955865510183, + "loss": 0.6387, + "step": 1050 + }, + { + "epoch": 0.28026666666666666, + "grad_norm": 0.36141449663437264, + "learning_rate": 0.00016893303548611152, + "loss": 0.6742, + "step": 1051 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.35812745513977534, + "learning_rate": 0.00016887043298836316, + "loss": 0.6633, + "step": 1052 + }, + { + "epoch": 0.2808, + "grad_norm": 0.3581035720758667, + "learning_rate": 0.0001688077791044826, + "loss": 0.6878, + "step": 1053 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.37736573683962743, + "learning_rate": 0.00016874507388121764, + "loss": 0.7029, + "step": 1054 + }, + { + "epoch": 0.2813333333333333, + "grad_norm": 0.36786599115150365, + "learning_rate": 0.0001686823173653544, + "loss": 0.699, + "step": 1055 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3504468067105522, + "learning_rate": 0.00016861950960371725, + "loss": 0.6571, + "step": 1056 + }, + { + "epoch": 0.28186666666666665, + "grad_norm": 0.37484424032181257, + "learning_rate": 0.00016855665064316878, + "loss": 0.6797, + "step": 1057 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.3619203324005068, + "learning_rate": 0.00016849374053060982, + "loss": 0.6864, + "step": 1058 + }, + { + "epoch": 0.2824, + "grad_norm": 0.3919017460182505, + "learning_rate": 0.00016843077931297931, + "loss": 0.6688, + "step": 1059 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.36523376713834776, + "learning_rate": 0.0001683677670372544, + "loss": 0.6491, + "step": 1060 + }, + { + "epoch": 0.2829333333333333, + "grad_norm": 0.4042315615415427, + "learning_rate": 0.00016830470375045026, + "loss": 0.7029, + "step": 1061 + }, + { + "epoch": 0.2832, + "grad_norm": 0.38412192749748014, + "learning_rate": 0.0001682415894996201, + "loss": 0.7355, + "step": 1062 + }, + { + "epoch": 0.28346666666666664, + "grad_norm": 0.36013514606028135, + "learning_rate": 0.0001681784243318553, + "loss": 0.7206, + "step": 1063 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.3732908816050375, + "learning_rate": 0.0001681152082942851, + "loss": 0.6888, + "step": 1064 + }, + { + "epoch": 0.284, + "grad_norm": 0.36299458065614304, + "learning_rate": 0.0001680519414340767, + "loss": 0.6489, + "step": 1065 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.3572414467072343, + "learning_rate": 0.0001679886237984353, + "loss": 0.6226, + "step": 1066 + }, + { + "epoch": 0.28453333333333336, + "grad_norm": 0.36499664268910215, + "learning_rate": 0.00016792525543460386, + "loss": 0.6657, + "step": 1067 + }, + { + "epoch": 0.2848, + "grad_norm": 0.4036141052244234, + "learning_rate": 0.00016786183638986337, + "loss": 0.6645, + "step": 1068 + }, + { + "epoch": 0.2850666666666667, + "grad_norm": 0.37185305842029137, + "learning_rate": 0.00016779836671153246, + "loss": 0.6739, + "step": 1069 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.35674199303735904, + "learning_rate": 0.00016773484644696764, + "loss": 0.7012, + "step": 1070 + }, + { + "epoch": 0.2856, + "grad_norm": 0.3459982007445063, + "learning_rate": 0.0001676712756435631, + "loss": 0.6454, + "step": 1071 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.36779706444281307, + "learning_rate": 0.0001676076543487508, + "loss": 0.692, + "step": 1072 + }, + { + "epoch": 0.28613333333333335, + "grad_norm": 0.35273160102593354, + "learning_rate": 0.00016754398261000037, + "loss": 0.6622, + "step": 1073 + }, + { + "epoch": 0.2864, + "grad_norm": 0.3453388756501663, + "learning_rate": 0.000167480260474819, + "loss": 0.6803, + "step": 1074 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 0.3609617910598972, + "learning_rate": 0.00016741648799075158, + "loss": 0.6633, + "step": 1075 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.339521677038658, + "learning_rate": 0.00016735266520538048, + "loss": 0.635, + "step": 1076 + }, + { + "epoch": 0.2872, + "grad_norm": 0.36655723439723775, + "learning_rate": 0.00016728879216632567, + "loss": 0.6184, + "step": 1077 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.36111905308044123, + "learning_rate": 0.00016722486892124455, + "loss": 0.6739, + "step": 1078 + }, + { + "epoch": 0.28773333333333334, + "grad_norm": 0.397710445374459, + "learning_rate": 0.0001671608955178321, + "loss": 0.6854, + "step": 1079 + }, + { + "epoch": 0.288, + "grad_norm": 0.3707630546517936, + "learning_rate": 0.0001670968720038206, + "loss": 0.6103, + "step": 1080 + }, + { + "epoch": 0.28826666666666667, + "grad_norm": 0.3710674802693859, + "learning_rate": 0.00016703279842697973, + "loss": 0.6735, + "step": 1081 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.3643854446717735, + "learning_rate": 0.00016696867483511656, + "loss": 0.5974, + "step": 1082 + }, + { + "epoch": 0.2888, + "grad_norm": 0.35067221992190195, + "learning_rate": 0.00016690450127607553, + "loss": 0.6635, + "step": 1083 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.3621828416846456, + "learning_rate": 0.00016684027779773826, + "loss": 0.6324, + "step": 1084 + }, + { + "epoch": 0.28933333333333333, + "grad_norm": 0.37352912194864135, + "learning_rate": 0.00016677600444802365, + "loss": 0.6739, + "step": 1085 + }, + { + "epoch": 0.2896, + "grad_norm": 0.37039670950564546, + "learning_rate": 0.00016671168127488785, + "loss": 0.6924, + "step": 1086 + }, + { + "epoch": 0.28986666666666666, + "grad_norm": 0.34511970312909246, + "learning_rate": 0.00016664730832632415, + "loss": 0.5947, + "step": 1087 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.35705567958317896, + "learning_rate": 0.000166582885650363, + "loss": 0.6652, + "step": 1088 + }, + { + "epoch": 0.2904, + "grad_norm": 0.3557212822445767, + "learning_rate": 0.0001665184132950719, + "loss": 0.6948, + "step": 1089 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.3273863887034328, + "learning_rate": 0.00016645389130855547, + "loss": 0.6696, + "step": 1090 + }, + { + "epoch": 0.2909333333333333, + "grad_norm": 0.35539875700521045, + "learning_rate": 0.00016638931973895536, + "loss": 0.6458, + "step": 1091 + }, + { + "epoch": 0.2912, + "grad_norm": 0.36460568537688526, + "learning_rate": 0.0001663246986344502, + "loss": 0.657, + "step": 1092 + }, + { + "epoch": 0.29146666666666665, + "grad_norm": 0.36649558584500275, + "learning_rate": 0.00016626002804325557, + "loss": 0.6609, + "step": 1093 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.39106219168027234, + "learning_rate": 0.00016619530801362394, + "loss": 0.6779, + "step": 1094 + }, + { + "epoch": 0.292, + "grad_norm": 0.3491447468740579, + "learning_rate": 0.00016613053859384485, + "loss": 0.6291, + "step": 1095 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.3671723824209041, + "learning_rate": 0.0001660657198322444, + "loss": 0.6181, + "step": 1096 + }, + { + "epoch": 0.2925333333333333, + "grad_norm": 0.3810939764036908, + "learning_rate": 0.00016600085177718573, + "loss": 0.6865, + "step": 1097 + }, + { + "epoch": 0.2928, + "grad_norm": 0.3275163187792605, + "learning_rate": 0.00016593593447706865, + "loss": 0.617, + "step": 1098 + }, + { + "epoch": 0.29306666666666664, + "grad_norm": 0.3519344941542276, + "learning_rate": 0.00016587096798032983, + "loss": 0.6611, + "step": 1099 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.36470694237855455, + "learning_rate": 0.00016580595233544248, + "loss": 0.7194, + "step": 1100 + }, + { + "epoch": 0.2936, + "grad_norm": 0.3651018255813527, + "learning_rate": 0.00016574088759091664, + "loss": 0.6889, + "step": 1101 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.39625683516580695, + "learning_rate": 0.00016567577379529883, + "loss": 0.7292, + "step": 1102 + }, + { + "epoch": 0.29413333333333336, + "grad_norm": 0.37248125147916067, + "learning_rate": 0.00016561061099717235, + "loss": 0.6468, + "step": 1103 + }, + { + "epoch": 0.2944, + "grad_norm": 0.34616853968603206, + "learning_rate": 0.00016554539924515686, + "loss": 0.6406, + "step": 1104 + }, + { + "epoch": 0.2946666666666667, + "grad_norm": 0.36652518870161205, + "learning_rate": 0.00016548013858790875, + "loss": 0.6616, + "step": 1105 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.39987104771015125, + "learning_rate": 0.0001654148290741207, + "loss": 0.7032, + "step": 1106 + }, + { + "epoch": 0.2952, + "grad_norm": 0.3288161117550961, + "learning_rate": 0.00016534947075252203, + "loss": 0.607, + "step": 1107 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.37883166858825185, + "learning_rate": 0.00016528406367187834, + "loss": 0.6274, + "step": 1108 + }, + { + "epoch": 0.29573333333333335, + "grad_norm": 0.38880499843762534, + "learning_rate": 0.00016521860788099165, + "loss": 0.6164, + "step": 1109 + }, + { + "epoch": 0.296, + "grad_norm": 0.37510296895333245, + "learning_rate": 0.0001651531034287004, + "loss": 0.67, + "step": 1110 + }, + { + "epoch": 0.2962666666666667, + "grad_norm": 0.3728715451387841, + "learning_rate": 0.00016508755036387919, + "loss": 0.6589, + "step": 1111 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.36323250640918014, + "learning_rate": 0.000165021948735439, + "loss": 0.61, + "step": 1112 + }, + { + "epoch": 0.2968, + "grad_norm": 0.3870463288023288, + "learning_rate": 0.00016495629859232704, + "loss": 0.6544, + "step": 1113 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.3605244652694021, + "learning_rate": 0.00016489059998352668, + "loss": 0.6255, + "step": 1114 + }, + { + "epoch": 0.29733333333333334, + "grad_norm": 0.38127211315090653, + "learning_rate": 0.00016482485295805748, + "loss": 0.6354, + "step": 1115 + }, + { + "epoch": 0.2976, + "grad_norm": 0.37297391747401476, + "learning_rate": 0.00016475905756497506, + "loss": 0.6344, + "step": 1116 + }, + { + "epoch": 0.29786666666666667, + "grad_norm": 0.36574361802191735, + "learning_rate": 0.00016469321385337123, + "loss": 0.6768, + "step": 1117 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.349897972234985, + "learning_rate": 0.0001646273218723738, + "loss": 0.6278, + "step": 1118 + }, + { + "epoch": 0.2984, + "grad_norm": 0.35673503680421903, + "learning_rate": 0.00016456138167114656, + "loss": 0.663, + "step": 1119 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.3646284802005334, + "learning_rate": 0.00016449539329888935, + "loss": 0.688, + "step": 1120 + }, + { + "epoch": 0.29893333333333333, + "grad_norm": 0.37942823954405064, + "learning_rate": 0.0001644293568048379, + "loss": 0.648, + "step": 1121 + }, + { + "epoch": 0.2992, + "grad_norm": 0.3706131772317103, + "learning_rate": 0.00016436327223826389, + "loss": 0.6872, + "step": 1122 + }, + { + "epoch": 0.29946666666666666, + "grad_norm": 0.363918550349555, + "learning_rate": 0.00016429713964847482, + "loss": 0.6653, + "step": 1123 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.37357927487684806, + "learning_rate": 0.00016423095908481403, + "loss": 0.6309, + "step": 1124 + }, + { + "epoch": 0.3, + "grad_norm": 0.4160864510634148, + "learning_rate": 0.00016416473059666065, + "loss": 0.7171, + "step": 1125 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.3524992252789292, + "learning_rate": 0.00016409845423342968, + "loss": 0.5971, + "step": 1126 + }, + { + "epoch": 0.3005333333333333, + "grad_norm": 0.36655727780622716, + "learning_rate": 0.00016403213004457162, + "loss": 0.6747, + "step": 1127 + }, + { + "epoch": 0.3008, + "grad_norm": 0.36583303447162424, + "learning_rate": 0.00016396575807957285, + "loss": 0.6894, + "step": 1128 + }, + { + "epoch": 0.30106666666666665, + "grad_norm": 0.4067169112555897, + "learning_rate": 0.00016389933838795534, + "loss": 0.7781, + "step": 1129 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.3664677596940745, + "learning_rate": 0.0001638328710192766, + "loss": 0.6704, + "step": 1130 + }, + { + "epoch": 0.3016, + "grad_norm": 0.37859485625377665, + "learning_rate": 0.00016376635602312982, + "loss": 0.6546, + "step": 1131 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.35192579221280346, + "learning_rate": 0.00016369979344914363, + "loss": 0.6297, + "step": 1132 + }, + { + "epoch": 0.3021333333333333, + "grad_norm": 0.34384956160748753, + "learning_rate": 0.00016363318334698223, + "loss": 0.6177, + "step": 1133 + }, + { + "epoch": 0.3024, + "grad_norm": 0.4504626611019618, + "learning_rate": 0.00016356652576634528, + "loss": 0.6942, + "step": 1134 + }, + { + "epoch": 0.30266666666666664, + "grad_norm": 0.39280597953969865, + "learning_rate": 0.0001634998207569678, + "loss": 0.6659, + "step": 1135 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.3679328159231638, + "learning_rate": 0.00016343306836862027, + "loss": 0.7045, + "step": 1136 + }, + { + "epoch": 0.3032, + "grad_norm": 0.35151668026257765, + "learning_rate": 0.00016336626865110843, + "loss": 0.6501, + "step": 1137 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.35385315501283787, + "learning_rate": 0.0001632994216542735, + "loss": 0.6332, + "step": 1138 + }, + { + "epoch": 0.30373333333333336, + "grad_norm": 0.36363587618583665, + "learning_rate": 0.00016323252742799182, + "loss": 0.6315, + "step": 1139 + }, + { + "epoch": 0.304, + "grad_norm": 0.3948592600839174, + "learning_rate": 0.000163165586022175, + "loss": 0.6482, + "step": 1140 + }, + { + "epoch": 0.3042666666666667, + "grad_norm": 0.36302246509769004, + "learning_rate": 0.00016309859748676983, + "loss": 0.6671, + "step": 1141 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.3779194376140945, + "learning_rate": 0.00016303156187175843, + "loss": 0.6871, + "step": 1142 + }, + { + "epoch": 0.3048, + "grad_norm": 0.36192175473327054, + "learning_rate": 0.0001629644792271578, + "loss": 0.6475, + "step": 1143 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.37289607890822235, + "learning_rate": 0.00016289734960302026, + "loss": 0.6949, + "step": 1144 + }, + { + "epoch": 0.30533333333333335, + "grad_norm": 0.3523417041461089, + "learning_rate": 0.00016283017304943295, + "loss": 0.628, + "step": 1145 + }, + { + "epoch": 0.3056, + "grad_norm": 0.3551378752338613, + "learning_rate": 0.0001627629496165183, + "loss": 0.6634, + "step": 1146 + }, + { + "epoch": 0.3058666666666667, + "grad_norm": 0.3472197515319477, + "learning_rate": 0.0001626956793544335, + "loss": 0.6865, + "step": 1147 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.35780980480311225, + "learning_rate": 0.00016262836231337071, + "loss": 0.6342, + "step": 1148 + }, + { + "epoch": 0.3064, + "grad_norm": 0.338062986445356, + "learning_rate": 0.00016256099854355707, + "loss": 0.6323, + "step": 1149 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.3659727126389936, + "learning_rate": 0.00016249358809525456, + "loss": 0.6675, + "step": 1150 + }, + { + "epoch": 0.30693333333333334, + "grad_norm": 0.3795725157114804, + "learning_rate": 0.00016242613101876, + "loss": 0.6663, + "step": 1151 + }, + { + "epoch": 0.3072, + "grad_norm": 0.35794199426970963, + "learning_rate": 0.00016235862736440487, + "loss": 0.6523, + "step": 1152 + }, + { + "epoch": 0.30746666666666667, + "grad_norm": 0.3642950023320711, + "learning_rate": 0.00016229107718255566, + "loss": 0.6538, + "step": 1153 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.36534479278477566, + "learning_rate": 0.00016222348052361333, + "loss": 0.6342, + "step": 1154 + }, + { + "epoch": 0.308, + "grad_norm": 0.3740616755218169, + "learning_rate": 0.0001621558374380136, + "loss": 0.6854, + "step": 1155 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.38679655278656966, + "learning_rate": 0.00016208814797622693, + "loss": 0.6466, + "step": 1156 + }, + { + "epoch": 0.3085333333333333, + "grad_norm": 0.37379508723873567, + "learning_rate": 0.00016202041218875825, + "loss": 0.6745, + "step": 1157 + }, + { + "epoch": 0.3088, + "grad_norm": 0.3506501397117617, + "learning_rate": 0.00016195263012614705, + "loss": 0.6428, + "step": 1158 + }, + { + "epoch": 0.30906666666666666, + "grad_norm": 0.3842823121392134, + "learning_rate": 0.0001618848018389675, + "loss": 0.6984, + "step": 1159 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.3680409119623642, + "learning_rate": 0.00016181692737782808, + "loss": 0.6595, + "step": 1160 + }, + { + "epoch": 0.3096, + "grad_norm": 0.38458618762078806, + "learning_rate": 0.00016174900679337184, + "loss": 0.6032, + "step": 1161 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.3822741148204659, + "learning_rate": 0.0001616810401362762, + "loss": 0.6991, + "step": 1162 + }, + { + "epoch": 0.3101333333333333, + "grad_norm": 0.3335556786759688, + "learning_rate": 0.00016161302745725293, + "loss": 0.6173, + "step": 1163 + }, + { + "epoch": 0.3104, + "grad_norm": 0.33459018128300577, + "learning_rate": 0.00016154496880704819, + "loss": 0.6514, + "step": 1164 + }, + { + "epoch": 0.31066666666666665, + "grad_norm": 0.35310490386718985, + "learning_rate": 0.00016147686423644242, + "loss": 0.6627, + "step": 1165 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.38067503455267376, + "learning_rate": 0.00016140871379625033, + "loss": 0.6253, + "step": 1166 + }, + { + "epoch": 0.3112, + "grad_norm": 0.3959262621372224, + "learning_rate": 0.00016134051753732083, + "loss": 0.6652, + "step": 1167 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.34540358267945126, + "learning_rate": 0.00016127227551053703, + "loss": 0.6386, + "step": 1168 + }, + { + "epoch": 0.3117333333333333, + "grad_norm": 0.34445275213733884, + "learning_rate": 0.0001612039877668162, + "loss": 0.6275, + "step": 1169 + }, + { + "epoch": 0.312, + "grad_norm": 0.3679875177944374, + "learning_rate": 0.00016113565435710972, + "loss": 0.7062, + "step": 1170 + }, + { + "epoch": 0.3122666666666667, + "grad_norm": 0.3822147893078014, + "learning_rate": 0.00016106727533240302, + "loss": 0.6532, + "step": 1171 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.36408722525935105, + "learning_rate": 0.00016099885074371558, + "loss": 0.6316, + "step": 1172 + }, + { + "epoch": 0.3128, + "grad_norm": 0.35163563376622164, + "learning_rate": 0.0001609303806421009, + "loss": 0.6348, + "step": 1173 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.36364646710195, + "learning_rate": 0.00016086186507864635, + "loss": 0.6776, + "step": 1174 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 0.3333509329366065, + "learning_rate": 0.00016079330410447335, + "loss": 0.6054, + "step": 1175 + }, + { + "epoch": 0.3136, + "grad_norm": 0.351290564829832, + "learning_rate": 0.0001607246977707371, + "loss": 0.6479, + "step": 1176 + }, + { + "epoch": 0.3138666666666667, + "grad_norm": 0.3576339066147463, + "learning_rate": 0.00016065604612862676, + "loss": 0.6779, + "step": 1177 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.3511175504088521, + "learning_rate": 0.00016058734922936506, + "loss": 0.6494, + "step": 1178 + }, + { + "epoch": 0.3144, + "grad_norm": 0.3694486746759354, + "learning_rate": 0.00016051860712420877, + "loss": 0.6912, + "step": 1179 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.36096748149280555, + "learning_rate": 0.00016044981986444826, + "loss": 0.6732, + "step": 1180 + }, + { + "epoch": 0.31493333333333334, + "grad_norm": 0.35219851339863323, + "learning_rate": 0.0001603809875014076, + "loss": 0.6236, + "step": 1181 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3765659705700948, + "learning_rate": 0.00016031211008644448, + "loss": 0.6985, + "step": 1182 + }, + { + "epoch": 0.3154666666666667, + "grad_norm": 0.3695985043120348, + "learning_rate": 0.0001602431876709503, + "loss": 0.7035, + "step": 1183 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.3608009123050179, + "learning_rate": 0.00016017422030634992, + "loss": 0.6399, + "step": 1184 + }, + { + "epoch": 0.316, + "grad_norm": 0.3748073320657708, + "learning_rate": 0.00016010520804410184, + "loss": 0.6675, + "step": 1185 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.3714347092962332, + "learning_rate": 0.000160036150935698, + "loss": 0.6843, + "step": 1186 + }, + { + "epoch": 0.31653333333333333, + "grad_norm": 0.3928369814244889, + "learning_rate": 0.00015996704903266382, + "loss": 0.6792, + "step": 1187 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3710624996331732, + "learning_rate": 0.0001598979023865581, + "loss": 0.6291, + "step": 1188 + }, + { + "epoch": 0.31706666666666666, + "grad_norm": 0.34999745773625557, + "learning_rate": 0.00015982871104897313, + "loss": 0.626, + "step": 1189 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.38243522357450194, + "learning_rate": 0.00015975947507153443, + "loss": 0.6929, + "step": 1190 + }, + { + "epoch": 0.3176, + "grad_norm": 0.3630642104901653, + "learning_rate": 0.00015969019450590087, + "loss": 0.6444, + "step": 1191 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.35883144312578763, + "learning_rate": 0.00015962086940376463, + "loss": 0.6708, + "step": 1192 + }, + { + "epoch": 0.3181333333333333, + "grad_norm": 0.37130043786712597, + "learning_rate": 0.00015955149981685107, + "loss": 0.6878, + "step": 1193 + }, + { + "epoch": 0.3184, + "grad_norm": 0.34879606254394946, + "learning_rate": 0.00015948208579691877, + "loss": 0.6589, + "step": 1194 + }, + { + "epoch": 0.31866666666666665, + "grad_norm": 0.3592033146179873, + "learning_rate": 0.00015941262739575937, + "loss": 0.649, + "step": 1195 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.34879313258280176, + "learning_rate": 0.0001593431246651978, + "loss": 0.6651, + "step": 1196 + }, + { + "epoch": 0.3192, + "grad_norm": 0.3554203164586111, + "learning_rate": 0.0001592735776570919, + "loss": 0.6704, + "step": 1197 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.35190883617460833, + "learning_rate": 0.00015920398642333265, + "loss": 0.6956, + "step": 1198 + }, + { + "epoch": 0.3197333333333333, + "grad_norm": 0.370121617005598, + "learning_rate": 0.00015913435101584398, + "loss": 0.6992, + "step": 1199 + }, + { + "epoch": 0.32, + "grad_norm": 0.3360198400648102, + "learning_rate": 0.0001590646714865828, + "loss": 0.6296, + "step": 1200 + }, + { + "epoch": 0.32026666666666664, + "grad_norm": 0.35623368554828544, + "learning_rate": 0.00015899494788753892, + "loss": 0.6498, + "step": 1201 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.37256055318413156, + "learning_rate": 0.00015892518027073505, + "loss": 0.6367, + "step": 1202 + }, + { + "epoch": 0.3208, + "grad_norm": 0.3627845573248268, + "learning_rate": 0.00015885536868822671, + "loss": 0.6841, + "step": 1203 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.36019386889032073, + "learning_rate": 0.00015878551319210228, + "loss": 0.6096, + "step": 1204 + }, + { + "epoch": 0.32133333333333336, + "grad_norm": 0.3574885966722773, + "learning_rate": 0.00015871561383448286, + "loss": 0.6442, + "step": 1205 + }, + { + "epoch": 0.3216, + "grad_norm": 0.3563774914852631, + "learning_rate": 0.0001586456706675223, + "loss": 0.6508, + "step": 1206 + }, + { + "epoch": 0.3218666666666667, + "grad_norm": 0.3662607386148354, + "learning_rate": 0.00015857568374340713, + "loss": 0.6518, + "step": 1207 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.38425098823181136, + "learning_rate": 0.00015850565311435652, + "loss": 0.6681, + "step": 1208 + }, + { + "epoch": 0.3224, + "grad_norm": 0.3606066742289815, + "learning_rate": 0.00015843557883262225, + "loss": 0.6464, + "step": 1209 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.3485835343050614, + "learning_rate": 0.0001583654609504887, + "loss": 0.6415, + "step": 1210 + }, + { + "epoch": 0.32293333333333335, + "grad_norm": 0.3433252913598194, + "learning_rate": 0.00015829529952027276, + "loss": 0.6472, + "step": 1211 + }, + { + "epoch": 0.3232, + "grad_norm": 0.38180169579767054, + "learning_rate": 0.00015822509459432379, + "loss": 0.6407, + "step": 1212 + }, + { + "epoch": 0.3234666666666667, + "grad_norm": 0.3536578967226547, + "learning_rate": 0.0001581548462250236, + "loss": 0.6363, + "step": 1213 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.36302717425259284, + "learning_rate": 0.00015808455446478646, + "loss": 0.6048, + "step": 1214 + }, + { + "epoch": 0.324, + "grad_norm": 0.38596616795709465, + "learning_rate": 0.00015801421936605903, + "loss": 0.6256, + "step": 1215 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.39008874153124756, + "learning_rate": 0.00015794384098132027, + "loss": 0.6775, + "step": 1216 + }, + { + "epoch": 0.32453333333333334, + "grad_norm": 0.3599031025738248, + "learning_rate": 0.00015787341936308134, + "loss": 0.6753, + "step": 1217 + }, + { + "epoch": 0.3248, + "grad_norm": 0.3666623377024128, + "learning_rate": 0.00015780295456388588, + "loss": 0.6514, + "step": 1218 + }, + { + "epoch": 0.32506666666666667, + "grad_norm": 0.3764398383093878, + "learning_rate": 0.00015773244663630953, + "loss": 0.6318, + "step": 1219 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.3616039006983256, + "learning_rate": 0.00015766189563296029, + "loss": 0.6895, + "step": 1220 + }, + { + "epoch": 0.3256, + "grad_norm": 0.36386024728183697, + "learning_rate": 0.0001575913016064781, + "loss": 0.6189, + "step": 1221 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.37325472624912076, + "learning_rate": 0.0001575206646095352, + "loss": 0.6583, + "step": 1222 + }, + { + "epoch": 0.32613333333333333, + "grad_norm": 0.38291898189334206, + "learning_rate": 0.00015744998469483575, + "loss": 0.6421, + "step": 1223 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3653212094574066, + "learning_rate": 0.00015737926191511606, + "loss": 0.653, + "step": 1224 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 0.37838881194679863, + "learning_rate": 0.00015730849632314428, + "loss": 0.7042, + "step": 1225 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.3851919203073385, + "learning_rate": 0.0001572376879717206, + "loss": 0.6746, + "step": 1226 + }, + { + "epoch": 0.3272, + "grad_norm": 0.3802774289463264, + "learning_rate": 0.00015716683691367704, + "loss": 0.64, + "step": 1227 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.39424919350285487, + "learning_rate": 0.0001570959432018776, + "loss": 0.7212, + "step": 1228 + }, + { + "epoch": 0.3277333333333333, + "grad_norm": 0.3620130208318971, + "learning_rate": 0.00015702500688921805, + "loss": 0.6527, + "step": 1229 + }, + { + "epoch": 0.328, + "grad_norm": 0.37466286399006393, + "learning_rate": 0.00015695402802862584, + "loss": 0.6102, + "step": 1230 + }, + { + "epoch": 0.32826666666666665, + "grad_norm": 0.345108202972318, + "learning_rate": 0.00015688300667306032, + "loss": 0.6793, + "step": 1231 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.3720730074945737, + "learning_rate": 0.0001568119428755125, + "loss": 0.6619, + "step": 1232 + }, + { + "epoch": 0.3288, + "grad_norm": 0.3569757717493977, + "learning_rate": 0.000156740836689005, + "loss": 0.6378, + "step": 1233 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.3598068042732101, + "learning_rate": 0.0001566696881665921, + "loss": 0.639, + "step": 1234 + }, + { + "epoch": 0.3293333333333333, + "grad_norm": 0.3872322396630404, + "learning_rate": 0.00015659849736135976, + "loss": 0.6437, + "step": 1235 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4061654621552088, + "learning_rate": 0.00015652726432642533, + "loss": 0.7233, + "step": 1236 + }, + { + "epoch": 0.32986666666666664, + "grad_norm": 0.37738913064383833, + "learning_rate": 0.00015645598911493775, + "loss": 0.6551, + "step": 1237 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.3429208761962747, + "learning_rate": 0.00015638467178007742, + "loss": 0.6359, + "step": 1238 + }, + { + "epoch": 0.3304, + "grad_norm": 0.38534787195578357, + "learning_rate": 0.00015631331237505623, + "loss": 0.6529, + "step": 1239 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.3566870640114799, + "learning_rate": 0.00015624191095311735, + "loss": 0.6334, + "step": 1240 + }, + { + "epoch": 0.33093333333333336, + "grad_norm": 0.3507989735290889, + "learning_rate": 0.00015617046756753538, + "loss": 0.6648, + "step": 1241 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3804327588695725, + "learning_rate": 0.00015609898227161617, + "loss": 0.5657, + "step": 1242 + }, + { + "epoch": 0.3314666666666667, + "grad_norm": 0.36782873077642625, + "learning_rate": 0.00015602745511869692, + "loss": 0.6844, + "step": 1243 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.37440910177301917, + "learning_rate": 0.00015595588616214596, + "loss": 0.6117, + "step": 1244 + }, + { + "epoch": 0.332, + "grad_norm": 0.38262043407727425, + "learning_rate": 0.0001558842754553629, + "loss": 0.6493, + "step": 1245 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.34683869042386345, + "learning_rate": 0.00015581262305177846, + "loss": 0.6775, + "step": 1246 + }, + { + "epoch": 0.33253333333333335, + "grad_norm": 0.3519959092207261, + "learning_rate": 0.0001557409290048545, + "loss": 0.6262, + "step": 1247 + }, + { + "epoch": 0.3328, + "grad_norm": 0.35762407123056417, + "learning_rate": 0.00015566919336808388, + "loss": 0.587, + "step": 1248 + }, + { + "epoch": 0.3330666666666667, + "grad_norm": 0.3734952689135648, + "learning_rate": 0.0001555974161949906, + "loss": 0.6658, + "step": 1249 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.36249522777359267, + "learning_rate": 0.00015552559753912953, + "loss": 0.6504, + "step": 1250 + }, + { + "epoch": 0.3336, + "grad_norm": 0.35338497034278515, + "learning_rate": 0.00015545373745408657, + "loss": 0.6346, + "step": 1251 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.3941213271584644, + "learning_rate": 0.00015538183599347853, + "loss": 0.6894, + "step": 1252 + }, + { + "epoch": 0.33413333333333334, + "grad_norm": 0.37980684186528246, + "learning_rate": 0.00015530989321095308, + "loss": 0.6465, + "step": 1253 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3386191752029083, + "learning_rate": 0.0001552379091601887, + "loss": 0.6091, + "step": 1254 + }, + { + "epoch": 0.33466666666666667, + "grad_norm": 0.35499212515740847, + "learning_rate": 0.0001551658838948947, + "loss": 0.6708, + "step": 1255 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.3507762267790256, + "learning_rate": 0.0001550938174688111, + "loss": 0.6653, + "step": 1256 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3509622875990821, + "learning_rate": 0.00015502170993570864, + "loss": 0.6546, + "step": 1257 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.3683119642145877, + "learning_rate": 0.00015494956134938875, + "loss": 0.6768, + "step": 1258 + }, + { + "epoch": 0.33573333333333333, + "grad_norm": 0.33967456657685374, + "learning_rate": 0.00015487737176368352, + "loss": 0.6324, + "step": 1259 + }, + { + "epoch": 0.336, + "grad_norm": 0.3490156070968416, + "learning_rate": 0.00015480514123245555, + "loss": 0.6077, + "step": 1260 + }, + { + "epoch": 0.33626666666666666, + "grad_norm": 0.37065388362132234, + "learning_rate": 0.00015473286980959805, + "loss": 0.6609, + "step": 1261 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.36879952858462517, + "learning_rate": 0.0001546605575490347, + "loss": 0.614, + "step": 1262 + }, + { + "epoch": 0.3368, + "grad_norm": 0.36123447052475727, + "learning_rate": 0.00015458820450471974, + "loss": 0.6546, + "step": 1263 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.35536305457711176, + "learning_rate": 0.0001545158107306377, + "loss": 0.6206, + "step": 1264 + }, + { + "epoch": 0.3373333333333333, + "grad_norm": 0.3953199215685226, + "learning_rate": 0.00015444337628080362, + "loss": 0.6641, + "step": 1265 + }, + { + "epoch": 0.3376, + "grad_norm": 0.3625484098950176, + "learning_rate": 0.00015437090120926284, + "loss": 0.6325, + "step": 1266 + }, + { + "epoch": 0.33786666666666665, + "grad_norm": 0.3558095021789365, + "learning_rate": 0.000154298385570091, + "loss": 0.6268, + "step": 1267 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.36446549265199385, + "learning_rate": 0.00015422582941739397, + "loss": 0.6534, + "step": 1268 + }, + { + "epoch": 0.3384, + "grad_norm": 0.35619854430761033, + "learning_rate": 0.00015415323280530802, + "loss": 0.6651, + "step": 1269 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.35900610187354576, + "learning_rate": 0.0001540805957879994, + "loss": 0.6826, + "step": 1270 + }, + { + "epoch": 0.3389333333333333, + "grad_norm": 0.36417732187548524, + "learning_rate": 0.00015400791841966465, + "loss": 0.6685, + "step": 1271 + }, + { + "epoch": 0.3392, + "grad_norm": 0.38437561051333186, + "learning_rate": 0.00015393520075453028, + "loss": 0.6264, + "step": 1272 + }, + { + "epoch": 0.3394666666666667, + "grad_norm": 0.34756552123783735, + "learning_rate": 0.00015386244284685302, + "loss": 0.6493, + "step": 1273 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.3785874610424342, + "learning_rate": 0.0001537896447509195, + "loss": 0.6908, + "step": 1274 + }, + { + "epoch": 0.34, + "grad_norm": 0.3669786971937423, + "learning_rate": 0.00015371680652104643, + "loss": 0.6365, + "step": 1275 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.36032875005693216, + "learning_rate": 0.00015364392821158043, + "loss": 0.6482, + "step": 1276 + }, + { + "epoch": 0.34053333333333335, + "grad_norm": 0.3599419557811629, + "learning_rate": 0.000153571009876898, + "loss": 0.6258, + "step": 1277 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3479833871068561, + "learning_rate": 0.00015349805157140553, + "loss": 0.6301, + "step": 1278 + }, + { + "epoch": 0.3410666666666667, + "grad_norm": 0.3545384568178658, + "learning_rate": 0.0001534250533495392, + "loss": 0.6587, + "step": 1279 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.3412159189978176, + "learning_rate": 0.00015335201526576507, + "loss": 0.5896, + "step": 1280 + }, + { + "epoch": 0.3416, + "grad_norm": 0.34608949708731057, + "learning_rate": 0.0001532789373745788, + "loss": 0.6747, + "step": 1281 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.3531856889857697, + "learning_rate": 0.00015320581973050587, + "loss": 0.7002, + "step": 1282 + }, + { + "epoch": 0.34213333333333334, + "grad_norm": 0.38405613464458355, + "learning_rate": 0.00015313266238810132, + "loss": 0.6717, + "step": 1283 + }, + { + "epoch": 0.3424, + "grad_norm": 0.38748352533666336, + "learning_rate": 0.0001530594654019499, + "loss": 0.6092, + "step": 1284 + }, + { + "epoch": 0.3426666666666667, + "grad_norm": 0.3713696754854172, + "learning_rate": 0.0001529862288266659, + "loss": 0.6451, + "step": 1285 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.35484028813406376, + "learning_rate": 0.00015291295271689317, + "loss": 0.6284, + "step": 1286 + }, + { + "epoch": 0.3432, + "grad_norm": 0.36492832853935714, + "learning_rate": 0.000152839637127305, + "loss": 0.6306, + "step": 1287 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.380289161842323, + "learning_rate": 0.00015276628211260423, + "loss": 0.6709, + "step": 1288 + }, + { + "epoch": 0.34373333333333334, + "grad_norm": 0.37607487520761307, + "learning_rate": 0.00015269288772752298, + "loss": 0.6409, + "step": 1289 + }, + { + "epoch": 0.344, + "grad_norm": 0.36833212471802784, + "learning_rate": 0.00015261945402682292, + "loss": 0.6925, + "step": 1290 + }, + { + "epoch": 0.34426666666666667, + "grad_norm": 0.3426238169547332, + "learning_rate": 0.0001525459810652949, + "loss": 0.6699, + "step": 1291 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.348530214367812, + "learning_rate": 0.00015247246889775915, + "loss": 0.6038, + "step": 1292 + }, + { + "epoch": 0.3448, + "grad_norm": 0.3630556132797608, + "learning_rate": 0.00015239891757906507, + "loss": 0.6411, + "step": 1293 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.3623795702351125, + "learning_rate": 0.00015232532716409148, + "loss": 0.6337, + "step": 1294 + }, + { + "epoch": 0.3453333333333333, + "grad_norm": 0.355797002464872, + "learning_rate": 0.00015225169770774605, + "loss": 0.6704, + "step": 1295 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3732979701519832, + "learning_rate": 0.00015217802926496585, + "loss": 0.6693, + "step": 1296 + }, + { + "epoch": 0.34586666666666666, + "grad_norm": 0.35294389401419496, + "learning_rate": 0.0001521043218907169, + "loss": 0.6704, + "step": 1297 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.3978208670100884, + "learning_rate": 0.00015203057563999438, + "loss": 0.6745, + "step": 1298 + }, + { + "epoch": 0.3464, + "grad_norm": 0.34707304418306456, + "learning_rate": 0.00015195679056782227, + "loss": 0.6377, + "step": 1299 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.35806543959566406, + "learning_rate": 0.00015188296672925377, + "loss": 0.6831, + "step": 1300 + }, + { + "epoch": 0.3469333333333333, + "grad_norm": 0.35503239038303996, + "learning_rate": 0.00015180910417937084, + "loss": 0.6027, + "step": 1301 + }, + { + "epoch": 0.3472, + "grad_norm": 0.6052900851469462, + "learning_rate": 0.00015173520297328438, + "loss": 0.6019, + "step": 1302 + }, + { + "epoch": 0.34746666666666665, + "grad_norm": 0.3911571636520369, + "learning_rate": 0.0001516612631661341, + "loss": 0.7034, + "step": 1303 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.3592736386518935, + "learning_rate": 0.00015158728481308852, + "loss": 0.6251, + "step": 1304 + }, + { + "epoch": 0.348, + "grad_norm": 0.37374460172904495, + "learning_rate": 0.00015151326796934497, + "loss": 0.666, + "step": 1305 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.34285443302059637, + "learning_rate": 0.0001514392126901295, + "loss": 0.5971, + "step": 1306 + }, + { + "epoch": 0.3485333333333333, + "grad_norm": 0.34531508430173563, + "learning_rate": 0.0001513651190306967, + "loss": 0.6218, + "step": 1307 + }, + { + "epoch": 0.3488, + "grad_norm": 0.35986072901583555, + "learning_rate": 0.00015129098704632995, + "loss": 0.6438, + "step": 1308 + }, + { + "epoch": 0.3490666666666667, + "grad_norm": 0.3553678748811404, + "learning_rate": 0.00015121681679234112, + "loss": 0.6427, + "step": 1309 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.3568476936052233, + "learning_rate": 0.0001511426083240708, + "loss": 0.6456, + "step": 1310 + }, + { + "epoch": 0.3496, + "grad_norm": 0.33901440501681673, + "learning_rate": 0.00015106836169688788, + "loss": 0.6194, + "step": 1311 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.34040480268919593, + "learning_rate": 0.00015099407696618982, + "loss": 0.6477, + "step": 1312 + }, + { + "epoch": 0.35013333333333335, + "grad_norm": 0.3652503847847282, + "learning_rate": 0.00015091975418740256, + "loss": 0.6686, + "step": 1313 + }, + { + "epoch": 0.3504, + "grad_norm": 0.3633498057009687, + "learning_rate": 0.00015084539341598036, + "loss": 0.6258, + "step": 1314 + }, + { + "epoch": 0.3506666666666667, + "grad_norm": 0.3487997690837372, + "learning_rate": 0.00015077099470740582, + "loss": 0.6158, + "step": 1315 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.3756750731507114, + "learning_rate": 0.00015069655811718988, + "loss": 0.6358, + "step": 1316 + }, + { + "epoch": 0.3512, + "grad_norm": 0.3264627050819527, + "learning_rate": 0.00015062208370087178, + "loss": 0.5964, + "step": 1317 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.3787090171757912, + "learning_rate": 0.0001505475715140189, + "loss": 0.6419, + "step": 1318 + }, + { + "epoch": 0.35173333333333334, + "grad_norm": 0.35407632434998715, + "learning_rate": 0.00015047302161222683, + "loss": 0.6605, + "step": 1319 + }, + { + "epoch": 0.352, + "grad_norm": 0.36041955950042137, + "learning_rate": 0.0001503984340511193, + "loss": 0.6904, + "step": 1320 + }, + { + "epoch": 0.3522666666666667, + "grad_norm": 0.3806518021077695, + "learning_rate": 0.0001503238088863482, + "loss": 0.663, + "step": 1321 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.3513632008993957, + "learning_rate": 0.00015024914617359342, + "loss": 0.6354, + "step": 1322 + }, + { + "epoch": 0.3528, + "grad_norm": 0.3598337632424312, + "learning_rate": 0.00015017444596856282, + "loss": 0.6522, + "step": 1323 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.3626859658184959, + "learning_rate": 0.00015009970832699233, + "loss": 0.6079, + "step": 1324 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 0.3453728474529043, + "learning_rate": 0.0001500249333046458, + "loss": 0.6535, + "step": 1325 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3489585036348258, + "learning_rate": 0.00014995012095731487, + "loss": 0.6851, + "step": 1326 + }, + { + "epoch": 0.35386666666666666, + "grad_norm": 0.33436370252875597, + "learning_rate": 0.0001498752713408191, + "loss": 0.6245, + "step": 1327 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.3627241113623393, + "learning_rate": 0.0001498003845110059, + "loss": 0.6887, + "step": 1328 + }, + { + "epoch": 0.3544, + "grad_norm": 0.33857257502557025, + "learning_rate": 0.0001497254605237504, + "loss": 0.6079, + "step": 1329 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.35886830219258514, + "learning_rate": 0.0001496504994349554, + "loss": 0.6047, + "step": 1330 + }, + { + "epoch": 0.3549333333333333, + "grad_norm": 0.3642384147874268, + "learning_rate": 0.0001495755013005515, + "loss": 0.6125, + "step": 1331 + }, + { + "epoch": 0.3552, + "grad_norm": 0.39103131647299133, + "learning_rate": 0.00014950046617649685, + "loss": 0.6871, + "step": 1332 + }, + { + "epoch": 0.35546666666666665, + "grad_norm": 0.3586087263192577, + "learning_rate": 0.0001494253941187773, + "loss": 0.5879, + "step": 1333 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.3796636077562965, + "learning_rate": 0.00014935028518340602, + "loss": 0.6572, + "step": 1334 + }, + { + "epoch": 0.356, + "grad_norm": 0.3603843501932065, + "learning_rate": 0.000149275139426424, + "loss": 0.6542, + "step": 1335 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.3552313013088614, + "learning_rate": 0.00014919995690389958, + "loss": 0.6769, + "step": 1336 + }, + { + "epoch": 0.3565333333333333, + "grad_norm": 0.3549500257876771, + "learning_rate": 0.00014912473767192841, + "loss": 0.6444, + "step": 1337 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3513257194781214, + "learning_rate": 0.00014904948178663373, + "loss": 0.6595, + "step": 1338 + }, + { + "epoch": 0.35706666666666664, + "grad_norm": 0.3656816659680909, + "learning_rate": 0.00014897418930416597, + "loss": 0.6681, + "step": 1339 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.3437582439392381, + "learning_rate": 0.00014889886028070294, + "loss": 0.6395, + "step": 1340 + }, + { + "epoch": 0.3576, + "grad_norm": 0.3780791131367785, + "learning_rate": 0.00014882349477244976, + "loss": 0.6763, + "step": 1341 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.3442496514562768, + "learning_rate": 0.00014874809283563865, + "loss": 0.6277, + "step": 1342 + }, + { + "epoch": 0.35813333333333336, + "grad_norm": 0.4031109870907002, + "learning_rate": 0.00014867265452652912, + "loss": 0.6977, + "step": 1343 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3638393122097646, + "learning_rate": 0.00014859717990140775, + "loss": 0.5819, + "step": 1344 + }, + { + "epoch": 0.3586666666666667, + "grad_norm": 0.3675883451529548, + "learning_rate": 0.0001485216690165883, + "loss": 0.6986, + "step": 1345 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.3683718766162737, + "learning_rate": 0.00014844612192841143, + "loss": 0.6866, + "step": 1346 + }, + { + "epoch": 0.3592, + "grad_norm": 0.3530276908656251, + "learning_rate": 0.00014837053869324498, + "loss": 0.6869, + "step": 1347 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.36759077783164884, + "learning_rate": 0.00014829491936748369, + "loss": 0.6723, + "step": 1348 + }, + { + "epoch": 0.35973333333333335, + "grad_norm": 0.3484944155661589, + "learning_rate": 0.00014821926400754916, + "loss": 0.6643, + "step": 1349 + }, + { + "epoch": 0.36, + "grad_norm": 0.36260263794094266, + "learning_rate": 0.00014814357266989002, + "loss": 0.6403, + "step": 1350 + }, + { + "epoch": 0.3602666666666667, + "grad_norm": 0.3575160920381425, + "learning_rate": 0.0001480678454109816, + "loss": 0.6605, + "step": 1351 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.34636289465466796, + "learning_rate": 0.0001479920822873262, + "loss": 0.6484, + "step": 1352 + }, + { + "epoch": 0.3608, + "grad_norm": 0.34952268449787954, + "learning_rate": 0.00014791628335545268, + "loss": 0.6018, + "step": 1353 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.4499924583766995, + "learning_rate": 0.00014784044867191675, + "loss": 0.6709, + "step": 1354 + }, + { + "epoch": 0.36133333333333334, + "grad_norm": 0.3520854946799651, + "learning_rate": 0.00014776457829330077, + "loss": 0.6699, + "step": 1355 + }, + { + "epoch": 0.3616, + "grad_norm": 0.38016155586990236, + "learning_rate": 0.00014768867227621374, + "loss": 0.6637, + "step": 1356 + }, + { + "epoch": 0.36186666666666667, + "grad_norm": 0.4071390587483549, + "learning_rate": 0.0001476127306772912, + "loss": 0.713, + "step": 1357 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.3555110502794118, + "learning_rate": 0.00014753675355319527, + "loss": 0.6458, + "step": 1358 + }, + { + "epoch": 0.3624, + "grad_norm": 0.3832452061683596, + "learning_rate": 0.00014746074096061462, + "loss": 0.6454, + "step": 1359 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.3886559081581442, + "learning_rate": 0.00014738469295626433, + "loss": 0.6808, + "step": 1360 + }, + { + "epoch": 0.36293333333333333, + "grad_norm": 0.34133195946817063, + "learning_rate": 0.0001473086095968859, + "loss": 0.6096, + "step": 1361 + }, + { + "epoch": 0.3632, + "grad_norm": 0.37643948172272235, + "learning_rate": 0.00014723249093924725, + "loss": 0.6995, + "step": 1362 + }, + { + "epoch": 0.36346666666666666, + "grad_norm": 0.3483812561565921, + "learning_rate": 0.0001471563370401426, + "loss": 0.6081, + "step": 1363 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.3740810360875961, + "learning_rate": 0.00014708014795639248, + "loss": 0.6489, + "step": 1364 + }, + { + "epoch": 0.364, + "grad_norm": 0.3520107409932921, + "learning_rate": 0.00014700392374484368, + "loss": 0.6199, + "step": 1365 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.33453889622578503, + "learning_rate": 0.00014692766446236914, + "loss": 0.6614, + "step": 1366 + }, + { + "epoch": 0.3645333333333333, + "grad_norm": 0.3646219251187341, + "learning_rate": 0.00014685137016586807, + "loss": 0.6086, + "step": 1367 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3447941866222518, + "learning_rate": 0.00014677504091226574, + "loss": 0.6298, + "step": 1368 + }, + { + "epoch": 0.36506666666666665, + "grad_norm": 0.3441247576904632, + "learning_rate": 0.0001466986767585135, + "loss": 0.6138, + "step": 1369 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.34802284725086, + "learning_rate": 0.00014662227776158877, + "loss": 0.6119, + "step": 1370 + }, + { + "epoch": 0.3656, + "grad_norm": 0.3477747911985361, + "learning_rate": 0.00014654584397849496, + "loss": 0.6699, + "step": 1371 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.3495012839218708, + "learning_rate": 0.00014646937546626142, + "loss": 0.6043, + "step": 1372 + }, + { + "epoch": 0.3661333333333333, + "grad_norm": 0.3508876127684906, + "learning_rate": 0.0001463928722819434, + "loss": 0.6609, + "step": 1373 + }, + { + "epoch": 0.3664, + "grad_norm": 0.34556557992667797, + "learning_rate": 0.000146316334482622, + "loss": 0.5956, + "step": 1374 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.35734406152055626, + "learning_rate": 0.00014623976212540428, + "loss": 0.6377, + "step": 1375 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.37375530548399544, + "learning_rate": 0.00014616315526742296, + "loss": 0.6843, + "step": 1376 + }, + { + "epoch": 0.3672, + "grad_norm": 0.3747004793008935, + "learning_rate": 0.00014608651396583647, + "loss": 0.6834, + "step": 1377 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.3729794084524959, + "learning_rate": 0.0001460098382778291, + "loss": 0.6099, + "step": 1378 + }, + { + "epoch": 0.36773333333333336, + "grad_norm": 0.35700000020842687, + "learning_rate": 0.00014593312826061063, + "loss": 0.6236, + "step": 1379 + }, + { + "epoch": 0.368, + "grad_norm": 0.34834979298561425, + "learning_rate": 0.00014585638397141657, + "loss": 0.6051, + "step": 1380 + }, + { + "epoch": 0.3682666666666667, + "grad_norm": 0.371412487526114, + "learning_rate": 0.00014577960546750788, + "loss": 0.6251, + "step": 1381 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.36117426276351927, + "learning_rate": 0.00014570279280617119, + "loss": 0.6484, + "step": 1382 + }, + { + "epoch": 0.3688, + "grad_norm": 0.3734292620108866, + "learning_rate": 0.0001456259460447185, + "loss": 0.6372, + "step": 1383 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.3676791781546167, + "learning_rate": 0.00014554906524048738, + "loss": 0.6541, + "step": 1384 + }, + { + "epoch": 0.36933333333333335, + "grad_norm": 0.4150364677606285, + "learning_rate": 0.00014547215045084065, + "loss": 0.6629, + "step": 1385 + }, + { + "epoch": 0.3696, + "grad_norm": 0.35893859090415847, + "learning_rate": 0.00014539520173316653, + "loss": 0.6753, + "step": 1386 + }, + { + "epoch": 0.3698666666666667, + "grad_norm": 0.399740268406432, + "learning_rate": 0.0001453182191448787, + "loss": 0.6433, + "step": 1387 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.3652602182332252, + "learning_rate": 0.0001452412027434159, + "loss": 0.6017, + "step": 1388 + }, + { + "epoch": 0.3704, + "grad_norm": 0.360092113518417, + "learning_rate": 0.0001451641525862422, + "loss": 0.656, + "step": 1389 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.3750624238543075, + "learning_rate": 0.0001450870687308469, + "loss": 0.6633, + "step": 1390 + }, + { + "epoch": 0.37093333333333334, + "grad_norm": 0.3737292836860596, + "learning_rate": 0.00014500995123474435, + "loss": 0.6265, + "step": 1391 + }, + { + "epoch": 0.3712, + "grad_norm": 0.37373396390757424, + "learning_rate": 0.00014493280015547407, + "loss": 0.6276, + "step": 1392 + }, + { + "epoch": 0.37146666666666667, + "grad_norm": 0.34821266552666486, + "learning_rate": 0.00014485561555060058, + "loss": 0.6462, + "step": 1393 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.3805440275795302, + "learning_rate": 0.00014477839747771348, + "loss": 0.6576, + "step": 1394 + }, + { + "epoch": 0.372, + "grad_norm": 0.3683730267700658, + "learning_rate": 0.00014470114599442728, + "loss": 0.6267, + "step": 1395 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.366295868060152, + "learning_rate": 0.00014462386115838145, + "loss": 0.6558, + "step": 1396 + }, + { + "epoch": 0.3725333333333333, + "grad_norm": 0.3737742074137285, + "learning_rate": 0.00014454654302724034, + "loss": 0.6286, + "step": 1397 + }, + { + "epoch": 0.3728, + "grad_norm": 0.38652327012546767, + "learning_rate": 0.0001444691916586932, + "loss": 0.6399, + "step": 1398 + }, + { + "epoch": 0.37306666666666666, + "grad_norm": 0.37861229779727024, + "learning_rate": 0.00014439180711045394, + "loss": 0.6238, + "step": 1399 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.3498803221410021, + "learning_rate": 0.00014431438944026133, + "loss": 0.6649, + "step": 1400 + }, + { + "epoch": 0.3736, + "grad_norm": 0.37502056890007396, + "learning_rate": 0.00014423693870587888, + "loss": 0.6662, + "step": 1401 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.37778399886076364, + "learning_rate": 0.00014415945496509464, + "loss": 0.6636, + "step": 1402 + }, + { + "epoch": 0.3741333333333333, + "grad_norm": 0.35723546331134165, + "learning_rate": 0.00014408193827572142, + "loss": 0.6439, + "step": 1403 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3613604726543159, + "learning_rate": 0.00014400438869559658, + "loss": 0.6535, + "step": 1404 + }, + { + "epoch": 0.37466666666666665, + "grad_norm": 0.34558749358937024, + "learning_rate": 0.000143926806282582, + "loss": 0.6533, + "step": 1405 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.359771703788518, + "learning_rate": 0.000143849191094564, + "loss": 0.6538, + "step": 1406 + }, + { + "epoch": 0.3752, + "grad_norm": 0.356332859969955, + "learning_rate": 0.0001437715431894535, + "loss": 0.644, + "step": 1407 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.3333631199903516, + "learning_rate": 0.00014369386262518566, + "loss": 0.6302, + "step": 1408 + }, + { + "epoch": 0.3757333333333333, + "grad_norm": 0.3467244158108116, + "learning_rate": 0.00014361614945972018, + "loss": 0.6311, + "step": 1409 + }, + { + "epoch": 0.376, + "grad_norm": 0.3554862473788559, + "learning_rate": 0.00014353840375104092, + "loss": 0.6689, + "step": 1410 + }, + { + "epoch": 0.3762666666666667, + "grad_norm": 0.36410287380349493, + "learning_rate": 0.0001434606255571562, + "loss": 0.6239, + "step": 1411 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.3539060822945138, + "learning_rate": 0.00014338281493609834, + "loss": 0.66, + "step": 1412 + }, + { + "epoch": 0.3768, + "grad_norm": 0.37360535660889377, + "learning_rate": 0.00014330497194592408, + "loss": 0.6479, + "step": 1413 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.3692522817580998, + "learning_rate": 0.00014322709664471423, + "loss": 0.6611, + "step": 1414 + }, + { + "epoch": 0.37733333333333335, + "grad_norm": 0.3726703888369673, + "learning_rate": 0.0001431491890905737, + "loss": 0.6745, + "step": 1415 + }, + { + "epoch": 0.3776, + "grad_norm": 0.33577272835165656, + "learning_rate": 0.00014307124934163148, + "loss": 0.676, + "step": 1416 + }, + { + "epoch": 0.3778666666666667, + "grad_norm": 0.368907912077969, + "learning_rate": 0.0001429932774560405, + "loss": 0.6885, + "step": 1417 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.3506804423496715, + "learning_rate": 0.00014291527349197779, + "loss": 0.6204, + "step": 1418 + }, + { + "epoch": 0.3784, + "grad_norm": 0.34909536845745265, + "learning_rate": 0.0001428372375076443, + "loss": 0.6724, + "step": 1419 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.35650561973341643, + "learning_rate": 0.00014275916956126475, + "loss": 0.6064, + "step": 1420 + }, + { + "epoch": 0.37893333333333334, + "grad_norm": 0.3457895485290377, + "learning_rate": 0.0001426810697110878, + "loss": 0.6617, + "step": 1421 + }, + { + "epoch": 0.3792, + "grad_norm": 0.345104340905149, + "learning_rate": 0.000142602938015386, + "loss": 0.6187, + "step": 1422 + }, + { + "epoch": 0.3794666666666667, + "grad_norm": 0.37404714389064475, + "learning_rate": 0.00014252477453245544, + "loss": 0.6972, + "step": 1423 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.3767845170610133, + "learning_rate": 0.00014244657932061615, + "loss": 0.6624, + "step": 1424 + }, + { + "epoch": 0.38, + "grad_norm": 0.3747093185175365, + "learning_rate": 0.00014236835243821167, + "loss": 0.6034, + "step": 1425 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.34457907595866566, + "learning_rate": 0.0001422900939436093, + "loss": 0.6059, + "step": 1426 + }, + { + "epoch": 0.38053333333333333, + "grad_norm": 0.34566087020841413, + "learning_rate": 0.00014221180389519984, + "loss": 0.5967, + "step": 1427 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3651092498941689, + "learning_rate": 0.0001421334823513976, + "loss": 0.6375, + "step": 1428 + }, + { + "epoch": 0.38106666666666666, + "grad_norm": 0.37221216779026145, + "learning_rate": 0.00014205512937064054, + "loss": 0.6975, + "step": 1429 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.3379472287858967, + "learning_rate": 0.00014197674501138994, + "loss": 0.6518, + "step": 1430 + }, + { + "epoch": 0.3816, + "grad_norm": 0.3638046101805259, + "learning_rate": 0.0001418983293321305, + "loss": 0.6862, + "step": 1431 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.34576413288436314, + "learning_rate": 0.00014181988239137037, + "loss": 0.6079, + "step": 1432 + }, + { + "epoch": 0.3821333333333333, + "grad_norm": 0.35071377304977486, + "learning_rate": 0.000141741404247641, + "loss": 0.6351, + "step": 1433 + }, + { + "epoch": 0.3824, + "grad_norm": 0.35840835151802136, + "learning_rate": 0.00014166289495949705, + "loss": 0.6665, + "step": 1434 + }, + { + "epoch": 0.38266666666666665, + "grad_norm": 0.35088984607529805, + "learning_rate": 0.00014158435458551649, + "loss": 0.6775, + "step": 1435 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.33604592310546416, + "learning_rate": 0.00014150578318430042, + "loss": 0.6133, + "step": 1436 + }, + { + "epoch": 0.3832, + "grad_norm": 0.34857723776658783, + "learning_rate": 0.00014142718081447324, + "loss": 0.6314, + "step": 1437 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.3663141090467801, + "learning_rate": 0.00014134854753468224, + "loss": 0.6546, + "step": 1438 + }, + { + "epoch": 0.3837333333333333, + "grad_norm": 0.37174836362339136, + "learning_rate": 0.00014126988340359796, + "loss": 0.6718, + "step": 1439 + }, + { + "epoch": 0.384, + "grad_norm": 0.3385548087790671, + "learning_rate": 0.0001411911884799138, + "loss": 0.6074, + "step": 1440 + }, + { + "epoch": 0.38426666666666665, + "grad_norm": 0.33816388270202485, + "learning_rate": 0.00014111246282234624, + "loss": 0.6282, + "step": 1441 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.374631723416128, + "learning_rate": 0.00014103370648963474, + "loss": 0.695, + "step": 1442 + }, + { + "epoch": 0.3848, + "grad_norm": 0.3778845126887572, + "learning_rate": 0.0001409549195405415, + "loss": 0.6493, + "step": 1443 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.35557762969413714, + "learning_rate": 0.00014087610203385168, + "loss": 0.6004, + "step": 1444 + }, + { + "epoch": 0.38533333333333336, + "grad_norm": 0.3283441174659094, + "learning_rate": 0.00014079725402837314, + "loss": 0.6511, + "step": 1445 + }, + { + "epoch": 0.3856, + "grad_norm": 0.35073668316204804, + "learning_rate": 0.00014071837558293662, + "loss": 0.648, + "step": 1446 + }, + { + "epoch": 0.3858666666666667, + "grad_norm": 0.3523652123712978, + "learning_rate": 0.0001406394667563955, + "loss": 0.6253, + "step": 1447 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.3773427908918206, + "learning_rate": 0.00014056052760762577, + "loss": 0.6451, + "step": 1448 + }, + { + "epoch": 0.3864, + "grad_norm": 0.3810824327305423, + "learning_rate": 0.00014048155819552618, + "loss": 0.6456, + "step": 1449 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.35006470475505896, + "learning_rate": 0.00014040255857901798, + "loss": 0.6134, + "step": 1450 + }, + { + "epoch": 0.38693333333333335, + "grad_norm": 0.3734970644646683, + "learning_rate": 0.0001403235288170449, + "loss": 0.6322, + "step": 1451 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3560229088880377, + "learning_rate": 0.0001402444689685733, + "loss": 0.6162, + "step": 1452 + }, + { + "epoch": 0.3874666666666667, + "grad_norm": 0.34700815733093937, + "learning_rate": 0.0001401653790925919, + "loss": 0.6342, + "step": 1453 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.35302883638128907, + "learning_rate": 0.00014008625924811184, + "loss": 0.697, + "step": 1454 + }, + { + "epoch": 0.388, + "grad_norm": 0.33782945603236564, + "learning_rate": 0.00014000710949416663, + "loss": 0.588, + "step": 1455 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.3580034949409188, + "learning_rate": 0.00013992792988981205, + "loss": 0.6369, + "step": 1456 + }, + { + "epoch": 0.38853333333333334, + "grad_norm": 0.3532360774481146, + "learning_rate": 0.00013984872049412623, + "loss": 0.6432, + "step": 1457 + }, + { + "epoch": 0.3888, + "grad_norm": 0.3862133257877399, + "learning_rate": 0.00013976948136620946, + "loss": 0.6698, + "step": 1458 + }, + { + "epoch": 0.38906666666666667, + "grad_norm": 0.3754654686788404, + "learning_rate": 0.00013969021256518424, + "loss": 0.7035, + "step": 1459 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.3443690469182567, + "learning_rate": 0.00013961091415019524, + "loss": 0.6213, + "step": 1460 + }, + { + "epoch": 0.3896, + "grad_norm": 0.33456931578760196, + "learning_rate": 0.00013953158618040917, + "loss": 0.619, + "step": 1461 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.3773389803746547, + "learning_rate": 0.00013945222871501487, + "loss": 0.6321, + "step": 1462 + }, + { + "epoch": 0.39013333333333333, + "grad_norm": 0.3684761345026691, + "learning_rate": 0.00013937284181322307, + "loss": 0.6618, + "step": 1463 + }, + { + "epoch": 0.3904, + "grad_norm": 0.3584326632493809, + "learning_rate": 0.00013929342553426657, + "loss": 0.6411, + "step": 1464 + }, + { + "epoch": 0.39066666666666666, + "grad_norm": 0.32540465460956763, + "learning_rate": 0.0001392139799374, + "loss": 0.6293, + "step": 1465 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.33652689128674673, + "learning_rate": 0.0001391345050819, + "loss": 0.6018, + "step": 1466 + }, + { + "epoch": 0.3912, + "grad_norm": 0.3694480872477037, + "learning_rate": 0.0001390550010270649, + "loss": 0.5906, + "step": 1467 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.3594553565703397, + "learning_rate": 0.00013897546783221484, + "loss": 0.6488, + "step": 1468 + }, + { + "epoch": 0.3917333333333333, + "grad_norm": 0.32334944748128114, + "learning_rate": 0.0001388959055566918, + "loss": 0.6093, + "step": 1469 + }, + { + "epoch": 0.392, + "grad_norm": 0.35605816828535247, + "learning_rate": 0.00013881631425985934, + "loss": 0.6403, + "step": 1470 + }, + { + "epoch": 0.39226666666666665, + "grad_norm": 0.3419202024337364, + "learning_rate": 0.00013873669400110277, + "loss": 0.622, + "step": 1471 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.36731323388180237, + "learning_rate": 0.00013865704483982894, + "loss": 0.6626, + "step": 1472 + }, + { + "epoch": 0.3928, + "grad_norm": 0.3636856419821721, + "learning_rate": 0.0001385773668354663, + "loss": 0.6372, + "step": 1473 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.338388744430327, + "learning_rate": 0.00013849766004746475, + "loss": 0.6249, + "step": 1474 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 0.3678788088288983, + "learning_rate": 0.00013841792453529581, + "loss": 0.6047, + "step": 1475 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3663416926907222, + "learning_rate": 0.00013833816035845232, + "loss": 0.6465, + "step": 1476 + }, + { + "epoch": 0.39386666666666664, + "grad_norm": 0.3578041769595127, + "learning_rate": 0.00013825836757644852, + "loss": 0.6409, + "step": 1477 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.3563270113134819, + "learning_rate": 0.00013817854624882, + "loss": 0.6554, + "step": 1478 + }, + { + "epoch": 0.3944, + "grad_norm": 0.34830123898977755, + "learning_rate": 0.00013809869643512367, + "loss": 0.6636, + "step": 1479 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.34219361898329487, + "learning_rate": 0.00013801881819493772, + "loss": 0.6484, + "step": 1480 + }, + { + "epoch": 0.39493333333333336, + "grad_norm": 0.35243131988303616, + "learning_rate": 0.00013793891158786148, + "loss": 0.5953, + "step": 1481 + }, + { + "epoch": 0.3952, + "grad_norm": 0.3519520099892265, + "learning_rate": 0.00013785897667351543, + "loss": 0.6255, + "step": 1482 + }, + { + "epoch": 0.3954666666666667, + "grad_norm": 0.3436903877252368, + "learning_rate": 0.0001377790135115413, + "loss": 0.6293, + "step": 1483 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.3377837437362064, + "learning_rate": 0.00013769902216160176, + "loss": 0.6053, + "step": 1484 + }, + { + "epoch": 0.396, + "grad_norm": 0.3711738208922456, + "learning_rate": 0.0001376190026833806, + "loss": 0.6375, + "step": 1485 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.340851902526855, + "learning_rate": 0.0001375389551365825, + "loss": 0.6469, + "step": 1486 + }, + { + "epoch": 0.39653333333333335, + "grad_norm": 0.33360286197581673, + "learning_rate": 0.0001374588795809332, + "loss": 0.6191, + "step": 1487 + }, + { + "epoch": 0.3968, + "grad_norm": 0.36348833255983914, + "learning_rate": 0.00013737877607617927, + "loss": 0.6632, + "step": 1488 + }, + { + "epoch": 0.3970666666666667, + "grad_norm": 0.34114683366302295, + "learning_rate": 0.00013729864468208818, + "loss": 0.6323, + "step": 1489 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.36866921044462253, + "learning_rate": 0.0001372184854584481, + "loss": 0.6287, + "step": 1490 + }, + { + "epoch": 0.3976, + "grad_norm": 0.34049810542946163, + "learning_rate": 0.00013713829846506812, + "loss": 0.6602, + "step": 1491 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.3407484177480634, + "learning_rate": 0.0001370580837617779, + "loss": 0.6597, + "step": 1492 + }, + { + "epoch": 0.39813333333333334, + "grad_norm": 0.37710184555590415, + "learning_rate": 0.00013697784140842794, + "loss": 0.6393, + "step": 1493 + }, + { + "epoch": 0.3984, + "grad_norm": 0.362940222353952, + "learning_rate": 0.00013689757146488916, + "loss": 0.6197, + "step": 1494 + }, + { + "epoch": 0.39866666666666667, + "grad_norm": 0.34600692014793993, + "learning_rate": 0.00013681727399105328, + "loss": 0.6153, + "step": 1495 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.367833759542811, + "learning_rate": 0.0001367369490468324, + "loss": 0.6673, + "step": 1496 + }, + { + "epoch": 0.3992, + "grad_norm": 0.4110824434740751, + "learning_rate": 0.0001366565966921592, + "loss": 0.6873, + "step": 1497 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.351806620600648, + "learning_rate": 0.0001365762169869868, + "loss": 0.6578, + "step": 1498 + }, + { + "epoch": 0.39973333333333333, + "grad_norm": 0.3369270496471882, + "learning_rate": 0.0001364958099912887, + "loss": 0.5992, + "step": 1499 + }, + { + "epoch": 0.4, + "grad_norm": 0.34896761237819673, + "learning_rate": 0.0001364153757650588, + "loss": 0.6477, + "step": 1500 + }, + { + "epoch": 0.40026666666666666, + "grad_norm": 0.34845533390700606, + "learning_rate": 0.00013633491436831132, + "loss": 0.6206, + "step": 1501 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.3363710981780535, + "learning_rate": 0.00013625442586108065, + "loss": 0.6371, + "step": 1502 + }, + { + "epoch": 0.4008, + "grad_norm": 0.35188392032956, + "learning_rate": 0.00013617391030342158, + "loss": 0.6507, + "step": 1503 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.33431656452883624, + "learning_rate": 0.00013609336775540892, + "loss": 0.6344, + "step": 1504 + }, + { + "epoch": 0.4013333333333333, + "grad_norm": 0.3653488328515536, + "learning_rate": 0.00013601279827713772, + "loss": 0.6127, + "step": 1505 + }, + { + "epoch": 0.4016, + "grad_norm": 0.35589419078074647, + "learning_rate": 0.00013593220192872308, + "loss": 0.6474, + "step": 1506 + }, + { + "epoch": 0.40186666666666665, + "grad_norm": 0.35186548872761786, + "learning_rate": 0.0001358515787703002, + "loss": 0.66, + "step": 1507 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.3714718982362881, + "learning_rate": 0.00013577092886202417, + "loss": 0.6468, + "step": 1508 + }, + { + "epoch": 0.4024, + "grad_norm": 0.3453223638264368, + "learning_rate": 0.00013569025226407023, + "loss": 0.6594, + "step": 1509 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.35309142222904594, + "learning_rate": 0.00013560954903663332, + "loss": 0.6915, + "step": 1510 + }, + { + "epoch": 0.4029333333333333, + "grad_norm": 0.37889247533151243, + "learning_rate": 0.00013552881923992839, + "loss": 0.6559, + "step": 1511 + }, + { + "epoch": 0.4032, + "grad_norm": 0.34838037050351023, + "learning_rate": 0.00013544806293419015, + "loss": 0.6356, + "step": 1512 + }, + { + "epoch": 0.40346666666666664, + "grad_norm": 0.3880616920772269, + "learning_rate": 0.00013536728017967312, + "loss": 0.6794, + "step": 1513 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.38773194173789227, + "learning_rate": 0.00013528647103665148, + "loss": 0.656, + "step": 1514 + }, + { + "epoch": 0.404, + "grad_norm": 0.33382700773093865, + "learning_rate": 0.0001352056355654193, + "loss": 0.6348, + "step": 1515 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.34673375611082313, + "learning_rate": 0.00013512477382629008, + "loss": 0.6744, + "step": 1516 + }, + { + "epoch": 0.40453333333333336, + "grad_norm": 0.37090956322641655, + "learning_rate": 0.00013504388587959695, + "loss": 0.6409, + "step": 1517 + }, + { + "epoch": 0.4048, + "grad_norm": 0.35147367112397365, + "learning_rate": 0.00013496297178569274, + "loss": 0.6627, + "step": 1518 + }, + { + "epoch": 0.4050666666666667, + "grad_norm": 0.36441317582458954, + "learning_rate": 0.00013488203160494963, + "loss": 0.5897, + "step": 1519 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.3424548610752302, + "learning_rate": 0.00013480106539775935, + "loss": 0.6124, + "step": 1520 + }, + { + "epoch": 0.4056, + "grad_norm": 0.3646191358554874, + "learning_rate": 0.00013472007322453297, + "loss": 0.6433, + "step": 1521 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.3574308271935125, + "learning_rate": 0.00013463905514570106, + "loss": 0.635, + "step": 1522 + }, + { + "epoch": 0.40613333333333335, + "grad_norm": 0.38219758508015894, + "learning_rate": 0.0001345580112217134, + "loss": 0.6675, + "step": 1523 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3467066500315951, + "learning_rate": 0.0001344769415130391, + "loss": 0.6376, + "step": 1524 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 0.3461541173882036, + "learning_rate": 0.00013439584608016653, + "loss": 0.6377, + "step": 1525 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.3499686466449366, + "learning_rate": 0.00013431472498360325, + "loss": 0.6604, + "step": 1526 + }, + { + "epoch": 0.4072, + "grad_norm": 0.34119690818259274, + "learning_rate": 0.00013423357828387588, + "loss": 0.6021, + "step": 1527 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.3929815220550607, + "learning_rate": 0.0001341524060415303, + "loss": 0.6554, + "step": 1528 + }, + { + "epoch": 0.40773333333333334, + "grad_norm": 0.34457738716127345, + "learning_rate": 0.0001340712083171313, + "loss": 0.616, + "step": 1529 + }, + { + "epoch": 0.408, + "grad_norm": 0.37599960302688823, + "learning_rate": 0.00013398998517126276, + "loss": 0.6922, + "step": 1530 + }, + { + "epoch": 0.40826666666666667, + "grad_norm": 0.39660022941118506, + "learning_rate": 0.00013390873666452752, + "loss": 0.6033, + "step": 1531 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.3605708887523207, + "learning_rate": 0.00013382746285754734, + "loss": 0.6783, + "step": 1532 + }, + { + "epoch": 0.4088, + "grad_norm": 0.3667495928710032, + "learning_rate": 0.00013374616381096286, + "loss": 0.7042, + "step": 1533 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.3551671483277212, + "learning_rate": 0.0001336648395854335, + "loss": 0.655, + "step": 1534 + }, + { + "epoch": 0.4093333333333333, + "grad_norm": 0.35594835772839595, + "learning_rate": 0.00013358349024163754, + "loss": 0.6357, + "step": 1535 + }, + { + "epoch": 0.4096, + "grad_norm": 0.36026625455622185, + "learning_rate": 0.000133502115840272, + "loss": 0.6777, + "step": 1536 + }, + { + "epoch": 0.40986666666666666, + "grad_norm": 0.3586308590497421, + "learning_rate": 0.00013342071644205253, + "loss": 0.6751, + "step": 1537 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.3560120889098755, + "learning_rate": 0.00013333929210771346, + "loss": 0.6445, + "step": 1538 + }, + { + "epoch": 0.4104, + "grad_norm": 0.35177785685708396, + "learning_rate": 0.00013325784289800775, + "loss": 0.6036, + "step": 1539 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.36247147861875423, + "learning_rate": 0.00013317636887370696, + "loss": 0.655, + "step": 1540 + }, + { + "epoch": 0.4109333333333333, + "grad_norm": 0.380556751452829, + "learning_rate": 0.000133094870095601, + "loss": 0.6579, + "step": 1541 + }, + { + "epoch": 0.4112, + "grad_norm": 0.4100655898876012, + "learning_rate": 0.0001330133466244984, + "loss": 0.6818, + "step": 1542 + }, + { + "epoch": 0.41146666666666665, + "grad_norm": 0.36703019103232193, + "learning_rate": 0.00013293179852122612, + "loss": 0.6771, + "step": 1543 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.357059947248057, + "learning_rate": 0.00013285022584662946, + "loss": 0.6354, + "step": 1544 + }, + { + "epoch": 0.412, + "grad_norm": 0.3497474206223572, + "learning_rate": 0.00013276862866157198, + "loss": 0.6323, + "step": 1545 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.3529645072569777, + "learning_rate": 0.0001326870070269356, + "loss": 0.659, + "step": 1546 + }, + { + "epoch": 0.4125333333333333, + "grad_norm": 0.3436442258301328, + "learning_rate": 0.00013260536100362055, + "loss": 0.6602, + "step": 1547 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3590771813513007, + "learning_rate": 0.00013252369065254511, + "loss": 0.6302, + "step": 1548 + }, + { + "epoch": 0.4130666666666667, + "grad_norm": 0.3559947854838785, + "learning_rate": 0.0001324419960346458, + "loss": 0.6378, + "step": 1549 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.4094994463533802, + "learning_rate": 0.00013236027721087723, + "loss": 0.603, + "step": 1550 + }, + { + "epoch": 0.4136, + "grad_norm": 0.35900635506623746, + "learning_rate": 0.00013227853424221207, + "loss": 0.6815, + "step": 1551 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.3494160339869504, + "learning_rate": 0.00013219676718964103, + "loss": 0.6624, + "step": 1552 + }, + { + "epoch": 0.41413333333333335, + "grad_norm": 0.340149423466505, + "learning_rate": 0.00013211497611417272, + "loss": 0.635, + "step": 1553 + }, + { + "epoch": 0.4144, + "grad_norm": 0.35777327817154914, + "learning_rate": 0.00013203316107683377, + "loss": 0.6357, + "step": 1554 + }, + { + "epoch": 0.4146666666666667, + "grad_norm": 0.35128954100735255, + "learning_rate": 0.00013195132213866866, + "loss": 0.6753, + "step": 1555 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.32580586190412875, + "learning_rate": 0.0001318694593607396, + "loss": 0.6438, + "step": 1556 + }, + { + "epoch": 0.4152, + "grad_norm": 0.345562090642469, + "learning_rate": 0.0001317875728041267, + "loss": 0.6259, + "step": 1557 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.34544027345472283, + "learning_rate": 0.0001317056625299278, + "loss": 0.6492, + "step": 1558 + }, + { + "epoch": 0.41573333333333334, + "grad_norm": 0.367357007877592, + "learning_rate": 0.00013162372859925844, + "loss": 0.6305, + "step": 1559 + }, + { + "epoch": 0.416, + "grad_norm": 0.3432714506183897, + "learning_rate": 0.00013154177107325174, + "loss": 0.6039, + "step": 1560 + }, + { + "epoch": 0.4162666666666667, + "grad_norm": 0.3495804273624954, + "learning_rate": 0.00013145979001305847, + "loss": 0.6296, + "step": 1561 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.3574362943924044, + "learning_rate": 0.00013137778547984703, + "loss": 0.6415, + "step": 1562 + }, + { + "epoch": 0.4168, + "grad_norm": 0.3637672470687809, + "learning_rate": 0.00013129575753480322, + "loss": 0.6242, + "step": 1563 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.3696659262777461, + "learning_rate": 0.0001312137062391303, + "loss": 0.692, + "step": 1564 + }, + { + "epoch": 0.41733333333333333, + "grad_norm": 0.3554003912618743, + "learning_rate": 0.00013113163165404915, + "loss": 0.6368, + "step": 1565 + }, + { + "epoch": 0.4176, + "grad_norm": 0.38875111923163774, + "learning_rate": 0.0001310495338407977, + "loss": 0.704, + "step": 1566 + }, + { + "epoch": 0.41786666666666666, + "grad_norm": 0.3445955597250309, + "learning_rate": 0.00013096741286063162, + "loss": 0.6099, + "step": 1567 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.3637008520152639, + "learning_rate": 0.00013088526877482343, + "loss": 0.6593, + "step": 1568 + }, + { + "epoch": 0.4184, + "grad_norm": 0.35403197489850313, + "learning_rate": 0.0001308031016446632, + "loss": 0.6517, + "step": 1569 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.36730966870295284, + "learning_rate": 0.00013072091153145808, + "loss": 0.6431, + "step": 1570 + }, + { + "epoch": 0.4189333333333333, + "grad_norm": 0.35365863428943584, + "learning_rate": 0.00013063869849653243, + "loss": 0.6652, + "step": 1571 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3570860852158583, + "learning_rate": 0.00013055646260122763, + "loss": 0.6182, + "step": 1572 + }, + { + "epoch": 0.41946666666666665, + "grad_norm": 0.38879671533863996, + "learning_rate": 0.0001304742039069021, + "loss": 0.6896, + "step": 1573 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.43243773122660395, + "learning_rate": 0.0001303919224749314, + "loss": 0.68, + "step": 1574 + }, + { + "epoch": 0.42, + "grad_norm": 0.35840297015870565, + "learning_rate": 0.00013030961836670794, + "loss": 0.6245, + "step": 1575 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.36316311253279926, + "learning_rate": 0.00013022729164364108, + "loss": 0.6142, + "step": 1576 + }, + { + "epoch": 0.4205333333333333, + "grad_norm": 0.351754921269816, + "learning_rate": 0.0001301449423671571, + "loss": 0.6003, + "step": 1577 + }, + { + "epoch": 0.4208, + "grad_norm": 0.37410580813936667, + "learning_rate": 0.00013006257059869906, + "loss": 0.6186, + "step": 1578 + }, + { + "epoch": 0.42106666666666664, + "grad_norm": 0.35361578526631765, + "learning_rate": 0.00012998017639972677, + "loss": 0.615, + "step": 1579 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.36374019695876725, + "learning_rate": 0.00012989775983171688, + "loss": 0.5908, + "step": 1580 + }, + { + "epoch": 0.4216, + "grad_norm": 0.3743848873800401, + "learning_rate": 0.0001298153209561626, + "loss": 0.624, + "step": 1581 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.3618039358338652, + "learning_rate": 0.00012973285983457393, + "loss": 0.6252, + "step": 1582 + }, + { + "epoch": 0.42213333333333336, + "grad_norm": 0.36607735560456356, + "learning_rate": 0.00012965037652847732, + "loss": 0.6422, + "step": 1583 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3412080100607846, + "learning_rate": 0.0001295678710994159, + "loss": 0.6139, + "step": 1584 + }, + { + "epoch": 0.4226666666666667, + "grad_norm": 0.34046334058310873, + "learning_rate": 0.0001294853436089492, + "loss": 0.6726, + "step": 1585 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.3773645149144639, + "learning_rate": 0.00012940279411865327, + "loss": 0.6675, + "step": 1586 + }, + { + "epoch": 0.4232, + "grad_norm": 0.33817104462326586, + "learning_rate": 0.0001293202226901206, + "loss": 0.593, + "step": 1587 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.37190404574939634, + "learning_rate": 0.00012923762938495996, + "loss": 0.6512, + "step": 1588 + }, + { + "epoch": 0.42373333333333335, + "grad_norm": 0.3486381712691507, + "learning_rate": 0.00012915501426479656, + "loss": 0.6603, + "step": 1589 + }, + { + "epoch": 0.424, + "grad_norm": 0.34465044432332115, + "learning_rate": 0.00012907237739127173, + "loss": 0.5976, + "step": 1590 + }, + { + "epoch": 0.4242666666666667, + "grad_norm": 0.35627556293759993, + "learning_rate": 0.00012898971882604324, + "loss": 0.6258, + "step": 1591 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.41447946758644905, + "learning_rate": 0.00012890703863078487, + "loss": 0.6143, + "step": 1592 + }, + { + "epoch": 0.4248, + "grad_norm": 0.3658966979109238, + "learning_rate": 0.00012882433686718656, + "loss": 0.6204, + "step": 1593 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.3663147593448287, + "learning_rate": 0.00012874161359695445, + "loss": 0.6471, + "step": 1594 + }, + { + "epoch": 0.42533333333333334, + "grad_norm": 0.3574959345862566, + "learning_rate": 0.00012865886888181058, + "loss": 0.6515, + "step": 1595 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3333632529687575, + "learning_rate": 0.00012857610278349315, + "loss": 0.6089, + "step": 1596 + }, + { + "epoch": 0.42586666666666667, + "grad_norm": 0.3622629955492807, + "learning_rate": 0.00012849331536375614, + "loss": 0.6408, + "step": 1597 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.35328265411820614, + "learning_rate": 0.00012841050668436964, + "loss": 0.6231, + "step": 1598 + }, + { + "epoch": 0.4264, + "grad_norm": 0.34575833118252797, + "learning_rate": 0.0001283276768071194, + "loss": 0.6461, + "step": 1599 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.3394298671827499, + "learning_rate": 0.00012824482579380716, + "loss": 0.6006, + "step": 1600 + }, + { + "epoch": 0.42693333333333333, + "grad_norm": 0.38092917381766306, + "learning_rate": 0.00012816195370625027, + "loss": 0.6414, + "step": 1601 + }, + { + "epoch": 0.4272, + "grad_norm": 0.4090247505789534, + "learning_rate": 0.00012807906060628192, + "loss": 0.676, + "step": 1602 + }, + { + "epoch": 0.42746666666666666, + "grad_norm": 0.35519065725042875, + "learning_rate": 0.00012799614655575095, + "loss": 0.6573, + "step": 1603 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.3660425425972097, + "learning_rate": 0.00012791321161652178, + "loss": 0.6237, + "step": 1604 + }, + { + "epoch": 0.428, + "grad_norm": 0.38797207014166457, + "learning_rate": 0.00012783025585047452, + "loss": 0.6489, + "step": 1605 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.39178472300057515, + "learning_rate": 0.00012774727931950472, + "loss": 0.6582, + "step": 1606 + }, + { + "epoch": 0.4285333333333333, + "grad_norm": 0.3548602555156648, + "learning_rate": 0.00012766428208552347, + "loss": 0.6397, + "step": 1607 + }, + { + "epoch": 0.4288, + "grad_norm": 0.41160462341490733, + "learning_rate": 0.0001275812642104573, + "loss": 0.6398, + "step": 1608 + }, + { + "epoch": 0.42906666666666665, + "grad_norm": 0.3607476132808952, + "learning_rate": 0.00012749822575624812, + "loss": 0.6268, + "step": 1609 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.3813303178413586, + "learning_rate": 0.0001274151667848533, + "loss": 0.6844, + "step": 1610 + }, + { + "epoch": 0.4296, + "grad_norm": 0.35958579899889676, + "learning_rate": 0.00012733208735824528, + "loss": 0.6903, + "step": 1611 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.36347371465089556, + "learning_rate": 0.00012724898753841205, + "loss": 0.6213, + "step": 1612 + }, + { + "epoch": 0.4301333333333333, + "grad_norm": 0.35423954632673743, + "learning_rate": 0.0001271658673873566, + "loss": 0.6185, + "step": 1613 + }, + { + "epoch": 0.4304, + "grad_norm": 0.34804941527345973, + "learning_rate": 0.0001270827269670972, + "loss": 0.6338, + "step": 1614 + }, + { + "epoch": 0.43066666666666664, + "grad_norm": 0.35797360802324796, + "learning_rate": 0.00012699956633966726, + "loss": 0.6683, + "step": 1615 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.3422332310143983, + "learning_rate": 0.00012691638556711513, + "loss": 0.614, + "step": 1616 + }, + { + "epoch": 0.4312, + "grad_norm": 0.34984801626919965, + "learning_rate": 0.00012683318471150434, + "loss": 0.665, + "step": 1617 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.37070912012570234, + "learning_rate": 0.00012674996383491336, + "loss": 0.6208, + "step": 1618 + }, + { + "epoch": 0.43173333333333336, + "grad_norm": 0.3562232026068743, + "learning_rate": 0.00012666672299943552, + "loss": 0.6212, + "step": 1619 + }, + { + "epoch": 0.432, + "grad_norm": 0.36034379188903254, + "learning_rate": 0.00012658346226717917, + "loss": 0.6134, + "step": 1620 + }, + { + "epoch": 0.4322666666666667, + "grad_norm": 0.3852555846814048, + "learning_rate": 0.0001265001817002674, + "loss": 0.6458, + "step": 1621 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.39639876575903304, + "learning_rate": 0.00012641688136083817, + "loss": 0.6625, + "step": 1622 + }, + { + "epoch": 0.4328, + "grad_norm": 0.3584288109644875, + "learning_rate": 0.00012633356131104415, + "loss": 0.6198, + "step": 1623 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.3764514354130069, + "learning_rate": 0.00012625022161305273, + "loss": 0.5889, + "step": 1624 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.3683177623299117, + "learning_rate": 0.00012616686232904594, + "loss": 0.6223, + "step": 1625 + }, + { + "epoch": 0.4336, + "grad_norm": 0.38144315307484467, + "learning_rate": 0.0001260834835212205, + "loss": 0.6279, + "step": 1626 + }, + { + "epoch": 0.4338666666666667, + "grad_norm": 0.36431404847538273, + "learning_rate": 0.00012600008525178756, + "loss": 0.6178, + "step": 1627 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.40141052098830127, + "learning_rate": 0.00012591666758297296, + "loss": 0.6678, + "step": 1628 + }, + { + "epoch": 0.4344, + "grad_norm": 0.3991071901941075, + "learning_rate": 0.00012583323057701687, + "loss": 0.7102, + "step": 1629 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.35572824812524084, + "learning_rate": 0.000125749774296174, + "loss": 0.6251, + "step": 1630 + }, + { + "epoch": 0.43493333333333334, + "grad_norm": 0.39366789227153004, + "learning_rate": 0.0001256662988027133, + "loss": 0.6191, + "step": 1631 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3403861815415497, + "learning_rate": 0.0001255828041589182, + "loss": 0.642, + "step": 1632 + }, + { + "epoch": 0.43546666666666667, + "grad_norm": 0.37721966964369996, + "learning_rate": 0.00012549929042708638, + "loss": 0.6162, + "step": 1633 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.3639172123188235, + "learning_rate": 0.00012541575766952966, + "loss": 0.5947, + "step": 1634 + }, + { + "epoch": 0.436, + "grad_norm": 0.358168330792411, + "learning_rate": 0.0001253322059485742, + "loss": 0.588, + "step": 1635 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.37188529578213403, + "learning_rate": 0.00012524863532656025, + "loss": 0.6287, + "step": 1636 + }, + { + "epoch": 0.43653333333333333, + "grad_norm": 0.3719258571402577, + "learning_rate": 0.00012516504586584216, + "loss": 0.6721, + "step": 1637 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3781674849224295, + "learning_rate": 0.00012508143762878827, + "loss": 0.6446, + "step": 1638 + }, + { + "epoch": 0.43706666666666666, + "grad_norm": 0.3601080826388201, + "learning_rate": 0.00012499781067778107, + "loss": 0.7061, + "step": 1639 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.37014558370817746, + "learning_rate": 0.00012491416507521693, + "loss": 0.6267, + "step": 1640 + }, + { + "epoch": 0.4376, + "grad_norm": 0.3305540827558495, + "learning_rate": 0.0001248305008835061, + "loss": 0.5939, + "step": 1641 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.366051769668418, + "learning_rate": 0.00012474681816507273, + "loss": 0.6585, + "step": 1642 + }, + { + "epoch": 0.4381333333333333, + "grad_norm": 0.335316699243076, + "learning_rate": 0.0001246631169823549, + "loss": 0.6076, + "step": 1643 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3709645424099733, + "learning_rate": 0.00012457939739780432, + "loss": 0.6129, + "step": 1644 + }, + { + "epoch": 0.43866666666666665, + "grad_norm": 0.34149193135982836, + "learning_rate": 0.00012449565947388652, + "loss": 0.5817, + "step": 1645 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.3508505022569726, + "learning_rate": 0.00012441190327308057, + "loss": 0.6479, + "step": 1646 + }, + { + "epoch": 0.4392, + "grad_norm": 0.3611783217736963, + "learning_rate": 0.00012432812885787938, + "loss": 0.5945, + "step": 1647 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.3485188200809396, + "learning_rate": 0.00012424433629078935, + "loss": 0.6976, + "step": 1648 + }, + { + "epoch": 0.4397333333333333, + "grad_norm": 0.3620575104377565, + "learning_rate": 0.00012416052563433042, + "loss": 0.6358, + "step": 1649 + }, + { + "epoch": 0.44, + "grad_norm": 0.34801802409283183, + "learning_rate": 0.000124076696951036, + "loss": 0.636, + "step": 1650 + }, + { + "epoch": 0.44026666666666664, + "grad_norm": 0.33453043806639926, + "learning_rate": 0.00012399285030345302, + "loss": 0.6469, + "step": 1651 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.3693898167794525, + "learning_rate": 0.00012390898575414177, + "loss": 0.6425, + "step": 1652 + }, + { + "epoch": 0.4408, + "grad_norm": 0.34613870679340925, + "learning_rate": 0.00012382510336567592, + "loss": 0.6445, + "step": 1653 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.35515983656786543, + "learning_rate": 0.00012374120320064242, + "loss": 0.6398, + "step": 1654 + }, + { + "epoch": 0.44133333333333336, + "grad_norm": 0.3549805974412868, + "learning_rate": 0.0001236572853216415, + "loss": 0.6044, + "step": 1655 + }, + { + "epoch": 0.4416, + "grad_norm": 0.35548373229040764, + "learning_rate": 0.0001235733497912866, + "loss": 0.6176, + "step": 1656 + }, + { + "epoch": 0.4418666666666667, + "grad_norm": 0.3595147339678269, + "learning_rate": 0.00012348939667220437, + "loss": 0.5967, + "step": 1657 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.3612244276840959, + "learning_rate": 0.00012340542602703455, + "loss": 0.6253, + "step": 1658 + }, + { + "epoch": 0.4424, + "grad_norm": 0.3588165377072431, + "learning_rate": 0.00012332143791842992, + "loss": 0.6079, + "step": 1659 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.3789856812517794, + "learning_rate": 0.00012323743240905634, + "loss": 0.6567, + "step": 1660 + }, + { + "epoch": 0.44293333333333335, + "grad_norm": 0.43464739139017444, + "learning_rate": 0.00012315340956159265, + "loss": 0.615, + "step": 1661 + }, + { + "epoch": 0.4432, + "grad_norm": 0.3434686435471745, + "learning_rate": 0.0001230693694387306, + "loss": 0.6214, + "step": 1662 + }, + { + "epoch": 0.4434666666666667, + "grad_norm": 0.333891109589993, + "learning_rate": 0.00012298531210317486, + "loss": 0.654, + "step": 1663 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.402981975795202, + "learning_rate": 0.00012290123761764295, + "loss": 0.603, + "step": 1664 + }, + { + "epoch": 0.444, + "grad_norm": 0.33075643690214307, + "learning_rate": 0.0001228171460448652, + "loss": 0.6214, + "step": 1665 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.35849018879941275, + "learning_rate": 0.00012273303744758454, + "loss": 0.6595, + "step": 1666 + }, + { + "epoch": 0.44453333333333334, + "grad_norm": 0.341626725365342, + "learning_rate": 0.00012264891188855677, + "loss": 0.6932, + "step": 1667 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3379362954056055, + "learning_rate": 0.0001225647694305503, + "loss": 0.636, + "step": 1668 + }, + { + "epoch": 0.44506666666666667, + "grad_norm": 0.3624029234465875, + "learning_rate": 0.00012248061013634618, + "loss": 0.6605, + "step": 1669 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.350033178585205, + "learning_rate": 0.00012239643406873792, + "loss": 0.6104, + "step": 1670 + }, + { + "epoch": 0.4456, + "grad_norm": 0.3388761621631631, + "learning_rate": 0.00012231224129053163, + "loss": 0.5925, + "step": 1671 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.3534446396927527, + "learning_rate": 0.0001222280318645459, + "loss": 0.5733, + "step": 1672 + }, + { + "epoch": 0.4461333333333333, + "grad_norm": 0.37402065822247554, + "learning_rate": 0.00012214380585361166, + "loss": 0.6791, + "step": 1673 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3574677235990412, + "learning_rate": 0.0001220595633205723, + "loss": 0.6748, + "step": 1674 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 0.34601388649427123, + "learning_rate": 0.00012197530432828348, + "loss": 0.6189, + "step": 1675 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.3501706946058701, + "learning_rate": 0.00012189102893961317, + "loss": 0.6495, + "step": 1676 + }, + { + "epoch": 0.4472, + "grad_norm": 0.35359241511647227, + "learning_rate": 0.00012180673721744156, + "loss": 0.6037, + "step": 1677 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.34561338141332965, + "learning_rate": 0.00012172242922466103, + "loss": 0.6455, + "step": 1678 + }, + { + "epoch": 0.4477333333333333, + "grad_norm": 0.35355188513544344, + "learning_rate": 0.00012163810502417611, + "loss": 0.6346, + "step": 1679 + }, + { + "epoch": 0.448, + "grad_norm": 0.3814744660532964, + "learning_rate": 0.0001215537646789034, + "loss": 0.6075, + "step": 1680 + }, + { + "epoch": 0.44826666666666665, + "grad_norm": 0.3579179619374581, + "learning_rate": 0.00012146940825177158, + "loss": 0.623, + "step": 1681 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.3422599240830082, + "learning_rate": 0.0001213850358057213, + "loss": 0.6525, + "step": 1682 + }, + { + "epoch": 0.4488, + "grad_norm": 0.34462088811090014, + "learning_rate": 0.00012130064740370517, + "loss": 0.6662, + "step": 1683 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.3254610992854232, + "learning_rate": 0.00012121624310868773, + "loss": 0.6245, + "step": 1684 + }, + { + "epoch": 0.4493333333333333, + "grad_norm": 0.348062684925965, + "learning_rate": 0.00012113182298364533, + "loss": 0.6503, + "step": 1685 + }, + { + "epoch": 0.4496, + "grad_norm": 0.34043658203295085, + "learning_rate": 0.00012104738709156615, + "loss": 0.6134, + "step": 1686 + }, + { + "epoch": 0.4498666666666667, + "grad_norm": 0.34879711942344566, + "learning_rate": 0.00012096293549545017, + "loss": 0.6362, + "step": 1687 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.34231886384769017, + "learning_rate": 0.00012087846825830902, + "loss": 0.6632, + "step": 1688 + }, + { + "epoch": 0.4504, + "grad_norm": 0.35026029746962084, + "learning_rate": 0.0001207939854431661, + "loss": 0.6335, + "step": 1689 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.350617316823054, + "learning_rate": 0.0001207094871130563, + "loss": 0.6261, + "step": 1690 + }, + { + "epoch": 0.45093333333333335, + "grad_norm": 0.3345279027657184, + "learning_rate": 0.0001206249733310262, + "loss": 0.6488, + "step": 1691 + }, + { + "epoch": 0.4512, + "grad_norm": 0.33078409540811543, + "learning_rate": 0.00012054044416013388, + "loss": 0.6113, + "step": 1692 + }, + { + "epoch": 0.4514666666666667, + "grad_norm": 0.3667361800254136, + "learning_rate": 0.00012045589966344884, + "loss": 0.6929, + "step": 1693 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.34426405973469115, + "learning_rate": 0.00012037133990405209, + "loss": 0.6006, + "step": 1694 + }, + { + "epoch": 0.452, + "grad_norm": 0.35922862308746506, + "learning_rate": 0.00012028676494503602, + "loss": 0.6591, + "step": 1695 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.3718053950465859, + "learning_rate": 0.00012020217484950434, + "loss": 0.623, + "step": 1696 + }, + { + "epoch": 0.45253333333333334, + "grad_norm": 0.3488618846207442, + "learning_rate": 0.00012011756968057202, + "loss": 0.6142, + "step": 1697 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3464279258634417, + "learning_rate": 0.00012003294950136531, + "loss": 0.5909, + "step": 1698 + }, + { + "epoch": 0.4530666666666667, + "grad_norm": 0.3480304555655408, + "learning_rate": 0.00011994831437502173, + "loss": 0.6487, + "step": 1699 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.3572698025800835, + "learning_rate": 0.00011986366436468985, + "loss": 0.6558, + "step": 1700 + }, + { + "epoch": 0.4536, + "grad_norm": 0.3445904125255456, + "learning_rate": 0.00011977899953352935, + "loss": 0.5715, + "step": 1701 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.36293733847573917, + "learning_rate": 0.00011969431994471103, + "loss": 0.6426, + "step": 1702 + }, + { + "epoch": 0.45413333333333333, + "grad_norm": 0.3480435846794133, + "learning_rate": 0.00011960962566141666, + "loss": 0.6391, + "step": 1703 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3445215698334862, + "learning_rate": 0.00011952491674683901, + "loss": 0.6097, + "step": 1704 + }, + { + "epoch": 0.45466666666666666, + "grad_norm": 0.34893762531055356, + "learning_rate": 0.0001194401932641817, + "loss": 0.6539, + "step": 1705 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.3426041167588452, + "learning_rate": 0.00011935545527665928, + "loss": 0.5759, + "step": 1706 + }, + { + "epoch": 0.4552, + "grad_norm": 0.3663819775175573, + "learning_rate": 0.00011927070284749708, + "loss": 0.6282, + "step": 1707 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.34182320319611004, + "learning_rate": 0.0001191859360399313, + "loss": 0.6234, + "step": 1708 + }, + { + "epoch": 0.4557333333333333, + "grad_norm": 0.36838546552480844, + "learning_rate": 0.0001191011549172087, + "loss": 0.627, + "step": 1709 + }, + { + "epoch": 0.456, + "grad_norm": 0.34560021152414283, + "learning_rate": 0.00011901635954258688, + "loss": 0.5874, + "step": 1710 + }, + { + "epoch": 0.45626666666666665, + "grad_norm": 0.3412508047279534, + "learning_rate": 0.00011893154997933398, + "loss": 0.6117, + "step": 1711 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.35030211136931205, + "learning_rate": 0.00011884672629072882, + "loss": 0.6545, + "step": 1712 + }, + { + "epoch": 0.4568, + "grad_norm": 0.4041961222843157, + "learning_rate": 0.0001187618885400606, + "loss": 0.6445, + "step": 1713 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.3469558132961199, + "learning_rate": 0.00011867703679062915, + "loss": 0.623, + "step": 1714 + }, + { + "epoch": 0.4573333333333333, + "grad_norm": 0.3757630011816648, + "learning_rate": 0.00011859217110574475, + "loss": 0.6392, + "step": 1715 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3183113537973486, + "learning_rate": 0.00011850729154872797, + "loss": 0.5802, + "step": 1716 + }, + { + "epoch": 0.45786666666666664, + "grad_norm": 0.3488258154667218, + "learning_rate": 0.0001184223981829098, + "loss": 0.637, + "step": 1717 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.3557434912545583, + "learning_rate": 0.00011833749107163156, + "loss": 0.6552, + "step": 1718 + }, + { + "epoch": 0.4584, + "grad_norm": 0.34513655114089387, + "learning_rate": 0.00011825257027824481, + "loss": 0.6212, + "step": 1719 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.33245413565433296, + "learning_rate": 0.00011816763586611121, + "loss": 0.6141, + "step": 1720 + }, + { + "epoch": 0.45893333333333336, + "grad_norm": 0.3688351242992989, + "learning_rate": 0.00011808268789860273, + "loss": 0.6184, + "step": 1721 + }, + { + "epoch": 0.4592, + "grad_norm": 0.3760740691404871, + "learning_rate": 0.00011799772643910137, + "loss": 0.6183, + "step": 1722 + }, + { + "epoch": 0.4594666666666667, + "grad_norm": 0.3459246535954564, + "learning_rate": 0.00011791275155099928, + "loss": 0.6185, + "step": 1723 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.3794025437279472, + "learning_rate": 0.0001178277632976985, + "loss": 0.634, + "step": 1724 + }, + { + "epoch": 0.46, + "grad_norm": 0.3356725232679871, + "learning_rate": 0.00011774276174261111, + "loss": 0.6101, + "step": 1725 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.3638540939453313, + "learning_rate": 0.00011765774694915917, + "loss": 0.629, + "step": 1726 + }, + { + "epoch": 0.46053333333333335, + "grad_norm": 0.3739281041623599, + "learning_rate": 0.00011757271898077455, + "loss": 0.6349, + "step": 1727 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3506253837609906, + "learning_rate": 0.00011748767790089896, + "loss": 0.6114, + "step": 1728 + }, + { + "epoch": 0.4610666666666667, + "grad_norm": 0.3563669109528651, + "learning_rate": 0.00011740262377298389, + "loss": 0.6348, + "step": 1729 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.34733398880304556, + "learning_rate": 0.00011731755666049059, + "loss": 0.6027, + "step": 1730 + }, + { + "epoch": 0.4616, + "grad_norm": 0.3539341602831934, + "learning_rate": 0.00011723247662688999, + "loss": 0.6052, + "step": 1731 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.3707876295803424, + "learning_rate": 0.00011714738373566261, + "loss": 0.6527, + "step": 1732 + }, + { + "epoch": 0.46213333333333334, + "grad_norm": 0.3646423466641055, + "learning_rate": 0.00011706227805029863, + "loss": 0.6571, + "step": 1733 + }, + { + "epoch": 0.4624, + "grad_norm": 0.33711712350098677, + "learning_rate": 0.00011697715963429777, + "loss": 0.616, + "step": 1734 + }, + { + "epoch": 0.46266666666666667, + "grad_norm": 0.3451068630337151, + "learning_rate": 0.0001168920285511692, + "loss": 0.6447, + "step": 1735 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.3472717885125027, + "learning_rate": 0.00011680688486443161, + "loss": 0.6075, + "step": 1736 + }, + { + "epoch": 0.4632, + "grad_norm": 0.3624518453803178, + "learning_rate": 0.00011672172863761301, + "loss": 0.6025, + "step": 1737 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.3672398998733144, + "learning_rate": 0.00011663655993425086, + "loss": 0.5607, + "step": 1738 + }, + { + "epoch": 0.46373333333333333, + "grad_norm": 0.3498099392730004, + "learning_rate": 0.00011655137881789187, + "loss": 0.6097, + "step": 1739 + }, + { + "epoch": 0.464, + "grad_norm": 0.34031625607024807, + "learning_rate": 0.000116466185352092, + "loss": 0.6012, + "step": 1740 + }, + { + "epoch": 0.46426666666666666, + "grad_norm": 0.3333234723759394, + "learning_rate": 0.00011638097960041646, + "loss": 0.6483, + "step": 1741 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.3742362318618192, + "learning_rate": 0.0001162957616264396, + "loss": 0.6419, + "step": 1742 + }, + { + "epoch": 0.4648, + "grad_norm": 0.36115062026580963, + "learning_rate": 0.00011621053149374492, + "loss": 0.6077, + "step": 1743 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.3961085014676561, + "learning_rate": 0.00011612528926592499, + "loss": 0.6623, + "step": 1744 + }, + { + "epoch": 0.4653333333333333, + "grad_norm": 0.3504639635390535, + "learning_rate": 0.00011604003500658135, + "loss": 0.6324, + "step": 1745 + }, + { + "epoch": 0.4656, + "grad_norm": 0.36082443526527486, + "learning_rate": 0.0001159547687793246, + "loss": 0.5789, + "step": 1746 + }, + { + "epoch": 0.46586666666666665, + "grad_norm": 0.3419926151679195, + "learning_rate": 0.00011586949064777424, + "loss": 0.6101, + "step": 1747 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.34934744773225196, + "learning_rate": 0.0001157842006755586, + "loss": 0.612, + "step": 1748 + }, + { + "epoch": 0.4664, + "grad_norm": 0.34795307189211017, + "learning_rate": 0.00011569889892631487, + "loss": 0.626, + "step": 1749 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.3565926984174023, + "learning_rate": 0.00011561358546368905, + "loss": 0.5923, + "step": 1750 + }, + { + "epoch": 0.4669333333333333, + "grad_norm": 0.3302871232944887, + "learning_rate": 0.00011552826035133594, + "loss": 0.6165, + "step": 1751 + }, + { + "epoch": 0.4672, + "grad_norm": 0.34648386915756985, + "learning_rate": 0.00011544292365291889, + "loss": 0.6059, + "step": 1752 + }, + { + "epoch": 0.46746666666666664, + "grad_norm": 0.3357397199945023, + "learning_rate": 0.00011535757543210995, + "loss": 0.6072, + "step": 1753 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.3811455486863015, + "learning_rate": 0.00011527221575258984, + "loss": 0.631, + "step": 1754 + }, + { + "epoch": 0.468, + "grad_norm": 0.3450542452834387, + "learning_rate": 0.00011518684467804777, + "loss": 0.6007, + "step": 1755 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.3441023171434445, + "learning_rate": 0.00011510146227218141, + "loss": 0.5751, + "step": 1756 + }, + { + "epoch": 0.46853333333333336, + "grad_norm": 0.3700175540134214, + "learning_rate": 0.000115016068598697, + "loss": 0.6554, + "step": 1757 + }, + { + "epoch": 0.4688, + "grad_norm": 0.4034366157451916, + "learning_rate": 0.00011493066372130907, + "loss": 0.6107, + "step": 1758 + }, + { + "epoch": 0.4690666666666667, + "grad_norm": 0.34852860508031075, + "learning_rate": 0.00011484524770374056, + "loss": 0.618, + "step": 1759 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.34762024645445466, + "learning_rate": 0.00011475982060972273, + "loss": 0.6791, + "step": 1760 + }, + { + "epoch": 0.4696, + "grad_norm": 0.36750238109278827, + "learning_rate": 0.00011467438250299509, + "loss": 0.6252, + "step": 1761 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.34093990152993797, + "learning_rate": 0.0001145889334473054, + "loss": 0.6111, + "step": 1762 + }, + { + "epoch": 0.47013333333333335, + "grad_norm": 0.34826964138058425, + "learning_rate": 0.00011450347350640948, + "loss": 0.6058, + "step": 1763 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3488992313334934, + "learning_rate": 0.00011441800274407146, + "loss": 0.5873, + "step": 1764 + }, + { + "epoch": 0.4706666666666667, + "grad_norm": 0.40354408907069983, + "learning_rate": 0.00011433252122406334, + "loss": 0.6472, + "step": 1765 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.3704286016903338, + "learning_rate": 0.00011424702901016533, + "loss": 0.6075, + "step": 1766 + }, + { + "epoch": 0.4712, + "grad_norm": 0.36193346339895377, + "learning_rate": 0.00011416152616616547, + "loss": 0.6206, + "step": 1767 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.35700756996017685, + "learning_rate": 0.00011407601275585981, + "loss": 0.6906, + "step": 1768 + }, + { + "epoch": 0.47173333333333334, + "grad_norm": 0.3574041314252541, + "learning_rate": 0.00011399048884305226, + "loss": 0.6414, + "step": 1769 + }, + { + "epoch": 0.472, + "grad_norm": 0.3520379979890098, + "learning_rate": 0.0001139049544915546, + "loss": 0.6065, + "step": 1770 + }, + { + "epoch": 0.47226666666666667, + "grad_norm": 0.3357214424657432, + "learning_rate": 0.00011381940976518634, + "loss": 0.5982, + "step": 1771 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.33496458510409605, + "learning_rate": 0.00011373385472777478, + "loss": 0.5652, + "step": 1772 + }, + { + "epoch": 0.4728, + "grad_norm": 0.36174083734164647, + "learning_rate": 0.00011364828944315489, + "loss": 0.6335, + "step": 1773 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.34477653403624764, + "learning_rate": 0.0001135627139751693, + "loss": 0.6089, + "step": 1774 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 0.37113886814844055, + "learning_rate": 0.00011347712838766824, + "loss": 0.6395, + "step": 1775 + }, + { + "epoch": 0.4736, + "grad_norm": 0.3557849084199261, + "learning_rate": 0.00011339153274450945, + "loss": 0.6139, + "step": 1776 + }, + { + "epoch": 0.47386666666666666, + "grad_norm": 0.34053610185585476, + "learning_rate": 0.00011330592710955823, + "loss": 0.5975, + "step": 1777 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.3541955335602447, + "learning_rate": 0.00011322031154668731, + "loss": 0.6165, + "step": 1778 + }, + { + "epoch": 0.4744, + "grad_norm": 0.34643727950645714, + "learning_rate": 0.00011313468611977678, + "loss": 0.656, + "step": 1779 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.3460536062176225, + "learning_rate": 0.00011304905089271418, + "loss": 0.6254, + "step": 1780 + }, + { + "epoch": 0.4749333333333333, + "grad_norm": 0.3336864302507506, + "learning_rate": 0.0001129634059293943, + "loss": 0.6259, + "step": 1781 + }, + { + "epoch": 0.4752, + "grad_norm": 0.35071888371097154, + "learning_rate": 0.00011287775129371925, + "loss": 0.6135, + "step": 1782 + }, + { + "epoch": 0.47546666666666665, + "grad_norm": 0.34251890451142475, + "learning_rate": 0.00011279208704959827, + "loss": 0.5884, + "step": 1783 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.4198573769517053, + "learning_rate": 0.00011270641326094784, + "loss": 0.6486, + "step": 1784 + }, + { + "epoch": 0.476, + "grad_norm": 0.3733782715545972, + "learning_rate": 0.00011262072999169155, + "loss": 0.6619, + "step": 1785 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.3636678351224484, + "learning_rate": 0.00011253503730576005, + "loss": 0.6435, + "step": 1786 + }, + { + "epoch": 0.4765333333333333, + "grad_norm": 0.3722293655153486, + "learning_rate": 0.000112449335267091, + "loss": 0.7063, + "step": 1787 + }, + { + "epoch": 0.4768, + "grad_norm": 0.4045494203049154, + "learning_rate": 0.00011236362393962907, + "loss": 0.6889, + "step": 1788 + }, + { + "epoch": 0.4770666666666667, + "grad_norm": 0.41141364057819396, + "learning_rate": 0.00011227790338732584, + "loss": 0.6454, + "step": 1789 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.35524714004015817, + "learning_rate": 0.00011219217367413979, + "loss": 0.6594, + "step": 1790 + }, + { + "epoch": 0.4776, + "grad_norm": 0.3548249027270856, + "learning_rate": 0.00011210643486403622, + "loss": 0.6848, + "step": 1791 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.34990685689062734, + "learning_rate": 0.00011202068702098725, + "loss": 0.6414, + "step": 1792 + }, + { + "epoch": 0.47813333333333335, + "grad_norm": 0.36062907826450685, + "learning_rate": 0.00011193493020897173, + "loss": 0.7016, + "step": 1793 + }, + { + "epoch": 0.4784, + "grad_norm": 0.3657531065016614, + "learning_rate": 0.00011184916449197509, + "loss": 0.6301, + "step": 1794 + }, + { + "epoch": 0.4786666666666667, + "grad_norm": 0.3427264254470834, + "learning_rate": 0.00011176338993398958, + "loss": 0.6625, + "step": 1795 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.3582835531352709, + "learning_rate": 0.00011167760659901396, + "loss": 0.6304, + "step": 1796 + }, + { + "epoch": 0.4792, + "grad_norm": 0.37796506398892793, + "learning_rate": 0.00011159181455105354, + "loss": 0.638, + "step": 1797 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.36483684177211584, + "learning_rate": 0.0001115060138541201, + "loss": 0.679, + "step": 1798 + }, + { + "epoch": 0.47973333333333334, + "grad_norm": 0.4053911364643288, + "learning_rate": 0.00011142020457223194, + "loss": 0.6475, + "step": 1799 + }, + { + "epoch": 0.48, + "grad_norm": 0.3429016214299657, + "learning_rate": 0.00011133438676941374, + "loss": 0.6388, + "step": 1800 + }, + { + "epoch": 0.4802666666666667, + "grad_norm": 0.3328225323854466, + "learning_rate": 0.00011124856050969656, + "loss": 0.6086, + "step": 1801 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.3759503815104567, + "learning_rate": 0.00011116272585711772, + "loss": 0.6549, + "step": 1802 + }, + { + "epoch": 0.4808, + "grad_norm": 0.3816139449550581, + "learning_rate": 0.00011107688287572075, + "loss": 0.6584, + "step": 1803 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.3488440642535208, + "learning_rate": 0.00011099103162955558, + "loss": 0.6516, + "step": 1804 + }, + { + "epoch": 0.48133333333333334, + "grad_norm": 0.31898300396226115, + "learning_rate": 0.00011090517218267817, + "loss": 0.5368, + "step": 1805 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3304373238663184, + "learning_rate": 0.00011081930459915057, + "loss": 0.5734, + "step": 1806 + }, + { + "epoch": 0.48186666666666667, + "grad_norm": 0.36577036159963267, + "learning_rate": 0.000110733428943041, + "loss": 0.6458, + "step": 1807 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.34418727674948435, + "learning_rate": 0.00011064754527842365, + "loss": 0.6379, + "step": 1808 + }, + { + "epoch": 0.4824, + "grad_norm": 0.3485730493230742, + "learning_rate": 0.00011056165366937868, + "loss": 0.6747, + "step": 1809 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.3579638159305423, + "learning_rate": 0.00011047575417999221, + "loss": 0.6519, + "step": 1810 + }, + { + "epoch": 0.4829333333333333, + "grad_norm": 0.37974716507113593, + "learning_rate": 0.00011038984687435624, + "loss": 0.612, + "step": 1811 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3853979813831866, + "learning_rate": 0.00011030393181656853, + "loss": 0.6636, + "step": 1812 + }, + { + "epoch": 0.48346666666666666, + "grad_norm": 0.3563252982196125, + "learning_rate": 0.00011021800907073274, + "loss": 0.616, + "step": 1813 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.33341699980169726, + "learning_rate": 0.00011013207870095817, + "loss": 0.6165, + "step": 1814 + }, + { + "epoch": 0.484, + "grad_norm": 0.3495071001339214, + "learning_rate": 0.00011004614077135982, + "loss": 0.643, + "step": 1815 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.3562240275221454, + "learning_rate": 0.00010996019534605839, + "loss": 0.6336, + "step": 1816 + }, + { + "epoch": 0.4845333333333333, + "grad_norm": 0.36402598215189086, + "learning_rate": 0.00010987424248918013, + "loss": 0.6811, + "step": 1817 + }, + { + "epoch": 0.4848, + "grad_norm": 0.33606282717664393, + "learning_rate": 0.0001097882822648568, + "loss": 0.603, + "step": 1818 + }, + { + "epoch": 0.48506666666666665, + "grad_norm": 0.3520338401948451, + "learning_rate": 0.00010970231473722576, + "loss": 0.6221, + "step": 1819 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.3395677931780785, + "learning_rate": 0.00010961633997042973, + "loss": 0.6106, + "step": 1820 + }, + { + "epoch": 0.4856, + "grad_norm": 0.3336757458331368, + "learning_rate": 0.00010953035802861686, + "loss": 0.5563, + "step": 1821 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.34310758385784323, + "learning_rate": 0.00010944436897594064, + "loss": 0.5801, + "step": 1822 + }, + { + "epoch": 0.4861333333333333, + "grad_norm": 0.33464032436838764, + "learning_rate": 0.00010935837287655986, + "loss": 0.6048, + "step": 1823 + }, + { + "epoch": 0.4864, + "grad_norm": 0.31986865601205045, + "learning_rate": 0.00010927236979463862, + "loss": 0.5787, + "step": 1824 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 0.34685655386156217, + "learning_rate": 0.00010918635979434622, + "loss": 0.6008, + "step": 1825 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.3511924912830929, + "learning_rate": 0.00010910034293985701, + "loss": 0.6118, + "step": 1826 + }, + { + "epoch": 0.4872, + "grad_norm": 0.4194661149245277, + "learning_rate": 0.0001090143192953506, + "loss": 0.6495, + "step": 1827 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.3456593638418228, + "learning_rate": 0.00010892828892501161, + "loss": 0.6416, + "step": 1828 + }, + { + "epoch": 0.48773333333333335, + "grad_norm": 0.35914563192748006, + "learning_rate": 0.00010884225189302968, + "loss": 0.6221, + "step": 1829 + }, + { + "epoch": 0.488, + "grad_norm": 0.3420030648648187, + "learning_rate": 0.00010875620826359937, + "loss": 0.5958, + "step": 1830 + }, + { + "epoch": 0.4882666666666667, + "grad_norm": 0.34803579730104184, + "learning_rate": 0.00010867015810092026, + "loss": 0.584, + "step": 1831 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.33939694787610614, + "learning_rate": 0.00010858410146919674, + "loss": 0.6265, + "step": 1832 + }, + { + "epoch": 0.4888, + "grad_norm": 0.35164434152982926, + "learning_rate": 0.00010849803843263802, + "loss": 0.6404, + "step": 1833 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.34503349278235446, + "learning_rate": 0.0001084119690554581, + "loss": 0.6143, + "step": 1834 + }, + { + "epoch": 0.48933333333333334, + "grad_norm": 0.3593120141383835, + "learning_rate": 0.00010832589340187573, + "loss": 0.6367, + "step": 1835 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3435343643175995, + "learning_rate": 0.00010823981153611438, + "loss": 0.586, + "step": 1836 + }, + { + "epoch": 0.4898666666666667, + "grad_norm": 0.3829802288187017, + "learning_rate": 0.00010815372352240203, + "loss": 0.6401, + "step": 1837 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.5504776432734196, + "learning_rate": 0.00010806762942497138, + "loss": 0.6456, + "step": 1838 + }, + { + "epoch": 0.4904, + "grad_norm": 0.3432403237218622, + "learning_rate": 0.00010798152930805958, + "loss": 0.6529, + "step": 1839 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.3362221768777311, + "learning_rate": 0.00010789542323590838, + "loss": 0.579, + "step": 1840 + }, + { + "epoch": 0.49093333333333333, + "grad_norm": 0.3523883313706826, + "learning_rate": 0.00010780931127276379, + "loss": 0.6728, + "step": 1841 + }, + { + "epoch": 0.4912, + "grad_norm": 0.3671914793983141, + "learning_rate": 0.00010772319348287638, + "loss": 0.6451, + "step": 1842 + }, + { + "epoch": 0.49146666666666666, + "grad_norm": 0.34033011515199435, + "learning_rate": 0.000107637069930501, + "loss": 0.645, + "step": 1843 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.3602503510963793, + "learning_rate": 0.00010755094067989684, + "loss": 0.6383, + "step": 1844 + }, + { + "epoch": 0.492, + "grad_norm": 0.35038007368810054, + "learning_rate": 0.00010746480579532727, + "loss": 0.6729, + "step": 1845 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.3473701853281187, + "learning_rate": 0.00010737866534105993, + "loss": 0.5375, + "step": 1846 + }, + { + "epoch": 0.4925333333333333, + "grad_norm": 0.3708167977397802, + "learning_rate": 0.0001072925193813666, + "loss": 0.66, + "step": 1847 + }, + { + "epoch": 0.4928, + "grad_norm": 0.4137343485050253, + "learning_rate": 0.00010720636798052314, + "loss": 0.6487, + "step": 1848 + }, + { + "epoch": 0.49306666666666665, + "grad_norm": 0.35047352190009584, + "learning_rate": 0.0001071202112028095, + "loss": 0.6351, + "step": 1849 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.3428848289584383, + "learning_rate": 0.00010703404911250962, + "loss": 0.6065, + "step": 1850 + }, + { + "epoch": 0.4936, + "grad_norm": 0.3712163179708831, + "learning_rate": 0.00010694788177391145, + "loss": 0.6395, + "step": 1851 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.34258943995828145, + "learning_rate": 0.00010686170925130678, + "loss": 0.5836, + "step": 1852 + }, + { + "epoch": 0.4941333333333333, + "grad_norm": 0.3634496024934811, + "learning_rate": 0.00010677553160899135, + "loss": 0.5987, + "step": 1853 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3494288944374076, + "learning_rate": 0.00010668934891126458, + "loss": 0.6173, + "step": 1854 + }, + { + "epoch": 0.49466666666666664, + "grad_norm": 0.3858477887669424, + "learning_rate": 0.00010660316122242988, + "loss": 0.6689, + "step": 1855 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.3316806723574418, + "learning_rate": 0.00010651696860679425, + "loss": 0.6091, + "step": 1856 + }, + { + "epoch": 0.4952, + "grad_norm": 0.34001195560173475, + "learning_rate": 0.00010643077112866831, + "loss": 0.5693, + "step": 1857 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.3730457710225014, + "learning_rate": 0.00010634456885236643, + "loss": 0.6383, + "step": 1858 + }, + { + "epoch": 0.49573333333333336, + "grad_norm": 0.3529477327900418, + "learning_rate": 0.0001062583618422065, + "loss": 0.6602, + "step": 1859 + }, + { + "epoch": 0.496, + "grad_norm": 0.35275175353116595, + "learning_rate": 0.00010617215016250996, + "loss": 0.585, + "step": 1860 + }, + { + "epoch": 0.4962666666666667, + "grad_norm": 0.3436185227798171, + "learning_rate": 0.00010608593387760171, + "loss": 0.6216, + "step": 1861 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.3403844645528916, + "learning_rate": 0.00010599971305181012, + "loss": 0.6036, + "step": 1862 + }, + { + "epoch": 0.4968, + "grad_norm": 0.3765236264940643, + "learning_rate": 0.00010591348774946687, + "loss": 0.6681, + "step": 1863 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.3739579366865447, + "learning_rate": 0.00010582725803490714, + "loss": 0.6859, + "step": 1864 + }, + { + "epoch": 0.49733333333333335, + "grad_norm": 0.3413576702802035, + "learning_rate": 0.00010574102397246921, + "loss": 0.6093, + "step": 1865 + }, + { + "epoch": 0.4976, + "grad_norm": 0.35345468868149615, + "learning_rate": 0.00010565478562649476, + "loss": 0.6227, + "step": 1866 + }, + { + "epoch": 0.4978666666666667, + "grad_norm": 0.3588273500333482, + "learning_rate": 0.00010556854306132855, + "loss": 0.6199, + "step": 1867 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.3866091359716914, + "learning_rate": 0.00010548229634131858, + "loss": 0.6377, + "step": 1868 + }, + { + "epoch": 0.4984, + "grad_norm": 0.3240242407888893, + "learning_rate": 0.0001053960455308159, + "loss": 0.5541, + "step": 1869 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.3477715307092799, + "learning_rate": 0.00010530979069417461, + "loss": 0.5971, + "step": 1870 + }, + { + "epoch": 0.49893333333333334, + "grad_norm": 0.34168835161930505, + "learning_rate": 0.00010522353189575183, + "loss": 0.6419, + "step": 1871 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3530478591879385, + "learning_rate": 0.00010513726919990762, + "loss": 0.6347, + "step": 1872 + }, + { + "epoch": 0.49946666666666667, + "grad_norm": 0.32991880649694866, + "learning_rate": 0.000105051002671005, + "loss": 0.5562, + "step": 1873 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.340019310969109, + "learning_rate": 0.00010496473237340978, + "loss": 0.6271, + "step": 1874 + }, + { + "epoch": 0.5, + "grad_norm": 0.3403011764625951, + "learning_rate": 0.00010487845837149062, + "loss": 0.5954, + "step": 1875 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.3404496729556034, + "learning_rate": 0.00010479218072961892, + "loss": 0.6603, + "step": 1876 + }, + { + "epoch": 0.5005333333333334, + "grad_norm": 0.34784316782713803, + "learning_rate": 0.00010470589951216882, + "loss": 0.6372, + "step": 1877 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3468042077926638, + "learning_rate": 0.00010461961478351711, + "loss": 0.6162, + "step": 1878 + }, + { + "epoch": 0.5010666666666667, + "grad_norm": 0.3441622189263432, + "learning_rate": 0.00010453332660804327, + "loss": 0.5968, + "step": 1879 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.36661564652960343, + "learning_rate": 0.0001044470350501292, + "loss": 0.6095, + "step": 1880 + }, + { + "epoch": 0.5016, + "grad_norm": 0.3461679275941061, + "learning_rate": 0.00010436074017415947, + "loss": 0.6265, + "step": 1881 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.3409084709412113, + "learning_rate": 0.00010427444204452103, + "loss": 0.5959, + "step": 1882 + }, + { + "epoch": 0.5021333333333333, + "grad_norm": 0.3719425555366416, + "learning_rate": 0.00010418814072560337, + "loss": 0.6138, + "step": 1883 + }, + { + "epoch": 0.5024, + "grad_norm": 0.33504316439542964, + "learning_rate": 0.00010410183628179822, + "loss": 0.6303, + "step": 1884 + }, + { + "epoch": 0.5026666666666667, + "grad_norm": 0.3589984051590842, + "learning_rate": 0.00010401552877749973, + "loss": 0.5878, + "step": 1885 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.3561547910470736, + "learning_rate": 0.00010392921827710432, + "loss": 0.6125, + "step": 1886 + }, + { + "epoch": 0.5032, + "grad_norm": 0.3381142821458892, + "learning_rate": 0.00010384290484501064, + "loss": 0.5954, + "step": 1887 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.3514335414345356, + "learning_rate": 0.00010375658854561952, + "loss": 0.5902, + "step": 1888 + }, + { + "epoch": 0.5037333333333334, + "grad_norm": 0.3316766610681647, + "learning_rate": 0.00010367026944333391, + "loss": 0.6192, + "step": 1889 + }, + { + "epoch": 0.504, + "grad_norm": 0.33754447514834407, + "learning_rate": 0.00010358394760255892, + "loss": 0.6267, + "step": 1890 + }, + { + "epoch": 0.5042666666666666, + "grad_norm": 0.3392906742883365, + "learning_rate": 0.00010349762308770163, + "loss": 0.6284, + "step": 1891 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.3303627343601473, + "learning_rate": 0.00010341129596317114, + "loss": 0.5755, + "step": 1892 + }, + { + "epoch": 0.5048, + "grad_norm": 0.39575470525642015, + "learning_rate": 0.00010332496629337854, + "loss": 0.6271, + "step": 1893 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.3605586831239872, + "learning_rate": 0.00010323863414273674, + "loss": 0.6174, + "step": 1894 + }, + { + "epoch": 0.5053333333333333, + "grad_norm": 0.35256415973273353, + "learning_rate": 0.00010315229957566059, + "loss": 0.615, + "step": 1895 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3424145120737014, + "learning_rate": 0.00010306596265656663, + "loss": 0.6222, + "step": 1896 + }, + { + "epoch": 0.5058666666666667, + "grad_norm": 0.3445887350666713, + "learning_rate": 0.00010297962344987326, + "loss": 0.6396, + "step": 1897 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.3629993823914323, + "learning_rate": 0.00010289328202000055, + "loss": 0.6737, + "step": 1898 + }, + { + "epoch": 0.5064, + "grad_norm": 0.3665736331797972, + "learning_rate": 0.00010280693843137019, + "loss": 0.6367, + "step": 1899 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.34092626926986974, + "learning_rate": 0.00010272059274840555, + "loss": 0.5757, + "step": 1900 + }, + { + "epoch": 0.5069333333333333, + "grad_norm": 0.3412486703966795, + "learning_rate": 0.00010263424503553155, + "loss": 0.5863, + "step": 1901 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3546342606220995, + "learning_rate": 0.00010254789535717456, + "loss": 0.6288, + "step": 1902 + }, + { + "epoch": 0.5074666666666666, + "grad_norm": 0.3469896420351848, + "learning_rate": 0.00010246154377776246, + "loss": 0.5865, + "step": 1903 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.32606735504146694, + "learning_rate": 0.00010237519036172459, + "loss": 0.6053, + "step": 1904 + }, + { + "epoch": 0.508, + "grad_norm": 0.3155514996696917, + "learning_rate": 0.00010228883517349154, + "loss": 0.5819, + "step": 1905 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.36122419725229143, + "learning_rate": 0.0001022024782774954, + "loss": 0.6305, + "step": 1906 + }, + { + "epoch": 0.5085333333333333, + "grad_norm": 0.3758738629272625, + "learning_rate": 0.0001021161197381694, + "loss": 0.5897, + "step": 1907 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3668727531150211, + "learning_rate": 0.00010202975961994798, + "loss": 0.6725, + "step": 1908 + }, + { + "epoch": 0.5090666666666667, + "grad_norm": 0.3683290010864964, + "learning_rate": 0.00010194339798726684, + "loss": 0.6397, + "step": 1909 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.36378539233678114, + "learning_rate": 0.0001018570349045628, + "loss": 0.5792, + "step": 1910 + }, + { + "epoch": 0.5096, + "grad_norm": 0.35361595482702723, + "learning_rate": 0.00010177067043627375, + "loss": 0.6149, + "step": 1911 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.351717486992583, + "learning_rate": 0.00010168430464683856, + "loss": 0.6001, + "step": 1912 + }, + { + "epoch": 0.5101333333333333, + "grad_norm": 0.3599675743587252, + "learning_rate": 0.00010159793760069715, + "loss": 0.5971, + "step": 1913 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3554409815569149, + "learning_rate": 0.0001015115693622904, + "loss": 0.6021, + "step": 1914 + }, + { + "epoch": 0.5106666666666667, + "grad_norm": 0.3653117929623009, + "learning_rate": 0.00010142519999605997, + "loss": 0.6292, + "step": 1915 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.34022426543298023, + "learning_rate": 0.00010133882956644846, + "loss": 0.5937, + "step": 1916 + }, + { + "epoch": 0.5112, + "grad_norm": 0.3498122610654033, + "learning_rate": 0.00010125245813789923, + "loss": 0.6385, + "step": 1917 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.3498140175570313, + "learning_rate": 0.0001011660857748564, + "loss": 0.6576, + "step": 1918 + }, + { + "epoch": 0.5117333333333334, + "grad_norm": 0.36061234115656216, + "learning_rate": 0.00010107971254176475, + "loss": 0.6691, + "step": 1919 + }, + { + "epoch": 0.512, + "grad_norm": 0.36038800741636573, + "learning_rate": 0.00010099333850306977, + "loss": 0.6582, + "step": 1920 + }, + { + "epoch": 0.5122666666666666, + "grad_norm": 0.3186736897666428, + "learning_rate": 0.0001009069637232175, + "loss": 0.5633, + "step": 1921 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.3465212424096694, + "learning_rate": 0.00010082058826665457, + "loss": 0.5948, + "step": 1922 + }, + { + "epoch": 0.5128, + "grad_norm": 0.34003543567676386, + "learning_rate": 0.00010073421219782804, + "loss": 0.6234, + "step": 1923 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.34016776681136945, + "learning_rate": 0.00010064783558118552, + "loss": 0.6, + "step": 1924 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 0.3581297499766351, + "learning_rate": 0.00010056145848117497, + "loss": 0.6047, + "step": 1925 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3514092449630491, + "learning_rate": 0.00010047508096224476, + "loss": 0.5896, + "step": 1926 + }, + { + "epoch": 0.5138666666666667, + "grad_norm": 0.3242359772997438, + "learning_rate": 0.0001003887030888435, + "loss": 0.6051, + "step": 1927 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.34380597190413853, + "learning_rate": 0.00010030232492542014, + "loss": 0.5986, + "step": 1928 + }, + { + "epoch": 0.5144, + "grad_norm": 0.3431258525131172, + "learning_rate": 0.00010021594653642379, + "loss": 0.6541, + "step": 1929 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.34331857592804965, + "learning_rate": 0.0001001295679863038, + "loss": 0.6492, + "step": 1930 + }, + { + "epoch": 0.5149333333333334, + "grad_norm": 0.33973108768418797, + "learning_rate": 0.00010004318933950953, + "loss": 0.6008, + "step": 1931 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3446940487143982, + "learning_rate": 9.99568106604905e-05, + "loss": 0.6176, + "step": 1932 + }, + { + "epoch": 0.5154666666666666, + "grad_norm": 0.37330428433218943, + "learning_rate": 9.987043201369622e-05, + "loss": 0.6337, + "step": 1933 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.3396927795752247, + "learning_rate": 9.978405346357621e-05, + "loss": 0.621, + "step": 1934 + }, + { + "epoch": 0.516, + "grad_norm": 0.34460019902033295, + "learning_rate": 9.969767507457987e-05, + "loss": 0.5934, + "step": 1935 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.3490440959282949, + "learning_rate": 9.961129691115653e-05, + "loss": 0.6572, + "step": 1936 + }, + { + "epoch": 0.5165333333333333, + "grad_norm": 0.34407504648219744, + "learning_rate": 9.952491903775529e-05, + "loss": 0.581, + "step": 1937 + }, + { + "epoch": 0.5168, + "grad_norm": 0.34365704470410735, + "learning_rate": 9.943854151882505e-05, + "loss": 0.59, + "step": 1938 + }, + { + "epoch": 0.5170666666666667, + "grad_norm": 0.3582436374601057, + "learning_rate": 9.935216441881451e-05, + "loss": 0.6189, + "step": 1939 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.3845217823198548, + "learning_rate": 9.926578780217199e-05, + "loss": 0.6089, + "step": 1940 + }, + { + "epoch": 0.5176, + "grad_norm": 0.32116870741455084, + "learning_rate": 9.917941173334545e-05, + "loss": 0.5844, + "step": 1941 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.34636988073483993, + "learning_rate": 9.909303627678249e-05, + "loss": 0.5766, + "step": 1942 + }, + { + "epoch": 0.5181333333333333, + "grad_norm": 0.35704539747501596, + "learning_rate": 9.900666149693022e-05, + "loss": 0.623, + "step": 1943 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3714349645043707, + "learning_rate": 9.892028745823526e-05, + "loss": 0.6401, + "step": 1944 + }, + { + "epoch": 0.5186666666666667, + "grad_norm": 0.3255639284042641, + "learning_rate": 9.883391422514362e-05, + "loss": 0.5781, + "step": 1945 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.3353133665007348, + "learning_rate": 9.874754186210078e-05, + "loss": 0.612, + "step": 1946 + }, + { + "epoch": 0.5192, + "grad_norm": 0.3669824930961404, + "learning_rate": 9.866117043355156e-05, + "loss": 0.6301, + "step": 1947 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.3342869403864703, + "learning_rate": 9.857480000394006e-05, + "loss": 0.5668, + "step": 1948 + }, + { + "epoch": 0.5197333333333334, + "grad_norm": 0.3921780213818484, + "learning_rate": 9.848843063770962e-05, + "loss": 0.587, + "step": 1949 + }, + { + "epoch": 0.52, + "grad_norm": 0.35479788707660537, + "learning_rate": 9.840206239930286e-05, + "loss": 0.5925, + "step": 1950 + }, + { + "epoch": 0.5202666666666667, + "grad_norm": 0.3401804529833413, + "learning_rate": 9.831569535316144e-05, + "loss": 0.6009, + "step": 1951 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.35423663734328054, + "learning_rate": 9.82293295637263e-05, + "loss": 0.576, + "step": 1952 + }, + { + "epoch": 0.5208, + "grad_norm": 0.3476262381142906, + "learning_rate": 9.814296509543724e-05, + "loss": 0.5986, + "step": 1953 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.38158077018818914, + "learning_rate": 9.805660201273317e-05, + "loss": 0.7074, + "step": 1954 + }, + { + "epoch": 0.5213333333333333, + "grad_norm": 0.32643123888065656, + "learning_rate": 9.797024038005204e-05, + "loss": 0.5948, + "step": 1955 + }, + { + "epoch": 0.5216, + "grad_norm": 0.34154734922813573, + "learning_rate": 9.788388026183063e-05, + "loss": 0.6104, + "step": 1956 + }, + { + "epoch": 0.5218666666666667, + "grad_norm": 0.33051183124715117, + "learning_rate": 9.779752172250461e-05, + "loss": 0.5731, + "step": 1957 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.3345259852415471, + "learning_rate": 9.771116482650844e-05, + "loss": 0.5393, + "step": 1958 + }, + { + "epoch": 0.5224, + "grad_norm": 0.3403340455917885, + "learning_rate": 9.762480963827546e-05, + "loss": 0.5962, + "step": 1959 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.34059588784734357, + "learning_rate": 9.753845622223757e-05, + "loss": 0.6118, + "step": 1960 + }, + { + "epoch": 0.5229333333333334, + "grad_norm": 0.3486927682382407, + "learning_rate": 9.745210464282548e-05, + "loss": 0.6159, + "step": 1961 + }, + { + "epoch": 0.5232, + "grad_norm": 0.34613355506330856, + "learning_rate": 9.736575496446848e-05, + "loss": 0.593, + "step": 1962 + }, + { + "epoch": 0.5234666666666666, + "grad_norm": 0.3269411928179518, + "learning_rate": 9.727940725159446e-05, + "loss": 0.5678, + "step": 1963 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.3321455675862325, + "learning_rate": 9.719306156862982e-05, + "loss": 0.5968, + "step": 1964 + }, + { + "epoch": 0.524, + "grad_norm": 0.33771013399845196, + "learning_rate": 9.710671797999946e-05, + "loss": 0.6129, + "step": 1965 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.3460757106477492, + "learning_rate": 9.702037655012675e-05, + "loss": 0.6263, + "step": 1966 + }, + { + "epoch": 0.5245333333333333, + "grad_norm": 0.33148582820188244, + "learning_rate": 9.693403734343342e-05, + "loss": 0.5713, + "step": 1967 + }, + { + "epoch": 0.5248, + "grad_norm": 0.349995296297823, + "learning_rate": 9.684770042433946e-05, + "loss": 0.6109, + "step": 1968 + }, + { + "epoch": 0.5250666666666667, + "grad_norm": 0.3618880675955362, + "learning_rate": 9.676136585726328e-05, + "loss": 0.6668, + "step": 1969 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.3560408216584042, + "learning_rate": 9.667503370662148e-05, + "loss": 0.6268, + "step": 1970 + }, + { + "epoch": 0.5256, + "grad_norm": 0.3340972286533265, + "learning_rate": 9.658870403682888e-05, + "loss": 0.5956, + "step": 1971 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.3556410848409868, + "learning_rate": 9.65023769122984e-05, + "loss": 0.6361, + "step": 1972 + }, + { + "epoch": 0.5261333333333333, + "grad_norm": 0.36388608450533694, + "learning_rate": 9.64160523974411e-05, + "loss": 0.6372, + "step": 1973 + }, + { + "epoch": 0.5264, + "grad_norm": 0.3608893800242963, + "learning_rate": 9.632973055666611e-05, + "loss": 0.6066, + "step": 1974 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 0.3542234429582837, + "learning_rate": 9.624341145438053e-05, + "loss": 0.6021, + "step": 1975 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.3646966851994873, + "learning_rate": 9.615709515498939e-05, + "loss": 0.6431, + "step": 1976 + }, + { + "epoch": 0.5272, + "grad_norm": 0.3567920875456539, + "learning_rate": 9.607078172289569e-05, + "loss": 0.6043, + "step": 1977 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.3455903642761181, + "learning_rate": 9.598447122250029e-05, + "loss": 0.5914, + "step": 1978 + }, + { + "epoch": 0.5277333333333334, + "grad_norm": 0.3350469040038476, + "learning_rate": 9.589816371820179e-05, + "loss": 0.6445, + "step": 1979 + }, + { + "epoch": 0.528, + "grad_norm": 0.33382522064086734, + "learning_rate": 9.581185927439665e-05, + "loss": 0.6278, + "step": 1980 + }, + { + "epoch": 0.5282666666666667, + "grad_norm": 0.34817099587173167, + "learning_rate": 9.572555795547896e-05, + "loss": 0.5998, + "step": 1981 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.34045618454381377, + "learning_rate": 9.563925982584054e-05, + "loss": 0.6376, + "step": 1982 + }, + { + "epoch": 0.5288, + "grad_norm": 0.37164786998146804, + "learning_rate": 9.555296494987083e-05, + "loss": 0.6673, + "step": 1983 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.34117451599058546, + "learning_rate": 9.546667339195678e-05, + "loss": 0.6478, + "step": 1984 + }, + { + "epoch": 0.5293333333333333, + "grad_norm": 0.32393108934758247, + "learning_rate": 9.53803852164829e-05, + "loss": 0.6027, + "step": 1985 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3563116464426639, + "learning_rate": 9.529410048783119e-05, + "loss": 0.6435, + "step": 1986 + }, + { + "epoch": 0.5298666666666667, + "grad_norm": 0.33473710068350665, + "learning_rate": 9.520781927038111e-05, + "loss": 0.6069, + "step": 1987 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.3528834640270041, + "learning_rate": 9.51215416285094e-05, + "loss": 0.6205, + "step": 1988 + }, + { + "epoch": 0.5304, + "grad_norm": 0.3465769571711552, + "learning_rate": 9.503526762659023e-05, + "loss": 0.6531, + "step": 1989 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.33870023059505733, + "learning_rate": 9.4948997328995e-05, + "loss": 0.5928, + "step": 1990 + }, + { + "epoch": 0.5309333333333334, + "grad_norm": 0.3408493516549377, + "learning_rate": 9.486273080009238e-05, + "loss": 0.5853, + "step": 1991 + }, + { + "epoch": 0.5312, + "grad_norm": 0.3429692651375918, + "learning_rate": 9.47764681042482e-05, + "loss": 0.63, + "step": 1992 + }, + { + "epoch": 0.5314666666666666, + "grad_norm": 0.3318651483214718, + "learning_rate": 9.46902093058254e-05, + "loss": 0.5924, + "step": 1993 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.3586569129780557, + "learning_rate": 9.460395446918412e-05, + "loss": 0.6612, + "step": 1994 + }, + { + "epoch": 0.532, + "grad_norm": 0.36558670340223703, + "learning_rate": 9.451770365868143e-05, + "loss": 0.6199, + "step": 1995 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.3570630351766155, + "learning_rate": 9.443145693867145e-05, + "loss": 0.612, + "step": 1996 + }, + { + "epoch": 0.5325333333333333, + "grad_norm": 0.3442960505629333, + "learning_rate": 9.434521437350525e-05, + "loss": 0.5844, + "step": 1997 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3410095445432704, + "learning_rate": 9.425897602753083e-05, + "loss": 0.5924, + "step": 1998 + }, + { + "epoch": 0.5330666666666667, + "grad_norm": 0.34181677698600293, + "learning_rate": 9.417274196509289e-05, + "loss": 0.6081, + "step": 1999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.33956914009237654, + "learning_rate": 9.408651225053314e-05, + "loss": 0.5726, + "step": 2000 + }, + { + "epoch": 0.5336, + "grad_norm": 0.3298280497269895, + "learning_rate": 9.400028694818992e-05, + "loss": 0.6169, + "step": 2001 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.344816811209219, + "learning_rate": 9.39140661223983e-05, + "loss": 0.5984, + "step": 2002 + }, + { + "epoch": 0.5341333333333333, + "grad_norm": 0.3372514333415848, + "learning_rate": 9.382784983749005e-05, + "loss": 0.5868, + "step": 2003 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3474693000685869, + "learning_rate": 9.37416381577935e-05, + "loss": 0.5806, + "step": 2004 + }, + { + "epoch": 0.5346666666666666, + "grad_norm": 0.35926899358715614, + "learning_rate": 9.365543114763357e-05, + "loss": 0.6578, + "step": 2005 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.3271872494616578, + "learning_rate": 9.356922887133173e-05, + "loss": 0.6303, + "step": 2006 + }, + { + "epoch": 0.5352, + "grad_norm": 0.33368318359185056, + "learning_rate": 9.34830313932058e-05, + "loss": 0.5801, + "step": 2007 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.3394449024892054, + "learning_rate": 9.339683877757014e-05, + "loss": 0.6069, + "step": 2008 + }, + { + "epoch": 0.5357333333333333, + "grad_norm": 0.340512022100245, + "learning_rate": 9.331065108873543e-05, + "loss": 0.6031, + "step": 2009 + }, + { + "epoch": 0.536, + "grad_norm": 0.3544515933008161, + "learning_rate": 9.322446839100869e-05, + "loss": 0.6796, + "step": 2010 + }, + { + "epoch": 0.5362666666666667, + "grad_norm": 0.34520332137560916, + "learning_rate": 9.313829074869323e-05, + "loss": 0.6136, + "step": 2011 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.3278816100917621, + "learning_rate": 9.305211822608856e-05, + "loss": 0.5916, + "step": 2012 + }, + { + "epoch": 0.5368, + "grad_norm": 0.3388663985330351, + "learning_rate": 9.296595088749036e-05, + "loss": 0.6183, + "step": 2013 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.4020491790596033, + "learning_rate": 9.287978879719053e-05, + "loss": 0.6291, + "step": 2014 + }, + { + "epoch": 0.5373333333333333, + "grad_norm": 0.5624815817379968, + "learning_rate": 9.279363201947689e-05, + "loss": 0.6057, + "step": 2015 + }, + { + "epoch": 0.5376, + "grad_norm": 0.35953229762593675, + "learning_rate": 9.270748061863344e-05, + "loss": 0.6232, + "step": 2016 + }, + { + "epoch": 0.5378666666666667, + "grad_norm": 0.32891363978155613, + "learning_rate": 9.262133465894009e-05, + "loss": 0.593, + "step": 2017 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.3518251086921895, + "learning_rate": 9.253519420467275e-05, + "loss": 0.622, + "step": 2018 + }, + { + "epoch": 0.5384, + "grad_norm": 0.3643982126011602, + "learning_rate": 9.244905932010319e-05, + "loss": 0.6277, + "step": 2019 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.3413844994342637, + "learning_rate": 9.2362930069499e-05, + "loss": 0.6549, + "step": 2020 + }, + { + "epoch": 0.5389333333333334, + "grad_norm": 0.33484175894642043, + "learning_rate": 9.227680651712362e-05, + "loss": 0.5621, + "step": 2021 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3308715436992413, + "learning_rate": 9.219068872723625e-05, + "loss": 0.5836, + "step": 2022 + }, + { + "epoch": 0.5394666666666666, + "grad_norm": 0.35692259824731815, + "learning_rate": 9.210457676409167e-05, + "loss": 0.6251, + "step": 2023 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.3375423674679303, + "learning_rate": 9.201847069194043e-05, + "loss": 0.6206, + "step": 2024 + }, + { + "epoch": 0.54, + "grad_norm": 0.41866905304563634, + "learning_rate": 9.193237057502864e-05, + "loss": 0.6342, + "step": 2025 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.334719998891011, + "learning_rate": 9.184627647759799e-05, + "loss": 0.6084, + "step": 2026 + }, + { + "epoch": 0.5405333333333333, + "grad_norm": 0.3385689320503021, + "learning_rate": 9.176018846388565e-05, + "loss": 0.6281, + "step": 2027 + }, + { + "epoch": 0.5408, + "grad_norm": 0.36206757130750844, + "learning_rate": 9.167410659812428e-05, + "loss": 0.6346, + "step": 2028 + }, + { + "epoch": 0.5410666666666667, + "grad_norm": 0.3541779180060866, + "learning_rate": 9.158803094454192e-05, + "loss": 0.5989, + "step": 2029 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.3251690401987799, + "learning_rate": 9.150196156736203e-05, + "loss": 0.6215, + "step": 2030 + }, + { + "epoch": 0.5416, + "grad_norm": 0.3453497082769123, + "learning_rate": 9.14158985308033e-05, + "loss": 0.6428, + "step": 2031 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.3165127728007757, + "learning_rate": 9.132984189907975e-05, + "loss": 0.5871, + "step": 2032 + }, + { + "epoch": 0.5421333333333334, + "grad_norm": 0.37383934724822754, + "learning_rate": 9.124379173640064e-05, + "loss": 0.5954, + "step": 2033 + }, + { + "epoch": 0.5424, + "grad_norm": 0.343389799835649, + "learning_rate": 9.115774810697034e-05, + "loss": 0.5961, + "step": 2034 + }, + { + "epoch": 0.5426666666666666, + "grad_norm": 0.3576745888356298, + "learning_rate": 9.107171107498838e-05, + "loss": 0.6831, + "step": 2035 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.3600900883948002, + "learning_rate": 9.09856807046494e-05, + "loss": 0.6257, + "step": 2036 + }, + { + "epoch": 0.5432, + "grad_norm": 0.36104787294082813, + "learning_rate": 9.089965706014301e-05, + "loss": 0.5985, + "step": 2037 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.3541612746079401, + "learning_rate": 9.081364020565383e-05, + "loss": 0.5988, + "step": 2038 + }, + { + "epoch": 0.5437333333333333, + "grad_norm": 0.37535187068876136, + "learning_rate": 9.07276302053614e-05, + "loss": 0.6278, + "step": 2039 + }, + { + "epoch": 0.544, + "grad_norm": 0.3421339200864777, + "learning_rate": 9.064162712344015e-05, + "loss": 0.5847, + "step": 2040 + }, + { + "epoch": 0.5442666666666667, + "grad_norm": 0.3403665462535796, + "learning_rate": 9.05556310240594e-05, + "loss": 0.619, + "step": 2041 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.3393418599217958, + "learning_rate": 9.046964197138316e-05, + "loss": 0.6417, + "step": 2042 + }, + { + "epoch": 0.5448, + "grad_norm": 0.33337811114109767, + "learning_rate": 9.038366002957028e-05, + "loss": 0.566, + "step": 2043 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.33068332423472757, + "learning_rate": 9.029768526277424e-05, + "loss": 0.6178, + "step": 2044 + }, + { + "epoch": 0.5453333333333333, + "grad_norm": 0.34597647082344846, + "learning_rate": 9.02117177351432e-05, + "loss": 0.6075, + "step": 2045 + }, + { + "epoch": 0.5456, + "grad_norm": 0.343550031589001, + "learning_rate": 9.012575751081991e-05, + "loss": 0.6398, + "step": 2046 + }, + { + "epoch": 0.5458666666666666, + "grad_norm": 0.33303193389503644, + "learning_rate": 9.003980465394165e-05, + "loss": 0.5492, + "step": 2047 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.3638930536720473, + "learning_rate": 8.995385922864021e-05, + "loss": 0.5737, + "step": 2048 + }, + { + "epoch": 0.5464, + "grad_norm": 0.3469533174425106, + "learning_rate": 8.986792129904186e-05, + "loss": 0.6275, + "step": 2049 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.3377209067699381, + "learning_rate": 8.978199092926727e-05, + "loss": 0.5673, + "step": 2050 + }, + { + "epoch": 0.5469333333333334, + "grad_norm": 0.34722544607728356, + "learning_rate": 8.969606818343147e-05, + "loss": 0.6335, + "step": 2051 + }, + { + "epoch": 0.5472, + "grad_norm": 0.31719714612185756, + "learning_rate": 8.961015312564377e-05, + "loss": 0.5459, + "step": 2052 + }, + { + "epoch": 0.5474666666666667, + "grad_norm": 0.3330824647927296, + "learning_rate": 8.952424582000783e-05, + "loss": 0.6243, + "step": 2053 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.347092057709546, + "learning_rate": 8.943834633062136e-05, + "loss": 0.6217, + "step": 2054 + }, + { + "epoch": 0.548, + "grad_norm": 0.3284139348614484, + "learning_rate": 8.935245472157639e-05, + "loss": 0.602, + "step": 2055 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.3402004932097726, + "learning_rate": 8.926657105695903e-05, + "loss": 0.5998, + "step": 2056 + }, + { + "epoch": 0.5485333333333333, + "grad_norm": 0.315831128436594, + "learning_rate": 8.918069540084946e-05, + "loss": 0.5496, + "step": 2057 + }, + { + "epoch": 0.5488, + "grad_norm": 0.3376938423670618, + "learning_rate": 8.909482781732186e-05, + "loss": 0.6063, + "step": 2058 + }, + { + "epoch": 0.5490666666666667, + "grad_norm": 0.39995440990181685, + "learning_rate": 8.900896837044442e-05, + "loss": 0.6427, + "step": 2059 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.33405138270560814, + "learning_rate": 8.892311712427923e-05, + "loss": 0.59, + "step": 2060 + }, + { + "epoch": 0.5496, + "grad_norm": 0.37310467131968905, + "learning_rate": 8.883727414288235e-05, + "loss": 0.5921, + "step": 2061 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.34440676481003174, + "learning_rate": 8.875143949030346e-05, + "loss": 0.6549, + "step": 2062 + }, + { + "epoch": 0.5501333333333334, + "grad_norm": 0.34814487185459875, + "learning_rate": 8.866561323058627e-05, + "loss": 0.5869, + "step": 2063 + }, + { + "epoch": 0.5504, + "grad_norm": 0.35022252482919664, + "learning_rate": 8.857979542776808e-05, + "loss": 0.5877, + "step": 2064 + }, + { + "epoch": 0.5506666666666666, + "grad_norm": 0.37179646007477163, + "learning_rate": 8.849398614587993e-05, + "loss": 0.6344, + "step": 2065 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.33317276543374524, + "learning_rate": 8.840818544894648e-05, + "loss": 0.5623, + "step": 2066 + }, + { + "epoch": 0.5512, + "grad_norm": 0.37191598469618325, + "learning_rate": 8.832239340098605e-05, + "loss": 0.6293, + "step": 2067 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.3789732328180415, + "learning_rate": 8.823661006601042e-05, + "loss": 0.6597, + "step": 2068 + }, + { + "epoch": 0.5517333333333333, + "grad_norm": 0.36215763819533425, + "learning_rate": 8.815083550802495e-05, + "loss": 0.5935, + "step": 2069 + }, + { + "epoch": 0.552, + "grad_norm": 0.32754612704059777, + "learning_rate": 8.806506979102834e-05, + "loss": 0.6207, + "step": 2070 + }, + { + "epoch": 0.5522666666666667, + "grad_norm": 0.3499504458622124, + "learning_rate": 8.797931297901276e-05, + "loss": 0.6029, + "step": 2071 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.32985846583801026, + "learning_rate": 8.789356513596379e-05, + "loss": 0.5792, + "step": 2072 + }, + { + "epoch": 0.5528, + "grad_norm": 0.35501097287953565, + "learning_rate": 8.780782632586023e-05, + "loss": 0.6215, + "step": 2073 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.3641781892214901, + "learning_rate": 8.772209661267418e-05, + "loss": 0.5936, + "step": 2074 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 0.3497094810195038, + "learning_rate": 8.763637606037097e-05, + "loss": 0.5721, + "step": 2075 + }, + { + "epoch": 0.5536, + "grad_norm": 0.35925238926527614, + "learning_rate": 8.755066473290904e-05, + "loss": 0.6171, + "step": 2076 + }, + { + "epoch": 0.5538666666666666, + "grad_norm": 0.37807311052864306, + "learning_rate": 8.746496269423999e-05, + "loss": 0.6154, + "step": 2077 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.327883030793197, + "learning_rate": 8.737927000830848e-05, + "loss": 0.5958, + "step": 2078 + }, + { + "epoch": 0.5544, + "grad_norm": 0.36752250218691057, + "learning_rate": 8.729358673905218e-05, + "loss": 0.6579, + "step": 2079 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.33357704669776383, + "learning_rate": 8.720791295040175e-05, + "loss": 0.5738, + "step": 2080 + }, + { + "epoch": 0.5549333333333333, + "grad_norm": 0.35818551778045515, + "learning_rate": 8.712224870628077e-05, + "loss": 0.6508, + "step": 2081 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3289981692128152, + "learning_rate": 8.703659407060571e-05, + "loss": 0.5745, + "step": 2082 + }, + { + "epoch": 0.5554666666666667, + "grad_norm": 0.3341807345759168, + "learning_rate": 8.695094910728583e-05, + "loss": 0.5978, + "step": 2083 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.351362014077988, + "learning_rate": 8.686531388022325e-05, + "loss": 0.6684, + "step": 2084 + }, + { + "epoch": 0.556, + "grad_norm": 0.3325468561360648, + "learning_rate": 8.677968845331274e-05, + "loss": 0.5998, + "step": 2085 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.35532846758096376, + "learning_rate": 8.66940728904418e-05, + "loss": 0.582, + "step": 2086 + }, + { + "epoch": 0.5565333333333333, + "grad_norm": 0.34063513501893017, + "learning_rate": 8.660846725549056e-05, + "loss": 0.621, + "step": 2087 + }, + { + "epoch": 0.5568, + "grad_norm": 0.33280622979296764, + "learning_rate": 8.652287161233178e-05, + "loss": 0.5744, + "step": 2088 + }, + { + "epoch": 0.5570666666666667, + "grad_norm": 0.3347037433758992, + "learning_rate": 8.64372860248307e-05, + "loss": 0.6376, + "step": 2089 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.3507905705327085, + "learning_rate": 8.635171055684511e-05, + "loss": 0.6011, + "step": 2090 + }, + { + "epoch": 0.5576, + "grad_norm": 0.3438632021902035, + "learning_rate": 8.626614527222523e-05, + "loss": 0.604, + "step": 2091 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.3424938685333037, + "learning_rate": 8.618059023481368e-05, + "loss": 0.6155, + "step": 2092 + }, + { + "epoch": 0.5581333333333334, + "grad_norm": 0.32896797668196776, + "learning_rate": 8.609504550844542e-05, + "loss": 0.5793, + "step": 2093 + }, + { + "epoch": 0.5584, + "grad_norm": 0.33911151122399524, + "learning_rate": 8.600951115694775e-05, + "loss": 0.601, + "step": 2094 + }, + { + "epoch": 0.5586666666666666, + "grad_norm": 0.3334465272246177, + "learning_rate": 8.592398724414021e-05, + "loss": 0.5915, + "step": 2095 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.3460845438683898, + "learning_rate": 8.583847383383454e-05, + "loss": 0.6313, + "step": 2096 + }, + { + "epoch": 0.5592, + "grad_norm": 0.33897001285361567, + "learning_rate": 8.575297098983468e-05, + "loss": 0.5973, + "step": 2097 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.3377652251865536, + "learning_rate": 8.566747877593665e-05, + "loss": 0.5772, + "step": 2098 + }, + { + "epoch": 0.5597333333333333, + "grad_norm": 0.3509326998534721, + "learning_rate": 8.558199725592855e-05, + "loss": 0.6192, + "step": 2099 + }, + { + "epoch": 0.56, + "grad_norm": 0.3350008388768927, + "learning_rate": 8.549652649359053e-05, + "loss": 0.6165, + "step": 2100 + }, + { + "epoch": 0.5602666666666667, + "grad_norm": 0.35185785923586005, + "learning_rate": 8.541106655269464e-05, + "loss": 0.6589, + "step": 2101 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.3641117264412489, + "learning_rate": 8.532561749700493e-05, + "loss": 0.6001, + "step": 2102 + }, + { + "epoch": 0.5608, + "grad_norm": 0.3624688666585318, + "learning_rate": 8.524017939027728e-05, + "loss": 0.6389, + "step": 2103 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.3367140167577722, + "learning_rate": 8.515475229625946e-05, + "loss": 0.6106, + "step": 2104 + }, + { + "epoch": 0.5613333333333334, + "grad_norm": 0.36035269416188537, + "learning_rate": 8.506933627869095e-05, + "loss": 0.598, + "step": 2105 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3704014589770776, + "learning_rate": 8.4983931401303e-05, + "loss": 0.5981, + "step": 2106 + }, + { + "epoch": 0.5618666666666666, + "grad_norm": 0.35576048619854, + "learning_rate": 8.489853772781857e-05, + "loss": 0.6635, + "step": 2107 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.3411775664349861, + "learning_rate": 8.481315532195227e-05, + "loss": 0.5762, + "step": 2108 + }, + { + "epoch": 0.5624, + "grad_norm": 0.361268949194407, + "learning_rate": 8.47277842474102e-05, + "loss": 0.6241, + "step": 2109 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.3383370950428697, + "learning_rate": 8.464242456789009e-05, + "loss": 0.6006, + "step": 2110 + }, + { + "epoch": 0.5629333333333333, + "grad_norm": 0.33067600513284773, + "learning_rate": 8.455707634708115e-05, + "loss": 0.5979, + "step": 2111 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3372671832014587, + "learning_rate": 8.447173964866408e-05, + "loss": 0.6467, + "step": 2112 + }, + { + "epoch": 0.5634666666666667, + "grad_norm": 0.3517759381980163, + "learning_rate": 8.438641453631093e-05, + "loss": 0.6049, + "step": 2113 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.33766488973058173, + "learning_rate": 8.430110107368513e-05, + "loss": 0.6178, + "step": 2114 + }, + { + "epoch": 0.564, + "grad_norm": 0.33149697066517503, + "learning_rate": 8.421579932444145e-05, + "loss": 0.5679, + "step": 2115 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.35216142432794484, + "learning_rate": 8.41305093522258e-05, + "loss": 0.6352, + "step": 2116 + }, + { + "epoch": 0.5645333333333333, + "grad_norm": 0.33587153980527745, + "learning_rate": 8.40452312206754e-05, + "loss": 0.6041, + "step": 2117 + }, + { + "epoch": 0.5648, + "grad_norm": 0.4621106400250287, + "learning_rate": 8.395996499341866e-05, + "loss": 0.6434, + "step": 2118 + }, + { + "epoch": 0.5650666666666667, + "grad_norm": 0.34685449138666086, + "learning_rate": 8.387471073407503e-05, + "loss": 0.627, + "step": 2119 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.3250697411138274, + "learning_rate": 8.378946850625509e-05, + "loss": 0.5911, + "step": 2120 + }, + { + "epoch": 0.5656, + "grad_norm": 0.33521477785009896, + "learning_rate": 8.37042383735604e-05, + "loss": 0.616, + "step": 2121 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.3556731304369497, + "learning_rate": 8.361902039958355e-05, + "loss": 0.626, + "step": 2122 + }, + { + "epoch": 0.5661333333333334, + "grad_norm": 0.35989902149630715, + "learning_rate": 8.353381464790805e-05, + "loss": 0.6547, + "step": 2123 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3633470323367122, + "learning_rate": 8.344862118210817e-05, + "loss": 0.6093, + "step": 2124 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 0.3643828983267928, + "learning_rate": 8.336344006574916e-05, + "loss": 0.6388, + "step": 2125 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.36020589860079016, + "learning_rate": 8.3278271362387e-05, + "loss": 0.5954, + "step": 2126 + }, + { + "epoch": 0.5672, + "grad_norm": 0.36464990739190356, + "learning_rate": 8.319311513556841e-05, + "loss": 0.6031, + "step": 2127 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.3459771593329154, + "learning_rate": 8.310797144883082e-05, + "loss": 0.6408, + "step": 2128 + }, + { + "epoch": 0.5677333333333333, + "grad_norm": 0.34126907606757023, + "learning_rate": 8.302284036570224e-05, + "loss": 0.591, + "step": 2129 + }, + { + "epoch": 0.568, + "grad_norm": 0.34335754276681585, + "learning_rate": 8.293772194970138e-05, + "loss": 0.5769, + "step": 2130 + }, + { + "epoch": 0.5682666666666667, + "grad_norm": 0.3357509163461733, + "learning_rate": 8.285261626433742e-05, + "loss": 0.5576, + "step": 2131 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.3680848040216543, + "learning_rate": 8.276752337311006e-05, + "loss": 0.6542, + "step": 2132 + }, + { + "epoch": 0.5688, + "grad_norm": 0.3587450751389266, + "learning_rate": 8.268244333950942e-05, + "loss": 0.5724, + "step": 2133 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.32975765110137656, + "learning_rate": 8.259737622701613e-05, + "loss": 0.5774, + "step": 2134 + }, + { + "epoch": 0.5693333333333334, + "grad_norm": 0.35280175254086077, + "learning_rate": 8.251232209910105e-05, + "loss": 0.5782, + "step": 2135 + }, + { + "epoch": 0.5696, + "grad_norm": 0.32817665526899037, + "learning_rate": 8.242728101922547e-05, + "loss": 0.551, + "step": 2136 + }, + { + "epoch": 0.5698666666666666, + "grad_norm": 0.34550670703489517, + "learning_rate": 8.234225305084084e-05, + "loss": 0.5816, + "step": 2137 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.33027750539426437, + "learning_rate": 8.22572382573889e-05, + "loss": 0.6058, + "step": 2138 + }, + { + "epoch": 0.5704, + "grad_norm": 0.34543802263874496, + "learning_rate": 8.217223670230157e-05, + "loss": 0.6288, + "step": 2139 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.3164432690107782, + "learning_rate": 8.208724844900078e-05, + "loss": 0.5525, + "step": 2140 + }, + { + "epoch": 0.5709333333333333, + "grad_norm": 0.3591546450447273, + "learning_rate": 8.200227356089864e-05, + "loss": 0.6046, + "step": 2141 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3461150712033162, + "learning_rate": 8.19173121013973e-05, + "loss": 0.6221, + "step": 2142 + }, + { + "epoch": 0.5714666666666667, + "grad_norm": 0.3540330483209408, + "learning_rate": 8.183236413388881e-05, + "loss": 0.5995, + "step": 2143 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.3434326905690918, + "learning_rate": 8.174742972175522e-05, + "loss": 0.558, + "step": 2144 + }, + { + "epoch": 0.572, + "grad_norm": 0.3546826651709987, + "learning_rate": 8.166250892836842e-05, + "loss": 0.6195, + "step": 2145 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.3438081514830633, + "learning_rate": 8.157760181709018e-05, + "loss": 0.5784, + "step": 2146 + }, + { + "epoch": 0.5725333333333333, + "grad_norm": 0.36915836141721475, + "learning_rate": 8.149270845127205e-05, + "loss": 0.6277, + "step": 2147 + }, + { + "epoch": 0.5728, + "grad_norm": 0.34105981741670754, + "learning_rate": 8.140782889425526e-05, + "loss": 0.6143, + "step": 2148 + }, + { + "epoch": 0.5730666666666666, + "grad_norm": 0.3540664120500646, + "learning_rate": 8.132296320937086e-05, + "loss": 0.6192, + "step": 2149 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.3532922710017726, + "learning_rate": 8.123811145993942e-05, + "loss": 0.6151, + "step": 2150 + }, + { + "epoch": 0.5736, + "grad_norm": 0.3370465509535198, + "learning_rate": 8.115327370927122e-05, + "loss": 0.5744, + "step": 2151 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.34363340810194093, + "learning_rate": 8.106845002066603e-05, + "loss": 0.5826, + "step": 2152 + }, + { + "epoch": 0.5741333333333334, + "grad_norm": 0.34154527633045134, + "learning_rate": 8.098364045741313e-05, + "loss": 0.5349, + "step": 2153 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3488485033367598, + "learning_rate": 8.089884508279135e-05, + "loss": 0.5748, + "step": 2154 + }, + { + "epoch": 0.5746666666666667, + "grad_norm": 0.3464583346288359, + "learning_rate": 8.081406396006877e-05, + "loss": 0.5931, + "step": 2155 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.3430607709714394, + "learning_rate": 8.072929715250293e-05, + "loss": 0.6386, + "step": 2156 + }, + { + "epoch": 0.5752, + "grad_norm": 0.4560599387658039, + "learning_rate": 8.064454472334076e-05, + "loss": 0.5888, + "step": 2157 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.3496810619046592, + "learning_rate": 8.055980673581832e-05, + "loss": 0.6148, + "step": 2158 + }, + { + "epoch": 0.5757333333333333, + "grad_norm": 0.3494353089823202, + "learning_rate": 8.047508325316102e-05, + "loss": 0.5859, + "step": 2159 + }, + { + "epoch": 0.576, + "grad_norm": 0.3651351507114001, + "learning_rate": 8.039037433858335e-05, + "loss": 0.6171, + "step": 2160 + }, + { + "epoch": 0.5762666666666667, + "grad_norm": 0.35926794960992114, + "learning_rate": 8.030568005528898e-05, + "loss": 0.5852, + "step": 2161 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.3622318625788015, + "learning_rate": 8.02210004664707e-05, + "loss": 0.5518, + "step": 2162 + }, + { + "epoch": 0.5768, + "grad_norm": 0.334224128949714, + "learning_rate": 8.01363356353102e-05, + "loss": 0.5938, + "step": 2163 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.3597879159381735, + "learning_rate": 8.00516856249783e-05, + "loss": 0.5784, + "step": 2164 + }, + { + "epoch": 0.5773333333333334, + "grad_norm": 0.3518818743418102, + "learning_rate": 7.996705049863471e-05, + "loss": 0.6022, + "step": 2165 + }, + { + "epoch": 0.5776, + "grad_norm": 0.4021362296104498, + "learning_rate": 7.9882430319428e-05, + "loss": 0.5951, + "step": 2166 + }, + { + "epoch": 0.5778666666666666, + "grad_norm": 0.3488767362508129, + "learning_rate": 7.979782515049567e-05, + "loss": 0.5988, + "step": 2167 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.35626717974817695, + "learning_rate": 7.971323505496398e-05, + "loss": 0.622, + "step": 2168 + }, + { + "epoch": 0.5784, + "grad_norm": 0.3382006285897414, + "learning_rate": 7.96286600959479e-05, + "loss": 0.5864, + "step": 2169 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.34574066685301846, + "learning_rate": 7.95441003365512e-05, + "loss": 0.5302, + "step": 2170 + }, + { + "epoch": 0.5789333333333333, + "grad_norm": 0.3403052469166525, + "learning_rate": 7.945955583986617e-05, + "loss": 0.6384, + "step": 2171 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3612333352239432, + "learning_rate": 7.937502666897382e-05, + "loss": 0.6007, + "step": 2172 + }, + { + "epoch": 0.5794666666666667, + "grad_norm": 0.3389929214890273, + "learning_rate": 7.929051288694374e-05, + "loss": 0.5707, + "step": 2173 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.3398935765805274, + "learning_rate": 7.920601455683394e-05, + "loss": 0.5797, + "step": 2174 + }, + { + "epoch": 0.58, + "grad_norm": 0.36536918064987484, + "learning_rate": 7.912153174169099e-05, + "loss": 0.6521, + "step": 2175 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.3435878045685125, + "learning_rate": 7.903706450454986e-05, + "loss": 0.5548, + "step": 2176 + }, + { + "epoch": 0.5805333333333333, + "grad_norm": 0.363125914272866, + "learning_rate": 7.895261290843386e-05, + "loss": 0.6277, + "step": 2177 + }, + { + "epoch": 0.5808, + "grad_norm": 0.3328225446299371, + "learning_rate": 7.886817701635472e-05, + "loss": 0.5689, + "step": 2178 + }, + { + "epoch": 0.5810666666666666, + "grad_norm": 0.3475643192861449, + "learning_rate": 7.878375689131232e-05, + "loss": 0.619, + "step": 2179 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.351111945046791, + "learning_rate": 7.869935259629485e-05, + "loss": 0.5851, + "step": 2180 + }, + { + "epoch": 0.5816, + "grad_norm": 0.3874561839635163, + "learning_rate": 7.861496419427872e-05, + "loss": 0.6405, + "step": 2181 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.3528461362308666, + "learning_rate": 7.853059174822844e-05, + "loss": 0.6063, + "step": 2182 + }, + { + "epoch": 0.5821333333333333, + "grad_norm": 0.3368603956170215, + "learning_rate": 7.844623532109662e-05, + "loss": 0.6353, + "step": 2183 + }, + { + "epoch": 0.5824, + "grad_norm": 0.36051823714180337, + "learning_rate": 7.836189497582391e-05, + "loss": 0.5983, + "step": 2184 + }, + { + "epoch": 0.5826666666666667, + "grad_norm": 0.3574779139123841, + "learning_rate": 7.827757077533899e-05, + "loss": 0.5696, + "step": 2185 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.3647603543025058, + "learning_rate": 7.819326278255848e-05, + "loss": 0.6073, + "step": 2186 + }, + { + "epoch": 0.5832, + "grad_norm": 0.32516747922540085, + "learning_rate": 7.810897106038686e-05, + "loss": 0.56, + "step": 2187 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.343663514568444, + "learning_rate": 7.802469567171655e-05, + "loss": 0.574, + "step": 2188 + }, + { + "epoch": 0.5837333333333333, + "grad_norm": 0.5104045087718205, + "learning_rate": 7.794043667942771e-05, + "loss": 0.6023, + "step": 2189 + }, + { + "epoch": 0.584, + "grad_norm": 0.3855855411473251, + "learning_rate": 7.785619414638835e-05, + "loss": 0.6187, + "step": 2190 + }, + { + "epoch": 0.5842666666666667, + "grad_norm": 0.3827575440516394, + "learning_rate": 7.777196813545413e-05, + "loss": 0.6275, + "step": 2191 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.3516594315954644, + "learning_rate": 7.768775870946837e-05, + "loss": 0.5734, + "step": 2192 + }, + { + "epoch": 0.5848, + "grad_norm": 0.3658062669534862, + "learning_rate": 7.760356593126211e-05, + "loss": 0.6167, + "step": 2193 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.35939595039550526, + "learning_rate": 7.751938986365385e-05, + "loss": 0.6175, + "step": 2194 + }, + { + "epoch": 0.5853333333333334, + "grad_norm": 0.3600678656083925, + "learning_rate": 7.743523056944972e-05, + "loss": 0.6298, + "step": 2195 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3414781212657749, + "learning_rate": 7.735108811144326e-05, + "loss": 0.6289, + "step": 2196 + }, + { + "epoch": 0.5858666666666666, + "grad_norm": 0.33506940917704947, + "learning_rate": 7.72669625524155e-05, + "loss": 0.5857, + "step": 2197 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.35039664016573224, + "learning_rate": 7.718285395513484e-05, + "loss": 0.6238, + "step": 2198 + }, + { + "epoch": 0.5864, + "grad_norm": 0.34707107508845275, + "learning_rate": 7.709876238235703e-05, + "loss": 0.621, + "step": 2199 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.34285251310935566, + "learning_rate": 7.70146878968251e-05, + "loss": 0.588, + "step": 2200 + }, + { + "epoch": 0.5869333333333333, + "grad_norm": 0.3602041822763401, + "learning_rate": 7.693063056126942e-05, + "loss": 0.5894, + "step": 2201 + }, + { + "epoch": 0.5872, + "grad_norm": 0.38263370361777826, + "learning_rate": 7.684659043840737e-05, + "loss": 0.6241, + "step": 2202 + }, + { + "epoch": 0.5874666666666667, + "grad_norm": 0.33442803356311723, + "learning_rate": 7.67625675909437e-05, + "loss": 0.6041, + "step": 2203 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.34271316136905655, + "learning_rate": 7.66785620815701e-05, + "loss": 0.5695, + "step": 2204 + }, + { + "epoch": 0.588, + "grad_norm": 0.34271844060551837, + "learning_rate": 7.659457397296548e-05, + "loss": 0.5719, + "step": 2205 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.34074197334029466, + "learning_rate": 7.651060332779563e-05, + "loss": 0.5978, + "step": 2206 + }, + { + "epoch": 0.5885333333333334, + "grad_norm": 0.35728991493505907, + "learning_rate": 7.642665020871338e-05, + "loss": 0.6017, + "step": 2207 + }, + { + "epoch": 0.5888, + "grad_norm": 0.34520399217730874, + "learning_rate": 7.634271467835851e-05, + "loss": 0.5912, + "step": 2208 + }, + { + "epoch": 0.5890666666666666, + "grad_norm": 0.348476666315004, + "learning_rate": 7.625879679935763e-05, + "loss": 0.6316, + "step": 2209 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.36361657121293595, + "learning_rate": 7.617489663432413e-05, + "loss": 0.6447, + "step": 2210 + }, + { + "epoch": 0.5896, + "grad_norm": 0.3597637793311185, + "learning_rate": 7.609101424585825e-05, + "loss": 0.5939, + "step": 2211 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.3539296488483343, + "learning_rate": 7.6007149696547e-05, + "loss": 0.5899, + "step": 2212 + }, + { + "epoch": 0.5901333333333333, + "grad_norm": 0.34309453582511523, + "learning_rate": 7.592330304896403e-05, + "loss": 0.6185, + "step": 2213 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3453739169812385, + "learning_rate": 7.58394743656696e-05, + "loss": 0.6203, + "step": 2214 + }, + { + "epoch": 0.5906666666666667, + "grad_norm": 0.35424522174748785, + "learning_rate": 7.575566370921066e-05, + "loss": 0.6111, + "step": 2215 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.35348887127339396, + "learning_rate": 7.567187114212061e-05, + "loss": 0.5487, + "step": 2216 + }, + { + "epoch": 0.5912, + "grad_norm": 0.3378762669998194, + "learning_rate": 7.558809672691947e-05, + "loss": 0.5828, + "step": 2217 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.33797924960148396, + "learning_rate": 7.550434052611355e-05, + "loss": 0.5678, + "step": 2218 + }, + { + "epoch": 0.5917333333333333, + "grad_norm": 0.3622053544874109, + "learning_rate": 7.54206026021957e-05, + "loss": 0.6029, + "step": 2219 + }, + { + "epoch": 0.592, + "grad_norm": 0.3487309719253174, + "learning_rate": 7.533688301764511e-05, + "loss": 0.618, + "step": 2220 + }, + { + "epoch": 0.5922666666666667, + "grad_norm": 0.36616511813036934, + "learning_rate": 7.525318183492726e-05, + "loss": 0.6237, + "step": 2221 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.4141903251524446, + "learning_rate": 7.516949911649391e-05, + "loss": 0.6252, + "step": 2222 + }, + { + "epoch": 0.5928, + "grad_norm": 0.34990643988554515, + "learning_rate": 7.50858349247831e-05, + "loss": 0.5596, + "step": 2223 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.3456055692611483, + "learning_rate": 7.500218932221892e-05, + "loss": 0.5629, + "step": 2224 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 0.34487246313600933, + "learning_rate": 7.491856237121175e-05, + "loss": 0.5487, + "step": 2225 + }, + { + "epoch": 0.5936, + "grad_norm": 0.34839659077446355, + "learning_rate": 7.483495413415788e-05, + "loss": 0.6425, + "step": 2226 + }, + { + "epoch": 0.5938666666666667, + "grad_norm": 0.33947982540845745, + "learning_rate": 7.475136467343978e-05, + "loss": 0.5738, + "step": 2227 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.3747753029085932, + "learning_rate": 7.46677940514258e-05, + "loss": 0.5928, + "step": 2228 + }, + { + "epoch": 0.5944, + "grad_norm": 0.33938024616010914, + "learning_rate": 7.458424233047036e-05, + "loss": 0.5793, + "step": 2229 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.33322286846915244, + "learning_rate": 7.450070957291366e-05, + "loss": 0.5961, + "step": 2230 + }, + { + "epoch": 0.5949333333333333, + "grad_norm": 0.33252742935298535, + "learning_rate": 7.441719584108181e-05, + "loss": 0.5845, + "step": 2231 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3271666683868001, + "learning_rate": 7.433370119728673e-05, + "loss": 0.5775, + "step": 2232 + }, + { + "epoch": 0.5954666666666667, + "grad_norm": 0.36118602544458317, + "learning_rate": 7.425022570382605e-05, + "loss": 0.6596, + "step": 2233 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.3277043497038271, + "learning_rate": 7.416676942298314e-05, + "loss": 0.5679, + "step": 2234 + }, + { + "epoch": 0.596, + "grad_norm": 0.3274622490086294, + "learning_rate": 7.408333241702705e-05, + "loss": 0.5747, + "step": 2235 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.3506365359340622, + "learning_rate": 7.399991474821243e-05, + "loss": 0.6231, + "step": 2236 + }, + { + "epoch": 0.5965333333333334, + "grad_norm": 0.32990538416292725, + "learning_rate": 7.391651647877953e-05, + "loss": 0.5954, + "step": 2237 + }, + { + "epoch": 0.5968, + "grad_norm": 0.33803132565287664, + "learning_rate": 7.383313767095407e-05, + "loss": 0.5949, + "step": 2238 + }, + { + "epoch": 0.5970666666666666, + "grad_norm": 0.357174157769741, + "learning_rate": 7.374977838694729e-05, + "loss": 0.6293, + "step": 2239 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.3316359422424292, + "learning_rate": 7.366643868895588e-05, + "loss": 0.6013, + "step": 2240 + }, + { + "epoch": 0.5976, + "grad_norm": 0.31772140620864137, + "learning_rate": 7.358311863916187e-05, + "loss": 0.573, + "step": 2241 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.3404401326089692, + "learning_rate": 7.349981829973263e-05, + "loss": 0.59, + "step": 2242 + }, + { + "epoch": 0.5981333333333333, + "grad_norm": 0.38114877266967306, + "learning_rate": 7.341653773282085e-05, + "loss": 0.5803, + "step": 2243 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3354450653736003, + "learning_rate": 7.333327700056449e-05, + "loss": 0.5496, + "step": 2244 + }, + { + "epoch": 0.5986666666666667, + "grad_norm": 0.3446261131581236, + "learning_rate": 7.325003616508666e-05, + "loss": 0.5994, + "step": 2245 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.35349347990876445, + "learning_rate": 7.316681528849566e-05, + "loss": 0.5803, + "step": 2246 + }, + { + "epoch": 0.5992, + "grad_norm": 0.3824222967935294, + "learning_rate": 7.308361443288488e-05, + "loss": 0.5958, + "step": 2247 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.3437914532851528, + "learning_rate": 7.300043366033278e-05, + "loss": 0.5816, + "step": 2248 + }, + { + "epoch": 0.5997333333333333, + "grad_norm": 0.36128960166606017, + "learning_rate": 7.29172730329028e-05, + "loss": 0.6084, + "step": 2249 + }, + { + "epoch": 0.6, + "grad_norm": 0.34836834337392336, + "learning_rate": 7.283413261264342e-05, + "loss": 0.6035, + "step": 2250 + }, + { + "epoch": 0.6002666666666666, + "grad_norm": 0.3329353260047149, + "learning_rate": 7.275101246158798e-05, + "loss": 0.6099, + "step": 2251 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.3844371500453326, + "learning_rate": 7.266791264175473e-05, + "loss": 0.6055, + "step": 2252 + }, + { + "epoch": 0.6008, + "grad_norm": 0.35335087867196224, + "learning_rate": 7.258483321514673e-05, + "loss": 0.575, + "step": 2253 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.42308983932303557, + "learning_rate": 7.250177424375186e-05, + "loss": 0.5996, + "step": 2254 + }, + { + "epoch": 0.6013333333333334, + "grad_norm": 0.32822661451514334, + "learning_rate": 7.241873578954271e-05, + "loss": 0.5797, + "step": 2255 + }, + { + "epoch": 0.6016, + "grad_norm": 0.35055550159750304, + "learning_rate": 7.233571791447656e-05, + "loss": 0.6495, + "step": 2256 + }, + { + "epoch": 0.6018666666666667, + "grad_norm": 0.3666828839852818, + "learning_rate": 7.225272068049531e-05, + "loss": 0.6483, + "step": 2257 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.35008777305166217, + "learning_rate": 7.216974414952551e-05, + "loss": 0.6263, + "step": 2258 + }, + { + "epoch": 0.6024, + "grad_norm": 0.3392558823037353, + "learning_rate": 7.208678838347824e-05, + "loss": 0.5879, + "step": 2259 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.34162997908650017, + "learning_rate": 7.200385344424908e-05, + "loss": 0.613, + "step": 2260 + }, + { + "epoch": 0.6029333333333333, + "grad_norm": 0.35444806707422305, + "learning_rate": 7.19209393937181e-05, + "loss": 0.6412, + "step": 2261 + }, + { + "epoch": 0.6032, + "grad_norm": 0.3529659060309609, + "learning_rate": 7.183804629374974e-05, + "loss": 0.6089, + "step": 2262 + }, + { + "epoch": 0.6034666666666667, + "grad_norm": 0.3399626933463322, + "learning_rate": 7.175517420619287e-05, + "loss": 0.5967, + "step": 2263 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.35725601136877355, + "learning_rate": 7.167232319288063e-05, + "loss": 0.632, + "step": 2264 + }, + { + "epoch": 0.604, + "grad_norm": 0.32530161909559985, + "learning_rate": 7.15894933156304e-05, + "loss": 0.5508, + "step": 2265 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.34345519645921707, + "learning_rate": 7.150668463624389e-05, + "loss": 0.6176, + "step": 2266 + }, + { + "epoch": 0.6045333333333334, + "grad_norm": 0.3209910978630537, + "learning_rate": 7.142389721650688e-05, + "loss": 0.5813, + "step": 2267 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3396483762825362, + "learning_rate": 7.134113111818943e-05, + "loss": 0.572, + "step": 2268 + }, + { + "epoch": 0.6050666666666666, + "grad_norm": 0.3586371413833368, + "learning_rate": 7.125838640304559e-05, + "loss": 0.6457, + "step": 2269 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.3735000740871753, + "learning_rate": 7.117566313281345e-05, + "loss": 0.6003, + "step": 2270 + }, + { + "epoch": 0.6056, + "grad_norm": 0.35323876792442155, + "learning_rate": 7.109296136921515e-05, + "loss": 0.5995, + "step": 2271 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.33637128275139416, + "learning_rate": 7.101028117395681e-05, + "loss": 0.5906, + "step": 2272 + }, + { + "epoch": 0.6061333333333333, + "grad_norm": 0.3701141598599618, + "learning_rate": 7.092762260872828e-05, + "loss": 0.6305, + "step": 2273 + }, + { + "epoch": 0.6064, + "grad_norm": 0.38931078042726064, + "learning_rate": 7.084498573520348e-05, + "loss": 0.6676, + "step": 2274 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 0.3586114052701087, + "learning_rate": 7.076237061504007e-05, + "loss": 0.652, + "step": 2275 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.3336436269440386, + "learning_rate": 7.067977730987942e-05, + "loss": 0.6056, + "step": 2276 + }, + { + "epoch": 0.6072, + "grad_norm": 0.3484378648642681, + "learning_rate": 7.059720588134672e-05, + "loss": 0.5668, + "step": 2277 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.34757392437088325, + "learning_rate": 7.05146563910508e-05, + "loss": 0.5414, + "step": 2278 + }, + { + "epoch": 0.6077333333333333, + "grad_norm": 0.3451030747030425, + "learning_rate": 7.043212890058416e-05, + "loss": 0.5766, + "step": 2279 + }, + { + "epoch": 0.608, + "grad_norm": 0.336361264043929, + "learning_rate": 7.03496234715227e-05, + "loss": 0.5947, + "step": 2280 + }, + { + "epoch": 0.6082666666666666, + "grad_norm": 0.36027530875664027, + "learning_rate": 7.026714016542611e-05, + "loss": 0.5956, + "step": 2281 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.3652789805976303, + "learning_rate": 7.018467904383741e-05, + "loss": 0.6146, + "step": 2282 + }, + { + "epoch": 0.6088, + "grad_norm": 0.3397631443093309, + "learning_rate": 7.010224016828316e-05, + "loss": 0.5734, + "step": 2283 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.3496392467834162, + "learning_rate": 7.001982360027324e-05, + "loss": 0.6544, + "step": 2284 + }, + { + "epoch": 0.6093333333333333, + "grad_norm": 0.34710771111578964, + "learning_rate": 6.993742940130097e-05, + "loss": 0.5631, + "step": 2285 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3282172916460941, + "learning_rate": 6.98550576328429e-05, + "loss": 0.6248, + "step": 2286 + }, + { + "epoch": 0.6098666666666667, + "grad_norm": 0.34740083714067665, + "learning_rate": 6.977270835635894e-05, + "loss": 0.6039, + "step": 2287 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.3450369928481761, + "learning_rate": 6.969038163329208e-05, + "loss": 0.5721, + "step": 2288 + }, + { + "epoch": 0.6104, + "grad_norm": 0.32838753095150724, + "learning_rate": 6.960807752506864e-05, + "loss": 0.5916, + "step": 2289 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.36201429084135534, + "learning_rate": 6.952579609309793e-05, + "loss": 0.597, + "step": 2290 + }, + { + "epoch": 0.6109333333333333, + "grad_norm": 0.3286486703288118, + "learning_rate": 6.94435373987724e-05, + "loss": 0.5988, + "step": 2291 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3415838874239514, + "learning_rate": 6.936130150346758e-05, + "loss": 0.615, + "step": 2292 + }, + { + "epoch": 0.6114666666666667, + "grad_norm": 0.37671731762232635, + "learning_rate": 6.92790884685419e-05, + "loss": 0.5637, + "step": 2293 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.3337898209689772, + "learning_rate": 6.919689835533681e-05, + "loss": 0.5977, + "step": 2294 + }, + { + "epoch": 0.612, + "grad_norm": 0.3596518136966967, + "learning_rate": 6.91147312251766e-05, + "loss": 0.6102, + "step": 2295 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.34362279762458187, + "learning_rate": 6.903258713936843e-05, + "loss": 0.6262, + "step": 2296 + }, + { + "epoch": 0.6125333333333334, + "grad_norm": 0.3464009451685017, + "learning_rate": 6.895046615920229e-05, + "loss": 0.5993, + "step": 2297 + }, + { + "epoch": 0.6128, + "grad_norm": 0.35826322775097214, + "learning_rate": 6.88683683459509e-05, + "loss": 0.5708, + "step": 2298 + }, + { + "epoch": 0.6130666666666666, + "grad_norm": 0.3713496698997063, + "learning_rate": 6.878629376086969e-05, + "loss": 0.6001, + "step": 2299 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.3451019877473833, + "learning_rate": 6.870424246519682e-05, + "loss": 0.5785, + "step": 2300 + }, + { + "epoch": 0.6136, + "grad_norm": 0.3220426975261808, + "learning_rate": 6.862221452015298e-05, + "loss": 0.5986, + "step": 2301 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.3763352966281929, + "learning_rate": 6.854020998694152e-05, + "loss": 0.6406, + "step": 2302 + }, + { + "epoch": 0.6141333333333333, + "grad_norm": 0.34134138605415293, + "learning_rate": 6.845822892674829e-05, + "loss": 0.5792, + "step": 2303 + }, + { + "epoch": 0.6144, + "grad_norm": 0.38388924190673807, + "learning_rate": 6.837627140074159e-05, + "loss": 0.6398, + "step": 2304 + }, + { + "epoch": 0.6146666666666667, + "grad_norm": 0.34419290228058447, + "learning_rate": 6.829433747007221e-05, + "loss": 0.6127, + "step": 2305 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.3619163626067995, + "learning_rate": 6.821242719587331e-05, + "loss": 0.6138, + "step": 2306 + }, + { + "epoch": 0.6152, + "grad_norm": 0.33745554573796577, + "learning_rate": 6.813054063926044e-05, + "loss": 0.6227, + "step": 2307 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.33848523185725055, + "learning_rate": 6.804867786133137e-05, + "loss": 0.5876, + "step": 2308 + }, + { + "epoch": 0.6157333333333334, + "grad_norm": 0.34346243670470383, + "learning_rate": 6.796683892316623e-05, + "loss": 0.637, + "step": 2309 + }, + { + "epoch": 0.616, + "grad_norm": 0.36269220774445005, + "learning_rate": 6.788502388582727e-05, + "loss": 0.6377, + "step": 2310 + }, + { + "epoch": 0.6162666666666666, + "grad_norm": 0.33404604813329786, + "learning_rate": 6.780323281035903e-05, + "loss": 0.59, + "step": 2311 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.349935025702179, + "learning_rate": 6.772146575778795e-05, + "loss": 0.6061, + "step": 2312 + }, + { + "epoch": 0.6168, + "grad_norm": 0.34072266601086976, + "learning_rate": 6.76397227891228e-05, + "loss": 0.6095, + "step": 2313 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.3419996604542034, + "learning_rate": 6.755800396535423e-05, + "loss": 0.5907, + "step": 2314 + }, + { + "epoch": 0.6173333333333333, + "grad_norm": 0.3357700024502719, + "learning_rate": 6.747630934745491e-05, + "loss": 0.6371, + "step": 2315 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3528200296628358, + "learning_rate": 6.739463899637945e-05, + "loss": 0.597, + "step": 2316 + }, + { + "epoch": 0.6178666666666667, + "grad_norm": 0.3504407344694739, + "learning_rate": 6.731299297306436e-05, + "loss": 0.6013, + "step": 2317 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.3408528689710479, + "learning_rate": 6.723137133842805e-05, + "loss": 0.6198, + "step": 2318 + }, + { + "epoch": 0.6184, + "grad_norm": 0.316655494416179, + "learning_rate": 6.714977415337058e-05, + "loss": 0.5578, + "step": 2319 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.34469447758237826, + "learning_rate": 6.706820147877388e-05, + "loss": 0.6225, + "step": 2320 + }, + { + "epoch": 0.6189333333333333, + "grad_norm": 0.31891601740953823, + "learning_rate": 6.698665337550161e-05, + "loss": 0.5802, + "step": 2321 + }, + { + "epoch": 0.6192, + "grad_norm": 0.34104339043253923, + "learning_rate": 6.690512990439902e-05, + "loss": 0.5901, + "step": 2322 + }, + { + "epoch": 0.6194666666666667, + "grad_norm": 0.34402374532188523, + "learning_rate": 6.682363112629308e-05, + "loss": 0.5689, + "step": 2323 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.347212354553933, + "learning_rate": 6.674215710199226e-05, + "loss": 0.6309, + "step": 2324 + }, + { + "epoch": 0.62, + "grad_norm": 0.3370658855896269, + "learning_rate": 6.666070789228655e-05, + "loss": 0.5521, + "step": 2325 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.3397399840731556, + "learning_rate": 6.657928355794752e-05, + "loss": 0.5665, + "step": 2326 + }, + { + "epoch": 0.6205333333333334, + "grad_norm": 0.35362450046341737, + "learning_rate": 6.649788415972804e-05, + "loss": 0.6259, + "step": 2327 + }, + { + "epoch": 0.6208, + "grad_norm": 0.37135044171802783, + "learning_rate": 6.641650975836248e-05, + "loss": 0.5936, + "step": 2328 + }, + { + "epoch": 0.6210666666666667, + "grad_norm": 0.3574490093025898, + "learning_rate": 6.633516041456654e-05, + "loss": 0.5939, + "step": 2329 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.34538946609485643, + "learning_rate": 6.625383618903718e-05, + "loss": 0.6016, + "step": 2330 + }, + { + "epoch": 0.6216, + "grad_norm": 0.3388382630145427, + "learning_rate": 6.617253714245268e-05, + "loss": 0.5736, + "step": 2331 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.3549451875907997, + "learning_rate": 6.609126333547249e-05, + "loss": 0.587, + "step": 2332 + }, + { + "epoch": 0.6221333333333333, + "grad_norm": 0.3698487517284332, + "learning_rate": 6.601001482873724e-05, + "loss": 0.6586, + "step": 2333 + }, + { + "epoch": 0.6224, + "grad_norm": 0.35090441455602456, + "learning_rate": 6.592879168286874e-05, + "loss": 0.6151, + "step": 2334 + }, + { + "epoch": 0.6226666666666667, + "grad_norm": 0.33661192496141673, + "learning_rate": 6.584759395846974e-05, + "loss": 0.5856, + "step": 2335 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.34836025792196795, + "learning_rate": 6.576642171612413e-05, + "loss": 0.5628, + "step": 2336 + }, + { + "epoch": 0.6232, + "grad_norm": 0.34069574890406146, + "learning_rate": 6.568527501639679e-05, + "loss": 0.6179, + "step": 2337 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.33665880063585896, + "learning_rate": 6.560415391983348e-05, + "loss": 0.5873, + "step": 2338 + }, + { + "epoch": 0.6237333333333334, + "grad_norm": 0.3583256944706876, + "learning_rate": 6.552305848696092e-05, + "loss": 0.6082, + "step": 2339 + }, + { + "epoch": 0.624, + "grad_norm": 0.33452834970612355, + "learning_rate": 6.544198877828662e-05, + "loss": 0.6091, + "step": 2340 + }, + { + "epoch": 0.6242666666666666, + "grad_norm": 0.3446645871496446, + "learning_rate": 6.536094485429897e-05, + "loss": 0.5962, + "step": 2341 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.3413265050364765, + "learning_rate": 6.527992677546706e-05, + "loss": 0.6085, + "step": 2342 + }, + { + "epoch": 0.6248, + "grad_norm": 0.31934875720828215, + "learning_rate": 6.51989346022407e-05, + "loss": 0.5955, + "step": 2343 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.36026896123073415, + "learning_rate": 6.51179683950504e-05, + "loss": 0.6107, + "step": 2344 + }, + { + "epoch": 0.6253333333333333, + "grad_norm": 0.35697524084038335, + "learning_rate": 6.503702821430728e-05, + "loss": 0.5941, + "step": 2345 + }, + { + "epoch": 0.6256, + "grad_norm": 0.35827259454318255, + "learning_rate": 6.495611412040306e-05, + "loss": 0.6597, + "step": 2346 + }, + { + "epoch": 0.6258666666666667, + "grad_norm": 0.3754157539262379, + "learning_rate": 6.487522617370996e-05, + "loss": 0.6106, + "step": 2347 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.33984543828264, + "learning_rate": 6.479436443458072e-05, + "loss": 0.5775, + "step": 2348 + }, + { + "epoch": 0.6264, + "grad_norm": 0.3281350978505545, + "learning_rate": 6.471352896334851e-05, + "loss": 0.571, + "step": 2349 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.32423939957163334, + "learning_rate": 6.463271982032695e-05, + "loss": 0.6119, + "step": 2350 + }, + { + "epoch": 0.6269333333333333, + "grad_norm": 0.36810359773927115, + "learning_rate": 6.45519370658099e-05, + "loss": 0.6534, + "step": 2351 + }, + { + "epoch": 0.6272, + "grad_norm": 0.3195658184081652, + "learning_rate": 6.447118076007165e-05, + "loss": 0.5435, + "step": 2352 + }, + { + "epoch": 0.6274666666666666, + "grad_norm": 0.33663837766452803, + "learning_rate": 6.43904509633667e-05, + "loss": 0.5974, + "step": 2353 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.34826678764415414, + "learning_rate": 6.43097477359298e-05, + "loss": 0.5899, + "step": 2354 + }, + { + "epoch": 0.628, + "grad_norm": 0.32103005144329616, + "learning_rate": 6.422907113797581e-05, + "loss": 0.6155, + "step": 2355 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.35378196416816504, + "learning_rate": 6.414842122969981e-05, + "loss": 0.6276, + "step": 2356 + }, + { + "epoch": 0.6285333333333334, + "grad_norm": 0.3810926164409607, + "learning_rate": 6.406779807127695e-05, + "loss": 0.5984, + "step": 2357 + }, + { + "epoch": 0.6288, + "grad_norm": 0.35133122231985336, + "learning_rate": 6.398720172286231e-05, + "loss": 0.6007, + "step": 2358 + }, + { + "epoch": 0.6290666666666667, + "grad_norm": 0.33961515311091645, + "learning_rate": 6.390663224459111e-05, + "loss": 0.608, + "step": 2359 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.34796322280463593, + "learning_rate": 6.382608969657846e-05, + "loss": 0.6124, + "step": 2360 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3526323514364444, + "learning_rate": 6.374557413891936e-05, + "loss": 0.6259, + "step": 2361 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.33417535778932184, + "learning_rate": 6.36650856316887e-05, + "loss": 0.5916, + "step": 2362 + }, + { + "epoch": 0.6301333333333333, + "grad_norm": 0.32604666321789155, + "learning_rate": 6.35846242349412e-05, + "loss": 0.5737, + "step": 2363 + }, + { + "epoch": 0.6304, + "grad_norm": 0.32609304597025857, + "learning_rate": 6.350419000871129e-05, + "loss": 0.599, + "step": 2364 + }, + { + "epoch": 0.6306666666666667, + "grad_norm": 0.3449149497273829, + "learning_rate": 6.342378301301324e-05, + "loss": 0.5899, + "step": 2365 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.3589645806850102, + "learning_rate": 6.334340330784083e-05, + "loss": 0.6403, + "step": 2366 + }, + { + "epoch": 0.6312, + "grad_norm": 0.33156136343929354, + "learning_rate": 6.326305095316762e-05, + "loss": 0.5803, + "step": 2367 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.3275785959556299, + "learning_rate": 6.318272600894675e-05, + "loss": 0.5368, + "step": 2368 + }, + { + "epoch": 0.6317333333333334, + "grad_norm": 0.3601974862755085, + "learning_rate": 6.310242853511083e-05, + "loss": 0.633, + "step": 2369 + }, + { + "epoch": 0.632, + "grad_norm": 0.3422910202857035, + "learning_rate": 6.302215859157208e-05, + "loss": 0.562, + "step": 2370 + }, + { + "epoch": 0.6322666666666666, + "grad_norm": 0.38307555025222, + "learning_rate": 6.294191623822207e-05, + "loss": 0.6391, + "step": 2371 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.36061672282013274, + "learning_rate": 6.286170153493188e-05, + "loss": 0.687, + "step": 2372 + }, + { + "epoch": 0.6328, + "grad_norm": 0.35955909782102774, + "learning_rate": 6.278151454155192e-05, + "loss": 0.5953, + "step": 2373 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.3530964829173126, + "learning_rate": 6.270135531791187e-05, + "loss": 0.5876, + "step": 2374 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.3391126742582884, + "learning_rate": 6.262122392382075e-05, + "loss": 0.5696, + "step": 2375 + }, + { + "epoch": 0.6336, + "grad_norm": 0.36073389005568296, + "learning_rate": 6.254112041906683e-05, + "loss": 0.5911, + "step": 2376 + }, + { + "epoch": 0.6338666666666667, + "grad_norm": 0.3980083373201319, + "learning_rate": 6.246104486341753e-05, + "loss": 0.6094, + "step": 2377 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.32413698778242334, + "learning_rate": 6.238099731661942e-05, + "loss": 0.5743, + "step": 2378 + }, + { + "epoch": 0.6344, + "grad_norm": 0.3382939342418333, + "learning_rate": 6.230097783839825e-05, + "loss": 0.5994, + "step": 2379 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.3393203304566258, + "learning_rate": 6.22209864884587e-05, + "loss": 0.5709, + "step": 2380 + }, + { + "epoch": 0.6349333333333333, + "grad_norm": 0.33839393026082126, + "learning_rate": 6.21410233264846e-05, + "loss": 0.6083, + "step": 2381 + }, + { + "epoch": 0.6352, + "grad_norm": 0.3711960839308339, + "learning_rate": 6.206108841213856e-05, + "loss": 0.6469, + "step": 2382 + }, + { + "epoch": 0.6354666666666666, + "grad_norm": 0.341931410734462, + "learning_rate": 6.19811818050623e-05, + "loss": 0.5604, + "step": 2383 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.3247703605715169, + "learning_rate": 6.190130356487634e-05, + "loss": 0.5812, + "step": 2384 + }, + { + "epoch": 0.636, + "grad_norm": 0.35434578547863294, + "learning_rate": 6.182145375118002e-05, + "loss": 0.5554, + "step": 2385 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.3556056044213876, + "learning_rate": 6.17416324235515e-05, + "loss": 0.6131, + "step": 2386 + }, + { + "epoch": 0.6365333333333333, + "grad_norm": 0.3724091841904112, + "learning_rate": 6.16618396415477e-05, + "loss": 0.6079, + "step": 2387 + }, + { + "epoch": 0.6368, + "grad_norm": 0.34070475795863037, + "learning_rate": 6.158207546470421e-05, + "loss": 0.6456, + "step": 2388 + }, + { + "epoch": 0.6370666666666667, + "grad_norm": 0.33656276566350446, + "learning_rate": 6.150233995253527e-05, + "loss": 0.608, + "step": 2389 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.3605240523953494, + "learning_rate": 6.142263316453377e-05, + "loss": 0.5943, + "step": 2390 + }, + { + "epoch": 0.6376, + "grad_norm": 0.3453808744153834, + "learning_rate": 6.134295516017109e-05, + "loss": 0.6191, + "step": 2391 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.3117371998839096, + "learning_rate": 6.126330599889724e-05, + "loss": 0.5439, + "step": 2392 + }, + { + "epoch": 0.6381333333333333, + "grad_norm": 0.31898691886780756, + "learning_rate": 6.118368574014066e-05, + "loss": 0.5318, + "step": 2393 + }, + { + "epoch": 0.6384, + "grad_norm": 0.36728965219198434, + "learning_rate": 6.110409444330823e-05, + "loss": 0.608, + "step": 2394 + }, + { + "epoch": 0.6386666666666667, + "grad_norm": 0.35233809111531966, + "learning_rate": 6.1024532167785164e-05, + "loss": 0.5759, + "step": 2395 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.3344103431990208, + "learning_rate": 6.094499897293515e-05, + "loss": 0.5932, + "step": 2396 + }, + { + "epoch": 0.6392, + "grad_norm": 0.3337176651337381, + "learning_rate": 6.086549491810003e-05, + "loss": 0.5967, + "step": 2397 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.33263926634069596, + "learning_rate": 6.0786020062600016e-05, + "loss": 0.605, + "step": 2398 + }, + { + "epoch": 0.6397333333333334, + "grad_norm": 0.3923339162462857, + "learning_rate": 6.070657446573347e-05, + "loss": 0.6399, + "step": 2399 + }, + { + "epoch": 0.64, + "grad_norm": 0.33958658057735047, + "learning_rate": 6.062715818677696e-05, + "loss": 0.5974, + "step": 2400 + }, + { + "epoch": 0.6402666666666667, + "grad_norm": 0.3300127156212101, + "learning_rate": 6.054777128498515e-05, + "loss": 0.5568, + "step": 2401 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.3581778039589509, + "learning_rate": 6.046841381959082e-05, + "loss": 0.5886, + "step": 2402 + }, + { + "epoch": 0.6408, + "grad_norm": 0.32294944624189975, + "learning_rate": 6.038908584980476e-05, + "loss": 0.5943, + "step": 2403 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.348186094768716, + "learning_rate": 6.030978743481578e-05, + "loss": 0.6176, + "step": 2404 + }, + { + "epoch": 0.6413333333333333, + "grad_norm": 0.37498759929633807, + "learning_rate": 6.023051863379057e-05, + "loss": 0.6029, + "step": 2405 + }, + { + "epoch": 0.6416, + "grad_norm": 0.32591324421105833, + "learning_rate": 6.01512795058738e-05, + "loss": 0.5429, + "step": 2406 + }, + { + "epoch": 0.6418666666666667, + "grad_norm": 0.360212112950924, + "learning_rate": 6.007207011018796e-05, + "loss": 0.578, + "step": 2407 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.35859697448639555, + "learning_rate": 5.999289050583339e-05, + "loss": 0.5891, + "step": 2408 + }, + { + "epoch": 0.6424, + "grad_norm": 0.3415563326990654, + "learning_rate": 5.991374075188816e-05, + "loss": 0.5936, + "step": 2409 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.36459265659956647, + "learning_rate": 5.98346209074081e-05, + "loss": 0.6104, + "step": 2410 + }, + { + "epoch": 0.6429333333333334, + "grad_norm": 0.34660026757714923, + "learning_rate": 5.975553103142669e-05, + "loss": 0.6186, + "step": 2411 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3522029968780649, + "learning_rate": 5.9676471182955116e-05, + "loss": 0.6096, + "step": 2412 + }, + { + "epoch": 0.6434666666666666, + "grad_norm": 0.34374410390207266, + "learning_rate": 5.959744142098207e-05, + "loss": 0.6051, + "step": 2413 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.34667109670639695, + "learning_rate": 5.9518441804473846e-05, + "loss": 0.6021, + "step": 2414 + }, + { + "epoch": 0.644, + "grad_norm": 0.358636275303948, + "learning_rate": 5.943947239237424e-05, + "loss": 0.5781, + "step": 2415 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.32347244785117024, + "learning_rate": 5.936053324360453e-05, + "loss": 0.5727, + "step": 2416 + }, + { + "epoch": 0.6445333333333333, + "grad_norm": 0.33767847752157737, + "learning_rate": 5.9281624417063395e-05, + "loss": 0.6232, + "step": 2417 + }, + { + "epoch": 0.6448, + "grad_norm": 0.32149237569745887, + "learning_rate": 5.9202745971626864e-05, + "loss": 0.5228, + "step": 2418 + }, + { + "epoch": 0.6450666666666667, + "grad_norm": 0.35127631413085253, + "learning_rate": 5.912389796614835e-05, + "loss": 0.6093, + "step": 2419 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.35284551550376864, + "learning_rate": 5.9045080459458535e-05, + "loss": 0.6011, + "step": 2420 + }, + { + "epoch": 0.6456, + "grad_norm": 0.342214804035069, + "learning_rate": 5.89662935103653e-05, + "loss": 0.5861, + "step": 2421 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.3308244044650585, + "learning_rate": 5.8887537177653786e-05, + "loss": 0.5929, + "step": 2422 + }, + { + "epoch": 0.6461333333333333, + "grad_norm": 0.3241516134333082, + "learning_rate": 5.880881152008623e-05, + "loss": 0.5774, + "step": 2423 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3346715824571503, + "learning_rate": 5.8730116596402084e-05, + "loss": 0.6211, + "step": 2424 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 0.3364017085713504, + "learning_rate": 5.865145246531776e-05, + "loss": 0.5951, + "step": 2425 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.350056816015172, + "learning_rate": 5.857281918552677e-05, + "loss": 0.5903, + "step": 2426 + }, + { + "epoch": 0.6472, + "grad_norm": 0.32984344774870605, + "learning_rate": 5.8494216815699556e-05, + "loss": 0.5915, + "step": 2427 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.7645698228696151, + "learning_rate": 5.841564541448356e-05, + "loss": 0.5994, + "step": 2428 + }, + { + "epoch": 0.6477333333333334, + "grad_norm": 0.36233641363172175, + "learning_rate": 5.833710504050298e-05, + "loss": 0.5436, + "step": 2429 + }, + { + "epoch": 0.648, + "grad_norm": 0.38125683813856825, + "learning_rate": 5.8258595752359036e-05, + "loss": 0.6347, + "step": 2430 + }, + { + "epoch": 0.6482666666666667, + "grad_norm": 0.3514245491403789, + "learning_rate": 5.8180117608629645e-05, + "loss": 0.577, + "step": 2431 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.342700705782899, + "learning_rate": 5.810167066786951e-05, + "loss": 0.6108, + "step": 2432 + }, + { + "epoch": 0.6488, + "grad_norm": 0.3386868299620465, + "learning_rate": 5.80232549886101e-05, + "loss": 0.5576, + "step": 2433 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.35904758414658006, + "learning_rate": 5.794487062935948e-05, + "loss": 0.6069, + "step": 2434 + }, + { + "epoch": 0.6493333333333333, + "grad_norm": 0.32993308622412953, + "learning_rate": 5.78665176486024e-05, + "loss": 0.5499, + "step": 2435 + }, + { + "epoch": 0.6496, + "grad_norm": 0.34450413336212, + "learning_rate": 5.7788196104800194e-05, + "loss": 0.6022, + "step": 2436 + }, + { + "epoch": 0.6498666666666667, + "grad_norm": 0.3397398179716159, + "learning_rate": 5.770990605639071e-05, + "loss": 0.5983, + "step": 2437 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.33207919165370103, + "learning_rate": 5.763164756178833e-05, + "loss": 0.5554, + "step": 2438 + }, + { + "epoch": 0.6504, + "grad_norm": 0.36498725108404595, + "learning_rate": 5.755342067938386e-05, + "loss": 0.6021, + "step": 2439 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.33331872270019547, + "learning_rate": 5.747522546754456e-05, + "loss": 0.6, + "step": 2440 + }, + { + "epoch": 0.6509333333333334, + "grad_norm": 0.3388025965934976, + "learning_rate": 5.739706198461402e-05, + "loss": 0.6461, + "step": 2441 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3345424369891375, + "learning_rate": 5.731893028891218e-05, + "loss": 0.6068, + "step": 2442 + }, + { + "epoch": 0.6514666666666666, + "grad_norm": 0.34908972711079245, + "learning_rate": 5.7240830438735295e-05, + "loss": 0.5962, + "step": 2443 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.34496049207322027, + "learning_rate": 5.7162762492355746e-05, + "loss": 0.5779, + "step": 2444 + }, + { + "epoch": 0.652, + "grad_norm": 0.3438736478434742, + "learning_rate": 5.708472650802221e-05, + "loss": 0.5931, + "step": 2445 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4031791703967563, + "learning_rate": 5.7006722543959515e-05, + "loss": 0.6027, + "step": 2446 + }, + { + "epoch": 0.6525333333333333, + "grad_norm": 0.34634120686413566, + "learning_rate": 5.6928750658368555e-05, + "loss": 0.5972, + "step": 2447 + }, + { + "epoch": 0.6528, + "grad_norm": 0.33949998651994945, + "learning_rate": 5.68508109094263e-05, + "loss": 0.6313, + "step": 2448 + }, + { + "epoch": 0.6530666666666667, + "grad_norm": 0.34180683935263556, + "learning_rate": 5.6772903355285755e-05, + "loss": 0.5996, + "step": 2449 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.3621528767223928, + "learning_rate": 5.669502805407591e-05, + "loss": 0.574, + "step": 2450 + }, + { + "epoch": 0.6536, + "grad_norm": 0.34832301863962545, + "learning_rate": 5.6617185063901714e-05, + "loss": 0.5673, + "step": 2451 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.346287044119997, + "learning_rate": 5.653937444284389e-05, + "loss": 0.6045, + "step": 2452 + }, + { + "epoch": 0.6541333333333333, + "grad_norm": 0.3566316323533672, + "learning_rate": 5.6461596248959115e-05, + "loss": 0.602, + "step": 2453 + }, + { + "epoch": 0.6544, + "grad_norm": 0.3287837448539727, + "learning_rate": 5.638385054027987e-05, + "loss": 0.5895, + "step": 2454 + }, + { + "epoch": 0.6546666666666666, + "grad_norm": 0.3613129767717555, + "learning_rate": 5.6306137374814363e-05, + "loss": 0.6258, + "step": 2455 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.3566901495801631, + "learning_rate": 5.622845681054651e-05, + "loss": 0.63, + "step": 2456 + }, + { + "epoch": 0.6552, + "grad_norm": 0.35772424480405757, + "learning_rate": 5.6150808905435984e-05, + "loss": 0.6003, + "step": 2457 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.34052185284268577, + "learning_rate": 5.607319371741799e-05, + "loss": 0.6489, + "step": 2458 + }, + { + "epoch": 0.6557333333333333, + "grad_norm": 0.347116530437614, + "learning_rate": 5.599561130440343e-05, + "loss": 0.5959, + "step": 2459 + }, + { + "epoch": 0.656, + "grad_norm": 0.33518404133193597, + "learning_rate": 5.5918061724278584e-05, + "loss": 0.5906, + "step": 2460 + }, + { + "epoch": 0.6562666666666667, + "grad_norm": 0.3476084147046415, + "learning_rate": 5.5840545034905365e-05, + "loss": 0.6216, + "step": 2461 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.3338795967471878, + "learning_rate": 5.5763061294121154e-05, + "loss": 0.5856, + "step": 2462 + }, + { + "epoch": 0.6568, + "grad_norm": 0.36357983418745365, + "learning_rate": 5.568561055973868e-05, + "loss": 0.5714, + "step": 2463 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.3508803528152843, + "learning_rate": 5.5608192889546085e-05, + "loss": 0.5932, + "step": 2464 + }, + { + "epoch": 0.6573333333333333, + "grad_norm": 0.46003479213026277, + "learning_rate": 5.553080834130682e-05, + "loss": 0.6168, + "step": 2465 + }, + { + "epoch": 0.6576, + "grad_norm": 0.3397964668589436, + "learning_rate": 5.545345697275964e-05, + "loss": 0.5933, + "step": 2466 + }, + { + "epoch": 0.6578666666666667, + "grad_norm": 0.3671332812649807, + "learning_rate": 5.537613884161859e-05, + "loss": 0.5557, + "step": 2467 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.3381807353666364, + "learning_rate": 5.529885400557277e-05, + "loss": 0.5397, + "step": 2468 + }, + { + "epoch": 0.6584, + "grad_norm": 0.353728469181082, + "learning_rate": 5.5221602522286565e-05, + "loss": 0.5924, + "step": 2469 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.36028533391105544, + "learning_rate": 5.5144384449399466e-05, + "loss": 0.5982, + "step": 2470 + }, + { + "epoch": 0.6589333333333334, + "grad_norm": 0.3682945885183188, + "learning_rate": 5.506719984452597e-05, + "loss": 0.6169, + "step": 2471 + }, + { + "epoch": 0.6592, + "grad_norm": 0.35033546794801473, + "learning_rate": 5.499004876525569e-05, + "loss": 0.5766, + "step": 2472 + }, + { + "epoch": 0.6594666666666666, + "grad_norm": 0.3589300885290531, + "learning_rate": 5.4912931269153134e-05, + "loss": 0.6766, + "step": 2473 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.34090204625039444, + "learning_rate": 5.483584741375781e-05, + "loss": 0.5639, + "step": 2474 + }, + { + "epoch": 0.66, + "grad_norm": 0.3686975922347045, + "learning_rate": 5.475879725658413e-05, + "loss": 0.6005, + "step": 2475 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.3944519721443153, + "learning_rate": 5.468178085512132e-05, + "loss": 0.6252, + "step": 2476 + }, + { + "epoch": 0.6605333333333333, + "grad_norm": 0.3999240027948932, + "learning_rate": 5.4604798266833455e-05, + "loss": 0.6019, + "step": 2477 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3295366763050033, + "learning_rate": 5.452784954915937e-05, + "loss": 0.5929, + "step": 2478 + }, + { + "epoch": 0.6610666666666667, + "grad_norm": 0.37837611354732187, + "learning_rate": 5.445093475951263e-05, + "loss": 0.6029, + "step": 2479 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.4533398937731497, + "learning_rate": 5.437405395528148e-05, + "loss": 0.5427, + "step": 2480 + }, + { + "epoch": 0.6616, + "grad_norm": 0.3436047315121984, + "learning_rate": 5.4297207193828804e-05, + "loss": 0.555, + "step": 2481 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.34018489822301473, + "learning_rate": 5.422039453249216e-05, + "loss": 0.5574, + "step": 2482 + }, + { + "epoch": 0.6621333333333334, + "grad_norm": 0.33501399145007826, + "learning_rate": 5.414361602858349e-05, + "loss": 0.5746, + "step": 2483 + }, + { + "epoch": 0.6624, + "grad_norm": 0.369499967322875, + "learning_rate": 5.40668717393894e-05, + "loss": 0.6149, + "step": 2484 + }, + { + "epoch": 0.6626666666666666, + "grad_norm": 0.35805023715592077, + "learning_rate": 5.399016172217093e-05, + "loss": 0.6133, + "step": 2485 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.32037798114227906, + "learning_rate": 5.391348603416353e-05, + "loss": 0.5607, + "step": 2486 + }, + { + "epoch": 0.6632, + "grad_norm": 0.3544221484186545, + "learning_rate": 5.383684473257707e-05, + "loss": 0.6019, + "step": 2487 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.3544589771151001, + "learning_rate": 5.376023787459574e-05, + "loss": 0.6019, + "step": 2488 + }, + { + "epoch": 0.6637333333333333, + "grad_norm": 0.3374719310464669, + "learning_rate": 5.3683665517378004e-05, + "loss": 0.6061, + "step": 2489 + }, + { + "epoch": 0.664, + "grad_norm": 0.3468952660891746, + "learning_rate": 5.3607127718056695e-05, + "loss": 0.5899, + "step": 2490 + }, + { + "epoch": 0.6642666666666667, + "grad_norm": 0.3407278819614768, + "learning_rate": 5.353062453373862e-05, + "loss": 0.5522, + "step": 2491 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.33739507650594036, + "learning_rate": 5.3454156021505055e-05, + "loss": 0.5983, + "step": 2492 + }, + { + "epoch": 0.6648, + "grad_norm": 0.33262547013987365, + "learning_rate": 5.337772223841122e-05, + "loss": 0.5613, + "step": 2493 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.3648979827617922, + "learning_rate": 5.330132324148649e-05, + "loss": 0.5623, + "step": 2494 + }, + { + "epoch": 0.6653333333333333, + "grad_norm": 0.32865851853796996, + "learning_rate": 5.3224959087734264e-05, + "loss": 0.6187, + "step": 2495 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3983097493320104, + "learning_rate": 5.3148629834131925e-05, + "loss": 0.5927, + "step": 2496 + }, + { + "epoch": 0.6658666666666667, + "grad_norm": 0.3664175826588646, + "learning_rate": 5.3072335537630845e-05, + "loss": 0.5455, + "step": 2497 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.31722078152711936, + "learning_rate": 5.299607625515637e-05, + "loss": 0.5379, + "step": 2498 + }, + { + "epoch": 0.6664, + "grad_norm": 0.3360887776404759, + "learning_rate": 5.291985204360754e-05, + "loss": 0.5521, + "step": 2499 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.34364499649892183, + "learning_rate": 5.284366295985741e-05, + "loss": 0.6079, + "step": 2500 + }, + { + "epoch": 0.6669333333333334, + "grad_norm": 0.33762813065562547, + "learning_rate": 5.2767509060752764e-05, + "loss": 0.5667, + "step": 2501 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3616585325541724, + "learning_rate": 5.269139040311411e-05, + "loss": 0.6244, + "step": 2502 + }, + { + "epoch": 0.6674666666666667, + "grad_norm": 0.3443235724588427, + "learning_rate": 5.2615307043735676e-05, + "loss": 0.5713, + "step": 2503 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.3438457858788958, + "learning_rate": 5.253925903938538e-05, + "loss": 0.5999, + "step": 2504 + }, + { + "epoch": 0.668, + "grad_norm": 0.37071917277564054, + "learning_rate": 5.2463246446804725e-05, + "loss": 0.643, + "step": 2505 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.3488194676574523, + "learning_rate": 5.2387269322708854e-05, + "loss": 0.5656, + "step": 2506 + }, + { + "epoch": 0.6685333333333333, + "grad_norm": 0.32135902286096674, + "learning_rate": 5.231132772378631e-05, + "loss": 0.5599, + "step": 2507 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3587053415055037, + "learning_rate": 5.223542170669926e-05, + "loss": 0.6155, + "step": 2508 + }, + { + "epoch": 0.6690666666666667, + "grad_norm": 0.33425986489072557, + "learning_rate": 5.215955132808328e-05, + "loss": 0.5712, + "step": 2509 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.3519724619626593, + "learning_rate": 5.2083716644547364e-05, + "loss": 0.5983, + "step": 2510 + }, + { + "epoch": 0.6696, + "grad_norm": 0.35133495988947727, + "learning_rate": 5.200791771267384e-05, + "loss": 0.574, + "step": 2511 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.3456925346287904, + "learning_rate": 5.193215458901837e-05, + "loss": 0.6037, + "step": 2512 + }, + { + "epoch": 0.6701333333333334, + "grad_norm": 0.34366282667650583, + "learning_rate": 5.1856427330110025e-05, + "loss": 0.5836, + "step": 2513 + }, + { + "epoch": 0.6704, + "grad_norm": 0.3367418436781026, + "learning_rate": 5.1780735992450865e-05, + "loss": 0.6221, + "step": 2514 + }, + { + "epoch": 0.6706666666666666, + "grad_norm": 0.3490656912191057, + "learning_rate": 5.170508063251636e-05, + "loss": 0.6183, + "step": 2515 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.35253745610459775, + "learning_rate": 5.162946130675503e-05, + "loss": 0.596, + "step": 2516 + }, + { + "epoch": 0.6712, + "grad_norm": 0.31980301046040727, + "learning_rate": 5.1553878071588576e-05, + "loss": 0.5479, + "step": 2517 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.3404904477153669, + "learning_rate": 5.147833098341173e-05, + "loss": 0.5365, + "step": 2518 + }, + { + "epoch": 0.6717333333333333, + "grad_norm": 0.3535567861550599, + "learning_rate": 5.140282009859224e-05, + "loss": 0.5345, + "step": 2519 + }, + { + "epoch": 0.672, + "grad_norm": 0.35489481564989434, + "learning_rate": 5.132734547347088e-05, + "loss": 0.5914, + "step": 2520 + }, + { + "epoch": 0.6722666666666667, + "grad_norm": 0.3475996798308405, + "learning_rate": 5.125190716436139e-05, + "loss": 0.5836, + "step": 2521 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.3450635115813747, + "learning_rate": 5.1176505227550286e-05, + "loss": 0.6197, + "step": 2522 + }, + { + "epoch": 0.6728, + "grad_norm": 0.37723384466315196, + "learning_rate": 5.110113971929708e-05, + "loss": 0.5709, + "step": 2523 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.32562501966448837, + "learning_rate": 5.102581069583407e-05, + "loss": 0.5501, + "step": 2524 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 0.31776203077361176, + "learning_rate": 5.0950518213366314e-05, + "loss": 0.5482, + "step": 2525 + }, + { + "epoch": 0.6736, + "grad_norm": 0.33030440418162826, + "learning_rate": 5.08752623280716e-05, + "loss": 0.5357, + "step": 2526 + }, + { + "epoch": 0.6738666666666666, + "grad_norm": 0.33907909419975807, + "learning_rate": 5.080004309610045e-05, + "loss": 0.5887, + "step": 2527 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.33793583554471157, + "learning_rate": 5.0724860573575994e-05, + "loss": 0.592, + "step": 2528 + }, + { + "epoch": 0.6744, + "grad_norm": 0.32993594798479337, + "learning_rate": 5.064971481659399e-05, + "loss": 0.555, + "step": 2529 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.3275825963157326, + "learning_rate": 5.057460588122276e-05, + "loss": 0.5264, + "step": 2530 + }, + { + "epoch": 0.6749333333333334, + "grad_norm": 0.3433011176300641, + "learning_rate": 5.049953382350314e-05, + "loss": 0.5878, + "step": 2531 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3698003650110036, + "learning_rate": 5.042449869944851e-05, + "loss": 0.5604, + "step": 2532 + }, + { + "epoch": 0.6754666666666667, + "grad_norm": 0.32268863726411673, + "learning_rate": 5.03495005650446e-05, + "loss": 0.5561, + "step": 2533 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.33428107397463697, + "learning_rate": 5.027453947624963e-05, + "loss": 0.5495, + "step": 2534 + }, + { + "epoch": 0.676, + "grad_norm": 0.3446895338822114, + "learning_rate": 5.01996154889941e-05, + "loss": 0.5431, + "step": 2535 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.4679149445936216, + "learning_rate": 5.0124728659180895e-05, + "loss": 0.6033, + "step": 2536 + }, + { + "epoch": 0.6765333333333333, + "grad_norm": 0.3724348593690452, + "learning_rate": 5.004987904268519e-05, + "loss": 0.6005, + "step": 2537 + }, + { + "epoch": 0.6768, + "grad_norm": 0.39101113395957166, + "learning_rate": 4.9975066695354245e-05, + "loss": 0.5781, + "step": 2538 + }, + { + "epoch": 0.6770666666666667, + "grad_norm": 0.37284538949464013, + "learning_rate": 4.990029167300767e-05, + "loss": 0.5508, + "step": 2539 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.35232556208516896, + "learning_rate": 4.9825554031437194e-05, + "loss": 0.6228, + "step": 2540 + }, + { + "epoch": 0.6776, + "grad_norm": 0.3454147641906744, + "learning_rate": 4.975085382640661e-05, + "loss": 0.5943, + "step": 2541 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.34753345272013797, + "learning_rate": 4.9676191113651804e-05, + "loss": 0.5403, + "step": 2542 + }, + { + "epoch": 0.6781333333333334, + "grad_norm": 0.3567171906792844, + "learning_rate": 4.9601565948880704e-05, + "loss": 0.6441, + "step": 2543 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3556237187996497, + "learning_rate": 4.9526978387773196e-05, + "loss": 0.598, + "step": 2544 + }, + { + "epoch": 0.6786666666666666, + "grad_norm": 0.40240655144839815, + "learning_rate": 4.945242848598116e-05, + "loss": 0.641, + "step": 2545 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.3370996868151834, + "learning_rate": 4.9377916299128226e-05, + "loss": 0.6046, + "step": 2546 + }, + { + "epoch": 0.6792, + "grad_norm": 0.37809508207870274, + "learning_rate": 4.9303441882810106e-05, + "loss": 0.5886, + "step": 2547 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.3350394277163161, + "learning_rate": 4.9229005292594175e-05, + "loss": 0.5931, + "step": 2548 + }, + { + "epoch": 0.6797333333333333, + "grad_norm": 0.3410077041033582, + "learning_rate": 4.9154606584019644e-05, + "loss": 0.5892, + "step": 2549 + }, + { + "epoch": 0.68, + "grad_norm": 0.3578754267506827, + "learning_rate": 4.9080245812597434e-05, + "loss": 0.5984, + "step": 2550 + }, + { + "epoch": 0.6802666666666667, + "grad_norm": 0.32861571746387214, + "learning_rate": 4.900592303381016e-05, + "loss": 0.5818, + "step": 2551 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.4002809734898077, + "learning_rate": 4.893163830311216e-05, + "loss": 0.5982, + "step": 2552 + }, + { + "epoch": 0.6808, + "grad_norm": 0.3393118343346211, + "learning_rate": 4.885739167592923e-05, + "loss": 0.5679, + "step": 2553 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.335548330799039, + "learning_rate": 4.878318320765888e-05, + "loss": 0.5837, + "step": 2554 + }, + { + "epoch": 0.6813333333333333, + "grad_norm": 0.32554726875807616, + "learning_rate": 4.8709012953670096e-05, + "loss": 0.5746, + "step": 2555 + }, + { + "epoch": 0.6816, + "grad_norm": 0.348240853733335, + "learning_rate": 4.863488096930333e-05, + "loss": 0.5905, + "step": 2556 + }, + { + "epoch": 0.6818666666666666, + "grad_norm": 0.3227300220745892, + "learning_rate": 4.856078730987054e-05, + "loss": 0.5835, + "step": 2557 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.33548187568536797, + "learning_rate": 4.848673203065502e-05, + "loss": 0.5162, + "step": 2558 + }, + { + "epoch": 0.6824, + "grad_norm": 0.3403186240533581, + "learning_rate": 4.841271518691149e-05, + "loss": 0.5509, + "step": 2559 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.3750547649216263, + "learning_rate": 4.833873683386596e-05, + "loss": 0.6344, + "step": 2560 + }, + { + "epoch": 0.6829333333333333, + "grad_norm": 0.3230688623238985, + "learning_rate": 4.826479702671567e-05, + "loss": 0.5731, + "step": 2561 + }, + { + "epoch": 0.6832, + "grad_norm": 0.32793200127882455, + "learning_rate": 4.81908958206292e-05, + "loss": 0.5442, + "step": 2562 + }, + { + "epoch": 0.6834666666666667, + "grad_norm": 0.35651353869835617, + "learning_rate": 4.811703327074626e-05, + "loss": 0.6099, + "step": 2563 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.35507964722126156, + "learning_rate": 4.804320943217775e-05, + "loss": 0.5938, + "step": 2564 + }, + { + "epoch": 0.684, + "grad_norm": 0.34955903687334955, + "learning_rate": 4.796942436000568e-05, + "loss": 0.5815, + "step": 2565 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.34118412145094035, + "learning_rate": 4.7895678109283116e-05, + "loss": 0.5531, + "step": 2566 + }, + { + "epoch": 0.6845333333333333, + "grad_norm": 0.3489463603151044, + "learning_rate": 4.782197073503414e-05, + "loss": 0.5722, + "step": 2567 + }, + { + "epoch": 0.6848, + "grad_norm": 0.34257297902736417, + "learning_rate": 4.774830229225398e-05, + "loss": 0.5662, + "step": 2568 + }, + { + "epoch": 0.6850666666666667, + "grad_norm": 0.33510491412642573, + "learning_rate": 4.767467283590856e-05, + "loss": 0.5366, + "step": 2569 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.3559397833904609, + "learning_rate": 4.7601082420934925e-05, + "loss": 0.6124, + "step": 2570 + }, + { + "epoch": 0.6856, + "grad_norm": 0.4037254310154087, + "learning_rate": 4.752753110224089e-05, + "loss": 0.6123, + "step": 2571 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.3478290984169354, + "learning_rate": 4.7454018934705126e-05, + "loss": 0.598, + "step": 2572 + }, + { + "epoch": 0.6861333333333334, + "grad_norm": 0.33529657369376437, + "learning_rate": 4.7380545973177107e-05, + "loss": 0.5934, + "step": 2573 + }, + { + "epoch": 0.6864, + "grad_norm": 0.34014238865796187, + "learning_rate": 4.730711227247703e-05, + "loss": 0.5591, + "step": 2574 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 0.3372149416578562, + "learning_rate": 4.72337178873958e-05, + "loss": 0.6029, + "step": 2575 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.3865273943149658, + "learning_rate": 4.716036287269504e-05, + "loss": 0.6013, + "step": 2576 + }, + { + "epoch": 0.6872, + "grad_norm": 0.37867397996445734, + "learning_rate": 4.708704728310688e-05, + "loss": 0.6054, + "step": 2577 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.33300064292744663, + "learning_rate": 4.701377117333413e-05, + "loss": 0.5746, + "step": 2578 + }, + { + "epoch": 0.6877333333333333, + "grad_norm": 0.3412960539150721, + "learning_rate": 4.6940534598050135e-05, + "loss": 0.5783, + "step": 2579 + }, + { + "epoch": 0.688, + "grad_norm": 0.34868200147958184, + "learning_rate": 4.686733761189872e-05, + "loss": 0.619, + "step": 2580 + }, + { + "epoch": 0.6882666666666667, + "grad_norm": 0.38928620427649707, + "learning_rate": 4.679418026949418e-05, + "loss": 0.5549, + "step": 2581 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.4056262048080404, + "learning_rate": 4.672106262542123e-05, + "loss": 0.6121, + "step": 2582 + }, + { + "epoch": 0.6888, + "grad_norm": 0.34796455707915097, + "learning_rate": 4.664798473423496e-05, + "loss": 0.5963, + "step": 2583 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.34420643683848945, + "learning_rate": 4.6574946650460804e-05, + "loss": 0.5468, + "step": 2584 + }, + { + "epoch": 0.6893333333333334, + "grad_norm": 0.3603720102963772, + "learning_rate": 4.650194842859449e-05, + "loss": 0.6162, + "step": 2585 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3720903972014294, + "learning_rate": 4.6428990123102014e-05, + "loss": 0.5746, + "step": 2586 + }, + { + "epoch": 0.6898666666666666, + "grad_norm": 0.3372115846788491, + "learning_rate": 4.6356071788419584e-05, + "loss": 0.566, + "step": 2587 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.39191337422988803, + "learning_rate": 4.6283193478953566e-05, + "loss": 0.5419, + "step": 2588 + }, + { + "epoch": 0.6904, + "grad_norm": 0.3548730373313539, + "learning_rate": 4.6210355249080505e-05, + "loss": 0.6079, + "step": 2589 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.35360031311505496, + "learning_rate": 4.6137557153147005e-05, + "loss": 0.588, + "step": 2590 + }, + { + "epoch": 0.6909333333333333, + "grad_norm": 0.33920376307324135, + "learning_rate": 4.606479924546977e-05, + "loss": 0.5963, + "step": 2591 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3606990133244874, + "learning_rate": 4.599208158033541e-05, + "loss": 0.6096, + "step": 2592 + }, + { + "epoch": 0.6914666666666667, + "grad_norm": 0.3423405811888361, + "learning_rate": 4.5919404212000614e-05, + "loss": 0.5978, + "step": 2593 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.32687460140451796, + "learning_rate": 4.5846767194692e-05, + "loss": 0.5976, + "step": 2594 + }, + { + "epoch": 0.692, + "grad_norm": 0.32446028355999235, + "learning_rate": 4.577417058260602e-05, + "loss": 0.5903, + "step": 2595 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.33956085172623224, + "learning_rate": 4.570161442990903e-05, + "loss": 0.5678, + "step": 2596 + }, + { + "epoch": 0.6925333333333333, + "grad_norm": 0.35040416393857554, + "learning_rate": 4.562909879073719e-05, + "loss": 0.6134, + "step": 2597 + }, + { + "epoch": 0.6928, + "grad_norm": 0.33152674715543834, + "learning_rate": 4.555662371919639e-05, + "loss": 0.5591, + "step": 2598 + }, + { + "epoch": 0.6930666666666667, + "grad_norm": 0.4203000284758059, + "learning_rate": 4.548418926936234e-05, + "loss": 0.5617, + "step": 2599 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.3518319436988351, + "learning_rate": 4.541179549528032e-05, + "loss": 0.5946, + "step": 2600 + }, + { + "epoch": 0.6936, + "grad_norm": 0.3490569973230447, + "learning_rate": 4.533944245096533e-05, + "loss": 0.6086, + "step": 2601 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.34871934309748615, + "learning_rate": 4.526713019040196e-05, + "loss": 0.6161, + "step": 2602 + }, + { + "epoch": 0.6941333333333334, + "grad_norm": 0.3338480599616353, + "learning_rate": 4.519485876754446e-05, + "loss": 0.5946, + "step": 2603 + }, + { + "epoch": 0.6944, + "grad_norm": 0.43003597600927673, + "learning_rate": 4.512262823631648e-05, + "loss": 0.5936, + "step": 2604 + }, + { + "epoch": 0.6946666666666667, + "grad_norm": 0.34478712680790746, + "learning_rate": 4.505043865061124e-05, + "loss": 0.5458, + "step": 2605 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.3397244100583301, + "learning_rate": 4.497829006429136e-05, + "loss": 0.5522, + "step": 2606 + }, + { + "epoch": 0.6952, + "grad_norm": 0.3565949685173397, + "learning_rate": 4.490618253118895e-05, + "loss": 0.5763, + "step": 2607 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.32851689301954007, + "learning_rate": 4.4834116105105325e-05, + "loss": 0.5663, + "step": 2608 + }, + { + "epoch": 0.6957333333333333, + "grad_norm": 0.3464260479874418, + "learning_rate": 4.476209083981131e-05, + "loss": 0.5604, + "step": 2609 + }, + { + "epoch": 0.696, + "grad_norm": 0.35022616692552616, + "learning_rate": 4.469010678904694e-05, + "loss": 0.6044, + "step": 2610 + }, + { + "epoch": 0.6962666666666667, + "grad_norm": 0.32363297510402633, + "learning_rate": 4.461816400652148e-05, + "loss": 0.6145, + "step": 2611 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.3456997575243997, + "learning_rate": 4.454626254591344e-05, + "loss": 0.6622, + "step": 2612 + }, + { + "epoch": 0.6968, + "grad_norm": 0.3903246172040829, + "learning_rate": 4.447440246087049e-05, + "loss": 0.6144, + "step": 2613 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.34037631740940844, + "learning_rate": 4.440258380500942e-05, + "loss": 0.5891, + "step": 2614 + }, + { + "epoch": 0.6973333333333334, + "grad_norm": 0.34431456055421017, + "learning_rate": 4.433080663191615e-05, + "loss": 0.5682, + "step": 2615 + }, + { + "epoch": 0.6976, + "grad_norm": 0.3474196486075847, + "learning_rate": 4.4259070995145544e-05, + "loss": 0.6116, + "step": 2616 + }, + { + "epoch": 0.6978666666666666, + "grad_norm": 0.3560638134351646, + "learning_rate": 4.418737694822156e-05, + "loss": 0.6119, + "step": 2617 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.34518385346353603, + "learning_rate": 4.4115724544637124e-05, + "loss": 0.6239, + "step": 2618 + }, + { + "epoch": 0.6984, + "grad_norm": 0.3732215335042505, + "learning_rate": 4.4044113837854074e-05, + "loss": 0.6278, + "step": 2619 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.334373039075218, + "learning_rate": 4.397254488130312e-05, + "loss": 0.5811, + "step": 2620 + }, + { + "epoch": 0.6989333333333333, + "grad_norm": 0.36055205242008437, + "learning_rate": 4.390101772838385e-05, + "loss": 0.6168, + "step": 2621 + }, + { + "epoch": 0.6992, + "grad_norm": 0.34268108833532807, + "learning_rate": 4.382953243246465e-05, + "loss": 0.5514, + "step": 2622 + }, + { + "epoch": 0.6994666666666667, + "grad_norm": 0.3522581780460182, + "learning_rate": 4.3758089046882664e-05, + "loss": 0.6305, + "step": 2623 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.3190086924293713, + "learning_rate": 4.368668762494379e-05, + "loss": 0.5638, + "step": 2624 + }, + { + "epoch": 0.7, + "grad_norm": 0.3781872459672398, + "learning_rate": 4.361532821992258e-05, + "loss": 0.5819, + "step": 2625 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.3292043824104552, + "learning_rate": 4.354401088506227e-05, + "loss": 0.5732, + "step": 2626 + }, + { + "epoch": 0.7005333333333333, + "grad_norm": 0.3438481638598106, + "learning_rate": 4.347273567357469e-05, + "loss": 0.5682, + "step": 2627 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3655178951282547, + "learning_rate": 4.340150263864024e-05, + "loss": 0.5838, + "step": 2628 + }, + { + "epoch": 0.7010666666666666, + "grad_norm": 0.3214380053326501, + "learning_rate": 4.333031183340788e-05, + "loss": 0.5686, + "step": 2629 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.3504043643306748, + "learning_rate": 4.3259163310995e-05, + "loss": 0.5956, + "step": 2630 + }, + { + "epoch": 0.7016, + "grad_norm": 0.3202911831798422, + "learning_rate": 4.3188057124487534e-05, + "loss": 0.5668, + "step": 2631 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.34591908224035883, + "learning_rate": 4.31169933269397e-05, + "loss": 0.6084, + "step": 2632 + }, + { + "epoch": 0.7021333333333334, + "grad_norm": 0.32118578511432455, + "learning_rate": 4.304597197137419e-05, + "loss": 0.5919, + "step": 2633 + }, + { + "epoch": 0.7024, + "grad_norm": 0.3365449968041733, + "learning_rate": 4.297499311078199e-05, + "loss": 0.5793, + "step": 2634 + }, + { + "epoch": 0.7026666666666667, + "grad_norm": 0.35972867136611175, + "learning_rate": 4.2904056798122406e-05, + "loss": 0.5699, + "step": 2635 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.3528624894925796, + "learning_rate": 4.283316308632297e-05, + "loss": 0.58, + "step": 2636 + }, + { + "epoch": 0.7032, + "grad_norm": 0.33220790004422346, + "learning_rate": 4.276231202827944e-05, + "loss": 0.5745, + "step": 2637 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.32672276059322863, + "learning_rate": 4.269150367685575e-05, + "loss": 0.5872, + "step": 2638 + }, + { + "epoch": 0.7037333333333333, + "grad_norm": 0.341027566678748, + "learning_rate": 4.262073808488397e-05, + "loss": 0.5641, + "step": 2639 + }, + { + "epoch": 0.704, + "grad_norm": 0.3386046463377781, + "learning_rate": 4.2550015305164245e-05, + "loss": 0.5892, + "step": 2640 + }, + { + "epoch": 0.7042666666666667, + "grad_norm": 0.4366696668404764, + "learning_rate": 4.2479335390464815e-05, + "loss": 0.5479, + "step": 2641 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.31959445453441143, + "learning_rate": 4.2408698393521906e-05, + "loss": 0.5856, + "step": 2642 + }, + { + "epoch": 0.7048, + "grad_norm": 0.3471296666761095, + "learning_rate": 4.233810436703973e-05, + "loss": 0.5676, + "step": 2643 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.34275353742735265, + "learning_rate": 4.226755336369046e-05, + "loss": 0.591, + "step": 2644 + }, + { + "epoch": 0.7053333333333334, + "grad_norm": 0.3456530260951439, + "learning_rate": 4.219704543611412e-05, + "loss": 0.6233, + "step": 2645 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3378058528363393, + "learning_rate": 4.212658063691867e-05, + "loss": 0.5735, + "step": 2646 + }, + { + "epoch": 0.7058666666666666, + "grad_norm": 0.3417206932496116, + "learning_rate": 4.2056159018679774e-05, + "loss": 0.6103, + "step": 2647 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.3231082122786425, + "learning_rate": 4.1985780633940985e-05, + "loss": 0.5614, + "step": 2648 + }, + { + "epoch": 0.7064, + "grad_norm": 0.7189689071660238, + "learning_rate": 4.191544553521355e-05, + "loss": 0.5381, + "step": 2649 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.36053623768076987, + "learning_rate": 4.184515377497643e-05, + "loss": 0.6074, + "step": 2650 + }, + { + "epoch": 0.7069333333333333, + "grad_norm": 0.3486997937899941, + "learning_rate": 4.177490540567626e-05, + "loss": 0.6239, + "step": 2651 + }, + { + "epoch": 0.7072, + "grad_norm": 0.34005580084293835, + "learning_rate": 4.170470047972727e-05, + "loss": 0.5706, + "step": 2652 + }, + { + "epoch": 0.7074666666666667, + "grad_norm": 0.3506047926327882, + "learning_rate": 4.16345390495113e-05, + "loss": 0.5555, + "step": 2653 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.332909996257306, + "learning_rate": 4.1564421167377785e-05, + "loss": 0.5827, + "step": 2654 + }, + { + "epoch": 0.708, + "grad_norm": 0.3283646986773106, + "learning_rate": 4.149434688564352e-05, + "loss": 0.5683, + "step": 2655 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.3417714108882349, + "learning_rate": 4.142431625659291e-05, + "loss": 0.5635, + "step": 2656 + }, + { + "epoch": 0.7085333333333333, + "grad_norm": 0.34258674126155625, + "learning_rate": 4.13543293324777e-05, + "loss": 0.5373, + "step": 2657 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3593930282034007, + "learning_rate": 4.128438616551714e-05, + "loss": 0.6112, + "step": 2658 + }, + { + "epoch": 0.7090666666666666, + "grad_norm": 0.3402599240319362, + "learning_rate": 4.1214486807897726e-05, + "loss": 0.6286, + "step": 2659 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.348463798971244, + "learning_rate": 4.1144631311773296e-05, + "loss": 0.5619, + "step": 2660 + }, + { + "epoch": 0.7096, + "grad_norm": 0.365445806915749, + "learning_rate": 4.1074819729264956e-05, + "loss": 0.6243, + "step": 2661 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.3546416460696886, + "learning_rate": 4.100505211246111e-05, + "loss": 0.5552, + "step": 2662 + }, + { + "epoch": 0.7101333333333333, + "grad_norm": 0.3322707356375658, + "learning_rate": 4.093532851341723e-05, + "loss": 0.6049, + "step": 2663 + }, + { + "epoch": 0.7104, + "grad_norm": 0.35282429367718965, + "learning_rate": 4.0865648984156037e-05, + "loss": 0.599, + "step": 2664 + }, + { + "epoch": 0.7106666666666667, + "grad_norm": 0.34663001757399187, + "learning_rate": 4.079601357666736e-05, + "loss": 0.6044, + "step": 2665 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.37580705308182843, + "learning_rate": 4.072642234290811e-05, + "loss": 0.6294, + "step": 2666 + }, + { + "epoch": 0.7112, + "grad_norm": 0.35212344160202014, + "learning_rate": 4.065687533480221e-05, + "loss": 0.5757, + "step": 2667 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.3430637332492065, + "learning_rate": 4.058737260424062e-05, + "loss": 0.5762, + "step": 2668 + }, + { + "epoch": 0.7117333333333333, + "grad_norm": 0.31636201051778423, + "learning_rate": 4.051791420308125e-05, + "loss": 0.5675, + "step": 2669 + }, + { + "epoch": 0.712, + "grad_norm": 0.33345047326619004, + "learning_rate": 4.0448500183148965e-05, + "loss": 0.5725, + "step": 2670 + }, + { + "epoch": 0.7122666666666667, + "grad_norm": 0.3429815277484449, + "learning_rate": 4.037913059623539e-05, + "loss": 0.5731, + "step": 2671 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.3500791897144474, + "learning_rate": 4.030980549409915e-05, + "loss": 0.5358, + "step": 2672 + }, + { + "epoch": 0.7128, + "grad_norm": 0.32081388886237383, + "learning_rate": 4.02405249284656e-05, + "loss": 0.5809, + "step": 2673 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.3308509347928257, + "learning_rate": 4.0171288951026896e-05, + "loss": 0.5189, + "step": 2674 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 0.3394308605459211, + "learning_rate": 4.0102097613441916e-05, + "loss": 0.5596, + "step": 2675 + }, + { + "epoch": 0.7136, + "grad_norm": 0.34126220943791, + "learning_rate": 4.0032950967336214e-05, + "loss": 0.5916, + "step": 2676 + }, + { + "epoch": 0.7138666666666666, + "grad_norm": 0.3381538436086013, + "learning_rate": 3.996384906430202e-05, + "loss": 0.5934, + "step": 2677 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.3380942162126229, + "learning_rate": 3.989479195589817e-05, + "loss": 0.5583, + "step": 2678 + }, + { + "epoch": 0.7144, + "grad_norm": 0.33669057744214026, + "learning_rate": 3.9825779693650076e-05, + "loss": 0.5652, + "step": 2679 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.3357000295249724, + "learning_rate": 3.9756812329049706e-05, + "loss": 0.5441, + "step": 2680 + }, + { + "epoch": 0.7149333333333333, + "grad_norm": 0.3210033657002956, + "learning_rate": 3.968788991355552e-05, + "loss": 0.629, + "step": 2681 + }, + { + "epoch": 0.7152, + "grad_norm": 0.34486798191198575, + "learning_rate": 3.961901249859241e-05, + "loss": 0.5862, + "step": 2682 + }, + { + "epoch": 0.7154666666666667, + "grad_norm": 0.3351316950217347, + "learning_rate": 3.955018013555174e-05, + "loss": 0.5364, + "step": 2683 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.3179747319531608, + "learning_rate": 3.948139287579122e-05, + "loss": 0.5639, + "step": 2684 + }, + { + "epoch": 0.716, + "grad_norm": 0.3359153652544578, + "learning_rate": 3.941265077063497e-05, + "loss": 0.5775, + "step": 2685 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.3499153314057488, + "learning_rate": 3.9343953871373306e-05, + "loss": 0.6228, + "step": 2686 + }, + { + "epoch": 0.7165333333333334, + "grad_norm": 0.333132020596246, + "learning_rate": 3.927530222926291e-05, + "loss": 0.5853, + "step": 2687 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3410323919520696, + "learning_rate": 3.9206695895526666e-05, + "loss": 0.5663, + "step": 2688 + }, + { + "epoch": 0.7170666666666666, + "grad_norm": 0.3453388074042059, + "learning_rate": 3.913813492135366e-05, + "loss": 0.5768, + "step": 2689 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.3379574720399497, + "learning_rate": 3.9069619357899137e-05, + "loss": 0.5388, + "step": 2690 + }, + { + "epoch": 0.7176, + "grad_norm": 0.3321920020102771, + "learning_rate": 3.900114925628443e-05, + "loss": 0.5331, + "step": 2691 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.3423234332914018, + "learning_rate": 3.8932724667597e-05, + "loss": 0.5675, + "step": 2692 + }, + { + "epoch": 0.7181333333333333, + "grad_norm": 0.34627067332119427, + "learning_rate": 3.8864345642890285e-05, + "loss": 0.5866, + "step": 2693 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3481216742300727, + "learning_rate": 3.879601223318381e-05, + "loss": 0.541, + "step": 2694 + }, + { + "epoch": 0.7186666666666667, + "grad_norm": 0.3533724827916215, + "learning_rate": 3.872772448946298e-05, + "loss": 0.5586, + "step": 2695 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.3409493010942156, + "learning_rate": 3.8659482462679186e-05, + "loss": 0.5874, + "step": 2696 + }, + { + "epoch": 0.7192, + "grad_norm": 0.33703755634795984, + "learning_rate": 3.8591286203749675e-05, + "loss": 0.6389, + "step": 2697 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.33642926964243336, + "learning_rate": 3.8523135763557586e-05, + "loss": 0.5694, + "step": 2698 + }, + { + "epoch": 0.7197333333333333, + "grad_norm": 0.36002017706901657, + "learning_rate": 3.845503119295182e-05, + "loss": 0.5922, + "step": 2699 + }, + { + "epoch": 0.72, + "grad_norm": 0.33680011289406897, + "learning_rate": 3.838697254274708e-05, + "loss": 0.6074, + "step": 2700 + }, + { + "epoch": 0.7202666666666667, + "grad_norm": 0.3551819088159809, + "learning_rate": 3.8318959863723845e-05, + "loss": 0.6345, + "step": 2701 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.339625447035895, + "learning_rate": 3.8250993206628196e-05, + "loss": 0.6164, + "step": 2702 + }, + { + "epoch": 0.7208, + "grad_norm": 0.33428656437795423, + "learning_rate": 3.8183072622171945e-05, + "loss": 0.5761, + "step": 2703 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.33766082620220056, + "learning_rate": 3.811519816103253e-05, + "loss": 0.5506, + "step": 2704 + }, + { + "epoch": 0.7213333333333334, + "grad_norm": 0.32835872325437937, + "learning_rate": 3.804736987385296e-05, + "loss": 0.5544, + "step": 2705 + }, + { + "epoch": 0.7216, + "grad_norm": 0.32836417353156705, + "learning_rate": 3.7979587811241776e-05, + "loss": 0.5823, + "step": 2706 + }, + { + "epoch": 0.7218666666666667, + "grad_norm": 0.3247183003730969, + "learning_rate": 3.791185202377308e-05, + "loss": 0.5952, + "step": 2707 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.34706684943085486, + "learning_rate": 3.7844162561986386e-05, + "loss": 0.5767, + "step": 2708 + }, + { + "epoch": 0.7224, + "grad_norm": 0.36355256307156913, + "learning_rate": 3.777651947638672e-05, + "loss": 0.5756, + "step": 2709 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.34411426320034616, + "learning_rate": 3.770892281744438e-05, + "loss": 0.589, + "step": 2710 + }, + { + "epoch": 0.7229333333333333, + "grad_norm": 0.32949175673002024, + "learning_rate": 3.764137263559514e-05, + "loss": 0.5701, + "step": 2711 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3386470837306449, + "learning_rate": 3.7573868981240055e-05, + "loss": 0.5772, + "step": 2712 + }, + { + "epoch": 0.7234666666666667, + "grad_norm": 0.328897677315379, + "learning_rate": 3.750641190474543e-05, + "loss": 0.5869, + "step": 2713 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.3441502977221793, + "learning_rate": 3.743900145644292e-05, + "loss": 0.5845, + "step": 2714 + }, + { + "epoch": 0.724, + "grad_norm": 0.3245188830355559, + "learning_rate": 3.737163768662929e-05, + "loss": 0.6009, + "step": 2715 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.368042509101508, + "learning_rate": 3.730432064556655e-05, + "loss": 0.5984, + "step": 2716 + }, + { + "epoch": 0.7245333333333334, + "grad_norm": 0.3092323120645295, + "learning_rate": 3.723705038348172e-05, + "loss": 0.5397, + "step": 2717 + }, + { + "epoch": 0.7248, + "grad_norm": 0.35887390833634286, + "learning_rate": 3.716982695056705e-05, + "loss": 0.5779, + "step": 2718 + }, + { + "epoch": 0.7250666666666666, + "grad_norm": 0.36867587051516315, + "learning_rate": 3.7102650396979775e-05, + "loss": 0.603, + "step": 2719 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.3504788968425937, + "learning_rate": 3.7035520772842215e-05, + "loss": 0.6275, + "step": 2720 + }, + { + "epoch": 0.7256, + "grad_norm": 0.35438892006742084, + "learning_rate": 3.69684381282416e-05, + "loss": 0.5881, + "step": 2721 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.34325493506533183, + "learning_rate": 3.6901402513230176e-05, + "loss": 0.5949, + "step": 2722 + }, + { + "epoch": 0.7261333333333333, + "grad_norm": 0.3180607835872949, + "learning_rate": 3.683441397782504e-05, + "loss": 0.5316, + "step": 2723 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3371022609370693, + "learning_rate": 3.676747257200824e-05, + "loss": 0.6321, + "step": 2724 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 0.34294720467767303, + "learning_rate": 3.670057834572653e-05, + "loss": 0.6166, + "step": 2725 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.34601852533763955, + "learning_rate": 3.6633731348891576e-05, + "loss": 0.5642, + "step": 2726 + }, + { + "epoch": 0.7272, + "grad_norm": 0.3291346370127747, + "learning_rate": 3.656693163137978e-05, + "loss": 0.5935, + "step": 2727 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.3482549678259148, + "learning_rate": 3.650017924303223e-05, + "loss": 0.5638, + "step": 2728 + }, + { + "epoch": 0.7277333333333333, + "grad_norm": 0.35577092272633354, + "learning_rate": 3.6433474233654755e-05, + "loss": 0.5621, + "step": 2729 + }, + { + "epoch": 0.728, + "grad_norm": 0.3500587318617331, + "learning_rate": 3.636681665301779e-05, + "loss": 0.6056, + "step": 2730 + }, + { + "epoch": 0.7282666666666666, + "grad_norm": 0.3259015848550714, + "learning_rate": 3.630020655085638e-05, + "loss": 0.5866, + "step": 2731 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.3421158499942335, + "learning_rate": 3.623364397687021e-05, + "loss": 0.5913, + "step": 2732 + }, + { + "epoch": 0.7288, + "grad_norm": 0.3527844139938932, + "learning_rate": 3.616712898072341e-05, + "loss": 0.6093, + "step": 2733 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.3540040986675042, + "learning_rate": 3.6100661612044674e-05, + "loss": 0.6377, + "step": 2734 + }, + { + "epoch": 0.7293333333333333, + "grad_norm": 0.33951088152217124, + "learning_rate": 3.6034241920427146e-05, + "loss": 0.586, + "step": 2735 + }, + { + "epoch": 0.7296, + "grad_norm": 0.36040966386816387, + "learning_rate": 3.596786995542838e-05, + "loss": 0.575, + "step": 2736 + }, + { + "epoch": 0.7298666666666667, + "grad_norm": 0.35036495557156944, + "learning_rate": 3.590154576657033e-05, + "loss": 0.5708, + "step": 2737 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.3402095274506431, + "learning_rate": 3.583526940333932e-05, + "loss": 0.5944, + "step": 2738 + }, + { + "epoch": 0.7304, + "grad_norm": 0.31256515109998495, + "learning_rate": 3.576904091518597e-05, + "loss": 0.5227, + "step": 2739 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.3341459090386711, + "learning_rate": 3.5702860351525216e-05, + "loss": 0.6042, + "step": 2740 + }, + { + "epoch": 0.7309333333333333, + "grad_norm": 0.321061397417268, + "learning_rate": 3.563672776173613e-05, + "loss": 0.559, + "step": 2741 + }, + { + "epoch": 0.7312, + "grad_norm": 0.3302288284779727, + "learning_rate": 3.557064319516211e-05, + "loss": 0.5772, + "step": 2742 + }, + { + "epoch": 0.7314666666666667, + "grad_norm": 0.3587254813301968, + "learning_rate": 3.5504606701110674e-05, + "loss": 0.5616, + "step": 2743 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.33612652088475176, + "learning_rate": 3.5438618328853466e-05, + "loss": 0.5686, + "step": 2744 + }, + { + "epoch": 0.732, + "grad_norm": 0.32337776100716026, + "learning_rate": 3.5372678127626236e-05, + "loss": 0.5622, + "step": 2745 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.31930456103097127, + "learning_rate": 3.5306786146628803e-05, + "loss": 0.5552, + "step": 2746 + }, + { + "epoch": 0.7325333333333334, + "grad_norm": 0.3533114791890799, + "learning_rate": 3.524094243502497e-05, + "loss": 0.5946, + "step": 2747 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3349334612823938, + "learning_rate": 3.517514704194256e-05, + "loss": 0.5825, + "step": 2748 + }, + { + "epoch": 0.7330666666666666, + "grad_norm": 0.35261757290925116, + "learning_rate": 3.510940001647334e-05, + "loss": 0.6046, + "step": 2749 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.3373037734584713, + "learning_rate": 3.504370140767297e-05, + "loss": 0.6056, + "step": 2750 + }, + { + "epoch": 0.7336, + "grad_norm": 0.3353647503081956, + "learning_rate": 3.4978051264561e-05, + "loss": 0.5732, + "step": 2751 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.3520589217906225, + "learning_rate": 3.491244963612082e-05, + "loss": 0.5711, + "step": 2752 + }, + { + "epoch": 0.7341333333333333, + "grad_norm": 0.3543274292844667, + "learning_rate": 3.4846896571299615e-05, + "loss": 0.6096, + "step": 2753 + }, + { + "epoch": 0.7344, + "grad_norm": 0.36631220619876814, + "learning_rate": 3.478139211900833e-05, + "loss": 0.6253, + "step": 2754 + }, + { + "epoch": 0.7346666666666667, + "grad_norm": 0.5896318232010979, + "learning_rate": 3.471593632812169e-05, + "loss": 0.5979, + "step": 2755 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.34658342927525737, + "learning_rate": 3.465052924747799e-05, + "loss": 0.6239, + "step": 2756 + }, + { + "epoch": 0.7352, + "grad_norm": 0.33573335206497024, + "learning_rate": 3.458517092587931e-05, + "loss": 0.5616, + "step": 2757 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.3322460637850616, + "learning_rate": 3.451986141209128e-05, + "loss": 0.6083, + "step": 2758 + }, + { + "epoch": 0.7357333333333334, + "grad_norm": 0.3606929380552638, + "learning_rate": 3.445460075484315e-05, + "loss": 0.6021, + "step": 2759 + }, + { + "epoch": 0.736, + "grad_norm": 0.33386289228713206, + "learning_rate": 3.438938900282768e-05, + "loss": 0.5762, + "step": 2760 + }, + { + "epoch": 0.7362666666666666, + "grad_norm": 0.3754182581468279, + "learning_rate": 3.432422620470117e-05, + "loss": 0.5654, + "step": 2761 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.3580448255914008, + "learning_rate": 3.425911240908338e-05, + "loss": 0.538, + "step": 2762 + }, + { + "epoch": 0.7368, + "grad_norm": 0.348729716974533, + "learning_rate": 3.419404766455755e-05, + "loss": 0.5977, + "step": 2763 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.36504385827184765, + "learning_rate": 3.412903201967022e-05, + "loss": 0.5955, + "step": 2764 + }, + { + "epoch": 0.7373333333333333, + "grad_norm": 0.36453391448094125, + "learning_rate": 3.4064065522931364e-05, + "loss": 0.5864, + "step": 2765 + }, + { + "epoch": 0.7376, + "grad_norm": 0.34426344395579583, + "learning_rate": 3.3999148222814316e-05, + "loss": 0.5753, + "step": 2766 + }, + { + "epoch": 0.7378666666666667, + "grad_norm": 0.3516850748631634, + "learning_rate": 3.393428016775565e-05, + "loss": 0.5759, + "step": 2767 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.3628796219858617, + "learning_rate": 3.386946140615517e-05, + "loss": 0.6167, + "step": 2768 + }, + { + "epoch": 0.7384, + "grad_norm": 0.33271508301085917, + "learning_rate": 3.3804691986376034e-05, + "loss": 0.5604, + "step": 2769 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.34987970715201805, + "learning_rate": 3.373997195674444e-05, + "loss": 0.5855, + "step": 2770 + }, + { + "epoch": 0.7389333333333333, + "grad_norm": 0.4009078897492756, + "learning_rate": 3.367530136554984e-05, + "loss": 0.675, + "step": 2771 + }, + { + "epoch": 0.7392, + "grad_norm": 0.355531287267289, + "learning_rate": 3.361068026104466e-05, + "loss": 0.5919, + "step": 2772 + }, + { + "epoch": 0.7394666666666667, + "grad_norm": 0.36164773264924877, + "learning_rate": 3.3546108691444544e-05, + "loss": 0.5489, + "step": 2773 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.35484373258109436, + "learning_rate": 3.3481586704928123e-05, + "loss": 0.575, + "step": 2774 + }, + { + "epoch": 0.74, + "grad_norm": 0.3603856426092299, + "learning_rate": 3.341711434963703e-05, + "loss": 0.5583, + "step": 2775 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.35102099841871875, + "learning_rate": 3.335269167367586e-05, + "loss": 0.5972, + "step": 2776 + }, + { + "epoch": 0.7405333333333334, + "grad_norm": 0.35954375592491183, + "learning_rate": 3.328831872511216e-05, + "loss": 0.6176, + "step": 2777 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3203867630802601, + "learning_rate": 3.3223995551976364e-05, + "loss": 0.5894, + "step": 2778 + }, + { + "epoch": 0.7410666666666667, + "grad_norm": 0.3418286289943729, + "learning_rate": 3.315972220226179e-05, + "loss": 0.5756, + "step": 2779 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.3400885588179178, + "learning_rate": 3.309549872392451e-05, + "loss": 0.5612, + "step": 2780 + }, + { + "epoch": 0.7416, + "grad_norm": 0.3588257717229326, + "learning_rate": 3.3031325164883466e-05, + "loss": 0.5906, + "step": 2781 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.339092122833708, + "learning_rate": 3.296720157302031e-05, + "loss": 0.5604, + "step": 2782 + }, + { + "epoch": 0.7421333333333333, + "grad_norm": 0.3808127497745609, + "learning_rate": 3.290312799617944e-05, + "loss": 0.6332, + "step": 2783 + }, + { + "epoch": 0.7424, + "grad_norm": 0.3393912738713724, + "learning_rate": 3.2839104482167914e-05, + "loss": 0.5602, + "step": 2784 + }, + { + "epoch": 0.7426666666666667, + "grad_norm": 0.3683834269616866, + "learning_rate": 3.277513107875544e-05, + "loss": 0.6171, + "step": 2785 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.3310670819530194, + "learning_rate": 3.271120783367435e-05, + "loss": 0.5278, + "step": 2786 + }, + { + "epoch": 0.7432, + "grad_norm": 0.3445808893867133, + "learning_rate": 3.264733479461953e-05, + "loss": 0.577, + "step": 2787 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.36301230687627056, + "learning_rate": 3.258351200924844e-05, + "loss": 0.6084, + "step": 2788 + }, + { + "epoch": 0.7437333333333334, + "grad_norm": 0.3449804768380338, + "learning_rate": 3.2519739525181007e-05, + "loss": 0.5664, + "step": 2789 + }, + { + "epoch": 0.744, + "grad_norm": 0.34825749563458047, + "learning_rate": 3.245601738999964e-05, + "loss": 0.5736, + "step": 2790 + }, + { + "epoch": 0.7442666666666666, + "grad_norm": 0.34240059510136145, + "learning_rate": 3.239234565124919e-05, + "loss": 0.5492, + "step": 2791 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.3562829720005136, + "learning_rate": 3.23287243564369e-05, + "loss": 0.5861, + "step": 2792 + }, + { + "epoch": 0.7448, + "grad_norm": 0.3223001854588283, + "learning_rate": 3.226515355303237e-05, + "loss": 0.5926, + "step": 2793 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.3206364442570673, + "learning_rate": 3.220163328846757e-05, + "loss": 0.5691, + "step": 2794 + }, + { + "epoch": 0.7453333333333333, + "grad_norm": 0.3571916770068726, + "learning_rate": 3.2138163610136665e-05, + "loss": 0.6011, + "step": 2795 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3550758504410085, + "learning_rate": 3.207474456539615e-05, + "loss": 0.5645, + "step": 2796 + }, + { + "epoch": 0.7458666666666667, + "grad_norm": 0.3429306121438903, + "learning_rate": 3.201137620156475e-05, + "loss": 0.5494, + "step": 2797 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.3578267104696251, + "learning_rate": 3.1948058565923324e-05, + "loss": 0.6124, + "step": 2798 + }, + { + "epoch": 0.7464, + "grad_norm": 0.35810356477423677, + "learning_rate": 3.188479170571493e-05, + "loss": 0.5364, + "step": 2799 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.3433763452975484, + "learning_rate": 3.182157566814471e-05, + "loss": 0.5747, + "step": 2800 + }, + { + "epoch": 0.7469333333333333, + "grad_norm": 0.3419644464151213, + "learning_rate": 3.17584105003799e-05, + "loss": 0.5917, + "step": 2801 + }, + { + "epoch": 0.7472, + "grad_norm": 0.32329900358403585, + "learning_rate": 3.16952962495498e-05, + "loss": 0.5778, + "step": 2802 + }, + { + "epoch": 0.7474666666666666, + "grad_norm": 0.32813276016351867, + "learning_rate": 3.163223296274561e-05, + "loss": 0.5294, + "step": 2803 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.34099670470381055, + "learning_rate": 3.1569220687020675e-05, + "loss": 0.5689, + "step": 2804 + }, + { + "epoch": 0.748, + "grad_norm": 0.35336381033368786, + "learning_rate": 3.1506259469390173e-05, + "loss": 0.6004, + "step": 2805 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.4062749773569371, + "learning_rate": 3.144334935683121e-05, + "loss": 0.5738, + "step": 2806 + }, + { + "epoch": 0.7485333333333334, + "grad_norm": 0.32575080749610674, + "learning_rate": 3.138049039628273e-05, + "loss": 0.5714, + "step": 2807 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3572483766931299, + "learning_rate": 3.1317682634645586e-05, + "loss": 0.6132, + "step": 2808 + }, + { + "epoch": 0.7490666666666667, + "grad_norm": 0.3327769902717611, + "learning_rate": 3.1254926118782346e-05, + "loss": 0.57, + "step": 2809 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.32477417536389414, + "learning_rate": 3.119222089551743e-05, + "loss": 0.5952, + "step": 2810 + }, + { + "epoch": 0.7496, + "grad_norm": 0.4144458561520922, + "learning_rate": 3.1129567011636875e-05, + "loss": 0.5897, + "step": 2811 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.33359936621612046, + "learning_rate": 3.1066964513888486e-05, + "loss": 0.5574, + "step": 2812 + }, + { + "epoch": 0.7501333333333333, + "grad_norm": 0.34162445655538076, + "learning_rate": 3.1004413448981726e-05, + "loss": 0.5775, + "step": 2813 + }, + { + "epoch": 0.7504, + "grad_norm": 0.3745261818377158, + "learning_rate": 3.094191386358768e-05, + "loss": 0.6403, + "step": 2814 + }, + { + "epoch": 0.7506666666666667, + "grad_norm": 0.3381915213786633, + "learning_rate": 3.0879465804339016e-05, + "loss": 0.5716, + "step": 2815 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.3553817135977034, + "learning_rate": 3.081706931782994e-05, + "loss": 0.5936, + "step": 2816 + }, + { + "epoch": 0.7512, + "grad_norm": 0.3392212116679093, + "learning_rate": 3.07547244506162e-05, + "loss": 0.5675, + "step": 2817 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.3419158329222798, + "learning_rate": 3.069243124921507e-05, + "loss": 0.5904, + "step": 2818 + }, + { + "epoch": 0.7517333333333334, + "grad_norm": 0.362236353952747, + "learning_rate": 3.063018976010514e-05, + "loss": 0.6215, + "step": 2819 + }, + { + "epoch": 0.752, + "grad_norm": 0.32366659489489924, + "learning_rate": 3.056800002972655e-05, + "loss": 0.6029, + "step": 2820 + }, + { + "epoch": 0.7522666666666666, + "grad_norm": 0.3327667473773849, + "learning_rate": 3.0505862104480787e-05, + "loss": 0.5427, + "step": 2821 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.333461734242426, + "learning_rate": 3.0443776030730653e-05, + "loss": 0.5664, + "step": 2822 + }, + { + "epoch": 0.7528, + "grad_norm": 0.3528611971703985, + "learning_rate": 3.0381741854800283e-05, + "loss": 0.5916, + "step": 2823 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.3432974171601048, + "learning_rate": 3.0319759622975062e-05, + "loss": 0.5779, + "step": 2824 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 0.3359747156313507, + "learning_rate": 3.0257829381501725e-05, + "loss": 0.5797, + "step": 2825 + }, + { + "epoch": 0.7536, + "grad_norm": 0.33007529909848426, + "learning_rate": 3.019595117658811e-05, + "loss": 0.5848, + "step": 2826 + }, + { + "epoch": 0.7538666666666667, + "grad_norm": 0.33089153643306335, + "learning_rate": 3.0134125054403207e-05, + "loss": 0.6071, + "step": 2827 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.35421218210284616, + "learning_rate": 3.0072351061077208e-05, + "loss": 0.6004, + "step": 2828 + }, + { + "epoch": 0.7544, + "grad_norm": 0.34489451227933937, + "learning_rate": 3.0010629242701417e-05, + "loss": 0.5913, + "step": 2829 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.3467533873523663, + "learning_rate": 2.9948959645328177e-05, + "loss": 0.5669, + "step": 2830 + }, + { + "epoch": 0.7549333333333333, + "grad_norm": 0.3348711053997264, + "learning_rate": 2.9887342314970878e-05, + "loss": 0.5327, + "step": 2831 + }, + { + "epoch": 0.7552, + "grad_norm": 0.33545208915497077, + "learning_rate": 2.982577729760392e-05, + "loss": 0.5621, + "step": 2832 + }, + { + "epoch": 0.7554666666666666, + "grad_norm": 0.3596487109307188, + "learning_rate": 2.9764264639162677e-05, + "loss": 0.6048, + "step": 2833 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.4636999773283181, + "learning_rate": 2.970280438554339e-05, + "loss": 0.6043, + "step": 2834 + }, + { + "epoch": 0.756, + "grad_norm": 0.34265894916271833, + "learning_rate": 2.9641396582603288e-05, + "loss": 0.5726, + "step": 2835 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.34947764799702663, + "learning_rate": 2.958004127616042e-05, + "loss": 0.5698, + "step": 2836 + }, + { + "epoch": 0.7565333333333333, + "grad_norm": 0.3363454553598191, + "learning_rate": 2.9518738511993683e-05, + "loss": 0.5546, + "step": 2837 + }, + { + "epoch": 0.7568, + "grad_norm": 0.3735086429816798, + "learning_rate": 2.9457488335842754e-05, + "loss": 0.5699, + "step": 2838 + }, + { + "epoch": 0.7570666666666667, + "grad_norm": 0.35097422993851685, + "learning_rate": 2.939629079340809e-05, + "loss": 0.5462, + "step": 2839 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.33511453700242655, + "learning_rate": 2.9335145930350848e-05, + "loss": 0.5861, + "step": 2840 + }, + { + "epoch": 0.7576, + "grad_norm": 0.3448492542930926, + "learning_rate": 2.927405379229292e-05, + "loss": 0.5544, + "step": 2841 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.3407360241982918, + "learning_rate": 2.9213014424816843e-05, + "loss": 0.5725, + "step": 2842 + }, + { + "epoch": 0.7581333333333333, + "grad_norm": 0.34268534583443067, + "learning_rate": 2.9152027873465747e-05, + "loss": 0.5569, + "step": 2843 + }, + { + "epoch": 0.7584, + "grad_norm": 0.35369337878939944, + "learning_rate": 2.9091094183743405e-05, + "loss": 0.5904, + "step": 2844 + }, + { + "epoch": 0.7586666666666667, + "grad_norm": 0.3544851327470538, + "learning_rate": 2.9030213401114127e-05, + "loss": 0.568, + "step": 2845 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.3664544687067838, + "learning_rate": 2.8969385571002728e-05, + "loss": 0.591, + "step": 2846 + }, + { + "epoch": 0.7592, + "grad_norm": 0.3366042280225982, + "learning_rate": 2.890861073879454e-05, + "loss": 0.5307, + "step": 2847 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.36078579440370795, + "learning_rate": 2.8847888949835357e-05, + "loss": 0.6049, + "step": 2848 + }, + { + "epoch": 0.7597333333333334, + "grad_norm": 0.33135072310074737, + "learning_rate": 2.878722024943139e-05, + "loss": 0.5966, + "step": 2849 + }, + { + "epoch": 0.76, + "grad_norm": 0.3446854577131219, + "learning_rate": 2.872660468284919e-05, + "loss": 0.5407, + "step": 2850 + }, + { + "epoch": 0.7602666666666666, + "grad_norm": 0.3452087501658542, + "learning_rate": 2.866604229531573e-05, + "loss": 0.5547, + "step": 2851 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.35852385022308303, + "learning_rate": 2.860553313201828e-05, + "loss": 0.6115, + "step": 2852 + }, + { + "epoch": 0.7608, + "grad_norm": 0.33154218446520856, + "learning_rate": 2.854507723810439e-05, + "loss": 0.6102, + "step": 2853 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.33192405887435444, + "learning_rate": 2.8484674658681887e-05, + "loss": 0.551, + "step": 2854 + }, + { + "epoch": 0.7613333333333333, + "grad_norm": 0.36128934018441156, + "learning_rate": 2.8424325438818798e-05, + "loss": 0.5719, + "step": 2855 + }, + { + "epoch": 0.7616, + "grad_norm": 0.3482992086305832, + "learning_rate": 2.8364029623543342e-05, + "loss": 0.5652, + "step": 2856 + }, + { + "epoch": 0.7618666666666667, + "grad_norm": 0.35876474657284396, + "learning_rate": 2.8303787257843917e-05, + "loss": 0.5911, + "step": 2857 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.33025684531166216, + "learning_rate": 2.8243598386668924e-05, + "loss": 0.5706, + "step": 2858 + }, + { + "epoch": 0.7624, + "grad_norm": 0.3370857793562149, + "learning_rate": 2.8183463054927052e-05, + "loss": 0.5878, + "step": 2859 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.3416894819537489, + "learning_rate": 2.8123381307486872e-05, + "loss": 0.5711, + "step": 2860 + }, + { + "epoch": 0.7629333333333334, + "grad_norm": 0.3341183247416704, + "learning_rate": 2.8063353189177065e-05, + "loss": 0.5497, + "step": 2861 + }, + { + "epoch": 0.7632, + "grad_norm": 0.35025357015705827, + "learning_rate": 2.8003378744786245e-05, + "loss": 0.5404, + "step": 2862 + }, + { + "epoch": 0.7634666666666666, + "grad_norm": 0.3406201446929645, + "learning_rate": 2.7943458019063018e-05, + "loss": 0.5852, + "step": 2863 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.35764609510961687, + "learning_rate": 2.7883591056715887e-05, + "loss": 0.568, + "step": 2864 + }, + { + "epoch": 0.764, + "grad_norm": 0.3555753793060286, + "learning_rate": 2.7823777902413272e-05, + "loss": 0.5547, + "step": 2865 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.3417393649148862, + "learning_rate": 2.776401860078337e-05, + "loss": 0.5791, + "step": 2866 + }, + { + "epoch": 0.7645333333333333, + "grad_norm": 0.4187043470260362, + "learning_rate": 2.7704313196414266e-05, + "loss": 0.6243, + "step": 2867 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3367323662310071, + "learning_rate": 2.7644661733853804e-05, + "loss": 0.5751, + "step": 2868 + }, + { + "epoch": 0.7650666666666667, + "grad_norm": 0.33716347882532866, + "learning_rate": 2.7585064257609607e-05, + "loss": 0.5587, + "step": 2869 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.3488560446831909, + "learning_rate": 2.7525520812148987e-05, + "loss": 0.6056, + "step": 2870 + }, + { + "epoch": 0.7656, + "grad_norm": 0.3501423965078887, + "learning_rate": 2.7466031441898955e-05, + "loss": 0.604, + "step": 2871 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.35556419708512693, + "learning_rate": 2.7406596191246204e-05, + "loss": 0.5783, + "step": 2872 + }, + { + "epoch": 0.7661333333333333, + "grad_norm": 0.3336847500086423, + "learning_rate": 2.734721510453695e-05, + "loss": 0.5423, + "step": 2873 + }, + { + "epoch": 0.7664, + "grad_norm": 0.34399136002504593, + "learning_rate": 2.7287888226077106e-05, + "loss": 0.572, + "step": 2874 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 0.35364208377429673, + "learning_rate": 2.722861560013208e-05, + "loss": 0.6032, + "step": 2875 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.34528890196523426, + "learning_rate": 2.716939727092682e-05, + "loss": 0.5994, + "step": 2876 + }, + { + "epoch": 0.7672, + "grad_norm": 0.37059081484690026, + "learning_rate": 2.7110233282645757e-05, + "loss": 0.5842, + "step": 2877 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.33942610536200135, + "learning_rate": 2.7051123679432776e-05, + "loss": 0.596, + "step": 2878 + }, + { + "epoch": 0.7677333333333334, + "grad_norm": 0.34570312362908784, + "learning_rate": 2.6992068505391198e-05, + "loss": 0.5324, + "step": 2879 + }, + { + "epoch": 0.768, + "grad_norm": 0.37058252090350524, + "learning_rate": 2.693306780458369e-05, + "loss": 0.6221, + "step": 2880 + }, + { + "epoch": 0.7682666666666667, + "grad_norm": 0.3479922325256641, + "learning_rate": 2.6874121621032334e-05, + "loss": 0.5957, + "step": 2881 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.34677840493325257, + "learning_rate": 2.6815229998718492e-05, + "loss": 0.6364, + "step": 2882 + }, + { + "epoch": 0.7688, + "grad_norm": 0.3397310545189047, + "learning_rate": 2.6756392981582835e-05, + "loss": 0.6255, + "step": 2883 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.3542887275124184, + "learning_rate": 2.669761061352527e-05, + "loss": 0.6087, + "step": 2884 + }, + { + "epoch": 0.7693333333333333, + "grad_norm": 0.34850317816204396, + "learning_rate": 2.6638882938404964e-05, + "loss": 0.5781, + "step": 2885 + }, + { + "epoch": 0.7696, + "grad_norm": 0.32586211658065056, + "learning_rate": 2.6580210000040252e-05, + "loss": 0.5624, + "step": 2886 + }, + { + "epoch": 0.7698666666666667, + "grad_norm": 0.3262727265793471, + "learning_rate": 2.6521591842208636e-05, + "loss": 0.6027, + "step": 2887 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.3805206490097053, + "learning_rate": 2.646302850864677e-05, + "loss": 0.5928, + "step": 2888 + }, + { + "epoch": 0.7704, + "grad_norm": 0.4774418088688516, + "learning_rate": 2.6404520043050316e-05, + "loss": 0.5621, + "step": 2889 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.3590649841143849, + "learning_rate": 2.6346066489074085e-05, + "loss": 0.6163, + "step": 2890 + }, + { + "epoch": 0.7709333333333334, + "grad_norm": 0.34624724191296896, + "learning_rate": 2.628766789033188e-05, + "loss": 0.592, + "step": 2891 + }, + { + "epoch": 0.7712, + "grad_norm": 0.35398341996013827, + "learning_rate": 2.6229324290396517e-05, + "loss": 0.5896, + "step": 2892 + }, + { + "epoch": 0.7714666666666666, + "grad_norm": 0.34491124969305736, + "learning_rate": 2.6171035732799766e-05, + "loss": 0.5893, + "step": 2893 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.3273373885550423, + "learning_rate": 2.6112802261032333e-05, + "loss": 0.5547, + "step": 2894 + }, + { + "epoch": 0.772, + "grad_norm": 0.35819953877359534, + "learning_rate": 2.6054623918543818e-05, + "loss": 0.5951, + "step": 2895 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.3593055667000119, + "learning_rate": 2.5996500748742693e-05, + "loss": 0.5758, + "step": 2896 + }, + { + "epoch": 0.7725333333333333, + "grad_norm": 0.3500869697127036, + "learning_rate": 2.5938432794996247e-05, + "loss": 0.5674, + "step": 2897 + }, + { + "epoch": 0.7728, + "grad_norm": 0.3337593031098043, + "learning_rate": 2.58804201006306e-05, + "loss": 0.5221, + "step": 2898 + }, + { + "epoch": 0.7730666666666667, + "grad_norm": 0.31726377157544783, + "learning_rate": 2.5822462708930607e-05, + "loss": 0.5316, + "step": 2899 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.38309907455470354, + "learning_rate": 2.5764560663139893e-05, + "loss": 0.6158, + "step": 2900 + }, + { + "epoch": 0.7736, + "grad_norm": 0.34890471898909503, + "learning_rate": 2.5706714006460775e-05, + "loss": 0.5539, + "step": 2901 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.3217838755652875, + "learning_rate": 2.564892278205423e-05, + "loss": 0.5345, + "step": 2902 + }, + { + "epoch": 0.7741333333333333, + "grad_norm": 0.36001095560304475, + "learning_rate": 2.55911870330399e-05, + "loss": 0.5807, + "step": 2903 + }, + { + "epoch": 0.7744, + "grad_norm": 0.3198848270430251, + "learning_rate": 2.553350680249603e-05, + "loss": 0.5438, + "step": 2904 + }, + { + "epoch": 0.7746666666666666, + "grad_norm": 0.34487127863747363, + "learning_rate": 2.5475882133459404e-05, + "loss": 0.5775, + "step": 2905 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.3411786328623317, + "learning_rate": 2.541831306892538e-05, + "loss": 0.5427, + "step": 2906 + }, + { + "epoch": 0.7752, + "grad_norm": 0.3375887356805181, + "learning_rate": 2.5360799651847855e-05, + "loss": 0.5573, + "step": 2907 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.33238618803649345, + "learning_rate": 2.5303341925139157e-05, + "loss": 0.5826, + "step": 2908 + }, + { + "epoch": 0.7757333333333334, + "grad_norm": 0.35801528194447835, + "learning_rate": 2.524593993167008e-05, + "loss": 0.5746, + "step": 2909 + }, + { + "epoch": 0.776, + "grad_norm": 0.3280498033236464, + "learning_rate": 2.518859371426985e-05, + "loss": 0.5456, + "step": 2910 + }, + { + "epoch": 0.7762666666666667, + "grad_norm": 0.3814129159093021, + "learning_rate": 2.5131303315726096e-05, + "loss": 0.577, + "step": 2911 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.3665650846255579, + "learning_rate": 2.5074068778784687e-05, + "loss": 0.5574, + "step": 2912 + }, + { + "epoch": 0.7768, + "grad_norm": 0.3282645759624906, + "learning_rate": 2.501689014614995e-05, + "loss": 0.5534, + "step": 2913 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.3658811470179827, + "learning_rate": 2.4959767460484384e-05, + "loss": 0.6137, + "step": 2914 + }, + { + "epoch": 0.7773333333333333, + "grad_norm": 0.3623977835828062, + "learning_rate": 2.4902700764408883e-05, + "loss": 0.587, + "step": 2915 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3772563853133515, + "learning_rate": 2.484569010050244e-05, + "loss": 0.609, + "step": 2916 + }, + { + "epoch": 0.7778666666666667, + "grad_norm": 0.3392553477959139, + "learning_rate": 2.4788735511302295e-05, + "loss": 0.5657, + "step": 2917 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.3618047663803316, + "learning_rate": 2.473183703930384e-05, + "loss": 0.5901, + "step": 2918 + }, + { + "epoch": 0.7784, + "grad_norm": 0.35053861270716696, + "learning_rate": 2.4674994726960633e-05, + "loss": 0.5734, + "step": 2919 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.3594647809109259, + "learning_rate": 2.4618208616684214e-05, + "loss": 0.6153, + "step": 2920 + }, + { + "epoch": 0.7789333333333334, + "grad_norm": 0.32647511916596544, + "learning_rate": 2.45614787508443e-05, + "loss": 0.5601, + "step": 2921 + }, + { + "epoch": 0.7792, + "grad_norm": 0.334028580235852, + "learning_rate": 2.4504805171768642e-05, + "loss": 0.5409, + "step": 2922 + }, + { + "epoch": 0.7794666666666666, + "grad_norm": 0.38378756268180986, + "learning_rate": 2.4448187921742925e-05, + "loss": 0.5765, + "step": 2923 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.32324283794676945, + "learning_rate": 2.4391627043010855e-05, + "loss": 0.5333, + "step": 2924 + }, + { + "epoch": 0.78, + "grad_norm": 0.34944296392694285, + "learning_rate": 2.4335122577774072e-05, + "loss": 0.5413, + "step": 2925 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.3508850592798124, + "learning_rate": 2.4278674568192128e-05, + "loss": 0.5625, + "step": 2926 + }, + { + "epoch": 0.7805333333333333, + "grad_norm": 0.3341152152889283, + "learning_rate": 2.4222283056382444e-05, + "loss": 0.607, + "step": 2927 + }, + { + "epoch": 0.7808, + "grad_norm": 0.33756049390002807, + "learning_rate": 2.4165948084420243e-05, + "loss": 0.5327, + "step": 2928 + }, + { + "epoch": 0.7810666666666667, + "grad_norm": 0.312321732118843, + "learning_rate": 2.4109669694338632e-05, + "loss": 0.5466, + "step": 2929 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.3975393449590267, + "learning_rate": 2.405344792812847e-05, + "loss": 0.5452, + "step": 2930 + }, + { + "epoch": 0.7816, + "grad_norm": 0.3265068424958377, + "learning_rate": 2.3997282827738366e-05, + "loss": 0.5504, + "step": 2931 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.3452517475991024, + "learning_rate": 2.3941174435074654e-05, + "loss": 0.56, + "step": 2932 + }, + { + "epoch": 0.7821333333333333, + "grad_norm": 0.33782500403876303, + "learning_rate": 2.388512279200137e-05, + "loss": 0.6001, + "step": 2933 + }, + { + "epoch": 0.7824, + "grad_norm": 0.3370854004151688, + "learning_rate": 2.3829127940340168e-05, + "loss": 0.5563, + "step": 2934 + }, + { + "epoch": 0.7826666666666666, + "grad_norm": 0.33451810783263564, + "learning_rate": 2.3773189921870376e-05, + "loss": 0.5826, + "step": 2935 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.333585780842901, + "learning_rate": 2.3717308778328874e-05, + "loss": 0.5488, + "step": 2936 + }, + { + "epoch": 0.7832, + "grad_norm": 0.37089414369526985, + "learning_rate": 2.366148455141014e-05, + "loss": 0.593, + "step": 2937 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.37176979525298176, + "learning_rate": 2.360571728276617e-05, + "loss": 0.5659, + "step": 2938 + }, + { + "epoch": 0.7837333333333333, + "grad_norm": 0.34479276230369466, + "learning_rate": 2.355000701400647e-05, + "loss": 0.5139, + "step": 2939 + }, + { + "epoch": 0.784, + "grad_norm": 0.3251565084630508, + "learning_rate": 2.3494353786698e-05, + "loss": 0.5794, + "step": 2940 + }, + { + "epoch": 0.7842666666666667, + "grad_norm": 0.34830553401586795, + "learning_rate": 2.343875764236516e-05, + "loss": 0.6183, + "step": 2941 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.34745467283869724, + "learning_rate": 2.3383218622489787e-05, + "loss": 0.5944, + "step": 2942 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3358164507661162, + "learning_rate": 2.3327736768511098e-05, + "loss": 0.6149, + "step": 2943 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.33637899727272086, + "learning_rate": 2.327231212182559e-05, + "loss": 0.5858, + "step": 2944 + }, + { + "epoch": 0.7853333333333333, + "grad_norm": 0.3821470768425901, + "learning_rate": 2.3216944723787138e-05, + "loss": 0.6128, + "step": 2945 + }, + { + "epoch": 0.7856, + "grad_norm": 0.3497881291500303, + "learning_rate": 2.3161634615706896e-05, + "loss": 0.5769, + "step": 2946 + }, + { + "epoch": 0.7858666666666667, + "grad_norm": 0.34137102419980003, + "learning_rate": 2.3106381838853253e-05, + "loss": 0.5735, + "step": 2947 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.34202923118737844, + "learning_rate": 2.3051186434451834e-05, + "loss": 0.5866, + "step": 2948 + }, + { + "epoch": 0.7864, + "grad_norm": 0.3629232532167221, + "learning_rate": 2.299604844368547e-05, + "loss": 0.6433, + "step": 2949 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.3545712525574004, + "learning_rate": 2.2940967907694112e-05, + "loss": 0.5332, + "step": 2950 + }, + { + "epoch": 0.7869333333333334, + "grad_norm": 0.3579347732370336, + "learning_rate": 2.2885944867574893e-05, + "loss": 0.5836, + "step": 2951 + }, + { + "epoch": 0.7872, + "grad_norm": 0.3566100557157306, + "learning_rate": 2.2830979364382022e-05, + "loss": 0.6296, + "step": 2952 + }, + { + "epoch": 0.7874666666666666, + "grad_norm": 0.34699427364241003, + "learning_rate": 2.2776071439126757e-05, + "loss": 0.6025, + "step": 2953 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.3465231209142456, + "learning_rate": 2.272122113277744e-05, + "loss": 0.594, + "step": 2954 + }, + { + "epoch": 0.788, + "grad_norm": 0.3670368858269258, + "learning_rate": 2.2666428486259382e-05, + "loss": 0.5329, + "step": 2955 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.32928263393429363, + "learning_rate": 2.2611693540454915e-05, + "loss": 0.5269, + "step": 2956 + }, + { + "epoch": 0.7885333333333333, + "grad_norm": 0.35814072641520267, + "learning_rate": 2.2557016336203262e-05, + "loss": 0.5712, + "step": 2957 + }, + { + "epoch": 0.7888, + "grad_norm": 0.3331818850374455, + "learning_rate": 2.250239691430065e-05, + "loss": 0.5664, + "step": 2958 + }, + { + "epoch": 0.7890666666666667, + "grad_norm": 0.4036037057308087, + "learning_rate": 2.2447835315500065e-05, + "loss": 0.5958, + "step": 2959 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3449568080902489, + "learning_rate": 2.239333158051147e-05, + "loss": 0.5638, + "step": 2960 + }, + { + "epoch": 0.7896, + "grad_norm": 0.3496415127045761, + "learning_rate": 2.2338885750001582e-05, + "loss": 0.6358, + "step": 2961 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.3501481408186804, + "learning_rate": 2.2284497864593944e-05, + "loss": 0.5972, + "step": 2962 + }, + { + "epoch": 0.7901333333333334, + "grad_norm": 0.33463969993135484, + "learning_rate": 2.2230167964868877e-05, + "loss": 0.5849, + "step": 2963 + }, + { + "epoch": 0.7904, + "grad_norm": 0.34851629426992603, + "learning_rate": 2.2175896091363414e-05, + "loss": 0.5715, + "step": 2964 + }, + { + "epoch": 0.7906666666666666, + "grad_norm": 0.36633196278627644, + "learning_rate": 2.212168228457129e-05, + "loss": 0.5733, + "step": 2965 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.35028472232384256, + "learning_rate": 2.2067526584942945e-05, + "loss": 0.6005, + "step": 2966 + }, + { + "epoch": 0.7912, + "grad_norm": 0.33031300973996097, + "learning_rate": 2.201342903288541e-05, + "loss": 0.5594, + "step": 2967 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.3601428153644341, + "learning_rate": 2.1959389668762377e-05, + "loss": 0.6124, + "step": 2968 + }, + { + "epoch": 0.7917333333333333, + "grad_norm": 0.3711474349441613, + "learning_rate": 2.19054085328941e-05, + "loss": 0.6243, + "step": 2969 + }, + { + "epoch": 0.792, + "grad_norm": 0.3414346088394343, + "learning_rate": 2.185148566555738e-05, + "loss": 0.5813, + "step": 2970 + }, + { + "epoch": 0.7922666666666667, + "grad_norm": 0.33354638034797607, + "learning_rate": 2.179762110698561e-05, + "loss": 0.5896, + "step": 2971 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.36501966154817117, + "learning_rate": 2.1743814897368597e-05, + "loss": 0.6131, + "step": 2972 + }, + { + "epoch": 0.7928, + "grad_norm": 0.3581623925022744, + "learning_rate": 2.1690067076852638e-05, + "loss": 0.5856, + "step": 2973 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.39654581731492705, + "learning_rate": 2.1636377685540487e-05, + "loss": 0.6308, + "step": 2974 + }, + { + "epoch": 0.7933333333333333, + "grad_norm": 0.3436151254193697, + "learning_rate": 2.1582746763491245e-05, + "loss": 0.5673, + "step": 2975 + }, + { + "epoch": 0.7936, + "grad_norm": 0.35044512361342534, + "learning_rate": 2.152917435072044e-05, + "loss": 0.574, + "step": 2976 + }, + { + "epoch": 0.7938666666666667, + "grad_norm": 0.3391148517034316, + "learning_rate": 2.1475660487199933e-05, + "loss": 0.5721, + "step": 2977 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.3462516980294206, + "learning_rate": 2.1422205212857892e-05, + "loss": 0.5762, + "step": 2978 + }, + { + "epoch": 0.7944, + "grad_norm": 0.32617739058869694, + "learning_rate": 2.136880856757877e-05, + "loss": 0.5532, + "step": 2979 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.3353370888440073, + "learning_rate": 2.131547059120329e-05, + "loss": 0.5525, + "step": 2980 + }, + { + "epoch": 0.7949333333333334, + "grad_norm": 0.35423787000879425, + "learning_rate": 2.1262191323528368e-05, + "loss": 0.5734, + "step": 2981 + }, + { + "epoch": 0.7952, + "grad_norm": 0.35902716894319314, + "learning_rate": 2.1208970804307194e-05, + "loss": 0.5975, + "step": 2982 + }, + { + "epoch": 0.7954666666666667, + "grad_norm": 0.32231984985583706, + "learning_rate": 2.115580907324899e-05, + "loss": 0.577, + "step": 2983 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.36113582567234853, + "learning_rate": 2.110270617001924e-05, + "loss": 0.6316, + "step": 2984 + }, + { + "epoch": 0.796, + "grad_norm": 0.3800356684757358, + "learning_rate": 2.1049662134239457e-05, + "loss": 0.5898, + "step": 2985 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.3841705838592886, + "learning_rate": 2.0996677005487285e-05, + "loss": 0.5688, + "step": 2986 + }, + { + "epoch": 0.7965333333333333, + "grad_norm": 0.3372205092230364, + "learning_rate": 2.094375082329638e-05, + "loss": 0.6191, + "step": 2987 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3410194847506707, + "learning_rate": 2.0890883627156442e-05, + "loss": 0.502, + "step": 2988 + }, + { + "epoch": 0.7970666666666667, + "grad_norm": 0.37071420339438005, + "learning_rate": 2.0838075456513128e-05, + "loss": 0.5506, + "step": 2989 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.32982977459433915, + "learning_rate": 2.0785326350768087e-05, + "loss": 0.5405, + "step": 2990 + }, + { + "epoch": 0.7976, + "grad_norm": 0.3481684852598814, + "learning_rate": 2.0732636349278878e-05, + "loss": 0.5954, + "step": 2991 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.3285778523524745, + "learning_rate": 2.0680005491358967e-05, + "loss": 0.5085, + "step": 2992 + }, + { + "epoch": 0.7981333333333334, + "grad_norm": 0.3305428040598721, + "learning_rate": 2.0627433816277684e-05, + "loss": 0.5469, + "step": 2993 + }, + { + "epoch": 0.7984, + "grad_norm": 0.3524810485184063, + "learning_rate": 2.0574921363260226e-05, + "loss": 0.5656, + "step": 2994 + }, + { + "epoch": 0.7986666666666666, + "grad_norm": 0.3304612675564754, + "learning_rate": 2.0522468171487564e-05, + "loss": 0.5605, + "step": 2995 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.33560023661076915, + "learning_rate": 2.0470074280096484e-05, + "loss": 0.5665, + "step": 2996 + }, + { + "epoch": 0.7992, + "grad_norm": 0.3420571509222942, + "learning_rate": 2.041773972817954e-05, + "loss": 0.5548, + "step": 2997 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.36316095570694745, + "learning_rate": 2.0365464554784942e-05, + "loss": 0.5229, + "step": 2998 + }, + { + "epoch": 0.7997333333333333, + "grad_norm": 0.3831509790868416, + "learning_rate": 2.031324879891664e-05, + "loss": 0.6059, + "step": 2999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3468139295168862, + "learning_rate": 2.0261092499534285e-05, + "loss": 0.585, + "step": 3000 + }, + { + "epoch": 0.8002666666666667, + "grad_norm": 0.3283080311199373, + "learning_rate": 2.020899569555311e-05, + "loss": 0.5392, + "step": 3001 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.34129602356471256, + "learning_rate": 2.0156958425843987e-05, + "loss": 0.5572, + "step": 3002 + }, + { + "epoch": 0.8008, + "grad_norm": 0.338405424984137, + "learning_rate": 2.0104980729233368e-05, + "loss": 0.6059, + "step": 3003 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.3543978323110512, + "learning_rate": 2.0053062644503228e-05, + "loss": 0.5315, + "step": 3004 + }, + { + "epoch": 0.8013333333333333, + "grad_norm": 0.3596869803007064, + "learning_rate": 2.000120421039111e-05, + "loss": 0.6161, + "step": 3005 + }, + { + "epoch": 0.8016, + "grad_norm": 0.3310055304869842, + "learning_rate": 1.994940546559e-05, + "loss": 0.527, + "step": 3006 + }, + { + "epoch": 0.8018666666666666, + "grad_norm": 0.3679947211080711, + "learning_rate": 1.9897666448748387e-05, + "loss": 0.5777, + "step": 3007 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.3365881418478606, + "learning_rate": 1.9845987198470174e-05, + "loss": 0.5343, + "step": 3008 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3641471750427538, + "learning_rate": 1.979436775331468e-05, + "loss": 0.566, + "step": 3009 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.35879713886753284, + "learning_rate": 1.9742808151796587e-05, + "loss": 0.6098, + "step": 3010 + }, + { + "epoch": 0.8029333333333334, + "grad_norm": 0.3450226540112181, + "learning_rate": 1.9691308432385956e-05, + "loss": 0.5947, + "step": 3011 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3983602084144142, + "learning_rate": 1.963986863350814e-05, + "loss": 0.5772, + "step": 3012 + }, + { + "epoch": 0.8034666666666667, + "grad_norm": 0.3811759945389708, + "learning_rate": 1.9588488793543824e-05, + "loss": 0.5106, + "step": 3013 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.3206806892853088, + "learning_rate": 1.9537168950828875e-05, + "loss": 0.5384, + "step": 3014 + }, + { + "epoch": 0.804, + "grad_norm": 0.3497959505990501, + "learning_rate": 1.9485909143654457e-05, + "loss": 0.5166, + "step": 3015 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.3437040874983381, + "learning_rate": 1.943470941026695e-05, + "loss": 0.5812, + "step": 3016 + }, + { + "epoch": 0.8045333333333333, + "grad_norm": 0.36824913600742964, + "learning_rate": 1.9383569788867873e-05, + "loss": 0.5694, + "step": 3017 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3623494389131158, + "learning_rate": 1.9332490317613904e-05, + "loss": 0.568, + "step": 3018 + }, + { + "epoch": 0.8050666666666667, + "grad_norm": 0.3424569572169921, + "learning_rate": 1.928147103461687e-05, + "loss": 0.565, + "step": 3019 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.3455387317412677, + "learning_rate": 1.9230511977943643e-05, + "loss": 0.5987, + "step": 3020 + }, + { + "epoch": 0.8056, + "grad_norm": 0.32797040924539606, + "learning_rate": 1.917961318561623e-05, + "loss": 0.5476, + "step": 3021 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.347167277213583, + "learning_rate": 1.9128774695611562e-05, + "loss": 0.5688, + "step": 3022 + }, + { + "epoch": 0.8061333333333334, + "grad_norm": 0.3498274783989771, + "learning_rate": 1.9077996545861677e-05, + "loss": 0.5418, + "step": 3023 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3325721718081461, + "learning_rate": 1.902727877425353e-05, + "loss": 0.5323, + "step": 3024 + }, + { + "epoch": 0.8066666666666666, + "grad_norm": 0.3582061127968383, + "learning_rate": 1.8976621418629047e-05, + "loss": 0.6011, + "step": 3025 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.35025751466836996, + "learning_rate": 1.8926024516785135e-05, + "loss": 0.564, + "step": 3026 + }, + { + "epoch": 0.8072, + "grad_norm": 0.3263800107252902, + "learning_rate": 1.8875488106473495e-05, + "loss": 0.5499, + "step": 3027 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.33237138627982893, + "learning_rate": 1.8825012225400752e-05, + "loss": 0.5275, + "step": 3028 + }, + { + "epoch": 0.8077333333333333, + "grad_norm": 0.3722351293796434, + "learning_rate": 1.8774596911228382e-05, + "loss": 0.5893, + "step": 3029 + }, + { + "epoch": 0.808, + "grad_norm": 0.4122538209596308, + "learning_rate": 1.8724242201572585e-05, + "loss": 0.5907, + "step": 3030 + }, + { + "epoch": 0.8082666666666667, + "grad_norm": 0.358682437899776, + "learning_rate": 1.8673948134004426e-05, + "loss": 0.5877, + "step": 3031 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.34031307693105944, + "learning_rate": 1.8623714746049704e-05, + "loss": 0.5761, + "step": 3032 + }, + { + "epoch": 0.8088, + "grad_norm": 0.3246738900676092, + "learning_rate": 1.8573542075188932e-05, + "loss": 0.5657, + "step": 3033 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.3433072148820531, + "learning_rate": 1.8523430158857334e-05, + "loss": 0.5646, + "step": 3034 + }, + { + "epoch": 0.8093333333333333, + "grad_norm": 0.3387495616516081, + "learning_rate": 1.8473379034444782e-05, + "loss": 0.5758, + "step": 3035 + }, + { + "epoch": 0.8096, + "grad_norm": 0.34529923052733885, + "learning_rate": 1.8423388739295833e-05, + "loss": 0.588, + "step": 3036 + }, + { + "epoch": 0.8098666666666666, + "grad_norm": 0.3466341485690237, + "learning_rate": 1.8373459310709585e-05, + "loss": 0.574, + "step": 3037 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.34573166808366856, + "learning_rate": 1.832359078593977e-05, + "loss": 0.5896, + "step": 3038 + }, + { + "epoch": 0.8104, + "grad_norm": 0.348255522853395, + "learning_rate": 1.8273783202194694e-05, + "loss": 0.5553, + "step": 3039 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.33829183891849923, + "learning_rate": 1.8224036596637152e-05, + "loss": 0.5427, + "step": 3040 + }, + { + "epoch": 0.8109333333333333, + "grad_norm": 0.3417128184368856, + "learning_rate": 1.8174351006384473e-05, + "loss": 0.5617, + "step": 3041 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3597667027802103, + "learning_rate": 1.8124726468508435e-05, + "loss": 0.5516, + "step": 3042 + }, + { + "epoch": 0.8114666666666667, + "grad_norm": 0.31211275175926995, + "learning_rate": 1.8075163020035292e-05, + "loss": 0.5016, + "step": 3043 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.3674318252568378, + "learning_rate": 1.802566069794569e-05, + "loss": 0.5979, + "step": 3044 + }, + { + "epoch": 0.812, + "grad_norm": 0.36490718474560413, + "learning_rate": 1.7976219539174687e-05, + "loss": 0.5629, + "step": 3045 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.3482359713706896, + "learning_rate": 1.79268395806117e-05, + "loss": 0.565, + "step": 3046 + }, + { + "epoch": 0.8125333333333333, + "grad_norm": 0.355305787392465, + "learning_rate": 1.787752085910046e-05, + "loss": 0.6048, + "step": 3047 + }, + { + "epoch": 0.8128, + "grad_norm": 0.37025822305093725, + "learning_rate": 1.782826341143904e-05, + "loss": 0.597, + "step": 3048 + }, + { + "epoch": 0.8130666666666667, + "grad_norm": 0.349070178251286, + "learning_rate": 1.777906727437979e-05, + "loss": 0.5988, + "step": 3049 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.3271595322401396, + "learning_rate": 1.7729932484629296e-05, + "loss": 0.5395, + "step": 3050 + }, + { + "epoch": 0.8136, + "grad_norm": 0.3321691289751727, + "learning_rate": 1.7680859078848376e-05, + "loss": 0.555, + "step": 3051 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.354746522010819, + "learning_rate": 1.7631847093652098e-05, + "loss": 0.6291, + "step": 3052 + }, + { + "epoch": 0.8141333333333334, + "grad_norm": 0.4154338098869775, + "learning_rate": 1.7582896565609598e-05, + "loss": 0.5762, + "step": 3053 + }, + { + "epoch": 0.8144, + "grad_norm": 0.343550579976311, + "learning_rate": 1.7534007531244236e-05, + "loss": 0.5542, + "step": 3054 + }, + { + "epoch": 0.8146666666666667, + "grad_norm": 0.34442071345670167, + "learning_rate": 1.7485180027033475e-05, + "loss": 0.5822, + "step": 3055 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.3454538987912449, + "learning_rate": 1.743641408940886e-05, + "loss": 0.6414, + "step": 3056 + }, + { + "epoch": 0.8152, + "grad_norm": 0.35737450665453296, + "learning_rate": 1.738770975475602e-05, + "loss": 0.5582, + "step": 3057 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.3406958021802569, + "learning_rate": 1.7339067059414582e-05, + "loss": 0.5394, + "step": 3058 + }, + { + "epoch": 0.8157333333333333, + "grad_norm": 0.3540868724474538, + "learning_rate": 1.7290486039678223e-05, + "loss": 0.5779, + "step": 3059 + }, + { + "epoch": 0.816, + "grad_norm": 0.3356834500889169, + "learning_rate": 1.7241966731794578e-05, + "loss": 0.5499, + "step": 3060 + }, + { + "epoch": 0.8162666666666667, + "grad_norm": 0.34861717312040424, + "learning_rate": 1.7193509171965237e-05, + "loss": 0.5576, + "step": 3061 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.31299222278930233, + "learning_rate": 1.7145113396345725e-05, + "loss": 0.5303, + "step": 3062 + }, + { + "epoch": 0.8168, + "grad_norm": 0.3464489477861352, + "learning_rate": 1.7096779441045473e-05, + "loss": 0.5702, + "step": 3063 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.34110702062407466, + "learning_rate": 1.7048507342127785e-05, + "loss": 0.5945, + "step": 3064 + }, + { + "epoch": 0.8173333333333334, + "grad_norm": 0.33369153933547685, + "learning_rate": 1.7000297135609787e-05, + "loss": 0.5718, + "step": 3065 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3264318555296828, + "learning_rate": 1.695214885746246e-05, + "loss": 0.5202, + "step": 3066 + }, + { + "epoch": 0.8178666666666666, + "grad_norm": 0.3606842064423147, + "learning_rate": 1.6904062543610556e-05, + "loss": 0.5559, + "step": 3067 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.33856709350332637, + "learning_rate": 1.6856038229932636e-05, + "loss": 0.6019, + "step": 3068 + }, + { + "epoch": 0.8184, + "grad_norm": 0.33504660943360826, + "learning_rate": 1.6808075952260915e-05, + "loss": 0.5454, + "step": 3069 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.3286890067049924, + "learning_rate": 1.6760175746381402e-05, + "loss": 0.5569, + "step": 3070 + }, + { + "epoch": 0.8189333333333333, + "grad_norm": 0.35798914123973774, + "learning_rate": 1.6712337648033748e-05, + "loss": 0.585, + "step": 3071 + }, + { + "epoch": 0.8192, + "grad_norm": 0.33909398079926073, + "learning_rate": 1.6664561692911284e-05, + "loss": 0.516, + "step": 3072 + }, + { + "epoch": 0.8194666666666667, + "grad_norm": 0.3385164586449421, + "learning_rate": 1.6616847916660992e-05, + "loss": 0.5987, + "step": 3073 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.3308690950640168, + "learning_rate": 1.656919635488341e-05, + "loss": 0.5328, + "step": 3074 + }, + { + "epoch": 0.82, + "grad_norm": 0.35880878727539284, + "learning_rate": 1.6521607043132714e-05, + "loss": 0.5534, + "step": 3075 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.34203552628716377, + "learning_rate": 1.647408001691657e-05, + "loss": 0.5585, + "step": 3076 + }, + { + "epoch": 0.8205333333333333, + "grad_norm": 0.34149842048265244, + "learning_rate": 1.6426615311696226e-05, + "loss": 0.5883, + "step": 3077 + }, + { + "epoch": 0.8208, + "grad_norm": 0.3280354312415396, + "learning_rate": 1.6379212962886394e-05, + "loss": 0.5225, + "step": 3078 + }, + { + "epoch": 0.8210666666666666, + "grad_norm": 0.34276041276084807, + "learning_rate": 1.633187300585528e-05, + "loss": 0.5613, + "step": 3079 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.37355750210778077, + "learning_rate": 1.6284595475924546e-05, + "loss": 0.5901, + "step": 3080 + }, + { + "epoch": 0.8216, + "grad_norm": 0.3451587348565714, + "learning_rate": 1.623738040836923e-05, + "loss": 0.5749, + "step": 3081 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.3430810898241322, + "learning_rate": 1.619022783841785e-05, + "loss": 0.5427, + "step": 3082 + }, + { + "epoch": 0.8221333333333334, + "grad_norm": 0.317669618085047, + "learning_rate": 1.614313780125224e-05, + "loss": 0.5284, + "step": 3083 + }, + { + "epoch": 0.8224, + "grad_norm": 0.35776214539323364, + "learning_rate": 1.609611033200752e-05, + "loss": 0.5722, + "step": 3084 + }, + { + "epoch": 0.8226666666666667, + "grad_norm": 0.33701294207980914, + "learning_rate": 1.6049145465772218e-05, + "loss": 0.5844, + "step": 3085 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.37266954199545144, + "learning_rate": 1.6002243237588112e-05, + "loss": 0.6142, + "step": 3086 + }, + { + "epoch": 0.8232, + "grad_norm": 0.36180311121312675, + "learning_rate": 1.5955403682450252e-05, + "loss": 0.6758, + "step": 3087 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.3291685064163525, + "learning_rate": 1.5908626835306938e-05, + "loss": 0.567, + "step": 3088 + }, + { + "epoch": 0.8237333333333333, + "grad_norm": 0.37160550945805754, + "learning_rate": 1.5861912731059636e-05, + "loss": 0.5546, + "step": 3089 + }, + { + "epoch": 0.824, + "grad_norm": 0.3482485423047536, + "learning_rate": 1.5815261404563065e-05, + "loss": 0.5991, + "step": 3090 + }, + { + "epoch": 0.8242666666666667, + "grad_norm": 0.36364254800404955, + "learning_rate": 1.5768672890625058e-05, + "loss": 0.5516, + "step": 3091 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.32491513929243804, + "learning_rate": 1.5722147224006565e-05, + "loss": 0.5893, + "step": 3092 + }, + { + "epoch": 0.8248, + "grad_norm": 0.35602654794975985, + "learning_rate": 1.5675684439421702e-05, + "loss": 0.5694, + "step": 3093 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.4167272938154404, + "learning_rate": 1.5629284571537618e-05, + "loss": 0.5651, + "step": 3094 + }, + { + "epoch": 0.8253333333333334, + "grad_norm": 0.34858193964545053, + "learning_rate": 1.5582947654974533e-05, + "loss": 0.5524, + "step": 3095 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3468123358100999, + "learning_rate": 1.5536673724305716e-05, + "loss": 0.563, + "step": 3096 + }, + { + "epoch": 0.8258666666666666, + "grad_norm": 0.34670637513527003, + "learning_rate": 1.5490462814057415e-05, + "loss": 0.5575, + "step": 3097 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.3184868533646417, + "learning_rate": 1.5444314958708873e-05, + "loss": 0.5443, + "step": 3098 + }, + { + "epoch": 0.8264, + "grad_norm": 0.32110216001052383, + "learning_rate": 1.5398230192692277e-05, + "loss": 0.5201, + "step": 3099 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.3462876778150794, + "learning_rate": 1.5352208550392743e-05, + "loss": 0.5931, + "step": 3100 + }, + { + "epoch": 0.8269333333333333, + "grad_norm": 0.3312189465926998, + "learning_rate": 1.5306250066148285e-05, + "loss": 0.5589, + "step": 3101 + }, + { + "epoch": 0.8272, + "grad_norm": 0.32952532243402916, + "learning_rate": 1.5260354774249806e-05, + "loss": 0.5665, + "step": 3102 + }, + { + "epoch": 0.8274666666666667, + "grad_norm": 0.3593564091826446, + "learning_rate": 1.5214522708941037e-05, + "loss": 0.5549, + "step": 3103 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.3609291592066639, + "learning_rate": 1.5168753904418565e-05, + "loss": 0.6086, + "step": 3104 + }, + { + "epoch": 0.828, + "grad_norm": 0.342609030376073, + "learning_rate": 1.512304839483175e-05, + "loss": 0.5591, + "step": 3105 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.34918690776952505, + "learning_rate": 1.5077406214282741e-05, + "loss": 0.6121, + "step": 3106 + }, + { + "epoch": 0.8285333333333333, + "grad_norm": 0.36463903701050193, + "learning_rate": 1.5031827396826448e-05, + "loss": 0.5781, + "step": 3107 + }, + { + "epoch": 0.8288, + "grad_norm": 0.36561929772298407, + "learning_rate": 1.4986311976470425e-05, + "loss": 0.5783, + "step": 3108 + }, + { + "epoch": 0.8290666666666666, + "grad_norm": 0.33928718604895186, + "learning_rate": 1.4940859987175037e-05, + "loss": 0.56, + "step": 3109 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.34876984697828656, + "learning_rate": 1.489547146285325e-05, + "loss": 0.544, + "step": 3110 + }, + { + "epoch": 0.8296, + "grad_norm": 0.32426992058441134, + "learning_rate": 1.4850146437370693e-05, + "loss": 0.5392, + "step": 3111 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.3311752958283469, + "learning_rate": 1.4804884944545627e-05, + "loss": 0.5549, + "step": 3112 + }, + { + "epoch": 0.8301333333333333, + "grad_norm": 0.33526699517919134, + "learning_rate": 1.4759687018148894e-05, + "loss": 0.5834, + "step": 3113 + }, + { + "epoch": 0.8304, + "grad_norm": 0.34289504616230815, + "learning_rate": 1.471455269190396e-05, + "loss": 0.5532, + "step": 3114 + }, + { + "epoch": 0.8306666666666667, + "grad_norm": 0.3554178233628037, + "learning_rate": 1.466948199948669e-05, + "loss": 0.4929, + "step": 3115 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.35488489073814056, + "learning_rate": 1.462447497452567e-05, + "loss": 0.5835, + "step": 3116 + }, + { + "epoch": 0.8312, + "grad_norm": 0.3388100258171893, + "learning_rate": 1.4579531650601853e-05, + "loss": 0.5861, + "step": 3117 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.3252292502494152, + "learning_rate": 1.4534652061248677e-05, + "loss": 0.5506, + "step": 3118 + }, + { + "epoch": 0.8317333333333333, + "grad_norm": 0.332497505901698, + "learning_rate": 1.448983623995207e-05, + "loss": 0.5403, + "step": 3119 + }, + { + "epoch": 0.832, + "grad_norm": 0.3538993481602382, + "learning_rate": 1.444508422015034e-05, + "loss": 0.5781, + "step": 3120 + }, + { + "epoch": 0.8322666666666667, + "grad_norm": 0.3330130248940368, + "learning_rate": 1.4400396035234198e-05, + "loss": 0.5494, + "step": 3121 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.35722297271831394, + "learning_rate": 1.4355771718546773e-05, + "loss": 0.53, + "step": 3122 + }, + { + "epoch": 0.8328, + "grad_norm": 0.33146304474566984, + "learning_rate": 1.4311211303383442e-05, + "loss": 0.5723, + "step": 3123 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.34921100976654335, + "learning_rate": 1.4266714822991989e-05, + "loss": 0.5399, + "step": 3124 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.34187059882469273, + "learning_rate": 1.4222282310572465e-05, + "loss": 0.5828, + "step": 3125 + }, + { + "epoch": 0.8336, + "grad_norm": 0.3508506691671756, + "learning_rate": 1.4177913799277198e-05, + "loss": 0.5611, + "step": 3126 + }, + { + "epoch": 0.8338666666666666, + "grad_norm": 0.34778415638058724, + "learning_rate": 1.4133609322210762e-05, + "loss": 0.4985, + "step": 3127 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.3249220339834162, + "learning_rate": 1.4089368912429945e-05, + "loss": 0.5186, + "step": 3128 + }, + { + "epoch": 0.8344, + "grad_norm": 0.35513881664832403, + "learning_rate": 1.4045192602943736e-05, + "loss": 0.5931, + "step": 3129 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.32643230736436546, + "learning_rate": 1.4001080426713331e-05, + "loss": 0.5443, + "step": 3130 + }, + { + "epoch": 0.8349333333333333, + "grad_norm": 0.3459611623277745, + "learning_rate": 1.3957032416651983e-05, + "loss": 0.5734, + "step": 3131 + }, + { + "epoch": 0.8352, + "grad_norm": 0.34682337803813185, + "learning_rate": 1.3913048605625168e-05, + "loss": 0.6112, + "step": 3132 + }, + { + "epoch": 0.8354666666666667, + "grad_norm": 0.3545936607288065, + "learning_rate": 1.3869129026450423e-05, + "loss": 0.6057, + "step": 3133 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.33269372663317176, + "learning_rate": 1.3825273711897347e-05, + "loss": 0.5584, + "step": 3134 + }, + { + "epoch": 0.836, + "grad_norm": 0.3305187222747315, + "learning_rate": 1.3781482694687598e-05, + "loss": 0.5426, + "step": 3135 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.3333210069668065, + "learning_rate": 1.3737756007494861e-05, + "loss": 0.5235, + "step": 3136 + }, + { + "epoch": 0.8365333333333334, + "grad_norm": 0.3598372953989674, + "learning_rate": 1.3694093682944853e-05, + "loss": 0.5473, + "step": 3137 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3521392383401204, + "learning_rate": 1.3650495753615244e-05, + "loss": 0.5894, + "step": 3138 + }, + { + "epoch": 0.8370666666666666, + "grad_norm": 0.34076214918151726, + "learning_rate": 1.3606962252035615e-05, + "loss": 0.5638, + "step": 3139 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.3672019914215645, + "learning_rate": 1.3563493210687529e-05, + "loss": 0.5914, + "step": 3140 + }, + { + "epoch": 0.8376, + "grad_norm": 0.34307137890350625, + "learning_rate": 1.3520088662004438e-05, + "loss": 0.5526, + "step": 3141 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.32295057140138955, + "learning_rate": 1.3476748638371672e-05, + "loss": 0.5504, + "step": 3142 + }, + { + "epoch": 0.8381333333333333, + "grad_norm": 0.33721794235393426, + "learning_rate": 1.3433473172126431e-05, + "loss": 0.6044, + "step": 3143 + }, + { + "epoch": 0.8384, + "grad_norm": 0.33626786453773044, + "learning_rate": 1.3390262295557731e-05, + "loss": 0.5821, + "step": 3144 + }, + { + "epoch": 0.8386666666666667, + "grad_norm": 0.3619784898673849, + "learning_rate": 1.3347116040906394e-05, + "loss": 0.5621, + "step": 3145 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.3226946715887109, + "learning_rate": 1.3304034440365065e-05, + "loss": 0.5414, + "step": 3146 + }, + { + "epoch": 0.8392, + "grad_norm": 0.3381053345806699, + "learning_rate": 1.3261017526078057e-05, + "loss": 0.5414, + "step": 3147 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.3600032170298199, + "learning_rate": 1.3218065330141515e-05, + "loss": 0.5783, + "step": 3148 + }, + { + "epoch": 0.8397333333333333, + "grad_norm": 0.36358285934799534, + "learning_rate": 1.3175177884603252e-05, + "loss": 0.582, + "step": 3149 + }, + { + "epoch": 0.84, + "grad_norm": 0.33877387393008823, + "learning_rate": 1.3132355221462778e-05, + "loss": 0.5736, + "step": 3150 + }, + { + "epoch": 0.8402666666666667, + "grad_norm": 0.34036159221005535, + "learning_rate": 1.3089597372671259e-05, + "loss": 0.532, + "step": 3151 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.32721215150383337, + "learning_rate": 1.3046904370131507e-05, + "loss": 0.5634, + "step": 3152 + }, + { + "epoch": 0.8408, + "grad_norm": 0.35704536091702926, + "learning_rate": 1.3004276245697955e-05, + "loss": 0.5458, + "step": 3153 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.3514385322166996, + "learning_rate": 1.2961713031176625e-05, + "loss": 0.5493, + "step": 3154 + }, + { + "epoch": 0.8413333333333334, + "grad_norm": 0.38800781784510435, + "learning_rate": 1.2919214758325104e-05, + "loss": 0.5645, + "step": 3155 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3366959556255024, + "learning_rate": 1.2876781458852538e-05, + "loss": 0.5916, + "step": 3156 + }, + { + "epoch": 0.8418666666666667, + "grad_norm": 0.33977567455723934, + "learning_rate": 1.2834413164419567e-05, + "loss": 0.5643, + "step": 3157 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.34141280336233987, + "learning_rate": 1.279210990663835e-05, + "loss": 0.5696, + "step": 3158 + }, + { + "epoch": 0.8424, + "grad_norm": 0.35339066196754426, + "learning_rate": 1.2749871717072515e-05, + "loss": 0.5755, + "step": 3159 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.32868857625428677, + "learning_rate": 1.2707698627237152e-05, + "loss": 0.5742, + "step": 3160 + }, + { + "epoch": 0.8429333333333333, + "grad_norm": 0.36450120748671516, + "learning_rate": 1.2665590668598781e-05, + "loss": 0.5495, + "step": 3161 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3459870745887521, + "learning_rate": 1.262354787257527e-05, + "loss": 0.5821, + "step": 3162 + }, + { + "epoch": 0.8434666666666667, + "grad_norm": 0.32624892871447686, + "learning_rate": 1.2581570270535924e-05, + "loss": 0.5407, + "step": 3163 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.32909254573737806, + "learning_rate": 1.2539657893801416e-05, + "loss": 0.5457, + "step": 3164 + }, + { + "epoch": 0.844, + "grad_norm": 0.327849396654827, + "learning_rate": 1.2497810773643704e-05, + "loss": 0.5587, + "step": 3165 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.32594205283464733, + "learning_rate": 1.245602894128609e-05, + "loss": 0.5615, + "step": 3166 + }, + { + "epoch": 0.8445333333333334, + "grad_norm": 0.350382500476646, + "learning_rate": 1.2414312427903152e-05, + "loss": 0.6224, + "step": 3167 + }, + { + "epoch": 0.8448, + "grad_norm": 0.33571184143014543, + "learning_rate": 1.2372661264620744e-05, + "loss": 0.5729, + "step": 3168 + }, + { + "epoch": 0.8450666666666666, + "grad_norm": 0.325950602377954, + "learning_rate": 1.2331075482515942e-05, + "loss": 0.5496, + "step": 3169 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.34304794355718793, + "learning_rate": 1.2289555112617024e-05, + "loss": 0.5685, + "step": 3170 + }, + { + "epoch": 0.8456, + "grad_norm": 0.3423742386766403, + "learning_rate": 1.2248100185903488e-05, + "loss": 0.5635, + "step": 3171 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.3389473443874955, + "learning_rate": 1.2206710733306037e-05, + "loss": 0.5444, + "step": 3172 + }, + { + "epoch": 0.8461333333333333, + "grad_norm": 0.34594767698427004, + "learning_rate": 1.2165386785706456e-05, + "loss": 0.5874, + "step": 3173 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3705801891599974, + "learning_rate": 1.2124128373937693e-05, + "loss": 0.5543, + "step": 3174 + }, + { + "epoch": 0.8466666666666667, + "grad_norm": 0.35331391348330077, + "learning_rate": 1.208293552878379e-05, + "loss": 0.5895, + "step": 3175 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.32947477169748096, + "learning_rate": 1.204180828097986e-05, + "loss": 0.54, + "step": 3176 + }, + { + "epoch": 0.8472, + "grad_norm": 0.35936722997120535, + "learning_rate": 1.2000746661212104e-05, + "loss": 0.5686, + "step": 3177 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.3552943797490246, + "learning_rate": 1.1959750700117678e-05, + "loss": 0.6426, + "step": 3178 + }, + { + "epoch": 0.8477333333333333, + "grad_norm": 0.37181025722058303, + "learning_rate": 1.1918820428284839e-05, + "loss": 0.6171, + "step": 3179 + }, + { + "epoch": 0.848, + "grad_norm": 0.34126260558190114, + "learning_rate": 1.1877955876252778e-05, + "loss": 0.5775, + "step": 3180 + }, + { + "epoch": 0.8482666666666666, + "grad_norm": 0.348535537282637, + "learning_rate": 1.1837157074511674e-05, + "loss": 0.5831, + "step": 3181 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.3335099676123647, + "learning_rate": 1.1796424053502641e-05, + "loss": 0.5215, + "step": 3182 + }, + { + "epoch": 0.8488, + "grad_norm": 0.322370643651146, + "learning_rate": 1.1755756843617705e-05, + "loss": 0.5369, + "step": 3183 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.3343027320626063, + "learning_rate": 1.1715155475199791e-05, + "loss": 0.5893, + "step": 3184 + }, + { + "epoch": 0.8493333333333334, + "grad_norm": 0.3398381708915825, + "learning_rate": 1.1674619978542734e-05, + "loss": 0.5598, + "step": 3185 + }, + { + "epoch": 0.8496, + "grad_norm": 0.3504585743785552, + "learning_rate": 1.1634150383891152e-05, + "loss": 0.5265, + "step": 3186 + }, + { + "epoch": 0.8498666666666667, + "grad_norm": 0.34953786866756165, + "learning_rate": 1.1593746721440524e-05, + "loss": 0.5738, + "step": 3187 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.3644657263291906, + "learning_rate": 1.1553409021337148e-05, + "loss": 0.5444, + "step": 3188 + }, + { + "epoch": 0.8504, + "grad_norm": 0.3800849986514281, + "learning_rate": 1.1513137313678113e-05, + "loss": 0.5533, + "step": 3189 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.35258202623158125, + "learning_rate": 1.147293162851123e-05, + "loss": 0.5148, + "step": 3190 + }, + { + "epoch": 0.8509333333333333, + "grad_norm": 0.33044009416194836, + "learning_rate": 1.143279199583508e-05, + "loss": 0.5482, + "step": 3191 + }, + { + "epoch": 0.8512, + "grad_norm": 0.3530916489213741, + "learning_rate": 1.1392718445598949e-05, + "loss": 0.5825, + "step": 3192 + }, + { + "epoch": 0.8514666666666667, + "grad_norm": 0.3508665335390478, + "learning_rate": 1.1352711007702832e-05, + "loss": 0.5579, + "step": 3193 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.3466998416938436, + "learning_rate": 1.1312769711997362e-05, + "loss": 0.5432, + "step": 3194 + }, + { + "epoch": 0.852, + "grad_norm": 0.3411622791986878, + "learning_rate": 1.1272894588283867e-05, + "loss": 0.5772, + "step": 3195 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.36269813098005116, + "learning_rate": 1.1233085666314258e-05, + "loss": 0.5779, + "step": 3196 + }, + { + "epoch": 0.8525333333333334, + "grad_norm": 0.3578020137917265, + "learning_rate": 1.1193342975791076e-05, + "loss": 0.5465, + "step": 3197 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3553736712478192, + "learning_rate": 1.1153666546367447e-05, + "loss": 0.5898, + "step": 3198 + }, + { + "epoch": 0.8530666666666666, + "grad_norm": 0.3277443264332803, + "learning_rate": 1.1114056407647044e-05, + "loss": 0.5554, + "step": 3199 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.3364657624990799, + "learning_rate": 1.1074512589184105e-05, + "loss": 0.5442, + "step": 3200 + }, + { + "epoch": 0.8536, + "grad_norm": 0.32918260501145835, + "learning_rate": 1.1035035120483328e-05, + "loss": 0.527, + "step": 3201 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.3295555408946516, + "learning_rate": 1.0995624030999974e-05, + "loss": 0.5934, + "step": 3202 + }, + { + "epoch": 0.8541333333333333, + "grad_norm": 0.35935900137617577, + "learning_rate": 1.095627935013972e-05, + "loss": 0.6097, + "step": 3203 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3401256039563951, + "learning_rate": 1.091700110725874e-05, + "loss": 0.5766, + "step": 3204 + }, + { + "epoch": 0.8546666666666667, + "grad_norm": 0.34740509250637935, + "learning_rate": 1.0877789331663612e-05, + "loss": 0.619, + "step": 3205 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.34838854795588037, + "learning_rate": 1.0838644052611314e-05, + "loss": 0.6076, + "step": 3206 + }, + { + "epoch": 0.8552, + "grad_norm": 0.34881942738001387, + "learning_rate": 1.0799565299309233e-05, + "loss": 0.5899, + "step": 3207 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.33809558751613183, + "learning_rate": 1.0760553100915093e-05, + "loss": 0.5903, + "step": 3208 + }, + { + "epoch": 0.8557333333333333, + "grad_norm": 0.3503737839708749, + "learning_rate": 1.0721607486536989e-05, + "loss": 0.5604, + "step": 3209 + }, + { + "epoch": 0.856, + "grad_norm": 0.3510536872947679, + "learning_rate": 1.0682728485233307e-05, + "loss": 0.5201, + "step": 3210 + }, + { + "epoch": 0.8562666666666666, + "grad_norm": 0.34266434087244585, + "learning_rate": 1.0643916126012755e-05, + "loss": 0.5804, + "step": 3211 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.35739511854891026, + "learning_rate": 1.060517043783429e-05, + "loss": 0.6024, + "step": 3212 + }, + { + "epoch": 0.8568, + "grad_norm": 0.33487402596688814, + "learning_rate": 1.0566491449607152e-05, + "loss": 0.5815, + "step": 3213 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.3337753554192245, + "learning_rate": 1.0527879190190793e-05, + "loss": 0.5135, + "step": 3214 + }, + { + "epoch": 0.8573333333333333, + "grad_norm": 0.34646176413437607, + "learning_rate": 1.0489333688394898e-05, + "loss": 0.563, + "step": 3215 + }, + { + "epoch": 0.8576, + "grad_norm": 0.35282410195835445, + "learning_rate": 1.0450854972979351e-05, + "loss": 0.5998, + "step": 3216 + }, + { + "epoch": 0.8578666666666667, + "grad_norm": 0.38319147117092245, + "learning_rate": 1.0412443072654132e-05, + "loss": 0.5739, + "step": 3217 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.32483721109258223, + "learning_rate": 1.0374098016079447e-05, + "loss": 0.5491, + "step": 3218 + }, + { + "epoch": 0.8584, + "grad_norm": 0.3608205263968364, + "learning_rate": 1.0335819831865601e-05, + "loss": 0.5895, + "step": 3219 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.33152552478422187, + "learning_rate": 1.0297608548573e-05, + "loss": 0.5543, + "step": 3220 + }, + { + "epoch": 0.8589333333333333, + "grad_norm": 0.3977908002600204, + "learning_rate": 1.0259464194712153e-05, + "loss": 0.582, + "step": 3221 + }, + { + "epoch": 0.8592, + "grad_norm": 0.35248138235149407, + "learning_rate": 1.0221386798743604e-05, + "loss": 0.5868, + "step": 3222 + }, + { + "epoch": 0.8594666666666667, + "grad_norm": 0.35197958026965886, + "learning_rate": 1.0183376389077948e-05, + "loss": 0.5722, + "step": 3223 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.34013751184786606, + "learning_rate": 1.0145432994075833e-05, + "loss": 0.5646, + "step": 3224 + }, + { + "epoch": 0.86, + "grad_norm": 0.34994347886653926, + "learning_rate": 1.010755664204781e-05, + "loss": 0.5807, + "step": 3225 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.34550206003348827, + "learning_rate": 1.00697473612545e-05, + "loss": 0.5491, + "step": 3226 + }, + { + "epoch": 0.8605333333333334, + "grad_norm": 0.33916558306614775, + "learning_rate": 1.0032005179906478e-05, + "loss": 0.547, + "step": 3227 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3456662667542057, + "learning_rate": 9.994330126164208e-06, + "loss": 0.5791, + "step": 3228 + }, + { + "epoch": 0.8610666666666666, + "grad_norm": 0.3421016826860681, + "learning_rate": 9.956722228138083e-06, + "loss": 0.5647, + "step": 3229 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.35431801656339623, + "learning_rate": 9.919181513888409e-06, + "loss": 0.59, + "step": 3230 + }, + { + "epoch": 0.8616, + "grad_norm": 0.33044793847886744, + "learning_rate": 9.88170801142536e-06, + "loss": 0.5511, + "step": 3231 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.3696311999080211, + "learning_rate": 9.844301748708906e-06, + "loss": 0.5627, + "step": 3232 + }, + { + "epoch": 0.8621333333333333, + "grad_norm": 0.3315578626872966, + "learning_rate": 9.806962753648929e-06, + "loss": 0.5554, + "step": 3233 + }, + { + "epoch": 0.8624, + "grad_norm": 0.3272825363008379, + "learning_rate": 9.769691054105067e-06, + "loss": 0.5253, + "step": 3234 + }, + { + "epoch": 0.8626666666666667, + "grad_norm": 0.35656411744864613, + "learning_rate": 9.732486677886777e-06, + "loss": 0.5846, + "step": 3235 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.3433762363590593, + "learning_rate": 9.69534965275326e-06, + "loss": 0.5875, + "step": 3236 + }, + { + "epoch": 0.8632, + "grad_norm": 0.4143839401623141, + "learning_rate": 9.658280006413488e-06, + "loss": 0.601, + "step": 3237 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.3615864490544622, + "learning_rate": 9.621277766526138e-06, + "loss": 0.5731, + "step": 3238 + }, + { + "epoch": 0.8637333333333334, + "grad_norm": 0.3680894124977018, + "learning_rate": 9.584342960699633e-06, + "loss": 0.6064, + "step": 3239 + }, + { + "epoch": 0.864, + "grad_norm": 0.37612550367805253, + "learning_rate": 9.547475616492007e-06, + "loss": 0.5787, + "step": 3240 + }, + { + "epoch": 0.8642666666666666, + "grad_norm": 0.3678757434369968, + "learning_rate": 9.510675761411015e-06, + "loss": 0.5789, + "step": 3241 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.3394426309988805, + "learning_rate": 9.473943422914067e-06, + "loss": 0.5397, + "step": 3242 + }, + { + "epoch": 0.8648, + "grad_norm": 0.369978562311377, + "learning_rate": 9.437278628408153e-06, + "loss": 0.5215, + "step": 3243 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.3349487137594821, + "learning_rate": 9.4006814052499e-06, + "loss": 0.5357, + "step": 3244 + }, + { + "epoch": 0.8653333333333333, + "grad_norm": 0.37528542341620424, + "learning_rate": 9.364151780745501e-06, + "loss": 0.5653, + "step": 3245 + }, + { + "epoch": 0.8656, + "grad_norm": 0.34429230510042047, + "learning_rate": 9.327689782150729e-06, + "loss": 0.5621, + "step": 3246 + }, + { + "epoch": 0.8658666666666667, + "grad_norm": 0.33909386793917956, + "learning_rate": 9.291295436670877e-06, + "loss": 0.545, + "step": 3247 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.3449815903588223, + "learning_rate": 9.25496877146077e-06, + "loss": 0.5679, + "step": 3248 + }, + { + "epoch": 0.8664, + "grad_norm": 0.34378056093082454, + "learning_rate": 9.218709813624748e-06, + "loss": 0.5891, + "step": 3249 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.35957566797279594, + "learning_rate": 9.182518590216615e-06, + "loss": 0.5928, + "step": 3250 + }, + { + "epoch": 0.8669333333333333, + "grad_norm": 0.328655898207611, + "learning_rate": 9.146395128239637e-06, + "loss": 0.5625, + "step": 3251 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3518279321720355, + "learning_rate": 9.110339454646532e-06, + "loss": 0.6103, + "step": 3252 + }, + { + "epoch": 0.8674666666666667, + "grad_norm": 0.557246452495665, + "learning_rate": 9.074351596339437e-06, + "loss": 0.5913, + "step": 3253 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.40350832276323767, + "learning_rate": 9.03843158016987e-06, + "loss": 0.5946, + "step": 3254 + }, + { + "epoch": 0.868, + "grad_norm": 0.33970570901158936, + "learning_rate": 9.002579432938795e-06, + "loss": 0.5538, + "step": 3255 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.3461920017211161, + "learning_rate": 8.966795181396425e-06, + "loss": 0.5388, + "step": 3256 + }, + { + "epoch": 0.8685333333333334, + "grad_norm": 0.34190066830255655, + "learning_rate": 8.931078852242413e-06, + "loss": 0.5864, + "step": 3257 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3356230809666767, + "learning_rate": 8.895430472125687e-06, + "loss": 0.5621, + "step": 3258 + }, + { + "epoch": 0.8690666666666667, + "grad_norm": 0.345759208794225, + "learning_rate": 8.859850067644505e-06, + "loss": 0.6045, + "step": 3259 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.33861681655206805, + "learning_rate": 8.824337665346371e-06, + "loss": 0.5832, + "step": 3260 + }, + { + "epoch": 0.8696, + "grad_norm": 0.362354139084554, + "learning_rate": 8.788893291728083e-06, + "loss": 0.5989, + "step": 3261 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.3530734766136594, + "learning_rate": 8.753516973235654e-06, + "loss": 0.5652, + "step": 3262 + }, + { + "epoch": 0.8701333333333333, + "grad_norm": 0.44759324956005603, + "learning_rate": 8.718208736264344e-06, + "loss": 0.5075, + "step": 3263 + }, + { + "epoch": 0.8704, + "grad_norm": 0.33063703542549155, + "learning_rate": 8.682968607158604e-06, + "loss": 0.5579, + "step": 3264 + }, + { + "epoch": 0.8706666666666667, + "grad_norm": 0.37433577562887554, + "learning_rate": 8.647796612212056e-06, + "loss": 0.6016, + "step": 3265 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.3337620533484374, + "learning_rate": 8.612692777667498e-06, + "loss": 0.5545, + "step": 3266 + }, + { + "epoch": 0.8712, + "grad_norm": 0.31873766786679025, + "learning_rate": 8.577657129716887e-06, + "loss": 0.5149, + "step": 3267 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.33669869623011145, + "learning_rate": 8.542689694501272e-06, + "loss": 0.5696, + "step": 3268 + }, + { + "epoch": 0.8717333333333334, + "grad_norm": 0.3644123820171066, + "learning_rate": 8.507790498110824e-06, + "loss": 0.5294, + "step": 3269 + }, + { + "epoch": 0.872, + "grad_norm": 0.32859115143557605, + "learning_rate": 8.472959566584804e-06, + "loss": 0.5524, + "step": 3270 + }, + { + "epoch": 0.8722666666666666, + "grad_norm": 0.3649837198560756, + "learning_rate": 8.438196925911546e-06, + "loss": 0.6044, + "step": 3271 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.353299286740774, + "learning_rate": 8.40350260202838e-06, + "loss": 0.5793, + "step": 3272 + }, + { + "epoch": 0.8728, + "grad_norm": 0.35775039633801486, + "learning_rate": 8.36887662082172e-06, + "loss": 0.5644, + "step": 3273 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.3461767398297513, + "learning_rate": 8.334319008126967e-06, + "loss": 0.5772, + "step": 3274 + }, + { + "epoch": 0.8733333333333333, + "grad_norm": 0.33164862832406716, + "learning_rate": 8.299829789728498e-06, + "loss": 0.5075, + "step": 3275 + }, + { + "epoch": 0.8736, + "grad_norm": 0.36987386787045756, + "learning_rate": 8.265408991359691e-06, + "loss": 0.5705, + "step": 3276 + }, + { + "epoch": 0.8738666666666667, + "grad_norm": 0.35301635551176375, + "learning_rate": 8.231056638702839e-06, + "loss": 0.6106, + "step": 3277 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.3462908486757754, + "learning_rate": 8.196772757389203e-06, + "loss": 0.5562, + "step": 3278 + }, + { + "epoch": 0.8744, + "grad_norm": 0.3328326226517987, + "learning_rate": 8.162557372998913e-06, + "loss": 0.591, + "step": 3279 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.3388622961992753, + "learning_rate": 8.128410511061002e-06, + "loss": 0.5528, + "step": 3280 + }, + { + "epoch": 0.8749333333333333, + "grad_norm": 0.37635543294128826, + "learning_rate": 8.094332197053412e-06, + "loss": 0.6112, + "step": 3281 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3799775151648602, + "learning_rate": 8.06032245640288e-06, + "loss": 0.6067, + "step": 3282 + }, + { + "epoch": 0.8754666666666666, + "grad_norm": 0.35906077347323107, + "learning_rate": 8.026381314485054e-06, + "loss": 0.5215, + "step": 3283 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.3287536923184325, + "learning_rate": 7.992508796624343e-06, + "loss": 0.5261, + "step": 3284 + }, + { + "epoch": 0.876, + "grad_norm": 0.327492567496066, + "learning_rate": 7.958704928093963e-06, + "loss": 0.4939, + "step": 3285 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.36053509661078054, + "learning_rate": 7.924969734115928e-06, + "loss": 0.5629, + "step": 3286 + }, + { + "epoch": 0.8765333333333334, + "grad_norm": 0.3363783491727698, + "learning_rate": 7.89130323986098e-06, + "loss": 0.5794, + "step": 3287 + }, + { + "epoch": 0.8768, + "grad_norm": 0.34796371043350693, + "learning_rate": 7.857705470448617e-06, + "loss": 0.5444, + "step": 3288 + }, + { + "epoch": 0.8770666666666667, + "grad_norm": 0.3184679776396525, + "learning_rate": 7.824176450947075e-06, + "loss": 0.5546, + "step": 3289 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.34421400841504457, + "learning_rate": 7.790716206373283e-06, + "loss": 0.5599, + "step": 3290 + }, + { + "epoch": 0.8776, + "grad_norm": 0.3532589077353095, + "learning_rate": 7.757324761692841e-06, + "loss": 0.6085, + "step": 3291 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.3448776749614105, + "learning_rate": 7.72400214182004e-06, + "loss": 0.5196, + "step": 3292 + }, + { + "epoch": 0.8781333333333333, + "grad_norm": 0.3350082273654233, + "learning_rate": 7.690748371617806e-06, + "loss": 0.5373, + "step": 3293 + }, + { + "epoch": 0.8784, + "grad_norm": 0.34487808315604784, + "learning_rate": 7.657563475897711e-06, + "loss": 0.575, + "step": 3294 + }, + { + "epoch": 0.8786666666666667, + "grad_norm": 0.35237549556478437, + "learning_rate": 7.624447479419883e-06, + "loss": 0.6269, + "step": 3295 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.33164028359496506, + "learning_rate": 7.591400406893101e-06, + "loss": 0.5488, + "step": 3296 + }, + { + "epoch": 0.8792, + "grad_norm": 0.3542185727173985, + "learning_rate": 7.558422282974708e-06, + "loss": 0.5764, + "step": 3297 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.3611010524609047, + "learning_rate": 7.525513132270579e-06, + "loss": 0.5811, + "step": 3298 + }, + { + "epoch": 0.8797333333333334, + "grad_norm": 0.36001884298028736, + "learning_rate": 7.492672979335147e-06, + "loss": 0.6253, + "step": 3299 + }, + { + "epoch": 0.88, + "grad_norm": 0.36426856748370834, + "learning_rate": 7.459901848671347e-06, + "loss": 0.5798, + "step": 3300 + }, + { + "epoch": 0.8802666666666666, + "grad_norm": 0.35830557996569656, + "learning_rate": 7.4271997647306415e-06, + "loss": 0.5523, + "step": 3301 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.34978906934982534, + "learning_rate": 7.394566751912957e-06, + "loss": 0.5263, + "step": 3302 + }, + { + "epoch": 0.8808, + "grad_norm": 0.35198915069184167, + "learning_rate": 7.3620028345666726e-06, + "loss": 0.5833, + "step": 3303 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.35568275541387856, + "learning_rate": 7.329508036988641e-06, + "loss": 0.5811, + "step": 3304 + }, + { + "epoch": 0.8813333333333333, + "grad_norm": 0.35916953034639415, + "learning_rate": 7.297082383424115e-06, + "loss": 0.643, + "step": 3305 + }, + { + "epoch": 0.8816, + "grad_norm": 0.34236865114061354, + "learning_rate": 7.2647258980667706e-06, + "loss": 0.5967, + "step": 3306 + }, + { + "epoch": 0.8818666666666667, + "grad_norm": 0.34567493944293115, + "learning_rate": 7.232438605058689e-06, + "loss": 0.5553, + "step": 3307 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 1.3100793124503591, + "learning_rate": 7.200220528490298e-06, + "loss": 0.591, + "step": 3308 + }, + { + "epoch": 0.8824, + "grad_norm": 0.3639618549404457, + "learning_rate": 7.168071692400402e-06, + "loss": 0.549, + "step": 3309 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.33472015991256804, + "learning_rate": 7.1359921207761585e-06, + "loss": 0.5326, + "step": 3310 + }, + { + "epoch": 0.8829333333333333, + "grad_norm": 0.34037985197760295, + "learning_rate": 7.1039818375529644e-06, + "loss": 0.5831, + "step": 3311 + }, + { + "epoch": 0.8832, + "grad_norm": 0.33021966945560177, + "learning_rate": 7.072040866614616e-06, + "loss": 0.5317, + "step": 3312 + }, + { + "epoch": 0.8834666666666666, + "grad_norm": 0.331136993799541, + "learning_rate": 7.040169231793137e-06, + "loss": 0.5506, + "step": 3313 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.3486234207619461, + "learning_rate": 7.0083669568688505e-06, + "loss": 0.6035, + "step": 3314 + }, + { + "epoch": 0.884, + "grad_norm": 0.3638037317700521, + "learning_rate": 6.976634065570309e-06, + "loss": 0.5656, + "step": 3315 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.3421952045586565, + "learning_rate": 6.944970581574284e-06, + "loss": 0.5403, + "step": 3316 + }, + { + "epoch": 0.8845333333333333, + "grad_norm": 0.33440102904543934, + "learning_rate": 6.913376528505799e-06, + "loss": 0.5521, + "step": 3317 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3765112715460031, + "learning_rate": 6.881851929938021e-06, + "loss": 0.5992, + "step": 3318 + }, + { + "epoch": 0.8850666666666667, + "grad_norm": 0.3630694747416555, + "learning_rate": 6.850396809392356e-06, + "loss": 0.5965, + "step": 3319 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.3522758941752851, + "learning_rate": 6.819011190338309e-06, + "loss": 0.5636, + "step": 3320 + }, + { + "epoch": 0.8856, + "grad_norm": 0.3385246380739454, + "learning_rate": 6.78769509619358e-06, + "loss": 0.5607, + "step": 3321 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.32551809370357543, + "learning_rate": 6.7564485503239574e-06, + "loss": 0.5295, + "step": 3322 + }, + { + "epoch": 0.8861333333333333, + "grad_norm": 0.35627144264962807, + "learning_rate": 6.725271576043346e-06, + "loss": 0.5737, + "step": 3323 + }, + { + "epoch": 0.8864, + "grad_norm": 0.36115897784736484, + "learning_rate": 6.694164196613772e-06, + "loss": 0.5558, + "step": 3324 + }, + { + "epoch": 0.8866666666666667, + "grad_norm": 0.3341930188039213, + "learning_rate": 6.663126435245304e-06, + "loss": 0.5433, + "step": 3325 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.3810960423378213, + "learning_rate": 6.63215831509606e-06, + "loss": 0.5879, + "step": 3326 + }, + { + "epoch": 0.8872, + "grad_norm": 0.33993605890660894, + "learning_rate": 6.601259859272202e-06, + "loss": 0.586, + "step": 3327 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.35004036929817195, + "learning_rate": 6.570431090827944e-06, + "loss": 0.6266, + "step": 3328 + }, + { + "epoch": 0.8877333333333334, + "grad_norm": 0.3717503938383483, + "learning_rate": 6.539672032765465e-06, + "loss": 0.6555, + "step": 3329 + }, + { + "epoch": 0.888, + "grad_norm": 0.3445225343826448, + "learning_rate": 6.508982708034961e-06, + "loss": 0.5866, + "step": 3330 + }, + { + "epoch": 0.8882666666666666, + "grad_norm": 0.35045801232569795, + "learning_rate": 6.478363139534571e-06, + "loss": 0.5893, + "step": 3331 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.35154649913330455, + "learning_rate": 6.44781335011041e-06, + "loss": 0.5901, + "step": 3332 + }, + { + "epoch": 0.8888, + "grad_norm": 0.35292744534969683, + "learning_rate": 6.417333362556532e-06, + "loss": 0.5655, + "step": 3333 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.3456544766542196, + "learning_rate": 6.3869231996148695e-06, + "loss": 0.5373, + "step": 3334 + }, + { + "epoch": 0.8893333333333333, + "grad_norm": 0.3524234625854795, + "learning_rate": 6.3565828839753035e-06, + "loss": 0.5855, + "step": 3335 + }, + { + "epoch": 0.8896, + "grad_norm": 0.39272882262439995, + "learning_rate": 6.326312438275572e-06, + "loss": 0.6213, + "step": 3336 + }, + { + "epoch": 0.8898666666666667, + "grad_norm": 0.3448382217704263, + "learning_rate": 6.296111885101297e-06, + "loss": 0.5956, + "step": 3337 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.3513853832014005, + "learning_rate": 6.265981246985919e-06, + "loss": 0.5351, + "step": 3338 + }, + { + "epoch": 0.8904, + "grad_norm": 0.34568456451112917, + "learning_rate": 6.2359205464107895e-06, + "loss": 0.5505, + "step": 3339 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.48686243022011005, + "learning_rate": 6.2059298058049995e-06, + "loss": 0.5287, + "step": 3340 + }, + { + "epoch": 0.8909333333333334, + "grad_norm": 0.3460418659531681, + "learning_rate": 6.1760090475454834e-06, + "loss": 0.5458, + "step": 3341 + }, + { + "epoch": 0.8912, + "grad_norm": 0.36103118976956017, + "learning_rate": 6.146158293956927e-06, + "loss": 0.5813, + "step": 3342 + }, + { + "epoch": 0.8914666666666666, + "grad_norm": 0.3461758153050274, + "learning_rate": 6.116377567311793e-06, + "loss": 0.5976, + "step": 3343 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.3581011010382332, + "learning_rate": 6.086666889830328e-06, + "loss": 0.5854, + "step": 3344 + }, + { + "epoch": 0.892, + "grad_norm": 0.33804936136371316, + "learning_rate": 6.057026283680478e-06, + "loss": 0.5853, + "step": 3345 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.3340592069086362, + "learning_rate": 6.02745577097793e-06, + "loss": 0.5712, + "step": 3346 + }, + { + "epoch": 0.8925333333333333, + "grad_norm": 0.34941985204684334, + "learning_rate": 5.997955373786035e-06, + "loss": 0.5976, + "step": 3347 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3643799347137535, + "learning_rate": 5.968525114115875e-06, + "loss": 0.5341, + "step": 3348 + }, + { + "epoch": 0.8930666666666667, + "grad_norm": 0.32646250831254653, + "learning_rate": 5.939165013926196e-06, + "loss": 0.5547, + "step": 3349 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.3454219699314769, + "learning_rate": 5.90987509512333e-06, + "loss": 0.5632, + "step": 3350 + }, + { + "epoch": 0.8936, + "grad_norm": 0.33942577975764904, + "learning_rate": 5.880655379561328e-06, + "loss": 0.5884, + "step": 3351 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.36689672439472804, + "learning_rate": 5.851505889041819e-06, + "loss": 0.5489, + "step": 3352 + }, + { + "epoch": 0.8941333333333333, + "grad_norm": 0.3383970330327614, + "learning_rate": 5.82242664531405e-06, + "loss": 0.5603, + "step": 3353 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3449391882849117, + "learning_rate": 5.793417670074841e-06, + "loss": 0.5395, + "step": 3354 + }, + { + "epoch": 0.8946666666666667, + "grad_norm": 0.3655048693945037, + "learning_rate": 5.764478984968591e-06, + "loss": 0.6243, + "step": 3355 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.3619076553017383, + "learning_rate": 5.73561061158725e-06, + "loss": 0.5677, + "step": 3356 + }, + { + "epoch": 0.8952, + "grad_norm": 0.35851992046288833, + "learning_rate": 5.70681257147031e-06, + "loss": 0.5882, + "step": 3357 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.35672187751638573, + "learning_rate": 5.678084886104795e-06, + "loss": 0.5336, + "step": 3358 + }, + { + "epoch": 0.8957333333333334, + "grad_norm": 0.3324540489987115, + "learning_rate": 5.649427576925204e-06, + "loss": 0.5633, + "step": 3359 + }, + { + "epoch": 0.896, + "grad_norm": 0.31751610760453475, + "learning_rate": 5.620840665313554e-06, + "loss": 0.5377, + "step": 3360 + }, + { + "epoch": 0.8962666666666667, + "grad_norm": 0.3369519572095269, + "learning_rate": 5.59232417259935e-06, + "loss": 0.542, + "step": 3361 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.32943862031547017, + "learning_rate": 5.563878120059507e-06, + "loss": 0.5257, + "step": 3362 + }, + { + "epoch": 0.8968, + "grad_norm": 0.3545512427860248, + "learning_rate": 5.535502528918413e-06, + "loss": 0.5273, + "step": 3363 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.34791790302858805, + "learning_rate": 5.507197420347909e-06, + "loss": 0.5435, + "step": 3364 + }, + { + "epoch": 0.8973333333333333, + "grad_norm": 0.3672933208616374, + "learning_rate": 5.478962815467193e-06, + "loss": 0.5796, + "step": 3365 + }, + { + "epoch": 0.8976, + "grad_norm": 0.357506143076227, + "learning_rate": 5.450798735342877e-06, + "loss": 0.5818, + "step": 3366 + }, + { + "epoch": 0.8978666666666667, + "grad_norm": 0.3431617037447951, + "learning_rate": 5.422705200988975e-06, + "loss": 0.5708, + "step": 3367 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.3435081114997157, + "learning_rate": 5.394682233366844e-06, + "loss": 0.5992, + "step": 3368 + }, + { + "epoch": 0.8984, + "grad_norm": 0.3449471231427674, + "learning_rate": 5.366729853385189e-06, + "loss": 0.5831, + "step": 3369 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.3515242856874825, + "learning_rate": 5.3388480819000604e-06, + "loss": 0.5426, + "step": 3370 + }, + { + "epoch": 0.8989333333333334, + "grad_norm": 0.3278263290498706, + "learning_rate": 5.3110369397148195e-06, + "loss": 0.5698, + "step": 3371 + }, + { + "epoch": 0.8992, + "grad_norm": 0.4205046184387036, + "learning_rate": 5.283296447580121e-06, + "loss": 0.5119, + "step": 3372 + }, + { + "epoch": 0.8994666666666666, + "grad_norm": 0.33514078064743896, + "learning_rate": 5.25562662619391e-06, + "loss": 0.5361, + "step": 3373 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.3651376001740104, + "learning_rate": 5.2280274962014155e-06, + "loss": 0.5825, + "step": 3374 + }, + { + "epoch": 0.9, + "grad_norm": 0.34651899022009525, + "learning_rate": 5.200499078195109e-06, + "loss": 0.5548, + "step": 3375 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.3492079567788359, + "learning_rate": 5.173041392714695e-06, + "loss": 0.5131, + "step": 3376 + }, + { + "epoch": 0.9005333333333333, + "grad_norm": 0.3324962445554178, + "learning_rate": 5.145654460247107e-06, + "loss": 0.5635, + "step": 3377 + }, + { + "epoch": 0.9008, + "grad_norm": 0.3629628773480166, + "learning_rate": 5.118338301226511e-06, + "loss": 0.5467, + "step": 3378 + }, + { + "epoch": 0.9010666666666667, + "grad_norm": 0.3646730658641418, + "learning_rate": 5.091092936034225e-06, + "loss": 0.6, + "step": 3379 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.3283987708264201, + "learning_rate": 5.063918384998801e-06, + "loss": 0.5338, + "step": 3380 + }, + { + "epoch": 0.9016, + "grad_norm": 0.3581215745826271, + "learning_rate": 5.036814668395884e-06, + "loss": 0.5509, + "step": 3381 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.35746200480286305, + "learning_rate": 5.009781806448321e-06, + "loss": 0.5639, + "step": 3382 + }, + { + "epoch": 0.9021333333333333, + "grad_norm": 0.35833296131263564, + "learning_rate": 4.982819819326079e-06, + "loss": 0.5061, + "step": 3383 + }, + { + "epoch": 0.9024, + "grad_norm": 0.34649362525522315, + "learning_rate": 4.955928727146242e-06, + "loss": 0.5869, + "step": 3384 + }, + { + "epoch": 0.9026666666666666, + "grad_norm": 0.32997879256129825, + "learning_rate": 4.929108549972994e-06, + "loss": 0.531, + "step": 3385 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.3371744799059837, + "learning_rate": 4.902359307817617e-06, + "loss": 0.5512, + "step": 3386 + }, + { + "epoch": 0.9032, + "grad_norm": 0.3588750402700034, + "learning_rate": 4.875681020638445e-06, + "loss": 0.5737, + "step": 3387 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.3356688343085048, + "learning_rate": 4.849073708340912e-06, + "loss": 0.5524, + "step": 3388 + }, + { + "epoch": 0.9037333333333334, + "grad_norm": 0.6348790138534877, + "learning_rate": 4.822537390777438e-06, + "loss": 0.5448, + "step": 3389 + }, + { + "epoch": 0.904, + "grad_norm": 0.33941149381075175, + "learning_rate": 4.7960720877475055e-06, + "loss": 0.5512, + "step": 3390 + }, + { + "epoch": 0.9042666666666667, + "grad_norm": 0.3507717175658764, + "learning_rate": 4.76967781899762e-06, + "loss": 0.6007, + "step": 3391 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.334190670152143, + "learning_rate": 4.743354604221273e-06, + "loss": 0.5823, + "step": 3392 + }, + { + "epoch": 0.9048, + "grad_norm": 0.3720034306023298, + "learning_rate": 4.717102463058931e-06, + "loss": 0.5726, + "step": 3393 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.33913944176271804, + "learning_rate": 4.69092141509806e-06, + "loss": 0.5291, + "step": 3394 + }, + { + "epoch": 0.9053333333333333, + "grad_norm": 0.34026829187386864, + "learning_rate": 4.664811479873066e-06, + "loss": 0.5714, + "step": 3395 + }, + { + "epoch": 0.9056, + "grad_norm": 0.35689347000342464, + "learning_rate": 4.638772676865266e-06, + "loss": 0.5847, + "step": 3396 + }, + { + "epoch": 0.9058666666666667, + "grad_norm": 0.33553290888199866, + "learning_rate": 4.612805025502953e-06, + "loss": 0.5218, + "step": 3397 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.33773884578643815, + "learning_rate": 4.586908545161295e-06, + "loss": 0.5765, + "step": 3398 + }, + { + "epoch": 0.9064, + "grad_norm": 0.3350389585714146, + "learning_rate": 4.56108325516238e-06, + "loss": 0.5661, + "step": 3399 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.33419233333684584, + "learning_rate": 4.53532917477516e-06, + "loss": 0.5645, + "step": 3400 + }, + { + "epoch": 0.9069333333333334, + "grad_norm": 0.33021951009146866, + "learning_rate": 4.509646323215477e-06, + "loss": 0.5626, + "step": 3401 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3390888722057864, + "learning_rate": 4.484034719646013e-06, + "loss": 0.5613, + "step": 3402 + }, + { + "epoch": 0.9074666666666666, + "grad_norm": 0.34782883901387635, + "learning_rate": 4.458494383176292e-06, + "loss": 0.5736, + "step": 3403 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.4285254476188629, + "learning_rate": 4.433025332862661e-06, + "loss": 0.5891, + "step": 3404 + }, + { + "epoch": 0.908, + "grad_norm": 0.35685185292864363, + "learning_rate": 4.407627587708285e-06, + "loss": 0.5484, + "step": 3405 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.33162199278586796, + "learning_rate": 4.3823011666631274e-06, + "loss": 0.5282, + "step": 3406 + }, + { + "epoch": 0.9085333333333333, + "grad_norm": 0.3353837709097986, + "learning_rate": 4.357046088623917e-06, + "loss": 0.5775, + "step": 3407 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3422129663229781, + "learning_rate": 4.331862372434181e-06, + "loss": 0.5558, + "step": 3408 + }, + { + "epoch": 0.9090666666666667, + "grad_norm": 0.34790986516491507, + "learning_rate": 4.3067500368841665e-06, + "loss": 0.58, + "step": 3409 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.3601399418613975, + "learning_rate": 4.281709100710907e-06, + "loss": 0.5967, + "step": 3410 + }, + { + "epoch": 0.9096, + "grad_norm": 0.34061847847607335, + "learning_rate": 4.256739582598113e-06, + "loss": 0.5325, + "step": 3411 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.3474707757087644, + "learning_rate": 4.231841501176237e-06, + "loss": 0.5569, + "step": 3412 + }, + { + "epoch": 0.9101333333333333, + "grad_norm": 0.33413575214733837, + "learning_rate": 4.207014875022442e-06, + "loss": 0.5739, + "step": 3413 + }, + { + "epoch": 0.9104, + "grad_norm": 0.37447508497973897, + "learning_rate": 4.182259722660531e-06, + "loss": 0.6287, + "step": 3414 + }, + { + "epoch": 0.9106666666666666, + "grad_norm": 0.34966345445978925, + "learning_rate": 4.15757606256103e-06, + "loss": 0.5605, + "step": 3415 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.3569638808577305, + "learning_rate": 4.132963913141097e-06, + "loss": 0.5752, + "step": 3416 + }, + { + "epoch": 0.9112, + "grad_norm": 0.3828216812747884, + "learning_rate": 4.108423292764529e-06, + "loss": 0.5446, + "step": 3417 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.3426205356289075, + "learning_rate": 4.083954219741759e-06, + "loss": 0.5346, + "step": 3418 + }, + { + "epoch": 0.9117333333333333, + "grad_norm": 0.3729100291429102, + "learning_rate": 4.059556712329849e-06, + "loss": 0.5793, + "step": 3419 + }, + { + "epoch": 0.912, + "grad_norm": 0.3800997619534846, + "learning_rate": 4.035230788732447e-06, + "loss": 0.5434, + "step": 3420 + }, + { + "epoch": 0.9122666666666667, + "grad_norm": 0.3548511885515272, + "learning_rate": 4.010976467099781e-06, + "loss": 0.5415, + "step": 3421 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.34714705832546566, + "learning_rate": 3.986793765528696e-06, + "loss": 0.553, + "step": 3422 + }, + { + "epoch": 0.9128, + "grad_norm": 0.3201284214399404, + "learning_rate": 3.962682702062559e-06, + "loss": 0.5349, + "step": 3423 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.3361315237513386, + "learning_rate": 3.938643294691302e-06, + "loss": 0.5383, + "step": 3424 + }, + { + "epoch": 0.9133333333333333, + "grad_norm": 0.4288977807737274, + "learning_rate": 3.9146755613514e-06, + "loss": 0.5554, + "step": 3425 + }, + { + "epoch": 0.9136, + "grad_norm": 0.37507120678643363, + "learning_rate": 3.890779519925825e-06, + "loss": 0.6013, + "step": 3426 + }, + { + "epoch": 0.9138666666666667, + "grad_norm": 0.3352797784670776, + "learning_rate": 3.866955188244092e-06, + "loss": 0.5972, + "step": 3427 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.36911711740443326, + "learning_rate": 3.843202584082161e-06, + "loss": 0.5588, + "step": 3428 + }, + { + "epoch": 0.9144, + "grad_norm": 0.3357896324309692, + "learning_rate": 3.819521725162545e-06, + "loss": 0.5694, + "step": 3429 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.33953496253329435, + "learning_rate": 3.7959126291541637e-06, + "loss": 0.5681, + "step": 3430 + }, + { + "epoch": 0.9149333333333334, + "grad_norm": 0.37423766467800623, + "learning_rate": 3.772375313672427e-06, + "loss": 0.585, + "step": 3431 + }, + { + "epoch": 0.9152, + "grad_norm": 0.32183862915826045, + "learning_rate": 3.7489097962791653e-06, + "loss": 0.5933, + "step": 3432 + }, + { + "epoch": 0.9154666666666667, + "grad_norm": 0.32908688690867044, + "learning_rate": 3.7255160944826617e-06, + "loss": 0.5889, + "step": 3433 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.3885315153699735, + "learning_rate": 3.7021942257375984e-06, + "loss": 0.5675, + "step": 3434 + }, + { + "epoch": 0.916, + "grad_norm": 0.3437928236652113, + "learning_rate": 3.6789442074450565e-06, + "loss": 0.5688, + "step": 3435 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.33830839924802564, + "learning_rate": 3.6557660569525253e-06, + "loss": 0.595, + "step": 3436 + }, + { + "epoch": 0.9165333333333333, + "grad_norm": 0.3668143129682812, + "learning_rate": 3.6326597915538608e-06, + "loss": 0.5922, + "step": 3437 + }, + { + "epoch": 0.9168, + "grad_norm": 0.3359897994039373, + "learning_rate": 3.6096254284892827e-06, + "loss": 0.5386, + "step": 3438 + }, + { + "epoch": 0.9170666666666667, + "grad_norm": 0.3452835638356792, + "learning_rate": 3.586662984945377e-06, + "loss": 0.5725, + "step": 3439 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.3319391224743454, + "learning_rate": 3.5637724780550385e-06, + "loss": 0.5487, + "step": 3440 + }, + { + "epoch": 0.9176, + "grad_norm": 0.3653469881855482, + "learning_rate": 3.5409539248975278e-06, + "loss": 0.5393, + "step": 3441 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.3591275259862968, + "learning_rate": 3.518207342498392e-06, + "loss": 0.5637, + "step": 3442 + }, + { + "epoch": 0.9181333333333334, + "grad_norm": 0.3500003388293157, + "learning_rate": 3.4955327478294665e-06, + "loss": 0.5789, + "step": 3443 + }, + { + "epoch": 0.9184, + "grad_norm": 0.35556080567542436, + "learning_rate": 3.472930157808907e-06, + "loss": 0.5455, + "step": 3444 + }, + { + "epoch": 0.9186666666666666, + "grad_norm": 0.4724456682388256, + "learning_rate": 3.4503995893011343e-06, + "loss": 0.5744, + "step": 3445 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.3525346173380394, + "learning_rate": 3.427941059116824e-06, + "loss": 0.5434, + "step": 3446 + }, + { + "epoch": 0.9192, + "grad_norm": 0.35242879127390236, + "learning_rate": 3.405554584012893e-06, + "loss": 0.5665, + "step": 3447 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.3660036809239432, + "learning_rate": 3.3832401806925262e-06, + "loss": 0.5814, + "step": 3448 + }, + { + "epoch": 0.9197333333333333, + "grad_norm": 0.32708289034734217, + "learning_rate": 3.3609978658051043e-06, + "loss": 0.5563, + "step": 3449 + }, + { + "epoch": 0.92, + "grad_norm": 0.36244401931388587, + "learning_rate": 3.338827655946253e-06, + "loss": 0.5744, + "step": 3450 + }, + { + "epoch": 0.9202666666666667, + "grad_norm": 0.32825296256281666, + "learning_rate": 3.3167295676577505e-06, + "loss": 0.6007, + "step": 3451 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.3298422305151278, + "learning_rate": 3.294703617427608e-06, + "loss": 0.5179, + "step": 3452 + }, + { + "epoch": 0.9208, + "grad_norm": 0.34209999603948027, + "learning_rate": 3.272749821689991e-06, + "loss": 0.534, + "step": 3453 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.3387623866217589, + "learning_rate": 3.250868196825241e-06, + "loss": 0.55, + "step": 3454 + }, + { + "epoch": 0.9213333333333333, + "grad_norm": 0.35384094049418513, + "learning_rate": 3.22905875915982e-06, + "loss": 0.595, + "step": 3455 + }, + { + "epoch": 0.9216, + "grad_norm": 0.34834502979965165, + "learning_rate": 3.207321524966378e-06, + "loss": 0.5903, + "step": 3456 + }, + { + "epoch": 0.9218666666666666, + "grad_norm": 0.3466414436596609, + "learning_rate": 3.1856565104636415e-06, + "loss": 0.5848, + "step": 3457 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.34434766048466514, + "learning_rate": 3.1640637318165132e-06, + "loss": 0.5975, + "step": 3458 + }, + { + "epoch": 0.9224, + "grad_norm": 0.3352189395887659, + "learning_rate": 3.1425432051359173e-06, + "loss": 0.5592, + "step": 3459 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.33372040012220905, + "learning_rate": 3.121094946478942e-06, + "loss": 0.5199, + "step": 3460 + }, + { + "epoch": 0.9229333333333334, + "grad_norm": 0.32843513845379646, + "learning_rate": 3.0997189718487084e-06, + "loss": 0.5518, + "step": 3461 + }, + { + "epoch": 0.9232, + "grad_norm": 0.35533330705449157, + "learning_rate": 3.0784152971944368e-06, + "loss": 0.56, + "step": 3462 + }, + { + "epoch": 0.9234666666666667, + "grad_norm": 0.3474625912154599, + "learning_rate": 3.0571839384113786e-06, + "loss": 0.5805, + "step": 3463 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.3477743232208824, + "learning_rate": 3.036024911340829e-06, + "loss": 0.5119, + "step": 3464 + }, + { + "epoch": 0.924, + "grad_norm": 0.3436521509737728, + "learning_rate": 3.0149382317701368e-06, + "loss": 0.5543, + "step": 3465 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.3658580228195742, + "learning_rate": 2.9939239154326613e-06, + "loss": 0.5663, + "step": 3466 + }, + { + "epoch": 0.9245333333333333, + "grad_norm": 0.3405350481687344, + "learning_rate": 2.9729819780077493e-06, + "loss": 0.5602, + "step": 3467 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3223376987274596, + "learning_rate": 2.9521124351207572e-06, + "loss": 0.5723, + "step": 3468 + }, + { + "epoch": 0.9250666666666667, + "grad_norm": 0.35046144031485543, + "learning_rate": 2.9313153023430407e-06, + "loss": 0.5565, + "step": 3469 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.35771389343211585, + "learning_rate": 2.910590595191898e-06, + "loss": 0.6168, + "step": 3470 + }, + { + "epoch": 0.9256, + "grad_norm": 0.3463009825761227, + "learning_rate": 2.8899383291306257e-06, + "loss": 0.568, + "step": 3471 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.36613794154936613, + "learning_rate": 2.8693585195684324e-06, + "loss": 0.5589, + "step": 3472 + }, + { + "epoch": 0.9261333333333334, + "grad_norm": 0.31227164196314305, + "learning_rate": 2.8488511818605124e-06, + "loss": 0.5427, + "step": 3473 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3359697325227988, + "learning_rate": 2.8284163313079146e-06, + "loss": 0.5235, + "step": 3474 + }, + { + "epoch": 0.9266666666666666, + "grad_norm": 0.34299822845409184, + "learning_rate": 2.8080539831576658e-06, + "loss": 0.5861, + "step": 3475 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.3273141045992717, + "learning_rate": 2.7877641526026785e-06, + "loss": 0.5558, + "step": 3476 + }, + { + "epoch": 0.9272, + "grad_norm": 0.3263508634726766, + "learning_rate": 2.767546854781744e-06, + "loss": 0.5396, + "step": 3477 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.33473440855531894, + "learning_rate": 2.747402104779562e-06, + "loss": 0.5447, + "step": 3478 + }, + { + "epoch": 0.9277333333333333, + "grad_norm": 0.342052373684546, + "learning_rate": 2.7273299176266863e-06, + "loss": 0.5933, + "step": 3479 + }, + { + "epoch": 0.928, + "grad_norm": 0.3533721872760103, + "learning_rate": 2.707330308299516e-06, + "loss": 0.5574, + "step": 3480 + }, + { + "epoch": 0.9282666666666667, + "grad_norm": 0.3549852413582081, + "learning_rate": 2.687403291720325e-06, + "loss": 0.5559, + "step": 3481 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.32512889753609175, + "learning_rate": 2.6675488827572093e-06, + "loss": 0.5529, + "step": 3482 + }, + { + "epoch": 0.9288, + "grad_norm": 0.34735416023315446, + "learning_rate": 2.647767096224063e-06, + "loss": 0.5616, + "step": 3483 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.3631690754339679, + "learning_rate": 2.6280579468806686e-06, + "loss": 0.6174, + "step": 3484 + }, + { + "epoch": 0.9293333333333333, + "grad_norm": 0.3216061865308382, + "learning_rate": 2.6084214494325523e-06, + "loss": 0.5526, + "step": 3485 + }, + { + "epoch": 0.9296, + "grad_norm": 0.3464061009823028, + "learning_rate": 2.5888576185310267e-06, + "loss": 0.5491, + "step": 3486 + }, + { + "epoch": 0.9298666666666666, + "grad_norm": 0.3342680712508537, + "learning_rate": 2.5693664687732266e-06, + "loss": 0.5373, + "step": 3487 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.3859268952964619, + "learning_rate": 2.5499480147020305e-06, + "loss": 0.5305, + "step": 3488 + }, + { + "epoch": 0.9304, + "grad_norm": 0.34716274196260805, + "learning_rate": 2.530602270806104e-06, + "loss": 0.5974, + "step": 3489 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.3504451834424654, + "learning_rate": 2.5113292515198007e-06, + "loss": 0.5384, + "step": 3490 + }, + { + "epoch": 0.9309333333333333, + "grad_norm": 0.372714929701282, + "learning_rate": 2.4921289712232842e-06, + "loss": 0.5627, + "step": 3491 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3503620334766579, + "learning_rate": 2.4730014442423954e-06, + "loss": 0.5518, + "step": 3492 + }, + { + "epoch": 0.9314666666666667, + "grad_norm": 0.3414162857666363, + "learning_rate": 2.453946684848718e-06, + "loss": 0.5636, + "step": 3493 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.34387143909760715, + "learning_rate": 2.434964707259535e-06, + "loss": 0.555, + "step": 3494 + }, + { + "epoch": 0.932, + "grad_norm": 0.3280599696909218, + "learning_rate": 2.416055525637828e-06, + "loss": 0.544, + "step": 3495 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.3454103759731847, + "learning_rate": 2.397219154092245e-06, + "loss": 0.5963, + "step": 3496 + }, + { + "epoch": 0.9325333333333333, + "grad_norm": 0.34073031334530307, + "learning_rate": 2.3784556066771544e-06, + "loss": 0.5648, + "step": 3497 + }, + { + "epoch": 0.9328, + "grad_norm": 0.34052111005645935, + "learning_rate": 2.3597648973925137e-06, + "loss": 0.5897, + "step": 3498 + }, + { + "epoch": 0.9330666666666667, + "grad_norm": 0.32664114860937943, + "learning_rate": 2.3411470401840106e-06, + "loss": 0.5764, + "step": 3499 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.32474442746561094, + "learning_rate": 2.3226020489429232e-06, + "loss": 0.5717, + "step": 3500 + }, + { + "epoch": 0.9336, + "grad_norm": 0.3249231252319353, + "learning_rate": 2.3041299375062053e-06, + "loss": 0.551, + "step": 3501 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.37846352234449726, + "learning_rate": 2.285730719656376e-06, + "loss": 0.562, + "step": 3502 + }, + { + "epoch": 0.9341333333333334, + "grad_norm": 0.4046561955417867, + "learning_rate": 2.2674044091216317e-06, + "loss": 0.5555, + "step": 3503 + }, + { + "epoch": 0.9344, + "grad_norm": 0.34427098431840325, + "learning_rate": 2.2491510195757125e-06, + "loss": 0.5589, + "step": 3504 + }, + { + "epoch": 0.9346666666666666, + "grad_norm": 0.3429797432890164, + "learning_rate": 2.230970564638002e-06, + "loss": 0.5307, + "step": 3505 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.33760232775558985, + "learning_rate": 2.2128630578734156e-06, + "loss": 0.5706, + "step": 3506 + }, + { + "epoch": 0.9352, + "grad_norm": 0.34535329393130076, + "learning_rate": 2.1948285127924906e-06, + "loss": 0.5573, + "step": 3507 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.3639024855554393, + "learning_rate": 2.1768669428512745e-06, + "loss": 0.6062, + "step": 3508 + }, + { + "epoch": 0.9357333333333333, + "grad_norm": 0.33958797306627986, + "learning_rate": 2.1589783614513912e-06, + "loss": 0.5891, + "step": 3509 + }, + { + "epoch": 0.936, + "grad_norm": 0.3485710628549526, + "learning_rate": 2.1411627819400316e-06, + "loss": 0.5294, + "step": 3510 + }, + { + "epoch": 0.9362666666666667, + "grad_norm": 0.3560571091250747, + "learning_rate": 2.123420217609862e-06, + "loss": 0.5774, + "step": 3511 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.34546401188489917, + "learning_rate": 2.1057506816991257e-06, + "loss": 0.6323, + "step": 3512 + }, + { + "epoch": 0.9368, + "grad_norm": 0.332337997169067, + "learning_rate": 2.0881541873915335e-06, + "loss": 0.5474, + "step": 3513 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.3486177808451853, + "learning_rate": 2.0706307478163157e-06, + "loss": 0.5797, + "step": 3514 + }, + { + "epoch": 0.9373333333333334, + "grad_norm": 0.35016015941121903, + "learning_rate": 2.0531803760482026e-06, + "loss": 0.5521, + "step": 3515 + }, + { + "epoch": 0.9376, + "grad_norm": 0.32121514868022516, + "learning_rate": 2.0358030851073908e-06, + "loss": 0.5464, + "step": 3516 + }, + { + "epoch": 0.9378666666666666, + "grad_norm": 0.31798881260197637, + "learning_rate": 2.0184988879595635e-06, + "loss": 0.5308, + "step": 3517 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.3293406600497602, + "learning_rate": 2.0012677975158488e-06, + "loss": 0.5381, + "step": 3518 + }, + { + "epoch": 0.9384, + "grad_norm": 0.3176534896163629, + "learning_rate": 1.984109826632863e-06, + "loss": 0.5321, + "step": 3519 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.3311629912277586, + "learning_rate": 1.9670249881126202e-06, + "loss": 0.5084, + "step": 3520 + }, + { + "epoch": 0.9389333333333333, + "grad_norm": 0.3470416917579143, + "learning_rate": 1.9500132947026017e-06, + "loss": 0.5165, + "step": 3521 + }, + { + "epoch": 0.9392, + "grad_norm": 0.3631737789780499, + "learning_rate": 1.933074759095688e-06, + "loss": 0.6042, + "step": 3522 + }, + { + "epoch": 0.9394666666666667, + "grad_norm": 0.3613474421439833, + "learning_rate": 1.916209393930202e-06, + "loss": 0.601, + "step": 3523 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.3202828824288851, + "learning_rate": 1.8994172117898557e-06, + "loss": 0.5336, + "step": 3524 + }, + { + "epoch": 0.94, + "grad_norm": 0.3630967793339104, + "learning_rate": 1.8826982252037606e-06, + "loss": 0.6005, + "step": 3525 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.3467614977155515, + "learning_rate": 1.8660524466464158e-06, + "loss": 0.559, + "step": 3526 + }, + { + "epoch": 0.9405333333333333, + "grad_norm": 0.31526517066045545, + "learning_rate": 1.8494798885376863e-06, + "loss": 0.572, + "step": 3527 + }, + { + "epoch": 0.9408, + "grad_norm": 0.32889037817569244, + "learning_rate": 1.8329805632428255e-06, + "loss": 0.5506, + "step": 3528 + }, + { + "epoch": 0.9410666666666667, + "grad_norm": 0.3332969614301865, + "learning_rate": 1.81655448307243e-06, + "loss": 0.5272, + "step": 3529 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.3361715357953492, + "learning_rate": 1.8002016602824635e-06, + "loss": 0.5445, + "step": 3530 + }, + { + "epoch": 0.9416, + "grad_norm": 0.4313126658557049, + "learning_rate": 1.7839221070741984e-06, + "loss": 0.561, + "step": 3531 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.3418637138104556, + "learning_rate": 1.7677158355942635e-06, + "loss": 0.5436, + "step": 3532 + }, + { + "epoch": 0.9421333333333334, + "grad_norm": 0.3355872413959369, + "learning_rate": 1.7515828579346194e-06, + "loss": 0.5624, + "step": 3533 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3245741371298666, + "learning_rate": 1.7355231861325261e-06, + "loss": 0.5463, + "step": 3534 + }, + { + "epoch": 0.9426666666666667, + "grad_norm": 0.3290236800807073, + "learning_rate": 1.7195368321705319e-06, + "loss": 0.5435, + "step": 3535 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.3513771202722227, + "learning_rate": 1.7036238079765178e-06, + "loss": 0.5709, + "step": 3536 + }, + { + "epoch": 0.9432, + "grad_norm": 0.3497099468242933, + "learning_rate": 1.6877841254236082e-06, + "loss": 0.5571, + "step": 3537 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.35291069476649944, + "learning_rate": 1.6720177963302497e-06, + "loss": 0.5829, + "step": 3538 + }, + { + "epoch": 0.9437333333333333, + "grad_norm": 0.3223102149956902, + "learning_rate": 1.6563248324600988e-06, + "loss": 0.5155, + "step": 3539 + }, + { + "epoch": 0.944, + "grad_norm": 0.3335795074723978, + "learning_rate": 1.640705245522156e-06, + "loss": 0.5325, + "step": 3540 + }, + { + "epoch": 0.9442666666666667, + "grad_norm": 0.33954714328334745, + "learning_rate": 1.6251590471705991e-06, + "loss": 0.5601, + "step": 3541 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.3448281605435717, + "learning_rate": 1.6096862490048935e-06, + "loss": 0.5373, + "step": 3542 + }, + { + "epoch": 0.9448, + "grad_norm": 0.3274949838020495, + "learning_rate": 1.594286862569694e-06, + "loss": 0.554, + "step": 3543 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.3307476735958091, + "learning_rate": 1.5789608993549421e-06, + "loss": 0.5821, + "step": 3544 + }, + { + "epoch": 0.9453333333333334, + "grad_norm": 0.3378389240492942, + "learning_rate": 1.5637083707957356e-06, + "loss": 0.5666, + "step": 3545 + }, + { + "epoch": 0.9456, + "grad_norm": 0.3563248431366006, + "learning_rate": 1.5485292882724156e-06, + "loss": 0.5548, + "step": 3546 + }, + { + "epoch": 0.9458666666666666, + "grad_norm": 0.32814676407780136, + "learning_rate": 1.5334236631105225e-06, + "loss": 0.5759, + "step": 3547 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.32902095074114185, + "learning_rate": 1.518391506580763e-06, + "loss": 0.5635, + "step": 3548 + }, + { + "epoch": 0.9464, + "grad_norm": 0.3405503610898305, + "learning_rate": 1.5034328298990652e-06, + "loss": 0.5489, + "step": 3549 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.3600760563968869, + "learning_rate": 1.48854764422649e-06, + "loss": 0.5474, + "step": 3550 + }, + { + "epoch": 0.9469333333333333, + "grad_norm": 0.34742946224123245, + "learning_rate": 1.473735960669309e-06, + "loss": 0.5635, + "step": 3551 + }, + { + "epoch": 0.9472, + "grad_norm": 0.33209614763239237, + "learning_rate": 1.4589977902789042e-06, + "loss": 0.5433, + "step": 3552 + }, + { + "epoch": 0.9474666666666667, + "grad_norm": 0.3457489799683696, + "learning_rate": 1.4443331440518459e-06, + "loss": 0.6113, + "step": 3553 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.3835404413535953, + "learning_rate": 1.4297420329298372e-06, + "loss": 0.6201, + "step": 3554 + }, + { + "epoch": 0.948, + "grad_norm": 0.3641691198464094, + "learning_rate": 1.4152244677996918e-06, + "loss": 0.571, + "step": 3555 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.3322347513336434, + "learning_rate": 1.4007804594933672e-06, + "loss": 0.5121, + "step": 3556 + }, + { + "epoch": 0.9485333333333333, + "grad_norm": 0.3321959483135998, + "learning_rate": 1.3864100187879536e-06, + "loss": 0.5651, + "step": 3557 + }, + { + "epoch": 0.9488, + "grad_norm": 0.32269460844323217, + "learning_rate": 1.37211315640563e-06, + "loss": 0.5429, + "step": 3558 + }, + { + "epoch": 0.9490666666666666, + "grad_norm": 0.34910224704439924, + "learning_rate": 1.3578898830136633e-06, + "loss": 0.5747, + "step": 3559 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.3486414831020094, + "learning_rate": 1.3437402092244534e-06, + "loss": 0.5945, + "step": 3560 + }, + { + "epoch": 0.9496, + "grad_norm": 0.4003486975347935, + "learning_rate": 1.3296641455954438e-06, + "loss": 0.5657, + "step": 3561 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.3603519837752374, + "learning_rate": 1.3156617026291783e-06, + "loss": 0.5804, + "step": 3562 + }, + { + "epoch": 0.9501333333333334, + "grad_norm": 0.3484500566141622, + "learning_rate": 1.3017328907732774e-06, + "loss": 0.6287, + "step": 3563 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3534093927288817, + "learning_rate": 1.2878777204204052e-06, + "loss": 0.5885, + "step": 3564 + }, + { + "epoch": 0.9506666666666667, + "grad_norm": 0.3324863923939078, + "learning_rate": 1.2740962019082814e-06, + "loss": 0.5441, + "step": 3565 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.34486810729696377, + "learning_rate": 1.2603883455196918e-06, + "loss": 0.5297, + "step": 3566 + }, + { + "epoch": 0.9512, + "grad_norm": 0.35007800836673886, + "learning_rate": 1.246754161482433e-06, + "loss": 0.604, + "step": 3567 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.3568450938297453, + "learning_rate": 1.233193659969356e-06, + "loss": 0.557, + "step": 3568 + }, + { + "epoch": 0.9517333333333333, + "grad_norm": 0.35780339545413115, + "learning_rate": 1.2197068510983123e-06, + "loss": 0.5756, + "step": 3569 + }, + { + "epoch": 0.952, + "grad_norm": 0.34446415802344327, + "learning_rate": 1.2062937449321853e-06, + "loss": 0.5437, + "step": 3570 + }, + { + "epoch": 0.9522666666666667, + "grad_norm": 0.34101359780530865, + "learning_rate": 1.192954351478881e-06, + "loss": 0.6003, + "step": 3571 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.3516004308518511, + "learning_rate": 1.1796886806912711e-06, + "loss": 0.5662, + "step": 3572 + }, + { + "epoch": 0.9528, + "grad_norm": 0.33560129676716577, + "learning_rate": 1.166496742467249e-06, + "loss": 0.6255, + "step": 3573 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.34565720758360824, + "learning_rate": 1.153378546649686e-06, + "loss": 0.5983, + "step": 3574 + }, + { + "epoch": 0.9533333333333334, + "grad_norm": 0.35761641574607006, + "learning_rate": 1.1403341030264192e-06, + "loss": 0.5413, + "step": 3575 + }, + { + "epoch": 0.9536, + "grad_norm": 0.33527998229172135, + "learning_rate": 1.1273634213302742e-06, + "loss": 0.5339, + "step": 3576 + }, + { + "epoch": 0.9538666666666666, + "grad_norm": 0.34193888567715813, + "learning_rate": 1.1144665112390317e-06, + "loss": 0.5899, + "step": 3577 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.3377647101175997, + "learning_rate": 1.101643382375439e-06, + "loss": 0.5485, + "step": 3578 + }, + { + "epoch": 0.9544, + "grad_norm": 0.34807809101284953, + "learning_rate": 1.088894044307176e-06, + "loss": 0.531, + "step": 3579 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.3483662790485872, + "learning_rate": 1.076218506546889e-06, + "loss": 0.603, + "step": 3580 + }, + { + "epoch": 0.9549333333333333, + "grad_norm": 0.3388854005197344, + "learning_rate": 1.0636167785521456e-06, + "loss": 0.5688, + "step": 3581 + }, + { + "epoch": 0.9552, + "grad_norm": 0.35006377953284007, + "learning_rate": 1.0510888697254362e-06, + "loss": 0.5441, + "step": 3582 + }, + { + "epoch": 0.9554666666666667, + "grad_norm": 0.35650236259787543, + "learning_rate": 1.0386347894141834e-06, + "loss": 0.56, + "step": 3583 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.35339656611204234, + "learning_rate": 1.026254546910721e-06, + "loss": 0.5895, + "step": 3584 + }, + { + "epoch": 0.956, + "grad_norm": 0.35522206248276794, + "learning_rate": 1.013948151452282e-06, + "loss": 0.5541, + "step": 3585 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.45822286253643324, + "learning_rate": 1.0017156122210103e-06, + "loss": 0.5335, + "step": 3586 + }, + { + "epoch": 0.9565333333333333, + "grad_norm": 0.36208808271064635, + "learning_rate": 9.895569383439497e-07, + "loss": 0.5345, + "step": 3587 + }, + { + "epoch": 0.9568, + "grad_norm": 0.35193403727049133, + "learning_rate": 9.774721388930208e-07, + "loss": 0.5733, + "step": 3588 + }, + { + "epoch": 0.9570666666666666, + "grad_norm": 0.3439579316664456, + "learning_rate": 9.654612228850112e-07, + "loss": 0.6056, + "step": 3589 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.33881712346131376, + "learning_rate": 9.53524199281619e-07, + "loss": 0.5533, + "step": 3590 + }, + { + "epoch": 0.9576, + "grad_norm": 0.33976941180649134, + "learning_rate": 9.416610769893863e-07, + "loss": 0.5525, + "step": 3591 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.32874693623032547, + "learning_rate": 9.298718648596882e-07, + "loss": 0.5839, + "step": 3592 + }, + { + "epoch": 0.9581333333333333, + "grad_norm": 0.3587153259638571, + "learning_rate": 9.181565716888108e-07, + "loss": 0.5308, + "step": 3593 + }, + { + "epoch": 0.9584, + "grad_norm": 0.34000172209033314, + "learning_rate": 9.065152062178394e-07, + "loss": 0.5948, + "step": 3594 + }, + { + "epoch": 0.9586666666666667, + "grad_norm": 0.332819201446939, + "learning_rate": 8.949477771327375e-07, + "loss": 0.5709, + "step": 3595 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.36653112439664776, + "learning_rate": 8.834542930642564e-07, + "loss": 0.5342, + "step": 3596 + }, + { + "epoch": 0.9592, + "grad_norm": 0.35649140485076625, + "learning_rate": 8.720347625880365e-07, + "loss": 0.5857, + "step": 3597 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.3382152692497552, + "learning_rate": 8.606891942244843e-07, + "loss": 0.542, + "step": 3598 + }, + { + "epoch": 0.9597333333333333, + "grad_norm": 0.3592859145050308, + "learning_rate": 8.494175964388285e-07, + "loss": 0.565, + "step": 3599 + }, + { + "epoch": 0.96, + "grad_norm": 0.3347940304545129, + "learning_rate": 8.382199776411526e-07, + "loss": 0.5408, + "step": 3600 + }, + { + "epoch": 0.9602666666666667, + "grad_norm": 0.3432807253611525, + "learning_rate": 8.270963461862735e-07, + "loss": 0.5235, + "step": 3601 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.3292055268567841, + "learning_rate": 8.160467103738744e-07, + "loss": 0.5691, + "step": 3602 + }, + { + "epoch": 0.9608, + "grad_norm": 0.37226392552353654, + "learning_rate": 8.050710784483606e-07, + "loss": 0.5794, + "step": 3603 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.3502730790953156, + "learning_rate": 7.941694585989812e-07, + "loss": 0.5478, + "step": 3604 + }, + { + "epoch": 0.9613333333333334, + "grad_norm": 0.3510793166853264, + "learning_rate": 7.833418589597297e-07, + "loss": 0.5409, + "step": 3605 + }, + { + "epoch": 0.9616, + "grad_norm": 0.35033417150917406, + "learning_rate": 7.72588287609366e-07, + "loss": 0.5365, + "step": 3606 + }, + { + "epoch": 0.9618666666666666, + "grad_norm": 0.36602968817644155, + "learning_rate": 7.619087525714385e-07, + "loss": 0.5996, + "step": 3607 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.34394524029132817, + "learning_rate": 7.51303261814229e-07, + "loss": 0.58, + "step": 3608 + }, + { + "epoch": 0.9624, + "grad_norm": 0.3646799709785778, + "learning_rate": 7.407718232508077e-07, + "loss": 0.6156, + "step": 3609 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.344240762525615, + "learning_rate": 7.303144447389554e-07, + "loss": 0.5627, + "step": 3610 + }, + { + "epoch": 0.9629333333333333, + "grad_norm": 0.33817371267390517, + "learning_rate": 7.199311340812087e-07, + "loss": 0.52, + "step": 3611 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3405187485431618, + "learning_rate": 7.096218990248593e-07, + "loss": 0.5541, + "step": 3612 + }, + { + "epoch": 0.9634666666666667, + "grad_norm": 0.33714832689249224, + "learning_rate": 6.993867472618987e-07, + "loss": 0.5505, + "step": 3613 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.36486171535355555, + "learning_rate": 6.892256864290625e-07, + "loss": 0.5901, + "step": 3614 + }, + { + "epoch": 0.964, + "grad_norm": 0.3504758719446265, + "learning_rate": 6.791387241077973e-07, + "loss": 0.5875, + "step": 3615 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.34338427679710004, + "learning_rate": 6.691258678242607e-07, + "loss": 0.5188, + "step": 3616 + }, + { + "epoch": 0.9645333333333334, + "grad_norm": 0.3550936875702556, + "learning_rate": 6.591871250493209e-07, + "loss": 0.5492, + "step": 3617 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3607325230619524, + "learning_rate": 6.493225031985573e-07, + "loss": 0.6269, + "step": 3618 + }, + { + "epoch": 0.9650666666666666, + "grad_norm": 0.3477495884654401, + "learning_rate": 6.395320096322266e-07, + "loss": 0.5934, + "step": 3619 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.3547980526668001, + "learning_rate": 6.298156516552967e-07, + "loss": 0.5718, + "step": 3620 + }, + { + "epoch": 0.9656, + "grad_norm": 0.34244665874709795, + "learning_rate": 6.201734365174017e-07, + "loss": 0.5463, + "step": 3621 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.34720102470117736, + "learning_rate": 6.106053714128757e-07, + "loss": 0.5417, + "step": 3622 + }, + { + "epoch": 0.9661333333333333, + "grad_norm": 0.340002079828364, + "learning_rate": 6.011114634807081e-07, + "loss": 0.5143, + "step": 3623 + }, + { + "epoch": 0.9664, + "grad_norm": 0.34122659002576333, + "learning_rate": 5.916917198045546e-07, + "loss": 0.5506, + "step": 3624 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 0.3595142029953279, + "learning_rate": 5.8234614741276e-07, + "loss": 0.6081, + "step": 3625 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.3379322322573059, + "learning_rate": 5.730747532783243e-07, + "loss": 0.5523, + "step": 3626 + }, + { + "epoch": 0.9672, + "grad_norm": 0.3296154389395013, + "learning_rate": 5.638775443188693e-07, + "loss": 0.5483, + "step": 3627 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.32713820569558444, + "learning_rate": 5.547545273966947e-07, + "loss": 0.5632, + "step": 3628 + }, + { + "epoch": 0.9677333333333333, + "grad_norm": 0.3482877643571333, + "learning_rate": 5.457057093187334e-07, + "loss": 0.5874, + "step": 3629 + }, + { + "epoch": 0.968, + "grad_norm": 0.3478564773327796, + "learning_rate": 5.367310968365624e-07, + "loss": 0.5422, + "step": 3630 + }, + { + "epoch": 0.9682666666666667, + "grad_norm": 0.33276209947772445, + "learning_rate": 5.278306966463919e-07, + "loss": 0.5686, + "step": 3631 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.33313289588087, + "learning_rate": 5.190045153890433e-07, + "loss": 0.5761, + "step": 3632 + }, + { + "epoch": 0.9688, + "grad_norm": 0.35060308437282434, + "learning_rate": 5.102525596499929e-07, + "loss": 0.6527, + "step": 3633 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.36335554292409006, + "learning_rate": 5.015748359592953e-07, + "loss": 0.6145, + "step": 3634 + }, + { + "epoch": 0.9693333333333334, + "grad_norm": 0.33928078851547144, + "learning_rate": 4.9297135079166e-07, + "loss": 0.6172, + "step": 3635 + }, + { + "epoch": 0.9696, + "grad_norm": 0.38826307780126884, + "learning_rate": 4.844421105663743e-07, + "loss": 0.5942, + "step": 3636 + }, + { + "epoch": 0.9698666666666667, + "grad_norm": 0.3525869725339327, + "learning_rate": 4.759871216473366e-07, + "loss": 0.552, + "step": 3637 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.34299810530404984, + "learning_rate": 4.676063903430561e-07, + "loss": 0.5806, + "step": 3638 + }, + { + "epoch": 0.9704, + "grad_norm": 0.3267379743822501, + "learning_rate": 4.5929992290661973e-07, + "loss": 0.5448, + "step": 3639 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.34606923723750227, + "learning_rate": 4.510677255357143e-07, + "loss": 0.5355, + "step": 3640 + }, + { + "epoch": 0.9709333333333333, + "grad_norm": 0.5412897534496282, + "learning_rate": 4.429098043726154e-07, + "loss": 0.5515, + "step": 3641 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3628427434950643, + "learning_rate": 4.3482616550416523e-07, + "loss": 0.5952, + "step": 3642 + }, + { + "epoch": 0.9714666666666667, + "grad_norm": 0.6060764455488781, + "learning_rate": 4.2681681496179457e-07, + "loss": 0.542, + "step": 3643 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.3617796677527885, + "learning_rate": 4.188817587215121e-07, + "loss": 0.545, + "step": 3644 + }, + { + "epoch": 0.972, + "grad_norm": 0.34276116956772457, + "learning_rate": 4.110210027038597e-07, + "loss": 0.5529, + "step": 3645 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.3453367869788142, + "learning_rate": 4.0323455277397894e-07, + "loss": 0.5487, + "step": 3646 + }, + { + "epoch": 0.9725333333333334, + "grad_norm": 0.3601716163110895, + "learning_rate": 3.955224147415559e-07, + "loss": 0.5822, + "step": 3647 + }, + { + "epoch": 0.9728, + "grad_norm": 0.34331308511554115, + "learning_rate": 3.8788459436082115e-07, + "loss": 0.5282, + "step": 3648 + }, + { + "epoch": 0.9730666666666666, + "grad_norm": 0.3657599969727808, + "learning_rate": 3.803210973305715e-07, + "loss": 0.5712, + "step": 3649 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.34918736921749277, + "learning_rate": 3.7283192929412626e-07, + "loss": 0.577, + "step": 3650 + }, + { + "epoch": 0.9736, + "grad_norm": 0.3415858611957605, + "learning_rate": 3.654170958393821e-07, + "loss": 0.608, + "step": 3651 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.3464244819210767, + "learning_rate": 3.5807660249873585e-07, + "loss": 0.5926, + "step": 3652 + }, + { + "epoch": 0.9741333333333333, + "grad_norm": 0.35190125691584373, + "learning_rate": 3.508104547491509e-07, + "loss": 0.5972, + "step": 3653 + }, + { + "epoch": 0.9744, + "grad_norm": 0.3345522118761501, + "learning_rate": 3.436186580120793e-07, + "loss": 0.5646, + "step": 3654 + }, + { + "epoch": 0.9746666666666667, + "grad_norm": 0.33590588885090034, + "learning_rate": 3.365012176535287e-07, + "loss": 0.5649, + "step": 3655 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.34206677560213433, + "learning_rate": 3.294581389840068e-07, + "loss": 0.5486, + "step": 3656 + }, + { + "epoch": 0.9752, + "grad_norm": 0.3459035480155328, + "learning_rate": 3.2248942725856545e-07, + "loss": 0.5584, + "step": 3657 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.35073761499048756, + "learning_rate": 3.155950876767455e-07, + "loss": 0.5758, + "step": 3658 + }, + { + "epoch": 0.9757333333333333, + "grad_norm": 0.32697923909304105, + "learning_rate": 3.087751253826099e-07, + "loss": 0.5099, + "step": 3659 + }, + { + "epoch": 0.976, + "grad_norm": 0.3504979255702691, + "learning_rate": 3.020295454647104e-07, + "loss": 0.5235, + "step": 3660 + }, + { + "epoch": 0.9762666666666666, + "grad_norm": 0.3601831993531229, + "learning_rate": 2.95358352956121e-07, + "loss": 0.5357, + "step": 3661 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.35176994725296307, + "learning_rate": 2.8876155283440455e-07, + "loss": 0.5917, + "step": 3662 + }, + { + "epoch": 0.9768, + "grad_norm": 0.33656028844879343, + "learning_rate": 2.822391500215904e-07, + "loss": 0.5416, + "step": 3663 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.33395575999512844, + "learning_rate": 2.757911493842524e-07, + "loss": 0.5674, + "step": 3664 + }, + { + "epoch": 0.9773333333333334, + "grad_norm": 0.3551203866045012, + "learning_rate": 2.694175557334089e-07, + "loss": 0.5939, + "step": 3665 + }, + { + "epoch": 0.9776, + "grad_norm": 0.3379185779211055, + "learning_rate": 2.631183738245779e-07, + "loss": 0.5867, + "step": 3666 + }, + { + "epoch": 0.9778666666666667, + "grad_norm": 0.3464942105968554, + "learning_rate": 2.5689360835775557e-07, + "loss": 0.571, + "step": 3667 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.32517939912996274, + "learning_rate": 2.5074326397740435e-07, + "loss": 0.6034, + "step": 3668 + }, + { + "epoch": 0.9784, + "grad_norm": 0.3398040745586794, + "learning_rate": 2.446673452724646e-07, + "loss": 0.564, + "step": 3669 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.3491270842941653, + "learning_rate": 2.3866585677635446e-07, + "loss": 0.558, + "step": 3670 + }, + { + "epoch": 0.9789333333333333, + "grad_norm": 0.346518542501218, + "learning_rate": 2.327388029669586e-07, + "loss": 0.5784, + "step": 3671 + }, + { + "epoch": 0.9792, + "grad_norm": 0.33509570506283576, + "learning_rate": 2.2688618826659513e-07, + "loss": 0.5618, + "step": 3672 + }, + { + "epoch": 0.9794666666666667, + "grad_norm": 0.34748912641619945, + "learning_rate": 2.2110801704207097e-07, + "loss": 0.5719, + "step": 3673 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.35738002745102626, + "learning_rate": 2.1540429360463744e-07, + "loss": 0.5603, + "step": 3674 + }, + { + "epoch": 0.98, + "grad_norm": 0.3400461056379536, + "learning_rate": 2.0977502221000145e-07, + "loss": 0.575, + "step": 3675 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.337193682785471, + "learning_rate": 2.0422020705832544e-07, + "loss": 0.5735, + "step": 3676 + }, + { + "epoch": 0.9805333333333334, + "grad_norm": 0.35670374797821, + "learning_rate": 1.9873985229419411e-07, + "loss": 0.5511, + "step": 3677 + }, + { + "epoch": 0.9808, + "grad_norm": 0.36498213833039267, + "learning_rate": 1.9333396200666988e-07, + "loss": 0.5674, + "step": 3678 + }, + { + "epoch": 0.9810666666666666, + "grad_norm": 0.34443444689075975, + "learning_rate": 1.8800254022922624e-07, + "loss": 0.5369, + "step": 3679 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.3397360982084764, + "learning_rate": 1.8274559093978126e-07, + "loss": 0.5746, + "step": 3680 + }, + { + "epoch": 0.9816, + "grad_norm": 0.3361284184682215, + "learning_rate": 1.7756311806069737e-07, + "loss": 0.5549, + "step": 3681 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.4169688985782199, + "learning_rate": 1.724551254587481e-07, + "loss": 0.549, + "step": 3682 + }, + { + "epoch": 0.9821333333333333, + "grad_norm": 0.34284336851101704, + "learning_rate": 1.6742161694516257e-07, + "loss": 0.5828, + "step": 3683 + }, + { + "epoch": 0.9824, + "grad_norm": 0.36003515900908467, + "learning_rate": 1.62462596275581e-07, + "loss": 0.5616, + "step": 3684 + }, + { + "epoch": 0.9826666666666667, + "grad_norm": 0.34646733660146567, + "learning_rate": 1.5757806715005487e-07, + "loss": 0.5276, + "step": 3685 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.33333869172869685, + "learning_rate": 1.5276803321307987e-07, + "loss": 0.5425, + "step": 3686 + }, + { + "epoch": 0.9832, + "grad_norm": 0.3305183354420717, + "learning_rate": 1.480324980535408e-07, + "loss": 0.5382, + "step": 3687 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.32742540141707405, + "learning_rate": 1.4337146520475575e-07, + "loss": 0.5509, + "step": 3688 + }, + { + "epoch": 0.9837333333333333, + "grad_norm": 0.3715852882148017, + "learning_rate": 1.3878493814445392e-07, + "loss": 0.5766, + "step": 3689 + }, + { + "epoch": 0.984, + "grad_norm": 0.43519776841736296, + "learning_rate": 1.3427292029476458e-07, + "loss": 0.553, + "step": 3690 + }, + { + "epoch": 0.9842666666666666, + "grad_norm": 0.3547770909363482, + "learning_rate": 1.2983541502222807e-07, + "loss": 0.5794, + "step": 3691 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.35048216490954837, + "learning_rate": 1.2547242563780703e-07, + "loss": 0.5625, + "step": 3692 + }, + { + "epoch": 0.9848, + "grad_norm": 0.33134426730204997, + "learning_rate": 1.211839553968197e-07, + "loss": 0.5413, + "step": 3693 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.3427775147266325, + "learning_rate": 1.1697000749903986e-07, + "loss": 0.5914, + "step": 3694 + }, + { + "epoch": 0.9853333333333333, + "grad_norm": 0.36056732951662795, + "learning_rate": 1.1283058508858579e-07, + "loss": 0.5827, + "step": 3695 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3569851450856535, + "learning_rate": 1.0876569125400915e-07, + "loss": 0.5374, + "step": 3696 + }, + { + "epoch": 0.9858666666666667, + "grad_norm": 0.363256035234427, + "learning_rate": 1.0477532902823939e-07, + "loss": 0.5724, + "step": 3697 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.33085860463381517, + "learning_rate": 1.008595013885949e-07, + "loss": 0.5255, + "step": 3698 + }, + { + "epoch": 0.9864, + "grad_norm": 0.33548418337483943, + "learning_rate": 9.701821125678301e-08, + "loss": 0.564, + "step": 3699 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.3503141815481428, + "learning_rate": 9.325146149888886e-08, + "loss": 0.5855, + "step": 3700 + }, + { + "epoch": 0.9869333333333333, + "grad_norm": 0.33196088931707374, + "learning_rate": 8.955925492539763e-08, + "loss": 0.5484, + "step": 3701 + }, + { + "epoch": 0.9872, + "grad_norm": 0.352459366831373, + "learning_rate": 8.594159429117233e-08, + "loss": 0.5872, + "step": 3702 + }, + { + "epoch": 0.9874666666666667, + "grad_norm": 0.37196689024942703, + "learning_rate": 8.239848229543156e-08, + "loss": 0.5822, + "step": 3703 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.33349634027677055, + "learning_rate": 7.892992158179401e-08, + "loss": 0.5348, + "step": 3704 + }, + { + "epoch": 0.988, + "grad_norm": 0.36103265082571734, + "learning_rate": 7.553591473825617e-08, + "loss": 0.5741, + "step": 3705 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.3456989224033128, + "learning_rate": 7.221646429718121e-08, + "loss": 0.5438, + "step": 3706 + }, + { + "epoch": 0.9885333333333334, + "grad_norm": 0.3572267712364149, + "learning_rate": 6.897157273528798e-08, + "loss": 0.6035, + "step": 3707 + }, + { + "epoch": 0.9888, + "grad_norm": 0.32692865411323563, + "learning_rate": 6.580124247370644e-08, + "loss": 0.5382, + "step": 3708 + }, + { + "epoch": 0.9890666666666666, + "grad_norm": 0.3572657522968857, + "learning_rate": 6.270547587787778e-08, + "loss": 0.5513, + "step": 3709 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.33975055889845546, + "learning_rate": 5.968427525765429e-08, + "loss": 0.5614, + "step": 3710 + }, + { + "epoch": 0.9896, + "grad_norm": 0.35863665738117934, + "learning_rate": 5.673764286724392e-08, + "loss": 0.5273, + "step": 3711 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.36455552879504405, + "learning_rate": 5.3865580905188005e-08, + "loss": 0.6047, + "step": 3712 + }, + { + "epoch": 0.9901333333333333, + "grad_norm": 0.3439122183945789, + "learning_rate": 5.106809151443903e-08, + "loss": 0.5605, + "step": 3713 + }, + { + "epoch": 0.9904, + "grad_norm": 0.33039237994533466, + "learning_rate": 4.834517678226069e-08, + "loss": 0.5595, + "step": 3714 + }, + { + "epoch": 0.9906666666666667, + "grad_norm": 0.3207122880754927, + "learning_rate": 4.569683874029451e-08, + "loss": 0.5578, + "step": 3715 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.3372882212816138, + "learning_rate": 4.3123079364559834e-08, + "loss": 0.5269, + "step": 3716 + }, + { + "epoch": 0.9912, + "grad_norm": 0.33359267062609543, + "learning_rate": 4.062390057538723e-08, + "loss": 0.5489, + "step": 3717 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.375741265348412, + "learning_rate": 3.819930423749618e-08, + "loss": 0.5574, + "step": 3718 + }, + { + "epoch": 0.9917333333333334, + "grad_norm": 0.3383230058068102, + "learning_rate": 3.5849292159928495e-08, + "loss": 0.5764, + "step": 3719 + }, + { + "epoch": 0.992, + "grad_norm": 0.3599320619740979, + "learning_rate": 3.3573866096114903e-08, + "loss": 0.6167, + "step": 3720 + }, + { + "epoch": 0.9922666666666666, + "grad_norm": 0.3268065053039836, + "learning_rate": 3.137302774379736e-08, + "loss": 0.5484, + "step": 3721 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.332945731946259, + "learning_rate": 2.9246778745095628e-08, + "loss": 0.556, + "step": 3722 + }, + { + "epoch": 0.9928, + "grad_norm": 0.3592303437059356, + "learning_rate": 2.7195120686451804e-08, + "loss": 0.5138, + "step": 3723 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.4313976678293077, + "learning_rate": 2.5218055098663594e-08, + "loss": 0.5876, + "step": 3724 + }, + { + "epoch": 0.9933333333333333, + "grad_norm": 0.3447874235855294, + "learning_rate": 2.331558345688434e-08, + "loss": 0.5888, + "step": 3725 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3290418583840294, + "learning_rate": 2.1487707180589677e-08, + "loss": 0.525, + "step": 3726 + }, + { + "epoch": 0.9938666666666667, + "grad_norm": 0.3790787818656205, + "learning_rate": 1.9734427633621987e-08, + "loss": 0.5406, + "step": 3727 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.33915403439916647, + "learning_rate": 1.8055746124134854e-08, + "loss": 0.5448, + "step": 3728 + }, + { + "epoch": 0.9944, + "grad_norm": 0.3571290661160332, + "learning_rate": 1.6451663904648584e-08, + "loss": 0.5222, + "step": 3729 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.3424379993643115, + "learning_rate": 1.4922182172016908e-08, + "loss": 0.5817, + "step": 3730 + }, + { + "epoch": 0.9949333333333333, + "grad_norm": 0.3461349164703394, + "learning_rate": 1.3467302067426969e-08, + "loss": 0.5614, + "step": 3731 + }, + { + "epoch": 0.9952, + "grad_norm": 0.35538227732488453, + "learning_rate": 1.2087024676388226e-08, + "loss": 0.5674, + "step": 3732 + }, + { + "epoch": 0.9954666666666667, + "grad_norm": 0.3822033804860934, + "learning_rate": 1.0781351028787967e-08, + "loss": 0.5911, + "step": 3733 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.32801843446391854, + "learning_rate": 9.55028209881359e-09, + "loss": 0.5777, + "step": 3734 + }, + { + "epoch": 0.996, + "grad_norm": 0.34182533513928925, + "learning_rate": 8.393818804997012e-09, + "loss": 0.575, + "step": 3735 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.3425173015154349, + "learning_rate": 7.311962010214668e-09, + "loss": 0.5494, + "step": 3736 + }, + { + "epoch": 0.9965333333333334, + "grad_norm": 0.33321485466971074, + "learning_rate": 6.304712521665312e-09, + "loss": 0.5913, + "step": 3737 + }, + { + "epoch": 0.9968, + "grad_norm": 0.48496049359812166, + "learning_rate": 5.372071090892217e-09, + "loss": 0.582, + "step": 3738 + }, + { + "epoch": 0.9970666666666667, + "grad_norm": 0.3419797651284938, + "learning_rate": 4.514038413749866e-09, + "loss": 0.5868, + "step": 3739 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.3450179455877578, + "learning_rate": 3.730615130448367e-09, + "loss": 0.5628, + "step": 3740 + }, + { + "epoch": 0.9976, + "grad_norm": 0.3369608810674822, + "learning_rate": 3.0218018255312454e-09, + "loss": 0.5876, + "step": 3741 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.3213782094523305, + "learning_rate": 2.387599027853238e-09, + "loss": 0.536, + "step": 3742 + }, + { + "epoch": 0.9981333333333333, + "grad_norm": 0.3373379710366741, + "learning_rate": 1.8280072106025003e-09, + "loss": 0.5671, + "step": 3743 + }, + { + "epoch": 0.9984, + "grad_norm": 0.347081184033703, + "learning_rate": 1.3430267913228101e-09, + "loss": 0.6081, + "step": 3744 + }, + { + "epoch": 0.9986666666666667, + "grad_norm": 0.37614554963750674, + "learning_rate": 9.326581318691574e-10, + "loss": 0.6076, + "step": 3745 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.3391325586916799, + "learning_rate": 5.969015384188481e-10, + "loss": 0.5597, + "step": 3746 + }, + { + "epoch": 0.9992, + "grad_norm": 0.33087837723988733, + "learning_rate": 3.357572614937077e-10, + "loss": 0.5337, + "step": 3747 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.37229213635787656, + "learning_rate": 1.4922549594897916e-10, + "loss": 0.6559, + "step": 3748 + }, + { + "epoch": 0.9997333333333334, + "grad_norm": 0.3555346325789637, + "learning_rate": 3.7306380940016484e-11, + "loss": 0.5591, + "step": 3749 + }, + { + "epoch": 1.0, + "grad_norm": 0.3498437473308086, + "learning_rate": 0.0, + "loss": 0.6484, + "step": 3750 + }, + { + "epoch": 1.0, + "step": 3750, + "total_flos": 3338724519444480.0, + "train_loss": 0.6330405312856039, + "train_runtime": 59473.6136, + "train_samples_per_second": 1.009, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 3750, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3338724519444480.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/README.md b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a434797959fc324c5714ca1d99639fb0015e769e --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "v_proj", + "gate_proj", + "k_proj", + "down_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aff946595aedb16a0c7784ed5884c193c6cec4fa --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d04187663e5e91ab625841d2987affdd8dd038cc1ac8842ac0022817f7edebf +size 671150064 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/config.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..61c6483d87895fb78ce096b61dc5269425f053c8 --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5816ea9154f9cc287eeaf1955021709fdb1804c697d177601df873dc08573c5 +size 918507402 diff --git a/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..42a63251765c58d2b0f1161b83917dbffd8c9bfc --- /dev/null +++ b/single_dataset/img2json/VideoGameBunny_v1_1-Llama-3-8B-V-img2json_dataset_90000_epochs_1_lora/trainer_state.json @@ -0,0 +1,39417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 5625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00017777777777777779, + "grad_norm": 1.258857068322721, + "learning_rate": 1.183431952662722e-06, + "loss": 1.5465, + "step": 1 + }, + { + "epoch": 0.00035555555555555557, + "grad_norm": 1.1499228169913636, + "learning_rate": 2.366863905325444e-06, + "loss": 1.58, + "step": 2 + }, + { + "epoch": 0.0005333333333333334, + "grad_norm": 1.1777685176179222, + "learning_rate": 3.550295857988166e-06, + "loss": 1.5768, + "step": 3 + }, + { + "epoch": 0.0007111111111111111, + "grad_norm": 1.135862332672188, + "learning_rate": 4.733727810650888e-06, + "loss": 1.5312, + "step": 4 + }, + { + "epoch": 0.0008888888888888889, + "grad_norm": 1.1356684044513992, + "learning_rate": 5.917159763313609e-06, + "loss": 1.6033, + "step": 5 + }, + { + "epoch": 0.0010666666666666667, + "grad_norm": 1.069577260780715, + "learning_rate": 7.100591715976332e-06, + "loss": 1.5259, + "step": 6 + }, + { + "epoch": 0.0012444444444444445, + "grad_norm": 1.1342025491596588, + "learning_rate": 8.284023668639054e-06, + "loss": 1.5418, + "step": 7 + }, + { + "epoch": 0.0014222222222222223, + "grad_norm": 0.9560259415069341, + "learning_rate": 9.467455621301776e-06, + "loss": 1.4498, + "step": 8 + }, + { + "epoch": 0.0016, + "grad_norm": 0.9288261561394549, + "learning_rate": 1.0650887573964498e-05, + "loss": 1.4003, + "step": 9 + }, + { + "epoch": 0.0017777777777777779, + "grad_norm": 0.9685459518549487, + "learning_rate": 1.1834319526627219e-05, + "loss": 1.4169, + "step": 10 + }, + { + "epoch": 0.0019555555555555554, + "grad_norm": 0.9747858026297386, + "learning_rate": 1.3017751479289941e-05, + "loss": 1.3656, + "step": 11 + }, + { + "epoch": 0.0021333333333333334, + "grad_norm": 0.9844283145066611, + "learning_rate": 1.4201183431952663e-05, + "loss": 1.2732, + "step": 12 + }, + { + "epoch": 0.002311111111111111, + "grad_norm": 1.0346465291991604, + "learning_rate": 1.5384615384615387e-05, + "loss": 1.2578, + "step": 13 + }, + { + "epoch": 0.002488888888888889, + "grad_norm": 0.9148693827208039, + "learning_rate": 1.6568047337278108e-05, + "loss": 1.1807, + "step": 14 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.9965068572355378, + "learning_rate": 1.7751479289940828e-05, + "loss": 1.1374, + "step": 15 + }, + { + "epoch": 0.0028444444444444446, + "grad_norm": 1.2870154033513308, + "learning_rate": 1.8934911242603552e-05, + "loss": 1.1152, + "step": 16 + }, + { + "epoch": 0.003022222222222222, + "grad_norm": 0.9480513594305368, + "learning_rate": 2.0118343195266273e-05, + "loss": 1.1463, + "step": 17 + }, + { + "epoch": 0.0032, + "grad_norm": 0.8331421966271232, + "learning_rate": 2.1301775147928997e-05, + "loss": 1.0336, + "step": 18 + }, + { + "epoch": 0.0033777777777777777, + "grad_norm": 0.8228833984417278, + "learning_rate": 2.2485207100591717e-05, + "loss": 0.9945, + "step": 19 + }, + { + "epoch": 0.0035555555555555557, + "grad_norm": 0.8554826793933942, + "learning_rate": 2.3668639053254438e-05, + "loss": 0.9625, + "step": 20 + }, + { + "epoch": 0.0037333333333333333, + "grad_norm": 0.836981544556156, + "learning_rate": 2.485207100591716e-05, + "loss": 0.9352, + "step": 21 + }, + { + "epoch": 0.003911111111111111, + "grad_norm": 0.7442301067026529, + "learning_rate": 2.6035502958579882e-05, + "loss": 0.9722, + "step": 22 + }, + { + "epoch": 0.004088888888888889, + "grad_norm": 0.7223817516900877, + "learning_rate": 2.7218934911242606e-05, + "loss": 0.957, + "step": 23 + }, + { + "epoch": 0.004266666666666667, + "grad_norm": 0.6851216084244861, + "learning_rate": 2.8402366863905327e-05, + "loss": 0.9225, + "step": 24 + }, + { + "epoch": 0.0044444444444444444, + "grad_norm": 0.6173778339952876, + "learning_rate": 2.958579881656805e-05, + "loss": 0.9219, + "step": 25 + }, + { + "epoch": 0.004622222222222222, + "grad_norm": 0.6145931846767859, + "learning_rate": 3.0769230769230774e-05, + "loss": 0.89, + "step": 26 + }, + { + "epoch": 0.0048, + "grad_norm": 0.5595801943750954, + "learning_rate": 3.195266272189349e-05, + "loss": 0.8878, + "step": 27 + }, + { + "epoch": 0.004977777777777778, + "grad_norm": 0.5900905880559347, + "learning_rate": 3.3136094674556215e-05, + "loss": 0.8746, + "step": 28 + }, + { + "epoch": 0.005155555555555556, + "grad_norm": 0.5450578628301268, + "learning_rate": 3.431952662721893e-05, + "loss": 0.925, + "step": 29 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 0.6261420506557391, + "learning_rate": 3.5502958579881656e-05, + "loss": 0.927, + "step": 30 + }, + { + "epoch": 0.005511111111111111, + "grad_norm": 0.5369435958949663, + "learning_rate": 3.668639053254438e-05, + "loss": 0.8643, + "step": 31 + }, + { + "epoch": 0.005688888888888889, + "grad_norm": 0.55770743392252, + "learning_rate": 3.7869822485207104e-05, + "loss": 0.8997, + "step": 32 + }, + { + "epoch": 0.005866666666666667, + "grad_norm": 0.5585332480788119, + "learning_rate": 3.905325443786982e-05, + "loss": 0.9353, + "step": 33 + }, + { + "epoch": 0.006044444444444444, + "grad_norm": 0.5315619567883817, + "learning_rate": 4.0236686390532545e-05, + "loss": 0.8873, + "step": 34 + }, + { + "epoch": 0.006222222222222222, + "grad_norm": 0.5151717959806205, + "learning_rate": 4.142011834319527e-05, + "loss": 0.8322, + "step": 35 + }, + { + "epoch": 0.0064, + "grad_norm": 0.5274177891190261, + "learning_rate": 4.260355029585799e-05, + "loss": 0.8908, + "step": 36 + }, + { + "epoch": 0.006577777777777778, + "grad_norm": 0.4889475833535269, + "learning_rate": 4.378698224852072e-05, + "loss": 0.8288, + "step": 37 + }, + { + "epoch": 0.0067555555555555554, + "grad_norm": 0.5042479504468477, + "learning_rate": 4.4970414201183434e-05, + "loss": 0.8606, + "step": 38 + }, + { + "epoch": 0.006933333333333333, + "grad_norm": 0.5168725063826504, + "learning_rate": 4.615384615384616e-05, + "loss": 0.8608, + "step": 39 + }, + { + "epoch": 0.0071111111111111115, + "grad_norm": 0.49628507591003773, + "learning_rate": 4.7337278106508875e-05, + "loss": 0.8632, + "step": 40 + }, + { + "epoch": 0.007288888888888889, + "grad_norm": 0.501678094318914, + "learning_rate": 4.85207100591716e-05, + "loss": 0.8007, + "step": 41 + }, + { + "epoch": 0.007466666666666667, + "grad_norm": 0.49661664456150134, + "learning_rate": 4.970414201183432e-05, + "loss": 0.8357, + "step": 42 + }, + { + "epoch": 0.007644444444444444, + "grad_norm": 1.7448635336781273, + "learning_rate": 5.088757396449705e-05, + "loss": 0.786, + "step": 43 + }, + { + "epoch": 0.007822222222222222, + "grad_norm": 0.5126282855810396, + "learning_rate": 5.2071005917159764e-05, + "loss": 0.8559, + "step": 44 + }, + { + "epoch": 0.008, + "grad_norm": 0.4681780693228782, + "learning_rate": 5.3254437869822495e-05, + "loss": 0.8096, + "step": 45 + }, + { + "epoch": 0.008177777777777779, + "grad_norm": 0.4830587733085102, + "learning_rate": 5.443786982248521e-05, + "loss": 0.7422, + "step": 46 + }, + { + "epoch": 0.008355555555555555, + "grad_norm": 0.49739529758924983, + "learning_rate": 5.562130177514793e-05, + "loss": 0.8457, + "step": 47 + }, + { + "epoch": 0.008533333333333334, + "grad_norm": 0.49131119990466243, + "learning_rate": 5.680473372781065e-05, + "loss": 0.8032, + "step": 48 + }, + { + "epoch": 0.00871111111111111, + "grad_norm": 0.5181650685005122, + "learning_rate": 5.798816568047337e-05, + "loss": 0.8473, + "step": 49 + }, + { + "epoch": 0.008888888888888889, + "grad_norm": 0.5587762171778651, + "learning_rate": 5.91715976331361e-05, + "loss": 0.7942, + "step": 50 + }, + { + "epoch": 0.009066666666666667, + "grad_norm": 0.5089348340184763, + "learning_rate": 6.035502958579882e-05, + "loss": 0.8363, + "step": 51 + }, + { + "epoch": 0.009244444444444444, + "grad_norm": 0.507227980739972, + "learning_rate": 6.153846153846155e-05, + "loss": 0.8142, + "step": 52 + }, + { + "epoch": 0.009422222222222222, + "grad_norm": 0.47958342316458724, + "learning_rate": 6.272189349112427e-05, + "loss": 0.7735, + "step": 53 + }, + { + "epoch": 0.0096, + "grad_norm": 0.4722450857881361, + "learning_rate": 6.390532544378698e-05, + "loss": 0.7868, + "step": 54 + }, + { + "epoch": 0.009777777777777778, + "grad_norm": 0.5322210858958987, + "learning_rate": 6.50887573964497e-05, + "loss": 0.7873, + "step": 55 + }, + { + "epoch": 0.009955555555555556, + "grad_norm": 0.49645778251385503, + "learning_rate": 6.627218934911243e-05, + "loss": 0.8828, + "step": 56 + }, + { + "epoch": 0.010133333333333333, + "grad_norm": 0.5080150556428977, + "learning_rate": 6.745562130177515e-05, + "loss": 0.8993, + "step": 57 + }, + { + "epoch": 0.010311111111111111, + "grad_norm": 0.4899449502743537, + "learning_rate": 6.863905325443787e-05, + "loss": 0.7542, + "step": 58 + }, + { + "epoch": 0.01048888888888889, + "grad_norm": 0.5204418152838295, + "learning_rate": 6.98224852071006e-05, + "loss": 0.7403, + "step": 59 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 0.4859982853059265, + "learning_rate": 7.100591715976331e-05, + "loss": 0.7863, + "step": 60 + }, + { + "epoch": 0.010844444444444445, + "grad_norm": 0.5183938626677759, + "learning_rate": 7.218934911242604e-05, + "loss": 0.8175, + "step": 61 + }, + { + "epoch": 0.011022222222222221, + "grad_norm": 0.49081102741303423, + "learning_rate": 7.337278106508876e-05, + "loss": 0.8055, + "step": 62 + }, + { + "epoch": 0.0112, + "grad_norm": 0.4925073336413384, + "learning_rate": 7.455621301775149e-05, + "loss": 0.8165, + "step": 63 + }, + { + "epoch": 0.011377777777777778, + "grad_norm": 0.46534238114233867, + "learning_rate": 7.573964497041421e-05, + "loss": 0.7796, + "step": 64 + }, + { + "epoch": 0.011555555555555555, + "grad_norm": 0.49748346282405465, + "learning_rate": 7.692307692307693e-05, + "loss": 0.8165, + "step": 65 + }, + { + "epoch": 0.011733333333333333, + "grad_norm": 0.4535062510331349, + "learning_rate": 7.810650887573964e-05, + "loss": 0.7805, + "step": 66 + }, + { + "epoch": 0.011911111111111112, + "grad_norm": 0.4406028466683985, + "learning_rate": 7.928994082840237e-05, + "loss": 0.7858, + "step": 67 + }, + { + "epoch": 0.012088888888888889, + "grad_norm": 0.46990100693978115, + "learning_rate": 8.047337278106509e-05, + "loss": 0.7743, + "step": 68 + }, + { + "epoch": 0.012266666666666667, + "grad_norm": 0.4637431264987931, + "learning_rate": 8.165680473372781e-05, + "loss": 0.8224, + "step": 69 + }, + { + "epoch": 0.012444444444444444, + "grad_norm": 0.46791205044756595, + "learning_rate": 8.284023668639054e-05, + "loss": 0.813, + "step": 70 + }, + { + "epoch": 0.012622222222222222, + "grad_norm": 0.4630866830977058, + "learning_rate": 8.402366863905326e-05, + "loss": 0.7565, + "step": 71 + }, + { + "epoch": 0.0128, + "grad_norm": 0.4754904662743993, + "learning_rate": 8.520710059171599e-05, + "loss": 0.8047, + "step": 72 + }, + { + "epoch": 0.012977777777777777, + "grad_norm": 0.4929498834422392, + "learning_rate": 8.63905325443787e-05, + "loss": 0.7989, + "step": 73 + }, + { + "epoch": 0.013155555555555556, + "grad_norm": 0.46023194279728535, + "learning_rate": 8.757396449704143e-05, + "loss": 0.822, + "step": 74 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.4677139906887722, + "learning_rate": 8.875739644970414e-05, + "loss": 0.8203, + "step": 75 + }, + { + "epoch": 0.013511111111111111, + "grad_norm": 0.46126163647133056, + "learning_rate": 8.994082840236687e-05, + "loss": 0.744, + "step": 76 + }, + { + "epoch": 0.01368888888888889, + "grad_norm": 0.4670292874892004, + "learning_rate": 9.112426035502959e-05, + "loss": 0.8402, + "step": 77 + }, + { + "epoch": 0.013866666666666666, + "grad_norm": 0.5125656023075831, + "learning_rate": 9.230769230769232e-05, + "loss": 0.7985, + "step": 78 + }, + { + "epoch": 0.014044444444444444, + "grad_norm": 0.4775962168591392, + "learning_rate": 9.349112426035503e-05, + "loss": 0.723, + "step": 79 + }, + { + "epoch": 0.014222222222222223, + "grad_norm": 0.4901539088108059, + "learning_rate": 9.467455621301775e-05, + "loss": 0.7958, + "step": 80 + }, + { + "epoch": 0.0144, + "grad_norm": 0.4813658150580067, + "learning_rate": 9.585798816568048e-05, + "loss": 0.8529, + "step": 81 + }, + { + "epoch": 0.014577777777777778, + "grad_norm": 0.46073059515040743, + "learning_rate": 9.70414201183432e-05, + "loss": 0.7186, + "step": 82 + }, + { + "epoch": 0.014755555555555555, + "grad_norm": 0.43371134510796305, + "learning_rate": 9.822485207100593e-05, + "loss": 0.7143, + "step": 83 + }, + { + "epoch": 0.014933333333333333, + "grad_norm": 0.48419715159203247, + "learning_rate": 9.940828402366865e-05, + "loss": 0.8002, + "step": 84 + }, + { + "epoch": 0.015111111111111112, + "grad_norm": 0.4866592780490299, + "learning_rate": 0.00010059171597633136, + "loss": 0.7745, + "step": 85 + }, + { + "epoch": 0.015288888888888888, + "grad_norm": 0.44829557184744384, + "learning_rate": 0.0001017751479289941, + "loss": 0.7724, + "step": 86 + }, + { + "epoch": 0.015466666666666667, + "grad_norm": 0.48930248103919116, + "learning_rate": 0.0001029585798816568, + "loss": 0.7675, + "step": 87 + }, + { + "epoch": 0.015644444444444443, + "grad_norm": 0.44046392908694915, + "learning_rate": 0.00010414201183431953, + "loss": 0.8224, + "step": 88 + }, + { + "epoch": 0.015822222222222224, + "grad_norm": 0.4785048069123892, + "learning_rate": 0.00010532544378698226, + "loss": 0.7931, + "step": 89 + }, + { + "epoch": 0.016, + "grad_norm": 0.44280483377299734, + "learning_rate": 0.00010650887573964499, + "loss": 0.7943, + "step": 90 + }, + { + "epoch": 0.016177777777777777, + "grad_norm": 0.4284224951394098, + "learning_rate": 0.0001076923076923077, + "loss": 0.7845, + "step": 91 + }, + { + "epoch": 0.016355555555555557, + "grad_norm": 0.4561417148963684, + "learning_rate": 0.00010887573964497042, + "loss": 0.872, + "step": 92 + }, + { + "epoch": 0.016533333333333334, + "grad_norm": 0.4292856846799266, + "learning_rate": 0.00011005917159763315, + "loss": 0.7981, + "step": 93 + }, + { + "epoch": 0.01671111111111111, + "grad_norm": 0.468177963284816, + "learning_rate": 0.00011124260355029586, + "loss": 0.7708, + "step": 94 + }, + { + "epoch": 0.016888888888888887, + "grad_norm": 0.46163788094220537, + "learning_rate": 0.00011242603550295858, + "loss": 0.7934, + "step": 95 + }, + { + "epoch": 0.017066666666666667, + "grad_norm": 0.5379963430741206, + "learning_rate": 0.0001136094674556213, + "loss": 0.7567, + "step": 96 + }, + { + "epoch": 0.017244444444444444, + "grad_norm": 0.4711512215552466, + "learning_rate": 0.00011479289940828404, + "loss": 0.8325, + "step": 97 + }, + { + "epoch": 0.01742222222222222, + "grad_norm": 0.4680378040138463, + "learning_rate": 0.00011597633136094674, + "loss": 0.836, + "step": 98 + }, + { + "epoch": 0.0176, + "grad_norm": 0.41731652982019085, + "learning_rate": 0.00011715976331360947, + "loss": 0.7679, + "step": 99 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 0.435358473615698, + "learning_rate": 0.0001183431952662722, + "loss": 0.7828, + "step": 100 + }, + { + "epoch": 0.017955555555555554, + "grad_norm": 0.42495852158320324, + "learning_rate": 0.00011952662721893493, + "loss": 0.7832, + "step": 101 + }, + { + "epoch": 0.018133333333333335, + "grad_norm": 0.44184873982043654, + "learning_rate": 0.00012071005917159764, + "loss": 0.7487, + "step": 102 + }, + { + "epoch": 0.01831111111111111, + "grad_norm": 0.4271562627615471, + "learning_rate": 0.00012189349112426037, + "loss": 0.7724, + "step": 103 + }, + { + "epoch": 0.018488888888888888, + "grad_norm": 0.44780279081120256, + "learning_rate": 0.0001230769230769231, + "loss": 0.7949, + "step": 104 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 0.45176823188045945, + "learning_rate": 0.0001242603550295858, + "loss": 0.7299, + "step": 105 + }, + { + "epoch": 0.018844444444444445, + "grad_norm": 0.4234941506791437, + "learning_rate": 0.00012544378698224853, + "loss": 0.7299, + "step": 106 + }, + { + "epoch": 0.01902222222222222, + "grad_norm": 0.46699578697864047, + "learning_rate": 0.00012662721893491125, + "loss": 0.7915, + "step": 107 + }, + { + "epoch": 0.0192, + "grad_norm": 0.47310759599204344, + "learning_rate": 0.00012781065088757397, + "loss": 0.7832, + "step": 108 + }, + { + "epoch": 0.01937777777777778, + "grad_norm": 0.461824405614306, + "learning_rate": 0.00012899408284023668, + "loss": 0.7689, + "step": 109 + }, + { + "epoch": 0.019555555555555555, + "grad_norm": 0.47505870701737424, + "learning_rate": 0.0001301775147928994, + "loss": 0.8128, + "step": 110 + }, + { + "epoch": 0.019733333333333332, + "grad_norm": 0.45568262964730294, + "learning_rate": 0.00013136094674556214, + "loss": 0.7069, + "step": 111 + }, + { + "epoch": 0.019911111111111112, + "grad_norm": 0.44379721917882264, + "learning_rate": 0.00013254437869822486, + "loss": 0.741, + "step": 112 + }, + { + "epoch": 0.02008888888888889, + "grad_norm": 0.45064494937433286, + "learning_rate": 0.00013372781065088758, + "loss": 0.7739, + "step": 113 + }, + { + "epoch": 0.020266666666666665, + "grad_norm": 0.43217563863386493, + "learning_rate": 0.0001349112426035503, + "loss": 0.7458, + "step": 114 + }, + { + "epoch": 0.020444444444444446, + "grad_norm": 0.4198571594139149, + "learning_rate": 0.00013609467455621304, + "loss": 0.6837, + "step": 115 + }, + { + "epoch": 0.020622222222222222, + "grad_norm": 0.5378757543769198, + "learning_rate": 0.00013727810650887573, + "loss": 0.7717, + "step": 116 + }, + { + "epoch": 0.0208, + "grad_norm": 0.45219385363053144, + "learning_rate": 0.00013846153846153847, + "loss": 0.7895, + "step": 117 + }, + { + "epoch": 0.02097777777777778, + "grad_norm": 0.43258201378316113, + "learning_rate": 0.0001396449704142012, + "loss": 0.8123, + "step": 118 + }, + { + "epoch": 0.021155555555555556, + "grad_norm": 0.44382187341418, + "learning_rate": 0.0001408284023668639, + "loss": 0.7333, + "step": 119 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 0.45541998194172073, + "learning_rate": 0.00014201183431952663, + "loss": 0.8149, + "step": 120 + }, + { + "epoch": 0.021511111111111113, + "grad_norm": 0.40917327672060544, + "learning_rate": 0.00014319526627218934, + "loss": 0.7622, + "step": 121 + }, + { + "epoch": 0.02168888888888889, + "grad_norm": 0.4129317001299444, + "learning_rate": 0.0001443786982248521, + "loss": 0.7548, + "step": 122 + }, + { + "epoch": 0.021866666666666666, + "grad_norm": 0.46956772253344264, + "learning_rate": 0.0001455621301775148, + "loss": 0.7787, + "step": 123 + }, + { + "epoch": 0.022044444444444443, + "grad_norm": 0.4411855560722539, + "learning_rate": 0.00014674556213017752, + "loss": 0.708, + "step": 124 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 0.4542310213696035, + "learning_rate": 0.00014792899408284024, + "loss": 0.746, + "step": 125 + }, + { + "epoch": 0.0224, + "grad_norm": 0.4410051734654425, + "learning_rate": 0.00014911242603550298, + "loss": 0.6443, + "step": 126 + }, + { + "epoch": 0.022577777777777776, + "grad_norm": 0.44538375957173787, + "learning_rate": 0.00015029585798816567, + "loss": 0.7591, + "step": 127 + }, + { + "epoch": 0.022755555555555557, + "grad_norm": 0.4768115952145477, + "learning_rate": 0.00015147928994082842, + "loss": 0.7634, + "step": 128 + }, + { + "epoch": 0.022933333333333333, + "grad_norm": 0.4310921521908967, + "learning_rate": 0.00015266272189349113, + "loss": 0.7354, + "step": 129 + }, + { + "epoch": 0.02311111111111111, + "grad_norm": 0.4219097353849593, + "learning_rate": 0.00015384615384615385, + "loss": 0.7576, + "step": 130 + }, + { + "epoch": 0.02328888888888889, + "grad_norm": 0.4493492763866972, + "learning_rate": 0.00015502958579881657, + "loss": 0.7398, + "step": 131 + }, + { + "epoch": 0.023466666666666667, + "grad_norm": 0.44449137191974447, + "learning_rate": 0.00015621301775147929, + "loss": 0.7274, + "step": 132 + }, + { + "epoch": 0.023644444444444444, + "grad_norm": 0.4437401335874507, + "learning_rate": 0.00015739644970414203, + "loss": 0.7174, + "step": 133 + }, + { + "epoch": 0.023822222222222224, + "grad_norm": 0.4601812889711194, + "learning_rate": 0.00015857988165680475, + "loss": 0.7531, + "step": 134 + }, + { + "epoch": 0.024, + "grad_norm": 0.4128429759868119, + "learning_rate": 0.00015976331360946746, + "loss": 0.7219, + "step": 135 + }, + { + "epoch": 0.024177777777777777, + "grad_norm": 0.4376573688898078, + "learning_rate": 0.00016094674556213018, + "loss": 0.7796, + "step": 136 + }, + { + "epoch": 0.024355555555555554, + "grad_norm": 0.42645604923791425, + "learning_rate": 0.00016213017751479293, + "loss": 0.725, + "step": 137 + }, + { + "epoch": 0.024533333333333334, + "grad_norm": 0.44839644907574444, + "learning_rate": 0.00016331360946745562, + "loss": 0.7674, + "step": 138 + }, + { + "epoch": 0.02471111111111111, + "grad_norm": 0.44763816117785193, + "learning_rate": 0.00016449704142011836, + "loss": 0.7916, + "step": 139 + }, + { + "epoch": 0.024888888888888887, + "grad_norm": 0.4762683216146698, + "learning_rate": 0.00016568047337278108, + "loss": 0.7129, + "step": 140 + }, + { + "epoch": 0.025066666666666668, + "grad_norm": 0.4974991361000172, + "learning_rate": 0.0001668639053254438, + "loss": 0.7364, + "step": 141 + }, + { + "epoch": 0.025244444444444444, + "grad_norm": 0.4176301140459096, + "learning_rate": 0.0001680473372781065, + "loss": 0.7523, + "step": 142 + }, + { + "epoch": 0.02542222222222222, + "grad_norm": 0.4468545379435327, + "learning_rate": 0.00016923076923076923, + "loss": 0.7662, + "step": 143 + }, + { + "epoch": 0.0256, + "grad_norm": 0.43655264981596725, + "learning_rate": 0.00017041420118343197, + "loss": 0.7323, + "step": 144 + }, + { + "epoch": 0.025777777777777778, + "grad_norm": 0.45957397871169015, + "learning_rate": 0.0001715976331360947, + "loss": 0.7618, + "step": 145 + }, + { + "epoch": 0.025955555555555555, + "grad_norm": 0.4282793392026257, + "learning_rate": 0.0001727810650887574, + "loss": 0.8245, + "step": 146 + }, + { + "epoch": 0.026133333333333335, + "grad_norm": 0.4365734245351505, + "learning_rate": 0.00017396449704142012, + "loss": 0.7666, + "step": 147 + }, + { + "epoch": 0.02631111111111111, + "grad_norm": 0.4110531659484318, + "learning_rate": 0.00017514792899408287, + "loss": 0.7695, + "step": 148 + }, + { + "epoch": 0.026488888888888888, + "grad_norm": 0.4548179603502384, + "learning_rate": 0.00017633136094674556, + "loss": 0.7664, + "step": 149 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.4684089371665446, + "learning_rate": 0.00017751479289940828, + "loss": 0.7695, + "step": 150 + }, + { + "epoch": 0.026844444444444445, + "grad_norm": 0.4667143348446463, + "learning_rate": 0.00017869822485207102, + "loss": 0.7943, + "step": 151 + }, + { + "epoch": 0.027022222222222222, + "grad_norm": 1.7980695513854181, + "learning_rate": 0.00017988165680473374, + "loss": 0.7294, + "step": 152 + }, + { + "epoch": 0.0272, + "grad_norm": 0.42960716648493547, + "learning_rate": 0.00018106508875739645, + "loss": 0.7794, + "step": 153 + }, + { + "epoch": 0.02737777777777778, + "grad_norm": 0.4449433292223409, + "learning_rate": 0.00018224852071005917, + "loss": 0.7892, + "step": 154 + }, + { + "epoch": 0.027555555555555555, + "grad_norm": 0.4372015682071353, + "learning_rate": 0.00018343195266272192, + "loss": 0.7472, + "step": 155 + }, + { + "epoch": 0.027733333333333332, + "grad_norm": 0.43865271321098903, + "learning_rate": 0.00018461538461538463, + "loss": 0.7408, + "step": 156 + }, + { + "epoch": 0.027911111111111112, + "grad_norm": 0.45771557496816895, + "learning_rate": 0.00018579881656804735, + "loss": 0.7857, + "step": 157 + }, + { + "epoch": 0.02808888888888889, + "grad_norm": 0.4319156816172843, + "learning_rate": 0.00018698224852071007, + "loss": 0.7963, + "step": 158 + }, + { + "epoch": 0.028266666666666666, + "grad_norm": 0.4525884936465943, + "learning_rate": 0.00018816568047337278, + "loss": 0.7295, + "step": 159 + }, + { + "epoch": 0.028444444444444446, + "grad_norm": 0.39884357895024963, + "learning_rate": 0.0001893491124260355, + "loss": 0.7267, + "step": 160 + }, + { + "epoch": 0.028622222222222223, + "grad_norm": 0.42876394034800636, + "learning_rate": 0.00019053254437869822, + "loss": 0.7522, + "step": 161 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4462597385837125, + "learning_rate": 0.00019171597633136096, + "loss": 0.7429, + "step": 162 + }, + { + "epoch": 0.02897777777777778, + "grad_norm": 0.48819492924245217, + "learning_rate": 0.00019289940828402368, + "loss": 0.8142, + "step": 163 + }, + { + "epoch": 0.029155555555555556, + "grad_norm": 0.47465656582797483, + "learning_rate": 0.0001940828402366864, + "loss": 0.8171, + "step": 164 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 0.42346675558380087, + "learning_rate": 0.00019526627218934911, + "loss": 0.7243, + "step": 165 + }, + { + "epoch": 0.02951111111111111, + "grad_norm": 0.396978468291565, + "learning_rate": 0.00019644970414201186, + "loss": 0.726, + "step": 166 + }, + { + "epoch": 0.02968888888888889, + "grad_norm": 0.4340506416543326, + "learning_rate": 0.00019763313609467458, + "loss": 0.7948, + "step": 167 + }, + { + "epoch": 0.029866666666666666, + "grad_norm": 0.44736208770371383, + "learning_rate": 0.0001988165680473373, + "loss": 0.7493, + "step": 168 + }, + { + "epoch": 0.030044444444444443, + "grad_norm": 0.44842977629625974, + "learning_rate": 0.0002, + "loss": 0.7384, + "step": 169 + }, + { + "epoch": 0.030222222222222223, + "grad_norm": 0.4377904907138219, + "learning_rate": 0.00019999998342242452, + "loss": 0.7333, + "step": 170 + }, + { + "epoch": 0.0304, + "grad_norm": 0.4471493498256661, + "learning_rate": 0.0001999999336897035, + "loss": 0.7827, + "step": 171 + }, + { + "epoch": 0.030577777777777777, + "grad_norm": 0.44748572645868845, + "learning_rate": 0.0001999998508018535, + "loss": 0.7772, + "step": 172 + }, + { + "epoch": 0.030755555555555557, + "grad_norm": 0.4096593349436675, + "learning_rate": 0.000199999734758902, + "loss": 0.7464, + "step": 173 + }, + { + "epoch": 0.030933333333333334, + "grad_norm": 0.41697363668435844, + "learning_rate": 0.0001999995855608874, + "loss": 0.7035, + "step": 174 + }, + { + "epoch": 0.03111111111111111, + "grad_norm": 0.4256245750203348, + "learning_rate": 0.00019999940320785924, + "loss": 0.7817, + "step": 175 + }, + { + "epoch": 0.03128888888888889, + "grad_norm": 0.4249773552544686, + "learning_rate": 0.00019999918769987796, + "loss": 0.7573, + "step": 176 + }, + { + "epoch": 0.031466666666666664, + "grad_norm": 0.4310933153219558, + "learning_rate": 0.00019999893903701498, + "loss": 0.7791, + "step": 177 + }, + { + "epoch": 0.03164444444444445, + "grad_norm": 0.4227463269176802, + "learning_rate": 0.0001999986572193528, + "loss": 0.7597, + "step": 178 + }, + { + "epoch": 0.031822222222222224, + "grad_norm": 0.40501337189453185, + "learning_rate": 0.0001999983422469848, + "loss": 0.7852, + "step": 179 + }, + { + "epoch": 0.032, + "grad_norm": 0.41866672247549885, + "learning_rate": 0.00019999799412001546, + "loss": 0.7271, + "step": 180 + }, + { + "epoch": 0.03217777777777778, + "grad_norm": 0.4180055203499516, + "learning_rate": 0.00019999761283856016, + "loss": 0.6782, + "step": 181 + }, + { + "epoch": 0.032355555555555554, + "grad_norm": 0.42387302787166425, + "learning_rate": 0.00019999719840274534, + "loss": 0.7202, + "step": 182 + }, + { + "epoch": 0.03253333333333333, + "grad_norm": 0.415592966763154, + "learning_rate": 0.0001999967508127084, + "loss": 0.7348, + "step": 183 + }, + { + "epoch": 0.032711111111111114, + "grad_norm": 0.4231487626538851, + "learning_rate": 0.00019999627006859775, + "loss": 0.7418, + "step": 184 + }, + { + "epoch": 0.03288888888888889, + "grad_norm": 0.43838224153476424, + "learning_rate": 0.00019999575617057276, + "loss": 0.7352, + "step": 185 + }, + { + "epoch": 0.03306666666666667, + "grad_norm": 0.436577132526167, + "learning_rate": 0.00019999520911880383, + "loss": 0.7903, + "step": 186 + }, + { + "epoch": 0.033244444444444445, + "grad_norm": 0.4396741594040606, + "learning_rate": 0.00019999462891347235, + "loss": 0.7124, + "step": 187 + }, + { + "epoch": 0.03342222222222222, + "grad_norm": 0.42165424177243394, + "learning_rate": 0.00019999401555477063, + "loss": 0.7505, + "step": 188 + }, + { + "epoch": 0.0336, + "grad_norm": 0.4216086489577599, + "learning_rate": 0.00019999336904290207, + "loss": 0.7577, + "step": 189 + }, + { + "epoch": 0.033777777777777775, + "grad_norm": 0.4189770102010155, + "learning_rate": 0.00019999268937808103, + "loss": 0.6997, + "step": 190 + }, + { + "epoch": 0.03395555555555556, + "grad_norm": 0.4340902191803675, + "learning_rate": 0.00019999197656053288, + "loss": 0.7419, + "step": 191 + }, + { + "epoch": 0.034133333333333335, + "grad_norm": 0.44127383729299113, + "learning_rate": 0.0001999912305904939, + "loss": 0.7438, + "step": 192 + }, + { + "epoch": 0.03431111111111111, + "grad_norm": 0.4175352351319076, + "learning_rate": 0.0001999904514682114, + "loss": 0.7364, + "step": 193 + }, + { + "epoch": 0.03448888888888889, + "grad_norm": 0.4188165320146356, + "learning_rate": 0.00019998963919394376, + "loss": 0.7177, + "step": 194 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 0.4387485892400164, + "learning_rate": 0.00019998879376796028, + "loss": 0.7802, + "step": 195 + }, + { + "epoch": 0.03484444444444444, + "grad_norm": 0.4102698316318348, + "learning_rate": 0.00019998791519054127, + "loss": 0.7439, + "step": 196 + }, + { + "epoch": 0.035022222222222225, + "grad_norm": 0.4477879080743453, + "learning_rate": 0.00019998700346197796, + "loss": 0.7799, + "step": 197 + }, + { + "epoch": 0.0352, + "grad_norm": 0.43547432937232106, + "learning_rate": 0.0001999860585825727, + "loss": 0.8031, + "step": 198 + }, + { + "epoch": 0.03537777777777778, + "grad_norm": 0.40701515352068407, + "learning_rate": 0.00019998508055263874, + "loss": 0.7322, + "step": 199 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 0.4055655633697622, + "learning_rate": 0.00019998406937250034, + "loss": 0.7683, + "step": 200 + }, + { + "epoch": 0.03573333333333333, + "grad_norm": 0.3941214398957608, + "learning_rate": 0.00019998302504249278, + "loss": 0.7272, + "step": 201 + }, + { + "epoch": 0.03591111111111111, + "grad_norm": 0.4208078679013026, + "learning_rate": 0.0001999819475629623, + "loss": 0.7143, + "step": 202 + }, + { + "epoch": 0.036088888888888886, + "grad_norm": 0.4339666746633097, + "learning_rate": 0.00019998083693426616, + "loss": 0.752, + "step": 203 + }, + { + "epoch": 0.03626666666666667, + "grad_norm": 0.40483388640974854, + "learning_rate": 0.00019997969315677252, + "loss": 0.6768, + "step": 204 + }, + { + "epoch": 0.036444444444444446, + "grad_norm": 0.41543278835837655, + "learning_rate": 0.00019997851623086067, + "loss": 0.7508, + "step": 205 + }, + { + "epoch": 0.03662222222222222, + "grad_norm": 0.4095256756764992, + "learning_rate": 0.00019997730615692083, + "loss": 0.7039, + "step": 206 + }, + { + "epoch": 0.0368, + "grad_norm": 0.4124476756263938, + "learning_rate": 0.00019997606293535415, + "loss": 0.7822, + "step": 207 + }, + { + "epoch": 0.036977777777777776, + "grad_norm": 0.42234578290270847, + "learning_rate": 0.00019997478656657287, + "loss": 0.7744, + "step": 208 + }, + { + "epoch": 0.03715555555555555, + "grad_norm": 0.42601138572474845, + "learning_rate": 0.00019997347705100015, + "loss": 0.77, + "step": 209 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 0.41680075142207174, + "learning_rate": 0.00019997213438907013, + "loss": 0.7534, + "step": 210 + }, + { + "epoch": 0.03751111111111111, + "grad_norm": 0.42595370919896436, + "learning_rate": 0.000199970758581228, + "loss": 0.7783, + "step": 211 + }, + { + "epoch": 0.03768888888888889, + "grad_norm": 0.41078007123190574, + "learning_rate": 0.00019996934962792994, + "loss": 0.7515, + "step": 212 + }, + { + "epoch": 0.037866666666666667, + "grad_norm": 0.4009652243031692, + "learning_rate": 0.00019996790752964305, + "loss": 0.7023, + "step": 213 + }, + { + "epoch": 0.03804444444444444, + "grad_norm": 0.4293457117206223, + "learning_rate": 0.0001999664322868455, + "loss": 0.7782, + "step": 214 + }, + { + "epoch": 0.03822222222222222, + "grad_norm": 0.4132962337752529, + "learning_rate": 0.00019996492390002635, + "loss": 0.7793, + "step": 215 + }, + { + "epoch": 0.0384, + "grad_norm": 0.3826851911935954, + "learning_rate": 0.00019996338236968574, + "loss": 0.7082, + "step": 216 + }, + { + "epoch": 0.03857777777777778, + "grad_norm": 0.3920900648286519, + "learning_rate": 0.0001999618076963348, + "loss": 0.7182, + "step": 217 + }, + { + "epoch": 0.03875555555555556, + "grad_norm": 0.42976393460604073, + "learning_rate": 0.00019996019988049554, + "loss": 0.7998, + "step": 218 + }, + { + "epoch": 0.038933333333333334, + "grad_norm": 0.38007212973087107, + "learning_rate": 0.0001999585589227011, + "loss": 0.689, + "step": 219 + }, + { + "epoch": 0.03911111111111111, + "grad_norm": 0.41540261813365553, + "learning_rate": 0.00019995688482349553, + "loss": 0.7089, + "step": 220 + }, + { + "epoch": 0.03928888888888889, + "grad_norm": 0.39321320573098695, + "learning_rate": 0.00019995517758343386, + "loss": 0.694, + "step": 221 + }, + { + "epoch": 0.039466666666666664, + "grad_norm": 0.4037515322280404, + "learning_rate": 0.00019995343720308212, + "loss": 0.7356, + "step": 222 + }, + { + "epoch": 0.03964444444444445, + "grad_norm": 0.42011262289834633, + "learning_rate": 0.00019995166368301734, + "loss": 0.717, + "step": 223 + }, + { + "epoch": 0.039822222222222224, + "grad_norm": 0.42873694748103836, + "learning_rate": 0.00019994985702382758, + "loss": 0.7633, + "step": 224 + }, + { + "epoch": 0.04, + "grad_norm": 0.3961804083832185, + "learning_rate": 0.00019994801722611182, + "loss": 0.6979, + "step": 225 + }, + { + "epoch": 0.04017777777777778, + "grad_norm": 0.4172279974998365, + "learning_rate": 0.00019994614429047998, + "loss": 0.706, + "step": 226 + }, + { + "epoch": 0.040355555555555554, + "grad_norm": 0.42688994383848117, + "learning_rate": 0.00019994423821755313, + "loss": 0.7777, + "step": 227 + }, + { + "epoch": 0.04053333333333333, + "grad_norm": 0.4250817717428973, + "learning_rate": 0.00019994229900796318, + "loss": 0.7273, + "step": 228 + }, + { + "epoch": 0.040711111111111115, + "grad_norm": 0.4136562214162894, + "learning_rate": 0.00019994032666235308, + "loss": 0.7799, + "step": 229 + }, + { + "epoch": 0.04088888888888889, + "grad_norm": 0.4214659029090264, + "learning_rate": 0.00019993832118137678, + "loss": 0.7233, + "step": 230 + }, + { + "epoch": 0.04106666666666667, + "grad_norm": 0.40643440979476614, + "learning_rate": 0.0001999362825656992, + "loss": 0.7083, + "step": 231 + }, + { + "epoch": 0.041244444444444445, + "grad_norm": 0.41358356997274226, + "learning_rate": 0.0001999342108159962, + "loss": 0.7279, + "step": 232 + }, + { + "epoch": 0.04142222222222222, + "grad_norm": 0.4319374532419072, + "learning_rate": 0.00019993210593295473, + "loss": 0.7574, + "step": 233 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4307115281367592, + "learning_rate": 0.00019992996791727267, + "loss": 0.7422, + "step": 234 + }, + { + "epoch": 0.041777777777777775, + "grad_norm": 0.4042382399897502, + "learning_rate": 0.00019992779676965885, + "loss": 0.7391, + "step": 235 + }, + { + "epoch": 0.04195555555555556, + "grad_norm": 0.4588002095047031, + "learning_rate": 0.0001999255924908331, + "loss": 0.7262, + "step": 236 + }, + { + "epoch": 0.042133333333333335, + "grad_norm": 0.42467507977251784, + "learning_rate": 0.00019992335508152632, + "loss": 0.7982, + "step": 237 + }, + { + "epoch": 0.04231111111111111, + "grad_norm": 0.4105715522868043, + "learning_rate": 0.00019992108454248023, + "loss": 0.7324, + "step": 238 + }, + { + "epoch": 0.04248888888888889, + "grad_norm": 0.43549057774407846, + "learning_rate": 0.00019991878087444772, + "loss": 0.7302, + "step": 239 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 0.3885617399762617, + "learning_rate": 0.00019991644407819256, + "loss": 0.7575, + "step": 240 + }, + { + "epoch": 0.04284444444444444, + "grad_norm": 0.408647891930264, + "learning_rate": 0.00019991407415448947, + "loss": 0.7108, + "step": 241 + }, + { + "epoch": 0.043022222222222226, + "grad_norm": 0.40004449768082634, + "learning_rate": 0.00019991167110412422, + "loss": 0.7199, + "step": 242 + }, + { + "epoch": 0.0432, + "grad_norm": 0.4222611035054969, + "learning_rate": 0.00019990923492789359, + "loss": 0.7717, + "step": 243 + }, + { + "epoch": 0.04337777777777778, + "grad_norm": 0.41420806798533705, + "learning_rate": 0.00019990676562660524, + "loss": 0.778, + "step": 244 + }, + { + "epoch": 0.043555555555555556, + "grad_norm": 0.39284232850398737, + "learning_rate": 0.00019990426320107792, + "loss": 0.7049, + "step": 245 + }, + { + "epoch": 0.04373333333333333, + "grad_norm": 0.4041020419763259, + "learning_rate": 0.00019990172765214128, + "loss": 0.7466, + "step": 246 + }, + { + "epoch": 0.04391111111111111, + "grad_norm": 0.4260355274023658, + "learning_rate": 0.00019989915898063597, + "loss": 0.738, + "step": 247 + }, + { + "epoch": 0.044088888888888886, + "grad_norm": 0.4374038023499603, + "learning_rate": 0.00019989655718741366, + "loss": 0.6973, + "step": 248 + }, + { + "epoch": 0.04426666666666667, + "grad_norm": 0.43049231829727097, + "learning_rate": 0.000199893922273337, + "loss": 0.7792, + "step": 249 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 0.4176026714765864, + "learning_rate": 0.00019989125423927956, + "loss": 0.6943, + "step": 250 + }, + { + "epoch": 0.04462222222222222, + "grad_norm": 0.4059797676151001, + "learning_rate": 0.00019988855308612595, + "loss": 0.7218, + "step": 251 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4158187293131045, + "learning_rate": 0.00019988581881477172, + "loss": 0.7276, + "step": 252 + }, + { + "epoch": 0.044977777777777776, + "grad_norm": 0.40237034388512766, + "learning_rate": 0.0001998830514261235, + "loss": 0.7587, + "step": 253 + }, + { + "epoch": 0.04515555555555555, + "grad_norm": 0.41066420743110094, + "learning_rate": 0.0001998802509210987, + "loss": 0.7792, + "step": 254 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 0.4076981317466958, + "learning_rate": 0.00019987741730062594, + "loss": 0.786, + "step": 255 + }, + { + "epoch": 0.04551111111111111, + "grad_norm": 0.4103971911062201, + "learning_rate": 0.00019987455056564462, + "loss": 0.7598, + "step": 256 + }, + { + "epoch": 0.04568888888888889, + "grad_norm": 0.41647228585767193, + "learning_rate": 0.00019987165071710527, + "loss": 0.7664, + "step": 257 + }, + { + "epoch": 0.04586666666666667, + "grad_norm": 0.4101745908088802, + "learning_rate": 0.00019986871775596937, + "loss": 0.8, + "step": 258 + }, + { + "epoch": 0.04604444444444444, + "grad_norm": 0.42983877689293504, + "learning_rate": 0.00019986575168320925, + "loss": 0.6814, + "step": 259 + }, + { + "epoch": 0.04622222222222222, + "grad_norm": 0.4057948279672016, + "learning_rate": 0.0001998627524998084, + "loss": 0.708, + "step": 260 + }, + { + "epoch": 0.0464, + "grad_norm": 0.3991751185185169, + "learning_rate": 0.00019985972020676116, + "loss": 0.7156, + "step": 261 + }, + { + "epoch": 0.04657777777777778, + "grad_norm": 0.41581351090637353, + "learning_rate": 0.0001998566548050729, + "loss": 0.6909, + "step": 262 + }, + { + "epoch": 0.04675555555555556, + "grad_norm": 0.4080322012204585, + "learning_rate": 0.00019985355629575997, + "loss": 0.7608, + "step": 263 + }, + { + "epoch": 0.046933333333333334, + "grad_norm": 0.42162201325708426, + "learning_rate": 0.00019985042467984967, + "loss": 0.7319, + "step": 264 + }, + { + "epoch": 0.04711111111111111, + "grad_norm": 0.39020769390029697, + "learning_rate": 0.00019984725995838033, + "loss": 0.7121, + "step": 265 + }, + { + "epoch": 0.04728888888888889, + "grad_norm": 0.38763501861251853, + "learning_rate": 0.00019984406213240113, + "loss": 0.74, + "step": 266 + }, + { + "epoch": 0.047466666666666664, + "grad_norm": 0.40881310999170967, + "learning_rate": 0.0001998408312029724, + "loss": 0.7483, + "step": 267 + }, + { + "epoch": 0.04764444444444445, + "grad_norm": 0.4768937828755657, + "learning_rate": 0.00019983756717116536, + "loss": 0.7128, + "step": 268 + }, + { + "epoch": 0.047822222222222224, + "grad_norm": 0.3789670433847438, + "learning_rate": 0.00019983427003806214, + "loss": 0.732, + "step": 269 + }, + { + "epoch": 0.048, + "grad_norm": 0.40260640659117697, + "learning_rate": 0.00019983093980475598, + "loss": 0.6963, + "step": 270 + }, + { + "epoch": 0.04817777777777778, + "grad_norm": 0.3860090317689809, + "learning_rate": 0.00019982757647235094, + "loss": 0.7063, + "step": 271 + }, + { + "epoch": 0.048355555555555554, + "grad_norm": 0.4326493194592787, + "learning_rate": 0.00019982418004196224, + "loss": 0.7303, + "step": 272 + }, + { + "epoch": 0.04853333333333333, + "grad_norm": 0.40472326762665517, + "learning_rate": 0.00019982075051471588, + "loss": 0.747, + "step": 273 + }, + { + "epoch": 0.04871111111111111, + "grad_norm": 0.41569159761813573, + "learning_rate": 0.000199817287891749, + "loss": 0.7279, + "step": 274 + }, + { + "epoch": 0.04888888888888889, + "grad_norm": 0.3965281745477917, + "learning_rate": 0.00019981379217420958, + "loss": 0.7107, + "step": 275 + }, + { + "epoch": 0.04906666666666667, + "grad_norm": 0.403196072301094, + "learning_rate": 0.00019981026336325663, + "loss": 0.7621, + "step": 276 + }, + { + "epoch": 0.049244444444444445, + "grad_norm": 0.41927919266095026, + "learning_rate": 0.0001998067014600602, + "loss": 0.7661, + "step": 277 + }, + { + "epoch": 0.04942222222222222, + "grad_norm": 0.37584221032027115, + "learning_rate": 0.00019980310646580115, + "loss": 0.6696, + "step": 278 + }, + { + "epoch": 0.0496, + "grad_norm": 0.42389289080408377, + "learning_rate": 0.0001997994783816715, + "loss": 0.7832, + "step": 279 + }, + { + "epoch": 0.049777777777777775, + "grad_norm": 0.38512553433136487, + "learning_rate": 0.0001997958172088741, + "loss": 0.7167, + "step": 280 + }, + { + "epoch": 0.04995555555555556, + "grad_norm": 0.3859382813533775, + "learning_rate": 0.0001997921229486228, + "loss": 0.7171, + "step": 281 + }, + { + "epoch": 0.050133333333333335, + "grad_norm": 0.4158323653798766, + "learning_rate": 0.00019978839560214247, + "loss": 0.7713, + "step": 282 + }, + { + "epoch": 0.05031111111111111, + "grad_norm": 0.38860124681480823, + "learning_rate": 0.00019978463517066888, + "loss": 0.727, + "step": 283 + }, + { + "epoch": 0.05048888888888889, + "grad_norm": 0.40108690695421373, + "learning_rate": 0.00019978084165544883, + "loss": 0.7273, + "step": 284 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.415268620449402, + "learning_rate": 0.0001997770150577401, + "loss": 0.7439, + "step": 285 + }, + { + "epoch": 0.05084444444444444, + "grad_norm": 0.4282600774927427, + "learning_rate": 0.00019977315537881137, + "loss": 0.8217, + "step": 286 + }, + { + "epoch": 0.05102222222222222, + "grad_norm": 0.3961562991634904, + "learning_rate": 0.0001997692626199423, + "loss": 0.7108, + "step": 287 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4064635607969968, + "learning_rate": 0.00019976533678242359, + "loss": 0.702, + "step": 288 + }, + { + "epoch": 0.05137777777777778, + "grad_norm": 0.42569958908513883, + "learning_rate": 0.0001997613778675568, + "loss": 0.7546, + "step": 289 + }, + { + "epoch": 0.051555555555555556, + "grad_norm": 0.4269333671719804, + "learning_rate": 0.00019975738587665456, + "loss": 0.7386, + "step": 290 + }, + { + "epoch": 0.05173333333333333, + "grad_norm": 0.41759105568698235, + "learning_rate": 0.00019975336081104038, + "loss": 0.7442, + "step": 291 + }, + { + "epoch": 0.05191111111111111, + "grad_norm": 0.45511212746981194, + "learning_rate": 0.00019974930267204884, + "loss": 0.7446, + "step": 292 + }, + { + "epoch": 0.052088888888888886, + "grad_norm": 0.3927148245282799, + "learning_rate": 0.00019974521146102537, + "loss": 0.7311, + "step": 293 + }, + { + "epoch": 0.05226666666666667, + "grad_norm": 0.4345782139011403, + "learning_rate": 0.00019974108717932642, + "loss": 0.8004, + "step": 294 + }, + { + "epoch": 0.052444444444444446, + "grad_norm": 0.3956155061076544, + "learning_rate": 0.00019973692982831943, + "loss": 0.7173, + "step": 295 + }, + { + "epoch": 0.05262222222222222, + "grad_norm": 0.3860379553602568, + "learning_rate": 0.00019973273940938275, + "loss": 0.7478, + "step": 296 + }, + { + "epoch": 0.0528, + "grad_norm": 0.40023460444830405, + "learning_rate": 0.00019972851592390574, + "loss": 0.7539, + "step": 297 + }, + { + "epoch": 0.052977777777777776, + "grad_norm": 0.3857506700815517, + "learning_rate": 0.0001997242593732887, + "loss": 0.7852, + "step": 298 + }, + { + "epoch": 0.05315555555555555, + "grad_norm": 0.40928687740427483, + "learning_rate": 0.00019971996975894286, + "loss": 0.8074, + "step": 299 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.39557341515474426, + "learning_rate": 0.00019971564708229047, + "loss": 0.7319, + "step": 300 + }, + { + "epoch": 0.05351111111111111, + "grad_norm": 0.3987251405952196, + "learning_rate": 0.00019971129134476473, + "loss": 0.7554, + "step": 301 + }, + { + "epoch": 0.05368888888888889, + "grad_norm": 0.3864495178514433, + "learning_rate": 0.0001997069025478098, + "loss": 0.6832, + "step": 302 + }, + { + "epoch": 0.05386666666666667, + "grad_norm": 0.427056921901087, + "learning_rate": 0.0001997024806928808, + "loss": 0.7821, + "step": 303 + }, + { + "epoch": 0.054044444444444444, + "grad_norm": 0.37317816435074624, + "learning_rate": 0.00019969802578144376, + "loss": 0.6802, + "step": 304 + }, + { + "epoch": 0.05422222222222222, + "grad_norm": 0.4078039396305031, + "learning_rate": 0.00019969353781497574, + "loss": 0.7069, + "step": 305 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3943567927022663, + "learning_rate": 0.00019968901679496472, + "loss": 0.6958, + "step": 306 + }, + { + "epoch": 0.05457777777777778, + "grad_norm": 0.42094678424076726, + "learning_rate": 0.00019968446272290968, + "loss": 0.7056, + "step": 307 + }, + { + "epoch": 0.05475555555555556, + "grad_norm": 0.5245417072250762, + "learning_rate": 0.0001996798756003205, + "loss": 0.7259, + "step": 308 + }, + { + "epoch": 0.054933333333333334, + "grad_norm": 0.41411973399556323, + "learning_rate": 0.00019967525542871804, + "loss": 0.755, + "step": 309 + }, + { + "epoch": 0.05511111111111111, + "grad_norm": 0.3972680177444909, + "learning_rate": 0.00019967060220963415, + "loss": 0.7137, + "step": 310 + }, + { + "epoch": 0.05528888888888889, + "grad_norm": 0.41521443791808565, + "learning_rate": 0.00019966591594461157, + "loss": 0.7777, + "step": 311 + }, + { + "epoch": 0.055466666666666664, + "grad_norm": 0.3916336077331353, + "learning_rate": 0.00019966119663520412, + "loss": 0.6849, + "step": 312 + }, + { + "epoch": 0.05564444444444445, + "grad_norm": 0.42052246462401904, + "learning_rate": 0.00019965644428297642, + "loss": 0.7569, + "step": 313 + }, + { + "epoch": 0.055822222222222224, + "grad_norm": 0.4110369457873654, + "learning_rate": 0.00019965165888950414, + "loss": 0.6966, + "step": 314 + }, + { + "epoch": 0.056, + "grad_norm": 0.4041355832408566, + "learning_rate": 0.00019964684045637387, + "loss": 0.7634, + "step": 315 + }, + { + "epoch": 0.05617777777777778, + "grad_norm": 0.40942487697009206, + "learning_rate": 0.00019964198898518324, + "loss": 0.6984, + "step": 316 + }, + { + "epoch": 0.056355555555555555, + "grad_norm": 0.4027425524934776, + "learning_rate": 0.00019963710447754065, + "loss": 0.694, + "step": 317 + }, + { + "epoch": 0.05653333333333333, + "grad_norm": 0.40719980235720055, + "learning_rate": 0.00019963218693506564, + "loss": 0.6956, + "step": 318 + }, + { + "epoch": 0.05671111111111111, + "grad_norm": 0.398804474465064, + "learning_rate": 0.00019962723635938865, + "loss": 0.7416, + "step": 319 + }, + { + "epoch": 0.05688888888888889, + "grad_norm": 0.41684825603507647, + "learning_rate": 0.000199622252752151, + "loss": 0.7722, + "step": 320 + }, + { + "epoch": 0.05706666666666667, + "grad_norm": 0.39520916999673306, + "learning_rate": 0.000199617236115005, + "loss": 0.7179, + "step": 321 + }, + { + "epoch": 0.057244444444444445, + "grad_norm": 0.3985887905211269, + "learning_rate": 0.00019961218644961397, + "loss": 0.6911, + "step": 322 + }, + { + "epoch": 0.05742222222222222, + "grad_norm": 0.41078307916742446, + "learning_rate": 0.0001996071037576521, + "loss": 0.7297, + "step": 323 + }, + { + "epoch": 0.0576, + "grad_norm": 0.4003687552320492, + "learning_rate": 0.0001996019880408046, + "loss": 0.6995, + "step": 324 + }, + { + "epoch": 0.057777777777777775, + "grad_norm": 0.41374382086838146, + "learning_rate": 0.00019959683930076758, + "loss": 0.6852, + "step": 325 + }, + { + "epoch": 0.05795555555555556, + "grad_norm": 0.38767911144070144, + "learning_rate": 0.00019959165753924806, + "loss": 0.6603, + "step": 326 + }, + { + "epoch": 0.058133333333333335, + "grad_norm": 0.4146713131575808, + "learning_rate": 0.00019958644275796416, + "loss": 0.7482, + "step": 327 + }, + { + "epoch": 0.05831111111111111, + "grad_norm": 0.40282409427512333, + "learning_rate": 0.00019958119495864477, + "loss": 0.7503, + "step": 328 + }, + { + "epoch": 0.05848888888888889, + "grad_norm": 0.42306846587640345, + "learning_rate": 0.00019957591414302984, + "loss": 0.6998, + "step": 329 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 0.4277605462336533, + "learning_rate": 0.0001995706003128702, + "loss": 0.7508, + "step": 330 + }, + { + "epoch": 0.05884444444444444, + "grad_norm": 0.3853085682494387, + "learning_rate": 0.00019956525346992768, + "loss": 0.6927, + "step": 331 + }, + { + "epoch": 0.05902222222222222, + "grad_norm": 0.391673017668163, + "learning_rate": 0.00019955987361597506, + "loss": 0.7491, + "step": 332 + }, + { + "epoch": 0.0592, + "grad_norm": 0.41754786645076236, + "learning_rate": 0.000199554460752796, + "loss": 0.8008, + "step": 333 + }, + { + "epoch": 0.05937777777777778, + "grad_norm": 0.3849602871243546, + "learning_rate": 0.00019954901488218515, + "loss": 0.7321, + "step": 334 + }, + { + "epoch": 0.059555555555555556, + "grad_norm": 0.3950166024985762, + "learning_rate": 0.00019954353600594812, + "loss": 0.7144, + "step": 335 + }, + { + "epoch": 0.05973333333333333, + "grad_norm": 0.4140578900735401, + "learning_rate": 0.00019953802412590142, + "loss": 0.709, + "step": 336 + }, + { + "epoch": 0.05991111111111111, + "grad_norm": 0.3870858564154028, + "learning_rate": 0.00019953247924387252, + "loss": 0.7185, + "step": 337 + }, + { + "epoch": 0.060088888888888886, + "grad_norm": 0.41320328866404843, + "learning_rate": 0.00019952690136169985, + "loss": 0.738, + "step": 338 + }, + { + "epoch": 0.06026666666666667, + "grad_norm": 0.44243060101910286, + "learning_rate": 0.00019952129048123274, + "loss": 0.7415, + "step": 339 + }, + { + "epoch": 0.060444444444444446, + "grad_norm": 0.42840659815007515, + "learning_rate": 0.0001995156466043315, + "loss": 0.7424, + "step": 340 + }, + { + "epoch": 0.06062222222222222, + "grad_norm": 0.39827708227458203, + "learning_rate": 0.0001995099697328674, + "loss": 0.7403, + "step": 341 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3955191883883136, + "learning_rate": 0.00019950425986872255, + "loss": 0.7187, + "step": 342 + }, + { + "epoch": 0.06097777777777778, + "grad_norm": 0.4226085872385566, + "learning_rate": 0.0001994985170137901, + "loss": 0.7091, + "step": 343 + }, + { + "epoch": 0.06115555555555555, + "grad_norm": 0.3847284881018638, + "learning_rate": 0.00019949274116997406, + "loss": 0.7337, + "step": 344 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 0.41103657174701047, + "learning_rate": 0.00019948693233918952, + "loss": 0.7142, + "step": 345 + }, + { + "epoch": 0.061511111111111114, + "grad_norm": 0.39510712866799935, + "learning_rate": 0.00019948109052336232, + "loss": 0.6515, + "step": 346 + }, + { + "epoch": 0.06168888888888889, + "grad_norm": 0.3973478546306339, + "learning_rate": 0.00019947521572442935, + "loss": 0.7053, + "step": 347 + }, + { + "epoch": 0.06186666666666667, + "grad_norm": 0.39265961824794854, + "learning_rate": 0.0001994693079443384, + "loss": 0.7317, + "step": 348 + }, + { + "epoch": 0.062044444444444444, + "grad_norm": 0.40022808878239136, + "learning_rate": 0.00019946336718504822, + "loss": 0.7467, + "step": 349 + }, + { + "epoch": 0.06222222222222222, + "grad_norm": 0.4060392673229292, + "learning_rate": 0.00019945739344852848, + "loss": 0.691, + "step": 350 + }, + { + "epoch": 0.0624, + "grad_norm": 0.38163889363721626, + "learning_rate": 0.00019945138673675973, + "loss": 0.7025, + "step": 351 + }, + { + "epoch": 0.06257777777777777, + "grad_norm": 0.3848990756785906, + "learning_rate": 0.00019944534705173354, + "loss": 0.7874, + "step": 352 + }, + { + "epoch": 0.06275555555555555, + "grad_norm": 0.40265559565058934, + "learning_rate": 0.00019943927439545242, + "loss": 0.6937, + "step": 353 + }, + { + "epoch": 0.06293333333333333, + "grad_norm": 0.3924397444910792, + "learning_rate": 0.0001994331687699297, + "loss": 0.741, + "step": 354 + }, + { + "epoch": 0.06311111111111112, + "grad_norm": 0.37538364201713864, + "learning_rate": 0.00019942703017718975, + "loss": 0.7035, + "step": 355 + }, + { + "epoch": 0.0632888888888889, + "grad_norm": 0.38866448405000675, + "learning_rate": 0.0001994208586192678, + "loss": 0.6894, + "step": 356 + }, + { + "epoch": 0.06346666666666667, + "grad_norm": 0.3903914056587825, + "learning_rate": 0.00019941465409821008, + "loss": 0.6418, + "step": 357 + }, + { + "epoch": 0.06364444444444445, + "grad_norm": 0.43251756600847535, + "learning_rate": 0.00019940841661607366, + "loss": 0.7657, + "step": 358 + }, + { + "epoch": 0.06382222222222222, + "grad_norm": 0.40484522087785463, + "learning_rate": 0.0001994021461749266, + "loss": 0.7733, + "step": 359 + }, + { + "epoch": 0.064, + "grad_norm": 0.41265312921015945, + "learning_rate": 0.0001993958427768479, + "loss": 0.7472, + "step": 360 + }, + { + "epoch": 0.06417777777777778, + "grad_norm": 0.42083144145833196, + "learning_rate": 0.00019938950642392746, + "loss": 0.7835, + "step": 361 + }, + { + "epoch": 0.06435555555555555, + "grad_norm": 0.38595338979089616, + "learning_rate": 0.0001993831371182661, + "loss": 0.6785, + "step": 362 + }, + { + "epoch": 0.06453333333333333, + "grad_norm": 0.3908370405376001, + "learning_rate": 0.00019937673486197555, + "loss": 0.705, + "step": 363 + }, + { + "epoch": 0.06471111111111111, + "grad_norm": 0.3893853168603376, + "learning_rate": 0.0001993702996571785, + "loss": 0.7022, + "step": 364 + }, + { + "epoch": 0.06488888888888888, + "grad_norm": 0.3883852863717747, + "learning_rate": 0.00019936383150600856, + "loss": 0.6847, + "step": 365 + }, + { + "epoch": 0.06506666666666666, + "grad_norm": 0.4089026341869207, + "learning_rate": 0.00019935733041061027, + "loss": 0.7262, + "step": 366 + }, + { + "epoch": 0.06524444444444444, + "grad_norm": 0.3987094795708502, + "learning_rate": 0.00019935079637313906, + "loss": 0.7346, + "step": 367 + }, + { + "epoch": 0.06542222222222223, + "grad_norm": 0.4165262629169865, + "learning_rate": 0.00019934422939576124, + "loss": 0.717, + "step": 368 + }, + { + "epoch": 0.0656, + "grad_norm": 0.38509780963736684, + "learning_rate": 0.0001993376294806542, + "loss": 0.7156, + "step": 369 + }, + { + "epoch": 0.06577777777777778, + "grad_norm": 0.3814004299512465, + "learning_rate": 0.00019933099663000615, + "loss": 0.7497, + "step": 370 + }, + { + "epoch": 0.06595555555555556, + "grad_norm": 0.4238705344332552, + "learning_rate": 0.00019932433084601613, + "loss": 0.7415, + "step": 371 + }, + { + "epoch": 0.06613333333333334, + "grad_norm": 0.3989956611610803, + "learning_rate": 0.00019931763213089428, + "loss": 0.6942, + "step": 372 + }, + { + "epoch": 0.06631111111111111, + "grad_norm": 0.3907659131921682, + "learning_rate": 0.00019931090048686152, + "loss": 0.7014, + "step": 373 + }, + { + "epoch": 0.06648888888888889, + "grad_norm": 0.41059339211380236, + "learning_rate": 0.00019930413591614973, + "loss": 0.7995, + "step": 374 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.38682872150951947, + "learning_rate": 0.00019929733842100178, + "loss": 0.7056, + "step": 375 + }, + { + "epoch": 0.06684444444444444, + "grad_norm": 0.38589179904857857, + "learning_rate": 0.0001992905080036713, + "loss": 0.703, + "step": 376 + }, + { + "epoch": 0.06702222222222222, + "grad_norm": 0.38276868000179953, + "learning_rate": 0.000199283644666423, + "loss": 0.7129, + "step": 377 + }, + { + "epoch": 0.0672, + "grad_norm": 0.390499334097345, + "learning_rate": 0.00019927674841153237, + "loss": 0.7303, + "step": 378 + }, + { + "epoch": 0.06737777777777777, + "grad_norm": 0.39012598805979254, + "learning_rate": 0.00019926981924128594, + "loss": 0.7514, + "step": 379 + }, + { + "epoch": 0.06755555555555555, + "grad_norm": 0.41551698281869964, + "learning_rate": 0.000199262857157981, + "loss": 0.7134, + "step": 380 + }, + { + "epoch": 0.06773333333333334, + "grad_norm": 0.4005173535103667, + "learning_rate": 0.00019925586216392596, + "loss": 0.6885, + "step": 381 + }, + { + "epoch": 0.06791111111111112, + "grad_norm": 0.4399692154042501, + "learning_rate": 0.0001992488342614399, + "loss": 0.7621, + "step": 382 + }, + { + "epoch": 0.0680888888888889, + "grad_norm": 0.41808656118741233, + "learning_rate": 0.00019924177345285297, + "loss": 0.7132, + "step": 383 + }, + { + "epoch": 0.06826666666666667, + "grad_norm": 0.40175559844683045, + "learning_rate": 0.00019923467974050622, + "loss": 0.7174, + "step": 384 + }, + { + "epoch": 0.06844444444444445, + "grad_norm": 0.39959976325332075, + "learning_rate": 0.00019922755312675158, + "loss": 0.6822, + "step": 385 + }, + { + "epoch": 0.06862222222222222, + "grad_norm": 0.4103944462406291, + "learning_rate": 0.00019922039361395185, + "loss": 0.711, + "step": 386 + }, + { + "epoch": 0.0688, + "grad_norm": 0.4316320714005958, + "learning_rate": 0.00019921320120448082, + "loss": 0.6684, + "step": 387 + }, + { + "epoch": 0.06897777777777778, + "grad_norm": 0.4087047790614361, + "learning_rate": 0.00019920597590072312, + "loss": 0.7171, + "step": 388 + }, + { + "epoch": 0.06915555555555555, + "grad_norm": 0.4121802353249436, + "learning_rate": 0.0001991987177050743, + "loss": 0.7022, + "step": 389 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 0.4057843281805508, + "learning_rate": 0.00019919142661994088, + "loss": 0.7377, + "step": 390 + }, + { + "epoch": 0.0695111111111111, + "grad_norm": 0.4324857831612135, + "learning_rate": 0.00019918410264774017, + "loss": 0.7145, + "step": 391 + }, + { + "epoch": 0.06968888888888888, + "grad_norm": 0.3964608162666485, + "learning_rate": 0.00019917674579090044, + "loss": 0.7708, + "step": 392 + }, + { + "epoch": 0.06986666666666666, + "grad_norm": 0.3991648435068707, + "learning_rate": 0.00019916935605186092, + "loss": 0.7295, + "step": 393 + }, + { + "epoch": 0.07004444444444445, + "grad_norm": 0.38554816477711273, + "learning_rate": 0.00019916193343307167, + "loss": 0.7213, + "step": 394 + }, + { + "epoch": 0.07022222222222223, + "grad_norm": 0.39011803598670747, + "learning_rate": 0.00019915447793699364, + "loss": 0.7541, + "step": 395 + }, + { + "epoch": 0.0704, + "grad_norm": 0.38368000820172654, + "learning_rate": 0.00019914698956609875, + "loss": 0.6776, + "step": 396 + }, + { + "epoch": 0.07057777777777778, + "grad_norm": 0.3949702277822226, + "learning_rate": 0.00019913946832286975, + "loss": 0.7903, + "step": 397 + }, + { + "epoch": 0.07075555555555556, + "grad_norm": 0.3927135416997767, + "learning_rate": 0.00019913191420980033, + "loss": 0.7177, + "step": 398 + }, + { + "epoch": 0.07093333333333333, + "grad_norm": 0.37457130403302863, + "learning_rate": 0.0001991243272293951, + "loss": 0.6492, + "step": 399 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 0.39570709246601554, + "learning_rate": 0.00019911670738416947, + "loss": 0.7204, + "step": 400 + }, + { + "epoch": 0.07128888888888889, + "grad_norm": 0.3960753830839489, + "learning_rate": 0.00019910905467664987, + "loss": 0.7064, + "step": 401 + }, + { + "epoch": 0.07146666666666666, + "grad_norm": 0.36061511883345115, + "learning_rate": 0.00019910136910937355, + "loss": 0.6778, + "step": 402 + }, + { + "epoch": 0.07164444444444444, + "grad_norm": 0.3871093380105151, + "learning_rate": 0.00019909365068488863, + "loss": 0.6609, + "step": 403 + }, + { + "epoch": 0.07182222222222222, + "grad_norm": 0.3783251036727788, + "learning_rate": 0.00019908589940575424, + "loss": 0.6438, + "step": 404 + }, + { + "epoch": 0.072, + "grad_norm": 0.37978204780797625, + "learning_rate": 0.0001990781152745403, + "loss": 0.6639, + "step": 405 + }, + { + "epoch": 0.07217777777777777, + "grad_norm": 0.4093861988747356, + "learning_rate": 0.00019907029829382758, + "loss": 0.7211, + "step": 406 + }, + { + "epoch": 0.07235555555555556, + "grad_norm": 0.4047677468631361, + "learning_rate": 0.0001990624484662079, + "loss": 0.6769, + "step": 407 + }, + { + "epoch": 0.07253333333333334, + "grad_norm": 0.4070877326042871, + "learning_rate": 0.00019905456579428384, + "loss": 0.7264, + "step": 408 + }, + { + "epoch": 0.07271111111111112, + "grad_norm": 0.4071589982711702, + "learning_rate": 0.00019904665028066894, + "loss": 0.7118, + "step": 409 + }, + { + "epoch": 0.07288888888888889, + "grad_norm": 0.3874607524608504, + "learning_rate": 0.00019903870192798762, + "loss": 0.6854, + "step": 410 + }, + { + "epoch": 0.07306666666666667, + "grad_norm": 0.3784816688798273, + "learning_rate": 0.00019903072073887507, + "loss": 0.6938, + "step": 411 + }, + { + "epoch": 0.07324444444444445, + "grad_norm": 0.38133597104549394, + "learning_rate": 0.00019902270671597757, + "loss": 0.7136, + "step": 412 + }, + { + "epoch": 0.07342222222222222, + "grad_norm": 0.3969531446889939, + "learning_rate": 0.00019901465986195212, + "loss": 0.6739, + "step": 413 + }, + { + "epoch": 0.0736, + "grad_norm": 0.4001476164167259, + "learning_rate": 0.00019900658017946672, + "loss": 0.7022, + "step": 414 + }, + { + "epoch": 0.07377777777777778, + "grad_norm": 0.3839686474510531, + "learning_rate": 0.00019899846767120014, + "loss": 0.7036, + "step": 415 + }, + { + "epoch": 0.07395555555555555, + "grad_norm": 0.40393162454141374, + "learning_rate": 0.00019899032233984215, + "loss": 0.7466, + "step": 416 + }, + { + "epoch": 0.07413333333333333, + "grad_norm": 0.41546775914410533, + "learning_rate": 0.0001989821441880933, + "loss": 0.6945, + "step": 417 + }, + { + "epoch": 0.0743111111111111, + "grad_norm": 0.4024131794448322, + "learning_rate": 0.00019897393321866507, + "loss": 0.7575, + "step": 418 + }, + { + "epoch": 0.07448888888888888, + "grad_norm": 0.43695009201329216, + "learning_rate": 0.00019896568943427988, + "loss": 0.8218, + "step": 419 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 0.37219480229165597, + "learning_rate": 0.00019895741283767085, + "loss": 0.683, + "step": 420 + }, + { + "epoch": 0.07484444444444445, + "grad_norm": 0.3963728867326916, + "learning_rate": 0.00019894910343158225, + "loss": 0.7472, + "step": 421 + }, + { + "epoch": 0.07502222222222223, + "grad_norm": 0.401321228712814, + "learning_rate": 0.000198940761218769, + "loss": 0.6606, + "step": 422 + }, + { + "epoch": 0.0752, + "grad_norm": 0.4166752215215881, + "learning_rate": 0.00019893238620199692, + "loss": 0.718, + "step": 423 + }, + { + "epoch": 0.07537777777777778, + "grad_norm": 0.4082404968393422, + "learning_rate": 0.00019892397838404286, + "loss": 0.684, + "step": 424 + }, + { + "epoch": 0.07555555555555556, + "grad_norm": 0.4002102640032932, + "learning_rate": 0.0001989155377676944, + "loss": 0.7701, + "step": 425 + }, + { + "epoch": 0.07573333333333333, + "grad_norm": 0.3962086195289424, + "learning_rate": 0.00019890706435574996, + "loss": 0.7124, + "step": 426 + }, + { + "epoch": 0.07591111111111111, + "grad_norm": 0.376472674399961, + "learning_rate": 0.0001988985581510191, + "loss": 0.7016, + "step": 427 + }, + { + "epoch": 0.07608888888888889, + "grad_norm": 0.3815542765291551, + "learning_rate": 0.0001988900191563219, + "loss": 0.6804, + "step": 428 + }, + { + "epoch": 0.07626666666666666, + "grad_norm": 0.39731788096853865, + "learning_rate": 0.00019888144737448951, + "loss": 0.7337, + "step": 429 + }, + { + "epoch": 0.07644444444444444, + "grad_norm": 0.40619310393560626, + "learning_rate": 0.00019887284280836398, + "loss": 0.6892, + "step": 430 + }, + { + "epoch": 0.07662222222222222, + "grad_norm": 0.4705324077268298, + "learning_rate": 0.0001988642054607981, + "loss": 0.7338, + "step": 431 + }, + { + "epoch": 0.0768, + "grad_norm": 0.39482001590724675, + "learning_rate": 0.00019885553533465565, + "loss": 0.7368, + "step": 432 + }, + { + "epoch": 0.07697777777777778, + "grad_norm": 0.4141309101913045, + "learning_rate": 0.00019884683243281116, + "loss": 0.6901, + "step": 433 + }, + { + "epoch": 0.07715555555555556, + "grad_norm": 0.38605885392346095, + "learning_rate": 0.00019883809675815014, + "loss": 0.7277, + "step": 434 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 0.39914498904299706, + "learning_rate": 0.00019882932831356888, + "loss": 0.6728, + "step": 435 + }, + { + "epoch": 0.07751111111111111, + "grad_norm": 0.6338942533284905, + "learning_rate": 0.00019882052710197461, + "loss": 0.7376, + "step": 436 + }, + { + "epoch": 0.07768888888888889, + "grad_norm": 0.40094631631964806, + "learning_rate": 0.0001988116931262854, + "loss": 0.7385, + "step": 437 + }, + { + "epoch": 0.07786666666666667, + "grad_norm": 0.416096651605623, + "learning_rate": 0.0001988028263894301, + "loss": 0.741, + "step": 438 + }, + { + "epoch": 0.07804444444444444, + "grad_norm": 0.3996699967882852, + "learning_rate": 0.00019879392689434852, + "loss": 0.708, + "step": 439 + }, + { + "epoch": 0.07822222222222222, + "grad_norm": 0.3986240266716566, + "learning_rate": 0.0001987849946439913, + "loss": 0.7422, + "step": 440 + }, + { + "epoch": 0.0784, + "grad_norm": 0.3837916756105452, + "learning_rate": 0.00019877602964131995, + "loss": 0.6942, + "step": 441 + }, + { + "epoch": 0.07857777777777777, + "grad_norm": 0.4106748207779132, + "learning_rate": 0.00019876703188930684, + "loss": 0.8022, + "step": 442 + }, + { + "epoch": 0.07875555555555555, + "grad_norm": 0.3867554407368537, + "learning_rate": 0.0001987580013909352, + "loss": 0.7392, + "step": 443 + }, + { + "epoch": 0.07893333333333333, + "grad_norm": 0.3899890604107621, + "learning_rate": 0.00019874893814919906, + "loss": 0.7282, + "step": 444 + }, + { + "epoch": 0.0791111111111111, + "grad_norm": 0.3909450016766374, + "learning_rate": 0.00019873984216710336, + "loss": 0.727, + "step": 445 + }, + { + "epoch": 0.0792888888888889, + "grad_norm": 0.3896511609914778, + "learning_rate": 0.0001987307134476639, + "loss": 0.7288, + "step": 446 + }, + { + "epoch": 0.07946666666666667, + "grad_norm": 0.37251349168789144, + "learning_rate": 0.00019872155199390733, + "loss": 0.6931, + "step": 447 + }, + { + "epoch": 0.07964444444444445, + "grad_norm": 0.3777509520845078, + "learning_rate": 0.00019871235780887113, + "loss": 0.7308, + "step": 448 + }, + { + "epoch": 0.07982222222222222, + "grad_norm": 0.36277517948573573, + "learning_rate": 0.00019870313089560365, + "loss": 0.6848, + "step": 449 + }, + { + "epoch": 0.08, + "grad_norm": 0.3919062953118981, + "learning_rate": 0.00019869387125716407, + "loss": 0.6851, + "step": 450 + }, + { + "epoch": 0.08017777777777778, + "grad_norm": 0.38761819854795565, + "learning_rate": 0.00019868457889662248, + "loss": 0.6781, + "step": 451 + }, + { + "epoch": 0.08035555555555556, + "grad_norm": 0.3788510612813003, + "learning_rate": 0.00019867525381705973, + "loss": 0.7273, + "step": 452 + }, + { + "epoch": 0.08053333333333333, + "grad_norm": 0.39016542366868956, + "learning_rate": 0.0001986658960215676, + "loss": 0.765, + "step": 453 + }, + { + "epoch": 0.08071111111111111, + "grad_norm": 0.4266466801889542, + "learning_rate": 0.00019865650551324866, + "loss": 0.7666, + "step": 454 + }, + { + "epoch": 0.08088888888888889, + "grad_norm": 0.3834137034047898, + "learning_rate": 0.00019864708229521636, + "loss": 0.6838, + "step": 455 + }, + { + "epoch": 0.08106666666666666, + "grad_norm": 0.3836031334531388, + "learning_rate": 0.00019863762637059495, + "loss": 0.669, + "step": 456 + }, + { + "epoch": 0.08124444444444444, + "grad_norm": 0.3753655698179046, + "learning_rate": 0.0001986281377425196, + "loss": 0.7294, + "step": 457 + }, + { + "epoch": 0.08142222222222223, + "grad_norm": 0.3699776675881974, + "learning_rate": 0.00019861861641413625, + "loss": 0.6958, + "step": 458 + }, + { + "epoch": 0.0816, + "grad_norm": 0.40947612279335743, + "learning_rate": 0.0001986090623886017, + "loss": 0.777, + "step": 459 + }, + { + "epoch": 0.08177777777777778, + "grad_norm": 0.3789962009920692, + "learning_rate": 0.00019859947566908364, + "loss": 0.7214, + "step": 460 + }, + { + "epoch": 0.08195555555555556, + "grad_norm": 0.36659064268536445, + "learning_rate": 0.00019858985625876056, + "loss": 0.7546, + "step": 461 + }, + { + "epoch": 0.08213333333333334, + "grad_norm": 0.41380546777912214, + "learning_rate": 0.00019858020416082178, + "loss": 0.7473, + "step": 462 + }, + { + "epoch": 0.08231111111111111, + "grad_norm": 0.38297506018788485, + "learning_rate": 0.00019857051937846744, + "loss": 0.7117, + "step": 463 + }, + { + "epoch": 0.08248888888888889, + "grad_norm": 0.3786590318115611, + "learning_rate": 0.00019856080191490858, + "loss": 0.7133, + "step": 464 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 0.39191032864838593, + "learning_rate": 0.00019855105177336702, + "loss": 0.6793, + "step": 465 + }, + { + "epoch": 0.08284444444444444, + "grad_norm": 0.40403356412990415, + "learning_rate": 0.0001985412689570754, + "loss": 0.674, + "step": 466 + }, + { + "epoch": 0.08302222222222222, + "grad_norm": 0.42610295285771455, + "learning_rate": 0.00019853145346927732, + "loss": 0.7042, + "step": 467 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3781768741886325, + "learning_rate": 0.00019852160531322707, + "loss": 0.6943, + "step": 468 + }, + { + "epoch": 0.08337777777777777, + "grad_norm": 0.41533041027846673, + "learning_rate": 0.00019851172449218978, + "loss": 0.6883, + "step": 469 + }, + { + "epoch": 0.08355555555555555, + "grad_norm": 0.4167632213029769, + "learning_rate": 0.0001985018110094415, + "loss": 0.7761, + "step": 470 + }, + { + "epoch": 0.08373333333333334, + "grad_norm": 0.40252436993266555, + "learning_rate": 0.00019849186486826906, + "loss": 0.7222, + "step": 471 + }, + { + "epoch": 0.08391111111111112, + "grad_norm": 0.3719893170262049, + "learning_rate": 0.00019848188607197008, + "loss": 0.7024, + "step": 472 + }, + { + "epoch": 0.0840888888888889, + "grad_norm": 0.3798984521916213, + "learning_rate": 0.0001984718746238531, + "loss": 0.679, + "step": 473 + }, + { + "epoch": 0.08426666666666667, + "grad_norm": 0.40158972712404656, + "learning_rate": 0.00019846183052723738, + "loss": 0.7276, + "step": 474 + }, + { + "epoch": 0.08444444444444445, + "grad_norm": 0.38624497311934824, + "learning_rate": 0.0001984517537854531, + "loss": 0.7185, + "step": 475 + }, + { + "epoch": 0.08462222222222222, + "grad_norm": 0.3735580173669137, + "learning_rate": 0.00019844164440184118, + "loss": 0.6791, + "step": 476 + }, + { + "epoch": 0.0848, + "grad_norm": 0.3984033532195548, + "learning_rate": 0.00019843150237975344, + "loss": 0.7067, + "step": 477 + }, + { + "epoch": 0.08497777777777778, + "grad_norm": 0.46071691804757686, + "learning_rate": 0.00019842132772255244, + "loss": 0.6828, + "step": 478 + }, + { + "epoch": 0.08515555555555555, + "grad_norm": 0.3859545760101646, + "learning_rate": 0.0001984111204336116, + "loss": 0.6721, + "step": 479 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 0.3985767473496741, + "learning_rate": 0.00019840088051631524, + "loss": 0.682, + "step": 480 + }, + { + "epoch": 0.08551111111111111, + "grad_norm": 0.3710666445505344, + "learning_rate": 0.00019839060797405833, + "loss": 0.6725, + "step": 481 + }, + { + "epoch": 0.08568888888888888, + "grad_norm": 0.38819738107860136, + "learning_rate": 0.0001983803028102468, + "loss": 0.7446, + "step": 482 + }, + { + "epoch": 0.08586666666666666, + "grad_norm": 0.3747206310678951, + "learning_rate": 0.00019836996502829731, + "loss": 0.7149, + "step": 483 + }, + { + "epoch": 0.08604444444444445, + "grad_norm": 0.3960843239759529, + "learning_rate": 0.0001983595946316374, + "loss": 0.6744, + "step": 484 + }, + { + "epoch": 0.08622222222222223, + "grad_norm": 0.3860936100139274, + "learning_rate": 0.00019834919162370538, + "loss": 0.7233, + "step": 485 + }, + { + "epoch": 0.0864, + "grad_norm": 0.36889033128684595, + "learning_rate": 0.00019833875600795036, + "loss": 0.719, + "step": 486 + }, + { + "epoch": 0.08657777777777778, + "grad_norm": 0.40055611052787216, + "learning_rate": 0.0001983282877878323, + "loss": 0.7299, + "step": 487 + }, + { + "epoch": 0.08675555555555556, + "grad_norm": 0.39221882653240764, + "learning_rate": 0.00019831778696682194, + "loss": 0.685, + "step": 488 + }, + { + "epoch": 0.08693333333333333, + "grad_norm": 0.38394915717190387, + "learning_rate": 0.00019830725354840089, + "loss": 0.7292, + "step": 489 + }, + { + "epoch": 0.08711111111111111, + "grad_norm": 0.3898539840510562, + "learning_rate": 0.00019829668753606146, + "loss": 0.7667, + "step": 490 + }, + { + "epoch": 0.08728888888888889, + "grad_norm": 0.3885663815836747, + "learning_rate": 0.0001982860889333069, + "loss": 0.7478, + "step": 491 + }, + { + "epoch": 0.08746666666666666, + "grad_norm": 0.36612102259144147, + "learning_rate": 0.0001982754577436511, + "loss": 0.6771, + "step": 492 + }, + { + "epoch": 0.08764444444444444, + "grad_norm": 0.3987280709247257, + "learning_rate": 0.00019826479397061893, + "loss": 0.709, + "step": 493 + }, + { + "epoch": 0.08782222222222222, + "grad_norm": 0.4043564451495452, + "learning_rate": 0.00019825409761774592, + "loss": 0.7026, + "step": 494 + }, + { + "epoch": 0.088, + "grad_norm": 0.37926429061357136, + "learning_rate": 0.00019824336868857852, + "loss": 0.6648, + "step": 495 + }, + { + "epoch": 0.08817777777777777, + "grad_norm": 0.36360149442951356, + "learning_rate": 0.00019823260718667386, + "loss": 0.6901, + "step": 496 + }, + { + "epoch": 0.08835555555555556, + "grad_norm": 0.39119767824550433, + "learning_rate": 0.00019822181311559994, + "loss": 0.6665, + "step": 497 + }, + { + "epoch": 0.08853333333333334, + "grad_norm": 0.3802362637359548, + "learning_rate": 0.0001982109864789356, + "loss": 0.7356, + "step": 498 + }, + { + "epoch": 0.08871111111111112, + "grad_norm": 0.39089413244118015, + "learning_rate": 0.00019820012728027044, + "loss": 0.7005, + "step": 499 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 0.38359311480973846, + "learning_rate": 0.00019818923552320476, + "loss": 0.67, + "step": 500 + }, + { + "epoch": 0.08906666666666667, + "grad_norm": 0.3887716612794801, + "learning_rate": 0.0001981783112113498, + "loss": 0.7046, + "step": 501 + }, + { + "epoch": 0.08924444444444445, + "grad_norm": 0.41701466560451533, + "learning_rate": 0.00019816735434832752, + "loss": 0.7471, + "step": 502 + }, + { + "epoch": 0.08942222222222222, + "grad_norm": 0.3953806694358842, + "learning_rate": 0.00019815636493777063, + "loss": 0.7136, + "step": 503 + }, + { + "epoch": 0.0896, + "grad_norm": 0.3961402619781841, + "learning_rate": 0.00019814534298332278, + "loss": 0.6828, + "step": 504 + }, + { + "epoch": 0.08977777777777778, + "grad_norm": 0.3672594146424926, + "learning_rate": 0.00019813428848863826, + "loss": 0.6584, + "step": 505 + }, + { + "epoch": 0.08995555555555555, + "grad_norm": 0.3850643115863915, + "learning_rate": 0.00019812320145738224, + "loss": 0.7869, + "step": 506 + }, + { + "epoch": 0.09013333333333333, + "grad_norm": 0.37077478089251176, + "learning_rate": 0.00019811208189323058, + "loss": 0.6796, + "step": 507 + }, + { + "epoch": 0.0903111111111111, + "grad_norm": 0.4078763220472397, + "learning_rate": 0.00019810092979987006, + "loss": 0.7009, + "step": 508 + }, + { + "epoch": 0.09048888888888888, + "grad_norm": 0.41811556406244266, + "learning_rate": 0.00019808974518099813, + "loss": 0.7427, + "step": 509 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.37551449192697767, + "learning_rate": 0.00019807852804032305, + "loss": 0.7324, + "step": 510 + }, + { + "epoch": 0.09084444444444445, + "grad_norm": 0.38812220151818816, + "learning_rate": 0.00019806727838156393, + "loss": 0.7076, + "step": 511 + }, + { + "epoch": 0.09102222222222223, + "grad_norm": 0.39630617978651994, + "learning_rate": 0.0001980559962084506, + "loss": 0.7518, + "step": 512 + }, + { + "epoch": 0.0912, + "grad_norm": 0.39678213342399604, + "learning_rate": 0.00019804468152472362, + "loss": 0.6769, + "step": 513 + }, + { + "epoch": 0.09137777777777778, + "grad_norm": 0.4162672008944878, + "learning_rate": 0.00019803333433413448, + "loss": 0.7087, + "step": 514 + }, + { + "epoch": 0.09155555555555556, + "grad_norm": 0.4009749516627074, + "learning_rate": 0.0001980219546404453, + "loss": 0.6927, + "step": 515 + }, + { + "epoch": 0.09173333333333333, + "grad_norm": 0.4075419407703099, + "learning_rate": 0.00019801054244742908, + "loss": 0.7545, + "step": 516 + }, + { + "epoch": 0.09191111111111111, + "grad_norm": 0.3889320504824982, + "learning_rate": 0.0001979990977588695, + "loss": 0.6547, + "step": 517 + }, + { + "epoch": 0.09208888888888889, + "grad_norm": 0.38812102304168816, + "learning_rate": 0.0001979876205785611, + "loss": 0.6839, + "step": 518 + }, + { + "epoch": 0.09226666666666666, + "grad_norm": 0.3955334347617926, + "learning_rate": 0.0001979761109103091, + "loss": 0.783, + "step": 519 + }, + { + "epoch": 0.09244444444444444, + "grad_norm": 0.37131327605098874, + "learning_rate": 0.00019796456875792963, + "loss": 0.733, + "step": 520 + }, + { + "epoch": 0.09262222222222222, + "grad_norm": 0.37768671406181936, + "learning_rate": 0.00019795299412524945, + "loss": 0.6978, + "step": 521 + }, + { + "epoch": 0.0928, + "grad_norm": 0.4046767262774699, + "learning_rate": 0.00019794138701610618, + "loss": 0.7884, + "step": 522 + }, + { + "epoch": 0.09297777777777778, + "grad_norm": 0.39433272676675823, + "learning_rate": 0.00019792974743434815, + "loss": 0.7499, + "step": 523 + }, + { + "epoch": 0.09315555555555556, + "grad_norm": 0.39129637695326064, + "learning_rate": 0.0001979180753838345, + "loss": 0.7109, + "step": 524 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.4035082219548878, + "learning_rate": 0.0001979063708684351, + "loss": 0.7287, + "step": 525 + }, + { + "epoch": 0.09351111111111111, + "grad_norm": 0.3942976843456556, + "learning_rate": 0.00019789463389203064, + "loss": 0.7476, + "step": 526 + }, + { + "epoch": 0.09368888888888889, + "grad_norm": 0.4523195035487732, + "learning_rate": 0.00019788286445851245, + "loss": 0.7001, + "step": 527 + }, + { + "epoch": 0.09386666666666667, + "grad_norm": 0.38904772680540983, + "learning_rate": 0.00019787106257178276, + "loss": 0.6834, + "step": 528 + }, + { + "epoch": 0.09404444444444444, + "grad_norm": 0.4040685501074588, + "learning_rate": 0.00019785922823575448, + "loss": 0.7488, + "step": 529 + }, + { + "epoch": 0.09422222222222222, + "grad_norm": 0.37001074183000726, + "learning_rate": 0.00019784736145435136, + "loss": 0.6912, + "step": 530 + }, + { + "epoch": 0.0944, + "grad_norm": 0.37605916188227884, + "learning_rate": 0.0001978354622315078, + "loss": 0.6769, + "step": 531 + }, + { + "epoch": 0.09457777777777777, + "grad_norm": 0.3863731577389576, + "learning_rate": 0.000197823530571169, + "loss": 0.7146, + "step": 532 + }, + { + "epoch": 0.09475555555555555, + "grad_norm": 0.38784027680178806, + "learning_rate": 0.00019781156647729093, + "loss": 0.7091, + "step": 533 + }, + { + "epoch": 0.09493333333333333, + "grad_norm": 0.39479264286450233, + "learning_rate": 0.00019779956995384033, + "loss": 0.7202, + "step": 534 + }, + { + "epoch": 0.0951111111111111, + "grad_norm": 0.388132039918643, + "learning_rate": 0.0001977875410047946, + "loss": 0.6781, + "step": 535 + }, + { + "epoch": 0.0952888888888889, + "grad_norm": 0.39178644436478455, + "learning_rate": 0.000197775479634142, + "loss": 0.7615, + "step": 536 + }, + { + "epoch": 0.09546666666666667, + "grad_norm": 0.37566978094263886, + "learning_rate": 0.00019776338584588153, + "loss": 0.6683, + "step": 537 + }, + { + "epoch": 0.09564444444444445, + "grad_norm": 0.40332078693756196, + "learning_rate": 0.00019775125964402283, + "loss": 0.7358, + "step": 538 + }, + { + "epoch": 0.09582222222222223, + "grad_norm": 0.39465881375258033, + "learning_rate": 0.0001977391010325864, + "loss": 0.7199, + "step": 539 + }, + { + "epoch": 0.096, + "grad_norm": 0.39739941388198086, + "learning_rate": 0.0001977269100156035, + "loss": 0.7152, + "step": 540 + }, + { + "epoch": 0.09617777777777778, + "grad_norm": 0.40349461005481413, + "learning_rate": 0.00019771468659711595, + "loss": 0.7405, + "step": 541 + }, + { + "epoch": 0.09635555555555556, + "grad_norm": 0.38314651002370476, + "learning_rate": 0.00019770243078117656, + "loss": 0.7485, + "step": 542 + }, + { + "epoch": 0.09653333333333333, + "grad_norm": 0.37249511714783334, + "learning_rate": 0.0001976901425718487, + "loss": 0.7154, + "step": 543 + }, + { + "epoch": 0.09671111111111111, + "grad_norm": 0.43362403261005483, + "learning_rate": 0.0001976778219732066, + "loss": 0.6861, + "step": 544 + }, + { + "epoch": 0.09688888888888889, + "grad_norm": 0.38982482782879585, + "learning_rate": 0.00019766546898933508, + "loss": 0.7075, + "step": 545 + }, + { + "epoch": 0.09706666666666666, + "grad_norm": 0.3949764018822734, + "learning_rate": 0.00019765308362432987, + "loss": 0.7055, + "step": 546 + }, + { + "epoch": 0.09724444444444444, + "grad_norm": 0.3691251495122129, + "learning_rate": 0.00019764066588229734, + "loss": 0.7227, + "step": 547 + }, + { + "epoch": 0.09742222222222222, + "grad_norm": 0.42663727490235365, + "learning_rate": 0.00019762821576735463, + "loss": 0.7703, + "step": 548 + }, + { + "epoch": 0.0976, + "grad_norm": 0.3800437550005673, + "learning_rate": 0.00019761573328362953, + "loss": 0.676, + "step": 549 + }, + { + "epoch": 0.09777777777777778, + "grad_norm": 0.4583794656278647, + "learning_rate": 0.0001976032184352607, + "loss": 0.6876, + "step": 550 + }, + { + "epoch": 0.09795555555555556, + "grad_norm": 0.4920240888261474, + "learning_rate": 0.00019759067122639742, + "loss": 0.7439, + "step": 551 + }, + { + "epoch": 0.09813333333333334, + "grad_norm": 0.37811029285341824, + "learning_rate": 0.0001975780916611997, + "loss": 0.6923, + "step": 552 + }, + { + "epoch": 0.09831111111111111, + "grad_norm": 0.3841859644376975, + "learning_rate": 0.0001975654797438384, + "loss": 0.6769, + "step": 553 + }, + { + "epoch": 0.09848888888888889, + "grad_norm": 0.37061538644462133, + "learning_rate": 0.00019755283547849494, + "loss": 0.6762, + "step": 554 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 0.3760153276671012, + "learning_rate": 0.0001975401588693616, + "loss": 0.6798, + "step": 555 + }, + { + "epoch": 0.09884444444444444, + "grad_norm": 0.3912361835185172, + "learning_rate": 0.0001975274499206413, + "loss": 0.6902, + "step": 556 + }, + { + "epoch": 0.09902222222222222, + "grad_norm": 0.4283050030644034, + "learning_rate": 0.00019751470863654772, + "loss": 0.7364, + "step": 557 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3853017088387168, + "learning_rate": 0.00019750193502130525, + "loss": 0.6964, + "step": 558 + }, + { + "epoch": 0.09937777777777777, + "grad_norm": 0.3991612722825895, + "learning_rate": 0.000197489129079149, + "loss": 0.6922, + "step": 559 + }, + { + "epoch": 0.09955555555555555, + "grad_norm": 0.3960774506964915, + "learning_rate": 0.0001974762908143248, + "loss": 0.666, + "step": 560 + }, + { + "epoch": 0.09973333333333333, + "grad_norm": 0.3841384622364195, + "learning_rate": 0.0001974634202310892, + "loss": 0.713, + "step": 561 + }, + { + "epoch": 0.09991111111111112, + "grad_norm": 0.39415126927713723, + "learning_rate": 0.00019745051733370948, + "loss": 0.6728, + "step": 562 + }, + { + "epoch": 0.1000888888888889, + "grad_norm": 0.39297294397494387, + "learning_rate": 0.00019743758212646358, + "loss": 0.7126, + "step": 563 + }, + { + "epoch": 0.10026666666666667, + "grad_norm": 0.3666371431502856, + "learning_rate": 0.00019742461461364017, + "loss": 0.6904, + "step": 564 + }, + { + "epoch": 0.10044444444444445, + "grad_norm": 0.4048054635796692, + "learning_rate": 0.0001974116147995387, + "loss": 0.6987, + "step": 565 + }, + { + "epoch": 0.10062222222222222, + "grad_norm": 0.39857663269149735, + "learning_rate": 0.00019739858268846928, + "loss": 0.6878, + "step": 566 + }, + { + "epoch": 0.1008, + "grad_norm": 0.3841995053260552, + "learning_rate": 0.0001973855182847527, + "loss": 0.7525, + "step": 567 + }, + { + "epoch": 0.10097777777777778, + "grad_norm": 0.39437695649325905, + "learning_rate": 0.00019737242159272047, + "loss": 0.7416, + "step": 568 + }, + { + "epoch": 0.10115555555555555, + "grad_norm": 0.4253403948667029, + "learning_rate": 0.00019735929261671485, + "loss": 0.7424, + "step": 569 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 0.3726733760212832, + "learning_rate": 0.00019734613136108875, + "loss": 0.7197, + "step": 570 + }, + { + "epoch": 0.10151111111111111, + "grad_norm": 0.39698138722245324, + "learning_rate": 0.0001973329378302058, + "loss": 0.714, + "step": 571 + }, + { + "epoch": 0.10168888888888888, + "grad_norm": 0.3974100754920948, + "learning_rate": 0.00019731971202844036, + "loss": 0.7117, + "step": 572 + }, + { + "epoch": 0.10186666666666666, + "grad_norm": 0.377398414120431, + "learning_rate": 0.00019730645396017743, + "loss": 0.6748, + "step": 573 + }, + { + "epoch": 0.10204444444444444, + "grad_norm": 0.38139272096193433, + "learning_rate": 0.0001972931636298128, + "loss": 0.6772, + "step": 574 + }, + { + "epoch": 0.10222222222222223, + "grad_norm": 0.3869568852013764, + "learning_rate": 0.00019727984104175284, + "loss": 0.7045, + "step": 575 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3882846699122299, + "learning_rate": 0.00019726648620041468, + "loss": 0.6897, + "step": 576 + }, + { + "epoch": 0.10257777777777778, + "grad_norm": 0.3996476684381144, + "learning_rate": 0.00019725309911022617, + "loss": 0.735, + "step": 577 + }, + { + "epoch": 0.10275555555555556, + "grad_norm": 0.38720619170224296, + "learning_rate": 0.00019723967977562583, + "loss": 0.6308, + "step": 578 + }, + { + "epoch": 0.10293333333333334, + "grad_norm": 0.39794186627128897, + "learning_rate": 0.0001972262282010628, + "loss": 0.6839, + "step": 579 + }, + { + "epoch": 0.10311111111111111, + "grad_norm": 0.4167802324572103, + "learning_rate": 0.00019721274439099703, + "loss": 0.6449, + "step": 580 + }, + { + "epoch": 0.10328888888888889, + "grad_norm": 0.39156390031788213, + "learning_rate": 0.00019719922834989906, + "loss": 0.6392, + "step": 581 + }, + { + "epoch": 0.10346666666666667, + "grad_norm": 0.39394650573347684, + "learning_rate": 0.00019718568008225015, + "loss": 0.713, + "step": 582 + }, + { + "epoch": 0.10364444444444444, + "grad_norm": 0.43288158874449467, + "learning_rate": 0.0001971720995925423, + "loss": 0.7239, + "step": 583 + }, + { + "epoch": 0.10382222222222222, + "grad_norm": 0.43425034558482667, + "learning_rate": 0.0001971584868852781, + "loss": 0.7546, + "step": 584 + }, + { + "epoch": 0.104, + "grad_norm": 0.36768709439661523, + "learning_rate": 0.00019714484196497084, + "loss": 0.6806, + "step": 585 + }, + { + "epoch": 0.10417777777777777, + "grad_norm": 0.39288106509619114, + "learning_rate": 0.00019713116483614456, + "loss": 0.7649, + "step": 586 + }, + { + "epoch": 0.10435555555555555, + "grad_norm": 0.38031087043482165, + "learning_rate": 0.0001971174555033339, + "loss": 0.7131, + "step": 587 + }, + { + "epoch": 0.10453333333333334, + "grad_norm": 0.39189997310837155, + "learning_rate": 0.00019710371397108425, + "loss": 0.6849, + "step": 588 + }, + { + "epoch": 0.10471111111111112, + "grad_norm": 0.36646347039971455, + "learning_rate": 0.0001970899402439516, + "loss": 0.6606, + "step": 589 + }, + { + "epoch": 0.10488888888888889, + "grad_norm": 0.3833475667981473, + "learning_rate": 0.0001970761343265027, + "loss": 0.7094, + "step": 590 + }, + { + "epoch": 0.10506666666666667, + "grad_norm": 0.3915767845271623, + "learning_rate": 0.00019706229622331486, + "loss": 0.6902, + "step": 591 + }, + { + "epoch": 0.10524444444444445, + "grad_norm": 0.3841717252940577, + "learning_rate": 0.00019704842593897613, + "loss": 0.6678, + "step": 592 + }, + { + "epoch": 0.10542222222222222, + "grad_norm": 0.3925414218590898, + "learning_rate": 0.00019703452347808527, + "loss": 0.6842, + "step": 593 + }, + { + "epoch": 0.1056, + "grad_norm": 0.3924214417851129, + "learning_rate": 0.00019702058884525162, + "loss": 0.668, + "step": 594 + }, + { + "epoch": 0.10577777777777778, + "grad_norm": 0.378592348409656, + "learning_rate": 0.00019700662204509523, + "loss": 0.7097, + "step": 595 + }, + { + "epoch": 0.10595555555555555, + "grad_norm": 0.3948263616879013, + "learning_rate": 0.00019699262308224688, + "loss": 0.6849, + "step": 596 + }, + { + "epoch": 0.10613333333333333, + "grad_norm": 0.42679657208352684, + "learning_rate": 0.00019697859196134786, + "loss": 0.6851, + "step": 597 + }, + { + "epoch": 0.1063111111111111, + "grad_norm": 0.42252954161449646, + "learning_rate": 0.00019696452868705024, + "loss": 0.7239, + "step": 598 + }, + { + "epoch": 0.10648888888888888, + "grad_norm": 0.4034722794383333, + "learning_rate": 0.00019695043326401672, + "loss": 0.7172, + "step": 599 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.3730100807720244, + "learning_rate": 0.00019693630569692067, + "loss": 0.7048, + "step": 600 + }, + { + "epoch": 0.10684444444444445, + "grad_norm": 0.40381360093897134, + "learning_rate": 0.0001969221459904461, + "loss": 0.6368, + "step": 601 + }, + { + "epoch": 0.10702222222222223, + "grad_norm": 0.373362612896415, + "learning_rate": 0.0001969079541492877, + "loss": 0.6612, + "step": 602 + }, + { + "epoch": 0.1072, + "grad_norm": 0.3961177828699534, + "learning_rate": 0.00019689373017815073, + "loss": 0.7093, + "step": 603 + }, + { + "epoch": 0.10737777777777778, + "grad_norm": 0.38292537668712084, + "learning_rate": 0.00019687947408175127, + "loss": 0.666, + "step": 604 + }, + { + "epoch": 0.10755555555555556, + "grad_norm": 0.3962220505027578, + "learning_rate": 0.00019686518586481587, + "loss": 0.7275, + "step": 605 + }, + { + "epoch": 0.10773333333333333, + "grad_norm": 0.384398611311203, + "learning_rate": 0.00019685086553208184, + "loss": 0.7117, + "step": 606 + }, + { + "epoch": 0.10791111111111111, + "grad_norm": 0.3863803568444556, + "learning_rate": 0.0001968365130882971, + "loss": 0.6865, + "step": 607 + }, + { + "epoch": 0.10808888888888889, + "grad_norm": 0.38113676782357997, + "learning_rate": 0.00019682212853822022, + "loss": 0.6771, + "step": 608 + }, + { + "epoch": 0.10826666666666666, + "grad_norm": 0.390287641585076, + "learning_rate": 0.00019680771188662044, + "loss": 0.7175, + "step": 609 + }, + { + "epoch": 0.10844444444444444, + "grad_norm": 0.3960456752266383, + "learning_rate": 0.00019679326313827762, + "loss": 0.7158, + "step": 610 + }, + { + "epoch": 0.10862222222222222, + "grad_norm": 0.423204349538445, + "learning_rate": 0.00019677878229798224, + "loss": 0.6754, + "step": 611 + }, + { + "epoch": 0.1088, + "grad_norm": 0.405886325521896, + "learning_rate": 0.00019676426937053547, + "loss": 0.6894, + "step": 612 + }, + { + "epoch": 0.10897777777777778, + "grad_norm": 0.38614058748178565, + "learning_rate": 0.00019674972436074906, + "loss": 0.6714, + "step": 613 + }, + { + "epoch": 0.10915555555555556, + "grad_norm": 0.40135755079444874, + "learning_rate": 0.00019673514727344547, + "loss": 0.7115, + "step": 614 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 0.3767696205104445, + "learning_rate": 0.00019672053811345774, + "loss": 0.6975, + "step": 615 + }, + { + "epoch": 0.10951111111111111, + "grad_norm": 0.3707750636427583, + "learning_rate": 0.00019670589688562955, + "loss": 0.7226, + "step": 616 + }, + { + "epoch": 0.10968888888888889, + "grad_norm": 0.3557441208059503, + "learning_rate": 0.00019669122359481525, + "loss": 0.6843, + "step": 617 + }, + { + "epoch": 0.10986666666666667, + "grad_norm": 0.3587624777182352, + "learning_rate": 0.00019667651824587976, + "loss": 0.6983, + "step": 618 + }, + { + "epoch": 0.11004444444444444, + "grad_norm": 0.3868661344532939, + "learning_rate": 0.00019666178084369867, + "loss": 0.6805, + "step": 619 + }, + { + "epoch": 0.11022222222222222, + "grad_norm": 0.38114079818793906, + "learning_rate": 0.0001966470113931582, + "loss": 0.7049, + "step": 620 + }, + { + "epoch": 0.1104, + "grad_norm": 0.3785693962728146, + "learning_rate": 0.00019663220989915513, + "loss": 0.6708, + "step": 621 + }, + { + "epoch": 0.11057777777777777, + "grad_norm": 0.3920112018906953, + "learning_rate": 0.00019661737636659696, + "loss": 0.6892, + "step": 622 + }, + { + "epoch": 0.11075555555555555, + "grad_norm": 0.3884063373305085, + "learning_rate": 0.0001966025108004018, + "loss": 0.6566, + "step": 623 + }, + { + "epoch": 0.11093333333333333, + "grad_norm": 0.41569158134959805, + "learning_rate": 0.00019658761320549833, + "loss": 0.7504, + "step": 624 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 0.40388249896819356, + "learning_rate": 0.00019657268358682584, + "loss": 0.7174, + "step": 625 + }, + { + "epoch": 0.1112888888888889, + "grad_norm": 0.3738884454707474, + "learning_rate": 0.00019655772194933428, + "loss": 0.6927, + "step": 626 + }, + { + "epoch": 0.11146666666666667, + "grad_norm": 0.39642178791794436, + "learning_rate": 0.00019654272829798425, + "loss": 0.7324, + "step": 627 + }, + { + "epoch": 0.11164444444444445, + "grad_norm": 0.3647370336208459, + "learning_rate": 0.0001965277026377468, + "loss": 0.6447, + "step": 628 + }, + { + "epoch": 0.11182222222222223, + "grad_norm": 0.40055350302841236, + "learning_rate": 0.00019651264497360388, + "loss": 0.719, + "step": 629 + }, + { + "epoch": 0.112, + "grad_norm": 0.468614952268159, + "learning_rate": 0.00019649755531054777, + "loss": 0.7535, + "step": 630 + }, + { + "epoch": 0.11217777777777778, + "grad_norm": 0.385594063029447, + "learning_rate": 0.00019648243365358146, + "loss": 0.6687, + "step": 631 + }, + { + "epoch": 0.11235555555555556, + "grad_norm": 0.3991226740325378, + "learning_rate": 0.00019646728000771862, + "loss": 0.7024, + "step": 632 + }, + { + "epoch": 0.11253333333333333, + "grad_norm": 0.40191609381625604, + "learning_rate": 0.0001964520943779834, + "loss": 0.6457, + "step": 633 + }, + { + "epoch": 0.11271111111111111, + "grad_norm": 0.37586566848670044, + "learning_rate": 0.00019643687676941068, + "loss": 0.7147, + "step": 634 + }, + { + "epoch": 0.11288888888888889, + "grad_norm": 0.40400999422691786, + "learning_rate": 0.00019642162718704585, + "loss": 0.6804, + "step": 635 + }, + { + "epoch": 0.11306666666666666, + "grad_norm": 0.3709453489046353, + "learning_rate": 0.00019640634563594496, + "loss": 0.6882, + "step": 636 + }, + { + "epoch": 0.11324444444444444, + "grad_norm": 0.4134633703381982, + "learning_rate": 0.0001963910321211746, + "loss": 0.7011, + "step": 637 + }, + { + "epoch": 0.11342222222222222, + "grad_norm": 0.40052956636237336, + "learning_rate": 0.00019637568664781195, + "loss": 0.6928, + "step": 638 + }, + { + "epoch": 0.1136, + "grad_norm": 0.38200631677661545, + "learning_rate": 0.0001963603092209449, + "loss": 0.7548, + "step": 639 + }, + { + "epoch": 0.11377777777777778, + "grad_norm": 0.41164011775950954, + "learning_rate": 0.00019634489984567184, + "loss": 0.7364, + "step": 640 + }, + { + "epoch": 0.11395555555555556, + "grad_norm": 0.3838618376968505, + "learning_rate": 0.00019632945852710173, + "loss": 0.6849, + "step": 641 + }, + { + "epoch": 0.11413333333333334, + "grad_norm": 0.38168638484642287, + "learning_rate": 0.00019631398527035422, + "loss": 0.7166, + "step": 642 + }, + { + "epoch": 0.11431111111111111, + "grad_norm": 0.4015902414905251, + "learning_rate": 0.00019629848008055948, + "loss": 0.6678, + "step": 643 + }, + { + "epoch": 0.11448888888888889, + "grad_norm": 0.38283172732806114, + "learning_rate": 0.00019628294296285823, + "loss": 0.6168, + "step": 644 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 0.39527449961864114, + "learning_rate": 0.00019626737392240188, + "loss": 0.7112, + "step": 645 + }, + { + "epoch": 0.11484444444444444, + "grad_norm": 0.3983478136997403, + "learning_rate": 0.00019625177296435234, + "loss": 0.7185, + "step": 646 + }, + { + "epoch": 0.11502222222222222, + "grad_norm": 0.40437207416312704, + "learning_rate": 0.00019623614009388216, + "loss": 0.7417, + "step": 647 + }, + { + "epoch": 0.1152, + "grad_norm": 0.39653682733883383, + "learning_rate": 0.0001962204753161744, + "loss": 0.7166, + "step": 648 + }, + { + "epoch": 0.11537777777777777, + "grad_norm": 0.392990450268175, + "learning_rate": 0.00019620477863642276, + "loss": 0.6733, + "step": 649 + }, + { + "epoch": 0.11555555555555555, + "grad_norm": 0.3929897861104475, + "learning_rate": 0.0001961890500598315, + "loss": 0.7326, + "step": 650 + }, + { + "epoch": 0.11573333333333333, + "grad_norm": 0.41879292758556386, + "learning_rate": 0.0001961732895916155, + "loss": 0.729, + "step": 651 + }, + { + "epoch": 0.11591111111111112, + "grad_norm": 0.4046234724717051, + "learning_rate": 0.00019615749723700008, + "loss": 0.7067, + "step": 652 + }, + { + "epoch": 0.1160888888888889, + "grad_norm": 0.4189170353654842, + "learning_rate": 0.00019614167300122126, + "loss": 0.6833, + "step": 653 + }, + { + "epoch": 0.11626666666666667, + "grad_norm": 0.40659796045052804, + "learning_rate": 0.0001961258168895256, + "loss": 0.7268, + "step": 654 + }, + { + "epoch": 0.11644444444444445, + "grad_norm": 0.42805021198864035, + "learning_rate": 0.00019610992890717018, + "loss": 0.7046, + "step": 655 + }, + { + "epoch": 0.11662222222222222, + "grad_norm": 0.3790159028051678, + "learning_rate": 0.00019609400905942274, + "loss": 0.6683, + "step": 656 + }, + { + "epoch": 0.1168, + "grad_norm": 0.3990134364124909, + "learning_rate": 0.0001960780573515615, + "loss": 0.7048, + "step": 657 + }, + { + "epoch": 0.11697777777777778, + "grad_norm": 0.399216113387533, + "learning_rate": 0.00019606207378887523, + "loss": 0.727, + "step": 658 + }, + { + "epoch": 0.11715555555555555, + "grad_norm": 0.4001163088053081, + "learning_rate": 0.0001960460583766634, + "loss": 0.719, + "step": 659 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 0.40635720899063404, + "learning_rate": 0.0001960300111202359, + "loss": 0.7201, + "step": 660 + }, + { + "epoch": 0.11751111111111111, + "grad_norm": 0.4302614248547872, + "learning_rate": 0.00019601393202491315, + "loss": 0.6858, + "step": 661 + }, + { + "epoch": 0.11768888888888888, + "grad_norm": 0.3632270023803551, + "learning_rate": 0.00019599782109602632, + "loss": 0.6455, + "step": 662 + }, + { + "epoch": 0.11786666666666666, + "grad_norm": 0.39515467618320493, + "learning_rate": 0.00019598167833891693, + "loss": 0.7083, + "step": 663 + }, + { + "epoch": 0.11804444444444444, + "grad_norm": 0.41578117914046, + "learning_rate": 0.0001959655037589372, + "loss": 0.7265, + "step": 664 + }, + { + "epoch": 0.11822222222222223, + "grad_norm": 0.4151397514822464, + "learning_rate": 0.00019594929736144976, + "loss": 0.717, + "step": 665 + }, + { + "epoch": 0.1184, + "grad_norm": 0.39651625885923303, + "learning_rate": 0.00019593305915182788, + "loss": 0.7298, + "step": 666 + }, + { + "epoch": 0.11857777777777778, + "grad_norm": 0.4206811369156842, + "learning_rate": 0.00019591678913545544, + "loss": 0.7382, + "step": 667 + }, + { + "epoch": 0.11875555555555556, + "grad_norm": 0.38427741776445884, + "learning_rate": 0.0001959004873177267, + "loss": 0.7221, + "step": 668 + }, + { + "epoch": 0.11893333333333334, + "grad_norm": 0.36996537792351825, + "learning_rate": 0.0001958841537040466, + "loss": 0.6857, + "step": 669 + }, + { + "epoch": 0.11911111111111111, + "grad_norm": 0.3930460323318283, + "learning_rate": 0.00019586778829983054, + "loss": 0.7034, + "step": 670 + }, + { + "epoch": 0.11928888888888889, + "grad_norm": 0.37366446401002124, + "learning_rate": 0.00019585139111050453, + "loss": 0.7149, + "step": 671 + }, + { + "epoch": 0.11946666666666667, + "grad_norm": 0.3736414488492699, + "learning_rate": 0.00019583496214150507, + "loss": 0.6984, + "step": 672 + }, + { + "epoch": 0.11964444444444444, + "grad_norm": 0.38685954327606714, + "learning_rate": 0.0001958185013982792, + "loss": 0.676, + "step": 673 + }, + { + "epoch": 0.11982222222222222, + "grad_norm": 0.3796635393623509, + "learning_rate": 0.00019580200888628452, + "loss": 0.6818, + "step": 674 + }, + { + "epoch": 0.12, + "grad_norm": 0.4034613468508447, + "learning_rate": 0.00019578548461098914, + "loss": 0.6914, + "step": 675 + }, + { + "epoch": 0.12017777777777777, + "grad_norm": 0.38839898897411546, + "learning_rate": 0.0001957689285778717, + "loss": 0.7468, + "step": 676 + }, + { + "epoch": 0.12035555555555555, + "grad_norm": 0.380472166037475, + "learning_rate": 0.00019575234079242143, + "loss": 0.692, + "step": 677 + }, + { + "epoch": 0.12053333333333334, + "grad_norm": 0.38844141026547785, + "learning_rate": 0.00019573572126013793, + "loss": 0.6743, + "step": 678 + }, + { + "epoch": 0.12071111111111112, + "grad_norm": 0.3810917695249503, + "learning_rate": 0.00019571906998653148, + "loss": 0.7074, + "step": 679 + }, + { + "epoch": 0.12088888888888889, + "grad_norm": 0.40683239492813295, + "learning_rate": 0.0001957023869771229, + "loss": 0.7602, + "step": 680 + }, + { + "epoch": 0.12106666666666667, + "grad_norm": 0.3725691518485805, + "learning_rate": 0.00019568567223744339, + "loss": 0.6327, + "step": 681 + }, + { + "epoch": 0.12124444444444445, + "grad_norm": 0.39774666940776254, + "learning_rate": 0.00019566892577303478, + "loss": 0.7107, + "step": 682 + }, + { + "epoch": 0.12142222222222222, + "grad_norm": 0.39312447206654133, + "learning_rate": 0.00019565214758944936, + "loss": 0.686, + "step": 683 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3957977816573584, + "learning_rate": 0.00019563533769225, + "loss": 0.6977, + "step": 684 + }, + { + "epoch": 0.12177777777777778, + "grad_norm": 0.37847264259650193, + "learning_rate": 0.00019561849608700998, + "loss": 0.6631, + "step": 685 + }, + { + "epoch": 0.12195555555555555, + "grad_norm": 0.437690221060892, + "learning_rate": 0.00019560162277931325, + "loss": 0.7385, + "step": 686 + }, + { + "epoch": 0.12213333333333333, + "grad_norm": 0.417906191495558, + "learning_rate": 0.00019558471777475413, + "loss": 0.6765, + "step": 687 + }, + { + "epoch": 0.1223111111111111, + "grad_norm": 0.3843418399254885, + "learning_rate": 0.00019556778107893748, + "loss": 0.6687, + "step": 688 + }, + { + "epoch": 0.12248888888888888, + "grad_norm": 0.39276975611926834, + "learning_rate": 0.00019555081269747877, + "loss": 0.6949, + "step": 689 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 0.38514665606398174, + "learning_rate": 0.00019553381263600378, + "loss": 0.6814, + "step": 690 + }, + { + "epoch": 0.12284444444444445, + "grad_norm": 0.39437780132172745, + "learning_rate": 0.00019551678090014898, + "loss": 0.7068, + "step": 691 + }, + { + "epoch": 0.12302222222222223, + "grad_norm": 0.3895435528336699, + "learning_rate": 0.00019549971749556125, + "loss": 0.6872, + "step": 692 + }, + { + "epoch": 0.1232, + "grad_norm": 0.40143956438050515, + "learning_rate": 0.00019548262242789796, + "loss": 0.6738, + "step": 693 + }, + { + "epoch": 0.12337777777777778, + "grad_norm": 0.3868980566707207, + "learning_rate": 0.00019546549570282707, + "loss": 0.7085, + "step": 694 + }, + { + "epoch": 0.12355555555555556, + "grad_norm": 0.37660475618434147, + "learning_rate": 0.00019544833732602692, + "loss": 0.6794, + "step": 695 + }, + { + "epoch": 0.12373333333333333, + "grad_norm": 0.38183188794054557, + "learning_rate": 0.0001954311473031864, + "loss": 0.7068, + "step": 696 + }, + { + "epoch": 0.12391111111111111, + "grad_norm": 0.4095031622494095, + "learning_rate": 0.00019541392564000488, + "loss": 0.7735, + "step": 697 + }, + { + "epoch": 0.12408888888888889, + "grad_norm": 0.3655706106021032, + "learning_rate": 0.00019539667234219228, + "loss": 0.702, + "step": 698 + }, + { + "epoch": 0.12426666666666666, + "grad_norm": 0.3785239408015017, + "learning_rate": 0.0001953793874154689, + "loss": 0.7534, + "step": 699 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 0.3727241968243882, + "learning_rate": 0.00019536207086556564, + "loss": 0.6917, + "step": 700 + }, + { + "epoch": 0.12462222222222222, + "grad_norm": 0.37439415888648636, + "learning_rate": 0.00019534472269822377, + "loss": 0.6591, + "step": 701 + }, + { + "epoch": 0.1248, + "grad_norm": 0.4015846498161762, + "learning_rate": 0.00019532734291919512, + "loss": 0.7426, + "step": 702 + }, + { + "epoch": 0.12497777777777777, + "grad_norm": 0.40124234056468516, + "learning_rate": 0.00019530993153424198, + "loss": 0.6943, + "step": 703 + }, + { + "epoch": 0.12515555555555555, + "grad_norm": 0.3850891059208508, + "learning_rate": 0.00019529248854913714, + "loss": 0.6583, + "step": 704 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 0.36339471960397257, + "learning_rate": 0.00019527501396966382, + "loss": 0.6505, + "step": 705 + }, + { + "epoch": 0.1255111111111111, + "grad_norm": 0.3676211036995528, + "learning_rate": 0.0001952575078016158, + "loss": 0.6937, + "step": 706 + }, + { + "epoch": 0.12568888888888888, + "grad_norm": 0.3773537720277207, + "learning_rate": 0.0001952399700507972, + "loss": 0.7101, + "step": 707 + }, + { + "epoch": 0.12586666666666665, + "grad_norm": 0.35930125331897794, + "learning_rate": 0.00019522240072302274, + "loss": 0.6799, + "step": 708 + }, + { + "epoch": 0.12604444444444443, + "grad_norm": 0.40235758973764196, + "learning_rate": 0.00019520479982411754, + "loss": 0.7322, + "step": 709 + }, + { + "epoch": 0.12622222222222224, + "grad_norm": 0.3927794035142052, + "learning_rate": 0.0001951871673599172, + "loss": 0.7367, + "step": 710 + }, + { + "epoch": 0.1264, + "grad_norm": 0.4009191490683467, + "learning_rate": 0.0001951695033362678, + "loss": 0.7104, + "step": 711 + }, + { + "epoch": 0.1265777777777778, + "grad_norm": 0.40597785126656877, + "learning_rate": 0.00019515180775902586, + "loss": 0.7197, + "step": 712 + }, + { + "epoch": 0.12675555555555557, + "grad_norm": 0.3744733301622407, + "learning_rate": 0.00019513408063405837, + "loss": 0.662, + "step": 713 + }, + { + "epoch": 0.12693333333333334, + "grad_norm": 0.40420665526701016, + "learning_rate": 0.00019511632196724286, + "loss": 0.74, + "step": 714 + }, + { + "epoch": 0.12711111111111112, + "grad_norm": 0.37712067427193463, + "learning_rate": 0.00019509853176446712, + "loss": 0.7177, + "step": 715 + }, + { + "epoch": 0.1272888888888889, + "grad_norm": 0.3814008249906852, + "learning_rate": 0.0001950807100316296, + "loss": 0.6833, + "step": 716 + }, + { + "epoch": 0.12746666666666667, + "grad_norm": 0.37943427228223975, + "learning_rate": 0.0001950628567746391, + "loss": 0.6702, + "step": 717 + }, + { + "epoch": 0.12764444444444445, + "grad_norm": 0.4034065670836774, + "learning_rate": 0.00019504497199941491, + "loss": 0.6401, + "step": 718 + }, + { + "epoch": 0.12782222222222223, + "grad_norm": 0.4310192209484912, + "learning_rate": 0.00019502705571188672, + "loss": 0.6937, + "step": 719 + }, + { + "epoch": 0.128, + "grad_norm": 0.3838845895439845, + "learning_rate": 0.00019500910791799475, + "loss": 0.7432, + "step": 720 + }, + { + "epoch": 0.12817777777777778, + "grad_norm": 0.40288754937412263, + "learning_rate": 0.0001949911286236896, + "loss": 0.676, + "step": 721 + }, + { + "epoch": 0.12835555555555556, + "grad_norm": 0.9668915824422284, + "learning_rate": 0.0001949731178349323, + "loss": 0.6898, + "step": 722 + }, + { + "epoch": 0.12853333333333333, + "grad_norm": 0.37327470703590615, + "learning_rate": 0.0001949550755576944, + "loss": 0.6726, + "step": 723 + }, + { + "epoch": 0.1287111111111111, + "grad_norm": 0.38951041324497787, + "learning_rate": 0.00019493700179795779, + "loss": 0.7065, + "step": 724 + }, + { + "epoch": 0.1288888888888889, + "grad_norm": 0.3592578967476502, + "learning_rate": 0.0001949188965617149, + "loss": 0.6778, + "step": 725 + }, + { + "epoch": 0.12906666666666666, + "grad_norm": 0.4192139955894857, + "learning_rate": 0.00019490075985496857, + "loss": 0.6891, + "step": 726 + }, + { + "epoch": 0.12924444444444444, + "grad_norm": 0.38679690663668653, + "learning_rate": 0.00019488259168373197, + "loss": 0.753, + "step": 727 + }, + { + "epoch": 0.12942222222222222, + "grad_norm": 0.37928518286891644, + "learning_rate": 0.00019486439205402886, + "loss": 0.687, + "step": 728 + }, + { + "epoch": 0.1296, + "grad_norm": 0.38521234140186406, + "learning_rate": 0.0001948461609718933, + "loss": 0.6821, + "step": 729 + }, + { + "epoch": 0.12977777777777777, + "grad_norm": 0.3811765521477349, + "learning_rate": 0.0001948278984433699, + "loss": 0.6682, + "step": 730 + }, + { + "epoch": 0.12995555555555555, + "grad_norm": 0.3853373344282037, + "learning_rate": 0.00019480960447451352, + "loss": 0.7021, + "step": 731 + }, + { + "epoch": 0.13013333333333332, + "grad_norm": 0.3818534579985387, + "learning_rate": 0.00019479127907138968, + "loss": 0.6748, + "step": 732 + }, + { + "epoch": 0.1303111111111111, + "grad_norm": 0.3736128958052664, + "learning_rate": 0.0001947729222400741, + "loss": 0.6285, + "step": 733 + }, + { + "epoch": 0.13048888888888888, + "grad_norm": 0.4158871767940913, + "learning_rate": 0.00019475453398665307, + "loss": 0.7613, + "step": 734 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 0.4057020120832801, + "learning_rate": 0.0001947361143172232, + "loss": 0.7343, + "step": 735 + }, + { + "epoch": 0.13084444444444446, + "grad_norm": 0.4042423094083696, + "learning_rate": 0.00019471766323789162, + "loss": 0.6852, + "step": 736 + }, + { + "epoch": 0.13102222222222223, + "grad_norm": 0.4065871322352284, + "learning_rate": 0.00019469918075477573, + "loss": 0.753, + "step": 737 + }, + { + "epoch": 0.1312, + "grad_norm": 0.4049602445920588, + "learning_rate": 0.0001946806668740035, + "loss": 0.7165, + "step": 738 + }, + { + "epoch": 0.1313777777777778, + "grad_norm": 0.36581838614638695, + "learning_rate": 0.00019466212160171322, + "loss": 0.7057, + "step": 739 + }, + { + "epoch": 0.13155555555555556, + "grad_norm": 0.35833651462384475, + "learning_rate": 0.00019464354494405357, + "loss": 0.6942, + "step": 740 + }, + { + "epoch": 0.13173333333333334, + "grad_norm": 0.36420006750116013, + "learning_rate": 0.0001946249369071837, + "loss": 0.6642, + "step": 741 + }, + { + "epoch": 0.13191111111111112, + "grad_norm": 0.3835386281989903, + "learning_rate": 0.0001946062974972731, + "loss": 0.6774, + "step": 742 + }, + { + "epoch": 0.1320888888888889, + "grad_norm": 0.418120550784053, + "learning_rate": 0.00019458762672050175, + "loss": 0.7305, + "step": 743 + }, + { + "epoch": 0.13226666666666667, + "grad_norm": 0.39042444598221515, + "learning_rate": 0.0001945689245830599, + "loss": 0.6663, + "step": 744 + }, + { + "epoch": 0.13244444444444445, + "grad_norm": 0.3948550958170412, + "learning_rate": 0.00019455019109114834, + "loss": 0.7107, + "step": 745 + }, + { + "epoch": 0.13262222222222222, + "grad_norm": 0.3770555486650651, + "learning_rate": 0.00019453142625097813, + "loss": 0.6831, + "step": 746 + }, + { + "epoch": 0.1328, + "grad_norm": 0.3562477937897721, + "learning_rate": 0.00019451263006877082, + "loss": 0.6525, + "step": 747 + }, + { + "epoch": 0.13297777777777778, + "grad_norm": 0.4170259799864657, + "learning_rate": 0.00019449380255075834, + "loss": 0.7611, + "step": 748 + }, + { + "epoch": 0.13315555555555555, + "grad_norm": 0.3541536601102192, + "learning_rate": 0.0001944749437031829, + "loss": 0.7023, + "step": 749 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.3705820712190629, + "learning_rate": 0.00019445605353229724, + "loss": 0.6944, + "step": 750 + }, + { + "epoch": 0.1335111111111111, + "grad_norm": 0.3729234298546076, + "learning_rate": 0.00019443713204436442, + "loss": 0.683, + "step": 751 + }, + { + "epoch": 0.13368888888888888, + "grad_norm": 0.3934317115739774, + "learning_rate": 0.00019441817924565786, + "loss": 0.7215, + "step": 752 + }, + { + "epoch": 0.13386666666666666, + "grad_norm": 0.3827409390397291, + "learning_rate": 0.00019439919514246143, + "loss": 0.683, + "step": 753 + }, + { + "epoch": 0.13404444444444444, + "grad_norm": 0.3655890153117205, + "learning_rate": 0.00019438017974106927, + "loss": 0.6904, + "step": 754 + }, + { + "epoch": 0.13422222222222221, + "grad_norm": 0.48897291399406645, + "learning_rate": 0.00019436113304778605, + "loss": 0.7228, + "step": 755 + }, + { + "epoch": 0.1344, + "grad_norm": 0.3964737782979409, + "learning_rate": 0.00019434205506892668, + "loss": 0.7232, + "step": 756 + }, + { + "epoch": 0.13457777777777777, + "grad_norm": 0.3747917375959724, + "learning_rate": 0.0001943229458108165, + "loss": 0.6475, + "step": 757 + }, + { + "epoch": 0.13475555555555555, + "grad_norm": 0.4084092421153487, + "learning_rate": 0.00019430380527979123, + "loss": 0.6853, + "step": 758 + }, + { + "epoch": 0.13493333333333332, + "grad_norm": 0.3954498153356176, + "learning_rate": 0.0001942846334821969, + "loss": 0.7067, + "step": 759 + }, + { + "epoch": 0.1351111111111111, + "grad_norm": 0.3923375746646342, + "learning_rate": 0.00019426543042438998, + "loss": 0.7349, + "step": 760 + }, + { + "epoch": 0.13528888888888888, + "grad_norm": 0.3846065996861539, + "learning_rate": 0.00019424619611273727, + "loss": 0.6407, + "step": 761 + }, + { + "epoch": 0.13546666666666668, + "grad_norm": 0.40324972054960234, + "learning_rate": 0.00019422693055361594, + "loss": 0.7002, + "step": 762 + }, + { + "epoch": 0.13564444444444446, + "grad_norm": 0.3809227825038652, + "learning_rate": 0.0001942076337534135, + "loss": 0.694, + "step": 763 + }, + { + "epoch": 0.13582222222222223, + "grad_norm": 0.39211048224653544, + "learning_rate": 0.00019418830571852786, + "loss": 0.6426, + "step": 764 + }, + { + "epoch": 0.136, + "grad_norm": 0.38121732199107194, + "learning_rate": 0.00019416894645536722, + "loss": 0.6781, + "step": 765 + }, + { + "epoch": 0.1361777777777778, + "grad_norm": 0.39960463693666126, + "learning_rate": 0.0001941495559703502, + "loss": 0.6541, + "step": 766 + }, + { + "epoch": 0.13635555555555556, + "grad_norm": 0.39853418482635733, + "learning_rate": 0.00019413013426990573, + "loss": 0.7311, + "step": 767 + }, + { + "epoch": 0.13653333333333334, + "grad_norm": 0.37570928744363474, + "learning_rate": 0.0001941106813604731, + "loss": 0.6809, + "step": 768 + }, + { + "epoch": 0.13671111111111112, + "grad_norm": 0.37654309295921845, + "learning_rate": 0.00019409119724850203, + "loss": 0.6965, + "step": 769 + }, + { + "epoch": 0.1368888888888889, + "grad_norm": 0.37267827694515493, + "learning_rate": 0.0001940716819404524, + "loss": 0.6855, + "step": 770 + }, + { + "epoch": 0.13706666666666667, + "grad_norm": 0.41712734231261517, + "learning_rate": 0.00019405213544279457, + "loss": 0.7411, + "step": 771 + }, + { + "epoch": 0.13724444444444445, + "grad_norm": 0.37196299242904857, + "learning_rate": 0.00019403255776200923, + "loss": 0.6814, + "step": 772 + }, + { + "epoch": 0.13742222222222222, + "grad_norm": 0.3736471019310651, + "learning_rate": 0.0001940129489045874, + "loss": 0.66, + "step": 773 + }, + { + "epoch": 0.1376, + "grad_norm": 0.4067752965062365, + "learning_rate": 0.00019399330887703037, + "loss": 0.6631, + "step": 774 + }, + { + "epoch": 0.13777777777777778, + "grad_norm": 0.41606622625490675, + "learning_rate": 0.00019397363768584985, + "loss": 0.6928, + "step": 775 + }, + { + "epoch": 0.13795555555555555, + "grad_norm": 0.37764784814524127, + "learning_rate": 0.00019395393533756791, + "loss": 0.6586, + "step": 776 + }, + { + "epoch": 0.13813333333333333, + "grad_norm": 0.37734096428376585, + "learning_rate": 0.00019393420183871682, + "loss": 0.6674, + "step": 777 + }, + { + "epoch": 0.1383111111111111, + "grad_norm": 0.488061776683202, + "learning_rate": 0.0001939144371958393, + "loss": 0.7338, + "step": 778 + }, + { + "epoch": 0.13848888888888888, + "grad_norm": 0.38761844001744455, + "learning_rate": 0.0001938946414154883, + "loss": 0.7043, + "step": 779 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.4212399759599747, + "learning_rate": 0.00019387481450422716, + "loss": 0.7298, + "step": 780 + }, + { + "epoch": 0.13884444444444444, + "grad_norm": 0.37564377307553876, + "learning_rate": 0.00019385495646862954, + "loss": 0.7146, + "step": 781 + }, + { + "epoch": 0.1390222222222222, + "grad_norm": 0.3613959363788971, + "learning_rate": 0.00019383506731527936, + "loss": 0.6855, + "step": 782 + }, + { + "epoch": 0.1392, + "grad_norm": 0.38638494445979815, + "learning_rate": 0.00019381514705077096, + "loss": 0.7322, + "step": 783 + }, + { + "epoch": 0.13937777777777777, + "grad_norm": 0.3848478283677541, + "learning_rate": 0.00019379519568170887, + "loss": 0.7238, + "step": 784 + }, + { + "epoch": 0.13955555555555554, + "grad_norm": 0.37843001759067885, + "learning_rate": 0.00019377521321470805, + "loss": 0.6246, + "step": 785 + }, + { + "epoch": 0.13973333333333332, + "grad_norm": 0.39755090105671587, + "learning_rate": 0.00019375519965639368, + "loss": 0.7072, + "step": 786 + }, + { + "epoch": 0.13991111111111112, + "grad_norm": 0.4177712000107054, + "learning_rate": 0.0001937351550134013, + "loss": 0.7199, + "step": 787 + }, + { + "epoch": 0.1400888888888889, + "grad_norm": 0.3866941433560375, + "learning_rate": 0.00019371507929237677, + "loss": 0.7334, + "step": 788 + }, + { + "epoch": 0.14026666666666668, + "grad_norm": 0.41247139332204347, + "learning_rate": 0.0001936949724999762, + "loss": 0.7428, + "step": 789 + }, + { + "epoch": 0.14044444444444446, + "grad_norm": 0.39944174506901275, + "learning_rate": 0.000193674834642866, + "loss": 0.6785, + "step": 790 + }, + { + "epoch": 0.14062222222222223, + "grad_norm": 0.4073098744839168, + "learning_rate": 0.0001936546657277229, + "loss": 0.6525, + "step": 791 + }, + { + "epoch": 0.1408, + "grad_norm": 0.35203231595843426, + "learning_rate": 0.00019363446576123403, + "loss": 0.6599, + "step": 792 + }, + { + "epoch": 0.14097777777777779, + "grad_norm": 0.39085949367200534, + "learning_rate": 0.0001936142347500966, + "loss": 0.7578, + "step": 793 + }, + { + "epoch": 0.14115555555555556, + "grad_norm": 0.3853118606301283, + "learning_rate": 0.00019359397270101832, + "loss": 0.7415, + "step": 794 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 0.36913894834976235, + "learning_rate": 0.0001935736796207171, + "loss": 0.6489, + "step": 795 + }, + { + "epoch": 0.14151111111111112, + "grad_norm": 0.3716800590217003, + "learning_rate": 0.00019355335551592105, + "loss": 0.6131, + "step": 796 + }, + { + "epoch": 0.1416888888888889, + "grad_norm": 0.37141903092957784, + "learning_rate": 0.00019353300039336873, + "loss": 0.688, + "step": 797 + }, + { + "epoch": 0.14186666666666667, + "grad_norm": 0.3720260022775563, + "learning_rate": 0.00019351261425980894, + "loss": 0.6668, + "step": 798 + }, + { + "epoch": 0.14204444444444445, + "grad_norm": 0.40464753411058213, + "learning_rate": 0.00019349219712200063, + "loss": 0.7121, + "step": 799 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 0.4031291162387869, + "learning_rate": 0.00019347174898671324, + "loss": 0.7358, + "step": 800 + }, + { + "epoch": 0.1424, + "grad_norm": 0.3747899842991318, + "learning_rate": 0.00019345126986072635, + "loss": 0.681, + "step": 801 + }, + { + "epoch": 0.14257777777777778, + "grad_norm": 0.3911511727555308, + "learning_rate": 0.00019343075975082982, + "loss": 0.6956, + "step": 802 + }, + { + "epoch": 0.14275555555555555, + "grad_norm": 0.3778862854504106, + "learning_rate": 0.00019341021866382382, + "loss": 0.6943, + "step": 803 + }, + { + "epoch": 0.14293333333333333, + "grad_norm": 0.3982973493657767, + "learning_rate": 0.00019338964660651876, + "loss": 0.7067, + "step": 804 + }, + { + "epoch": 0.1431111111111111, + "grad_norm": 0.3845951950334205, + "learning_rate": 0.00019336904358573537, + "loss": 0.6442, + "step": 805 + }, + { + "epoch": 0.14328888888888888, + "grad_norm": 0.3779646103778523, + "learning_rate": 0.0001933484096083046, + "loss": 0.6812, + "step": 806 + }, + { + "epoch": 0.14346666666666666, + "grad_norm": 0.37724722844955266, + "learning_rate": 0.00019332774468106768, + "loss": 0.6894, + "step": 807 + }, + { + "epoch": 0.14364444444444444, + "grad_norm": 0.3970349948910945, + "learning_rate": 0.0001933070488108761, + "loss": 0.7438, + "step": 808 + }, + { + "epoch": 0.1438222222222222, + "grad_norm": 0.38646658897235914, + "learning_rate": 0.00019328632200459156, + "loss": 0.6343, + "step": 809 + }, + { + "epoch": 0.144, + "grad_norm": 0.3725777771171682, + "learning_rate": 0.00019326556426908613, + "loss": 0.6869, + "step": 810 + }, + { + "epoch": 0.14417777777777777, + "grad_norm": 0.3967025788249028, + "learning_rate": 0.00019324477561124206, + "loss": 0.7214, + "step": 811 + }, + { + "epoch": 0.14435555555555554, + "grad_norm": 0.3609963595961742, + "learning_rate": 0.0001932239560379518, + "loss": 0.6393, + "step": 812 + }, + { + "epoch": 0.14453333333333335, + "grad_norm": 0.40156946299695406, + "learning_rate": 0.00019320310555611818, + "loss": 0.7506, + "step": 813 + }, + { + "epoch": 0.14471111111111112, + "grad_norm": 0.3830502972541065, + "learning_rate": 0.0001931822241726542, + "loss": 0.7175, + "step": 814 + }, + { + "epoch": 0.1448888888888889, + "grad_norm": 0.38994602022279967, + "learning_rate": 0.00019316131189448305, + "loss": 0.6981, + "step": 815 + }, + { + "epoch": 0.14506666666666668, + "grad_norm": 0.361486592054618, + "learning_rate": 0.0001931403687285383, + "loss": 0.6605, + "step": 816 + }, + { + "epoch": 0.14524444444444445, + "grad_norm": 0.3721825291843033, + "learning_rate": 0.00019311939468176368, + "loss": 0.6725, + "step": 817 + }, + { + "epoch": 0.14542222222222223, + "grad_norm": 0.3702339318523388, + "learning_rate": 0.00019309838976111311, + "loss": 0.6771, + "step": 818 + }, + { + "epoch": 0.1456, + "grad_norm": 0.3655943655108182, + "learning_rate": 0.00019307735397355088, + "loss": 0.6696, + "step": 819 + }, + { + "epoch": 0.14577777777777778, + "grad_norm": 0.3832003903881648, + "learning_rate": 0.00019305628732605137, + "loss": 0.7332, + "step": 820 + }, + { + "epoch": 0.14595555555555556, + "grad_norm": 0.37917627423343236, + "learning_rate": 0.00019303518982559932, + "loss": 0.717, + "step": 821 + }, + { + "epoch": 0.14613333333333334, + "grad_norm": 0.4024645477481322, + "learning_rate": 0.00019301406147918956, + "loss": 0.648, + "step": 822 + }, + { + "epoch": 0.14631111111111111, + "grad_norm": 0.38746880739552597, + "learning_rate": 0.0001929929022938273, + "loss": 0.6984, + "step": 823 + }, + { + "epoch": 0.1464888888888889, + "grad_norm": 0.3827970535115837, + "learning_rate": 0.00019297171227652786, + "loss": 0.6789, + "step": 824 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.40500596869525096, + "learning_rate": 0.00019295049143431685, + "loss": 0.7277, + "step": 825 + }, + { + "epoch": 0.14684444444444444, + "grad_norm": 0.37882096104908486, + "learning_rate": 0.00019292923977423006, + "loss": 0.6711, + "step": 826 + }, + { + "epoch": 0.14702222222222222, + "grad_norm": 0.4146687016604997, + "learning_rate": 0.0001929079573033135, + "loss": 0.7462, + "step": 827 + }, + { + "epoch": 0.1472, + "grad_norm": 0.38815996077635145, + "learning_rate": 0.0001928866440286234, + "loss": 0.662, + "step": 828 + }, + { + "epoch": 0.14737777777777777, + "grad_norm": 0.40178896846691553, + "learning_rate": 0.00019286529995722623, + "loss": 0.7094, + "step": 829 + }, + { + "epoch": 0.14755555555555555, + "grad_norm": 0.3623127536712287, + "learning_rate": 0.00019284392509619864, + "loss": 0.6425, + "step": 830 + }, + { + "epoch": 0.14773333333333333, + "grad_norm": 0.4051273563935649, + "learning_rate": 0.00019282251945262747, + "loss": 0.6766, + "step": 831 + }, + { + "epoch": 0.1479111111111111, + "grad_norm": 0.4040610098428571, + "learning_rate": 0.00019280108303360987, + "loss": 0.6786, + "step": 832 + }, + { + "epoch": 0.14808888888888888, + "grad_norm": 0.3902912390290996, + "learning_rate": 0.00019277961584625303, + "loss": 0.6942, + "step": 833 + }, + { + "epoch": 0.14826666666666666, + "grad_norm": 0.3738145449037745, + "learning_rate": 0.00019275811789767447, + "loss": 0.7028, + "step": 834 + }, + { + "epoch": 0.14844444444444443, + "grad_norm": 0.3713233104536498, + "learning_rate": 0.00019273658919500186, + "loss": 0.6874, + "step": 835 + }, + { + "epoch": 0.1486222222222222, + "grad_norm": 0.379256933973657, + "learning_rate": 0.0001927150297453731, + "loss": 0.6734, + "step": 836 + }, + { + "epoch": 0.1488, + "grad_norm": 0.36501729827377855, + "learning_rate": 0.00019269343955593618, + "loss": 0.6887, + "step": 837 + }, + { + "epoch": 0.14897777777777776, + "grad_norm": 0.3914571055470754, + "learning_rate": 0.00019267181863384946, + "loss": 0.6912, + "step": 838 + }, + { + "epoch": 0.14915555555555557, + "grad_norm": 0.35502848462922065, + "learning_rate": 0.00019265016698628132, + "loss": 0.6771, + "step": 839 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 0.3574149592775559, + "learning_rate": 0.00019262848462041045, + "loss": 0.6848, + "step": 840 + }, + { + "epoch": 0.14951111111111112, + "grad_norm": 0.3749511200730551, + "learning_rate": 0.00019260677154342564, + "loss": 0.7516, + "step": 841 + }, + { + "epoch": 0.1496888888888889, + "grad_norm": 0.39746920064342706, + "learning_rate": 0.00019258502776252587, + "loss": 0.6851, + "step": 842 + }, + { + "epoch": 0.14986666666666668, + "grad_norm": 0.3995760780095097, + "learning_rate": 0.0001925632532849204, + "loss": 0.7215, + "step": 843 + }, + { + "epoch": 0.15004444444444445, + "grad_norm": 0.39485424985504264, + "learning_rate": 0.00019254144811782845, + "loss": 0.7248, + "step": 844 + }, + { + "epoch": 0.15022222222222223, + "grad_norm": 0.3875916817580916, + "learning_rate": 0.0001925196122684797, + "loss": 0.6821, + "step": 845 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3366175677715393, + "learning_rate": 0.0001924977457441138, + "loss": 0.6446, + "step": 846 + }, + { + "epoch": 0.15057777777777778, + "grad_norm": 0.40719045534080944, + "learning_rate": 0.00019247584855198064, + "loss": 0.6923, + "step": 847 + }, + { + "epoch": 0.15075555555555556, + "grad_norm": 0.3905962557102382, + "learning_rate": 0.00019245392069934024, + "loss": 0.767, + "step": 848 + }, + { + "epoch": 0.15093333333333334, + "grad_norm": 0.3717944052657077, + "learning_rate": 0.00019243196219346283, + "loss": 0.6952, + "step": 849 + }, + { + "epoch": 0.1511111111111111, + "grad_norm": 0.3867249130380458, + "learning_rate": 0.0001924099730416288, + "loss": 0.6647, + "step": 850 + }, + { + "epoch": 0.1512888888888889, + "grad_norm": 0.40528450547748174, + "learning_rate": 0.0001923879532511287, + "loss": 0.7014, + "step": 851 + }, + { + "epoch": 0.15146666666666667, + "grad_norm": 0.3782710266019561, + "learning_rate": 0.00019236590282926318, + "loss": 0.657, + "step": 852 + }, + { + "epoch": 0.15164444444444444, + "grad_norm": 0.40216110744235556, + "learning_rate": 0.0001923438217833431, + "loss": 0.7599, + "step": 853 + }, + { + "epoch": 0.15182222222222222, + "grad_norm": 0.3804514119338972, + "learning_rate": 0.00019232171012068948, + "loss": 0.7034, + "step": 854 + }, + { + "epoch": 0.152, + "grad_norm": 0.373251838160946, + "learning_rate": 0.00019229956784863345, + "loss": 0.6598, + "step": 855 + }, + { + "epoch": 0.15217777777777777, + "grad_norm": 0.3969445549142826, + "learning_rate": 0.00019227739497451637, + "loss": 0.7327, + "step": 856 + }, + { + "epoch": 0.15235555555555555, + "grad_norm": 0.39468893924795007, + "learning_rate": 0.00019225519150568965, + "loss": 0.7299, + "step": 857 + }, + { + "epoch": 0.15253333333333333, + "grad_norm": 0.3839251712329289, + "learning_rate": 0.00019223295744951485, + "loss": 0.7275, + "step": 858 + }, + { + "epoch": 0.1527111111111111, + "grad_norm": 0.38860140658945475, + "learning_rate": 0.00019221069281336378, + "loss": 0.7066, + "step": 859 + }, + { + "epoch": 0.15288888888888888, + "grad_norm": 0.3564378114831085, + "learning_rate": 0.00019218839760461827, + "loss": 0.6811, + "step": 860 + }, + { + "epoch": 0.15306666666666666, + "grad_norm": 0.3994409828105225, + "learning_rate": 0.00019216607183067033, + "loss": 0.7238, + "step": 861 + }, + { + "epoch": 0.15324444444444443, + "grad_norm": 0.38267358680000324, + "learning_rate": 0.0001921437154989221, + "loss": 0.6866, + "step": 862 + }, + { + "epoch": 0.1534222222222222, + "grad_norm": 0.3787665686182062, + "learning_rate": 0.00019212132861678587, + "loss": 0.6758, + "step": 863 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3968012321798156, + "learning_rate": 0.00019209891119168404, + "loss": 0.7525, + "step": 864 + }, + { + "epoch": 0.1537777777777778, + "grad_norm": 0.3755118168875679, + "learning_rate": 0.00019207646323104915, + "loss": 0.6768, + "step": 865 + }, + { + "epoch": 0.15395555555555557, + "grad_norm": 0.37239361100877194, + "learning_rate": 0.00019205398474232384, + "loss": 0.693, + "step": 866 + }, + { + "epoch": 0.15413333333333334, + "grad_norm": 0.39339555350299876, + "learning_rate": 0.0001920314757329609, + "loss": 0.7043, + "step": 867 + }, + { + "epoch": 0.15431111111111112, + "grad_norm": 0.3842754898541647, + "learning_rate": 0.00019200893621042323, + "loss": 0.6884, + "step": 868 + }, + { + "epoch": 0.1544888888888889, + "grad_norm": 0.3661887441910969, + "learning_rate": 0.0001919863661821838, + "loss": 0.6418, + "step": 869 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 0.40312379655848163, + "learning_rate": 0.00019196376565572577, + "loss": 0.7169, + "step": 870 + }, + { + "epoch": 0.15484444444444445, + "grad_norm": 0.3691160143514685, + "learning_rate": 0.00019194113463854242, + "loss": 0.6753, + "step": 871 + }, + { + "epoch": 0.15502222222222223, + "grad_norm": 0.39931681001637737, + "learning_rate": 0.00019191847313813703, + "loss": 0.6856, + "step": 872 + }, + { + "epoch": 0.1552, + "grad_norm": 0.38596794025358083, + "learning_rate": 0.00019189578116202307, + "loss": 0.7086, + "step": 873 + }, + { + "epoch": 0.15537777777777778, + "grad_norm": 0.36835465349722907, + "learning_rate": 0.0001918730587177241, + "loss": 0.674, + "step": 874 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 0.3682904690381493, + "learning_rate": 0.00019185030581277384, + "loss": 0.6113, + "step": 875 + }, + { + "epoch": 0.15573333333333333, + "grad_norm": 0.37647941723590267, + "learning_rate": 0.00019182752245471596, + "loss": 0.756, + "step": 876 + }, + { + "epoch": 0.1559111111111111, + "grad_norm": 0.3824400362358979, + "learning_rate": 0.00019180470865110436, + "loss": 0.6926, + "step": 877 + }, + { + "epoch": 0.1560888888888889, + "grad_norm": 0.3706013649073851, + "learning_rate": 0.000191781864409503, + "loss": 0.6355, + "step": 878 + }, + { + "epoch": 0.15626666666666666, + "grad_norm": 0.36518079225092087, + "learning_rate": 0.00019175898973748589, + "loss": 0.7308, + "step": 879 + }, + { + "epoch": 0.15644444444444444, + "grad_norm": 0.3803257423170877, + "learning_rate": 0.00019173608464263721, + "loss": 0.7124, + "step": 880 + }, + { + "epoch": 0.15662222222222222, + "grad_norm": 0.37408064422087633, + "learning_rate": 0.00019171314913255113, + "loss": 0.67, + "step": 881 + }, + { + "epoch": 0.1568, + "grad_norm": 0.41669295893778413, + "learning_rate": 0.00019169018321483198, + "loss": 0.738, + "step": 882 + }, + { + "epoch": 0.15697777777777777, + "grad_norm": 0.3915679222832886, + "learning_rate": 0.00019166718689709415, + "loss": 0.6898, + "step": 883 + }, + { + "epoch": 0.15715555555555555, + "grad_norm": 0.3714762991721066, + "learning_rate": 0.00019164416018696207, + "loss": 0.6736, + "step": 884 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 0.38048514131328887, + "learning_rate": 0.00019162110309207034, + "loss": 0.6572, + "step": 885 + }, + { + "epoch": 0.1575111111111111, + "grad_norm": 0.3957511094551649, + "learning_rate": 0.0001915980156200635, + "loss": 0.7551, + "step": 886 + }, + { + "epoch": 0.15768888888888888, + "grad_norm": 0.3749959449674772, + "learning_rate": 0.0001915748977785963, + "loss": 0.6837, + "step": 887 + }, + { + "epoch": 0.15786666666666666, + "grad_norm": 0.3708924692046955, + "learning_rate": 0.00019155174957533343, + "loss": 0.6334, + "step": 888 + }, + { + "epoch": 0.15804444444444443, + "grad_norm": 0.3723815872834324, + "learning_rate": 0.00019152857101794978, + "loss": 0.6853, + "step": 889 + }, + { + "epoch": 0.1582222222222222, + "grad_norm": 0.3574084991834564, + "learning_rate": 0.00019150536211413023, + "loss": 0.6665, + "step": 890 + }, + { + "epoch": 0.1584, + "grad_norm": 0.37994981717056847, + "learning_rate": 0.00019148212287156967, + "loss": 0.6421, + "step": 891 + }, + { + "epoch": 0.1585777777777778, + "grad_norm": 0.3719690546774784, + "learning_rate": 0.00019145885329797317, + "loss": 0.6532, + "step": 892 + }, + { + "epoch": 0.15875555555555557, + "grad_norm": 0.3948652525477549, + "learning_rate": 0.00019143555340105572, + "loss": 0.7026, + "step": 893 + }, + { + "epoch": 0.15893333333333334, + "grad_norm": 0.39236242936492893, + "learning_rate": 0.0001914122231885425, + "loss": 0.7022, + "step": 894 + }, + { + "epoch": 0.15911111111111112, + "grad_norm": 0.38184154499779155, + "learning_rate": 0.00019138886266816866, + "loss": 0.7161, + "step": 895 + }, + { + "epoch": 0.1592888888888889, + "grad_norm": 0.3773011500516085, + "learning_rate": 0.00019136547184767943, + "loss": 0.6643, + "step": 896 + }, + { + "epoch": 0.15946666666666667, + "grad_norm": 0.367319783999322, + "learning_rate": 0.00019134205073483002, + "loss": 0.6633, + "step": 897 + }, + { + "epoch": 0.15964444444444445, + "grad_norm": 0.3676518190361671, + "learning_rate": 0.0001913185993373858, + "loss": 0.6798, + "step": 898 + }, + { + "epoch": 0.15982222222222223, + "grad_norm": 0.40317481122827503, + "learning_rate": 0.00019129511766312205, + "loss": 0.7495, + "step": 899 + }, + { + "epoch": 0.16, + "grad_norm": 0.3797450239775051, + "learning_rate": 0.0001912716057198242, + "loss": 0.6783, + "step": 900 + }, + { + "epoch": 0.16017777777777778, + "grad_norm": 0.3666360257069798, + "learning_rate": 0.00019124806351528766, + "loss": 0.6667, + "step": 901 + }, + { + "epoch": 0.16035555555555556, + "grad_norm": 0.37469060589716036, + "learning_rate": 0.0001912244910573179, + "loss": 0.6894, + "step": 902 + }, + { + "epoch": 0.16053333333333333, + "grad_norm": 0.3856385979008903, + "learning_rate": 0.00019120088835373038, + "loss": 0.6585, + "step": 903 + }, + { + "epoch": 0.1607111111111111, + "grad_norm": 0.37598435152839466, + "learning_rate": 0.00019117725541235061, + "loss": 0.67, + "step": 904 + }, + { + "epoch": 0.1608888888888889, + "grad_norm": 0.36572523451244027, + "learning_rate": 0.00019115359224101416, + "loss": 0.7003, + "step": 905 + }, + { + "epoch": 0.16106666666666666, + "grad_norm": 0.38165170390812214, + "learning_rate": 0.00019112989884756653, + "loss": 0.7348, + "step": 906 + }, + { + "epoch": 0.16124444444444444, + "grad_norm": 0.36227885276166877, + "learning_rate": 0.00019110617523986333, + "loss": 0.7095, + "step": 907 + }, + { + "epoch": 0.16142222222222222, + "grad_norm": 0.36840554757525734, + "learning_rate": 0.00019108242142577023, + "loss": 0.6283, + "step": 908 + }, + { + "epoch": 0.1616, + "grad_norm": 0.37681657881387676, + "learning_rate": 0.0001910586374131627, + "loss": 0.672, + "step": 909 + }, + { + "epoch": 0.16177777777777777, + "grad_norm": 0.3589724367086767, + "learning_rate": 0.00019103482320992647, + "loss": 0.6667, + "step": 910 + }, + { + "epoch": 0.16195555555555555, + "grad_norm": 0.3633758066410066, + "learning_rate": 0.00019101097882395717, + "loss": 0.7471, + "step": 911 + }, + { + "epoch": 0.16213333333333332, + "grad_norm": 0.3714861769343233, + "learning_rate": 0.0001909871042631604, + "loss": 0.7148, + "step": 912 + }, + { + "epoch": 0.1623111111111111, + "grad_norm": 0.38220576216861196, + "learning_rate": 0.00019096319953545185, + "loss": 0.6937, + "step": 913 + }, + { + "epoch": 0.16248888888888888, + "grad_norm": 0.34252174084917913, + "learning_rate": 0.00019093926464875714, + "loss": 0.627, + "step": 914 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 0.3910193525377364, + "learning_rate": 0.00019091529961101191, + "loss": 0.674, + "step": 915 + }, + { + "epoch": 0.16284444444444446, + "grad_norm": 0.37212335353670445, + "learning_rate": 0.00019089130443016182, + "loss": 0.6672, + "step": 916 + }, + { + "epoch": 0.16302222222222224, + "grad_norm": 0.3723265939088728, + "learning_rate": 0.0001908672791141625, + "loss": 0.6823, + "step": 917 + }, + { + "epoch": 0.1632, + "grad_norm": 0.3980200236736282, + "learning_rate": 0.0001908432236709796, + "loss": 0.6584, + "step": 918 + }, + { + "epoch": 0.1633777777777778, + "grad_norm": 0.36951075328814814, + "learning_rate": 0.00019081913810858872, + "loss": 0.6336, + "step": 919 + }, + { + "epoch": 0.16355555555555557, + "grad_norm": 0.3681292106163915, + "learning_rate": 0.00019079502243497546, + "loss": 0.6975, + "step": 920 + }, + { + "epoch": 0.16373333333333334, + "grad_norm": 0.36736987542538535, + "learning_rate": 0.00019077087665813545, + "loss": 0.6912, + "step": 921 + }, + { + "epoch": 0.16391111111111112, + "grad_norm": 0.3953614753595474, + "learning_rate": 0.00019074670078607418, + "loss": 0.7226, + "step": 922 + }, + { + "epoch": 0.1640888888888889, + "grad_norm": 0.3842973294649797, + "learning_rate": 0.00019072249482680726, + "loss": 0.7125, + "step": 923 + }, + { + "epoch": 0.16426666666666667, + "grad_norm": 0.3505606297193894, + "learning_rate": 0.0001906982587883602, + "loss": 0.6727, + "step": 924 + }, + { + "epoch": 0.16444444444444445, + "grad_norm": 0.3632278428872631, + "learning_rate": 0.00019067399267876849, + "loss": 0.6996, + "step": 925 + }, + { + "epoch": 0.16462222222222223, + "grad_norm": 0.3545503866907047, + "learning_rate": 0.0001906496965060776, + "loss": 0.6641, + "step": 926 + }, + { + "epoch": 0.1648, + "grad_norm": 0.3632330341334364, + "learning_rate": 0.00019062537027834297, + "loss": 0.6547, + "step": 927 + }, + { + "epoch": 0.16497777777777778, + "grad_norm": 0.38323046059134197, + "learning_rate": 0.00019060101400362998, + "loss": 0.6662, + "step": 928 + }, + { + "epoch": 0.16515555555555556, + "grad_norm": 0.39803430647266524, + "learning_rate": 0.00019057662769001395, + "loss": 0.6472, + "step": 929 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 0.3762222200575266, + "learning_rate": 0.0001905522113455803, + "loss": 0.6311, + "step": 930 + }, + { + "epoch": 0.1655111111111111, + "grad_norm": 0.3611497893779868, + "learning_rate": 0.00019052776497842423, + "loss": 0.6708, + "step": 931 + }, + { + "epoch": 0.16568888888888889, + "grad_norm": 0.3633431135172028, + "learning_rate": 0.000190503288596651, + "loss": 0.6384, + "step": 932 + }, + { + "epoch": 0.16586666666666666, + "grad_norm": 0.36716431339514577, + "learning_rate": 0.00019047878220837576, + "loss": 0.6679, + "step": 933 + }, + { + "epoch": 0.16604444444444444, + "grad_norm": 0.3761611957212563, + "learning_rate": 0.00019045424582172368, + "loss": 0.6928, + "step": 934 + }, + { + "epoch": 0.16622222222222222, + "grad_norm": 0.36756920537908794, + "learning_rate": 0.00019042967944482981, + "loss": 0.7036, + "step": 935 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3705242573861348, + "learning_rate": 0.00019040508308583917, + "loss": 0.6677, + "step": 936 + }, + { + "epoch": 0.16657777777777777, + "grad_norm": 0.3476817772622256, + "learning_rate": 0.00019038045675290674, + "loss": 0.6404, + "step": 937 + }, + { + "epoch": 0.16675555555555555, + "grad_norm": 0.36164452836198513, + "learning_rate": 0.0001903558004541974, + "loss": 0.6751, + "step": 938 + }, + { + "epoch": 0.16693333333333332, + "grad_norm": 0.3613310282905976, + "learning_rate": 0.00019033111419788597, + "loss": 0.678, + "step": 939 + }, + { + "epoch": 0.1671111111111111, + "grad_norm": 0.4002764076703055, + "learning_rate": 0.00019030639799215727, + "loss": 0.6583, + "step": 940 + }, + { + "epoch": 0.16728888888888888, + "grad_norm": 0.383884582251674, + "learning_rate": 0.00019028165184520598, + "loss": 0.7168, + "step": 941 + }, + { + "epoch": 0.16746666666666668, + "grad_norm": 0.3513166988467683, + "learning_rate": 0.00019025687576523662, + "loss": 0.6887, + "step": 942 + }, + { + "epoch": 0.16764444444444446, + "grad_norm": 0.3712351944457513, + "learning_rate": 0.00019023206976046388, + "loss": 0.6896, + "step": 943 + }, + { + "epoch": 0.16782222222222223, + "grad_norm": 0.3548838696930136, + "learning_rate": 0.00019020723383911215, + "loss": 0.7209, + "step": 944 + }, + { + "epoch": 0.168, + "grad_norm": 0.3609831047187673, + "learning_rate": 0.00019018236800941586, + "loss": 0.6913, + "step": 945 + }, + { + "epoch": 0.1681777777777778, + "grad_norm": 0.38494901957599786, + "learning_rate": 0.00019015747227961924, + "loss": 0.6726, + "step": 946 + }, + { + "epoch": 0.16835555555555556, + "grad_norm": 0.376045438024239, + "learning_rate": 0.00019013254665797656, + "loss": 0.7049, + "step": 947 + }, + { + "epoch": 0.16853333333333334, + "grad_norm": 0.38693239070360796, + "learning_rate": 0.00019010759115275198, + "loss": 0.7093, + "step": 948 + }, + { + "epoch": 0.16871111111111112, + "grad_norm": 0.39279322099221703, + "learning_rate": 0.00019008260577221947, + "loss": 0.7007, + "step": 949 + }, + { + "epoch": 0.1688888888888889, + "grad_norm": 0.36528657687701077, + "learning_rate": 0.000190057590524663, + "loss": 0.6862, + "step": 950 + }, + { + "epoch": 0.16906666666666667, + "grad_norm": 0.4067825862277342, + "learning_rate": 0.0001900325454183764, + "loss": 0.7108, + "step": 951 + }, + { + "epoch": 0.16924444444444445, + "grad_norm": 0.3641457720727108, + "learning_rate": 0.00019000747046166345, + "loss": 0.6795, + "step": 952 + }, + { + "epoch": 0.16942222222222222, + "grad_norm": 0.38005048120742807, + "learning_rate": 0.00018998236566283774, + "loss": 0.7111, + "step": 953 + }, + { + "epoch": 0.1696, + "grad_norm": 0.40237289294467093, + "learning_rate": 0.00018995723103022285, + "loss": 0.6531, + "step": 954 + }, + { + "epoch": 0.16977777777777778, + "grad_norm": 0.3828704240170546, + "learning_rate": 0.00018993206657215214, + "loss": 0.664, + "step": 955 + }, + { + "epoch": 0.16995555555555555, + "grad_norm": 0.3707010162882291, + "learning_rate": 0.00018990687229696903, + "loss": 0.6801, + "step": 956 + }, + { + "epoch": 0.17013333333333333, + "grad_norm": 0.3766939634441239, + "learning_rate": 0.0001898816482130266, + "loss": 0.663, + "step": 957 + }, + { + "epoch": 0.1703111111111111, + "grad_norm": 0.3875508305819283, + "learning_rate": 0.000189856394328688, + "loss": 0.66, + "step": 958 + }, + { + "epoch": 0.17048888888888888, + "grad_norm": 0.39607092680599376, + "learning_rate": 0.0001898311106523262, + "loss": 0.6969, + "step": 959 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 0.3748046646768422, + "learning_rate": 0.00018980579719232404, + "loss": 0.7157, + "step": 960 + }, + { + "epoch": 0.17084444444444444, + "grad_norm": 0.3550300218043015, + "learning_rate": 0.00018978045395707418, + "loss": 0.6794, + "step": 961 + }, + { + "epoch": 0.17102222222222221, + "grad_norm": 0.39915666639760633, + "learning_rate": 0.00018975508095497924, + "loss": 0.6239, + "step": 962 + }, + { + "epoch": 0.1712, + "grad_norm": 0.39528568088530114, + "learning_rate": 0.00018972967819445174, + "loss": 0.722, + "step": 963 + }, + { + "epoch": 0.17137777777777777, + "grad_norm": 0.42818435107281805, + "learning_rate": 0.0001897042456839139, + "loss": 0.7361, + "step": 964 + }, + { + "epoch": 0.17155555555555554, + "grad_norm": 0.3770331825732747, + "learning_rate": 0.000189678783431798, + "loss": 0.6903, + "step": 965 + }, + { + "epoch": 0.17173333333333332, + "grad_norm": 0.3690606710884741, + "learning_rate": 0.000189653291446546, + "loss": 0.6654, + "step": 966 + }, + { + "epoch": 0.1719111111111111, + "grad_norm": 0.3531078932464131, + "learning_rate": 0.00018962776973660987, + "loss": 0.6985, + "step": 967 + }, + { + "epoch": 0.1720888888888889, + "grad_norm": 0.37621156797683303, + "learning_rate": 0.00018960221831045137, + "loss": 0.6532, + "step": 968 + }, + { + "epoch": 0.17226666666666668, + "grad_norm": 0.36891033017600644, + "learning_rate": 0.00018957663717654208, + "loss": 0.6621, + "step": 969 + }, + { + "epoch": 0.17244444444444446, + "grad_norm": 0.3706211159271918, + "learning_rate": 0.00018955102634336346, + "loss": 0.68, + "step": 970 + }, + { + "epoch": 0.17262222222222223, + "grad_norm": 0.3501615441517381, + "learning_rate": 0.00018952538581940687, + "loss": 0.6794, + "step": 971 + }, + { + "epoch": 0.1728, + "grad_norm": 0.38998123444222355, + "learning_rate": 0.0001894997156131734, + "loss": 0.6217, + "step": 972 + }, + { + "epoch": 0.17297777777777779, + "grad_norm": 0.395084525965259, + "learning_rate": 0.00018947401573317412, + "loss": 0.6329, + "step": 973 + }, + { + "epoch": 0.17315555555555556, + "grad_norm": 0.44863575803275124, + "learning_rate": 0.0001894482861879298, + "loss": 0.7058, + "step": 974 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.389612453394495, + "learning_rate": 0.00018942252698597113, + "loss": 0.6733, + "step": 975 + }, + { + "epoch": 0.17351111111111112, + "grad_norm": 0.4150308547143673, + "learning_rate": 0.00018939673813583863, + "loss": 0.718, + "step": 976 + }, + { + "epoch": 0.1736888888888889, + "grad_norm": 0.37633665114754683, + "learning_rate": 0.00018937091964608263, + "loss": 0.6395, + "step": 977 + }, + { + "epoch": 0.17386666666666667, + "grad_norm": 0.36311496791114056, + "learning_rate": 0.00018934507152526325, + "loss": 0.6534, + "step": 978 + }, + { + "epoch": 0.17404444444444445, + "grad_norm": 0.3742321074084477, + "learning_rate": 0.00018931919378195052, + "loss": 0.6848, + "step": 979 + }, + { + "epoch": 0.17422222222222222, + "grad_norm": 0.36516688034793926, + "learning_rate": 0.00018929328642472418, + "loss": 0.6838, + "step": 980 + }, + { + "epoch": 0.1744, + "grad_norm": 0.3752231104254026, + "learning_rate": 0.00018926734946217395, + "loss": 0.6907, + "step": 981 + }, + { + "epoch": 0.17457777777777778, + "grad_norm": 0.3654848572864537, + "learning_rate": 0.0001892413829028992, + "loss": 0.6774, + "step": 982 + }, + { + "epoch": 0.17475555555555555, + "grad_norm": 0.3718312413505301, + "learning_rate": 0.0001892153867555092, + "loss": 0.7198, + "step": 983 + }, + { + "epoch": 0.17493333333333333, + "grad_norm": 0.4503316142695314, + "learning_rate": 0.00018918936102862302, + "loss": 0.6622, + "step": 984 + }, + { + "epoch": 0.1751111111111111, + "grad_norm": 0.37168340520887666, + "learning_rate": 0.00018916330573086953, + "loss": 0.655, + "step": 985 + }, + { + "epoch": 0.17528888888888888, + "grad_norm": 0.3760416902427817, + "learning_rate": 0.00018913722087088736, + "loss": 0.6525, + "step": 986 + }, + { + "epoch": 0.17546666666666666, + "grad_norm": 0.3695281597407702, + "learning_rate": 0.00018911110645732505, + "loss": 0.6682, + "step": 987 + }, + { + "epoch": 0.17564444444444444, + "grad_norm": 0.3729231290106923, + "learning_rate": 0.00018908496249884084, + "loss": 0.6777, + "step": 988 + }, + { + "epoch": 0.1758222222222222, + "grad_norm": 0.3816775557778491, + "learning_rate": 0.00018905878900410275, + "loss": 0.6856, + "step": 989 + }, + { + "epoch": 0.176, + "grad_norm": 0.3594686002260559, + "learning_rate": 0.00018903258598178876, + "loss": 0.6836, + "step": 990 + }, + { + "epoch": 0.17617777777777777, + "grad_norm": 0.38593876136560634, + "learning_rate": 0.00018900635344058645, + "loss": 0.7419, + "step": 991 + }, + { + "epoch": 0.17635555555555554, + "grad_norm": 0.3638198093469802, + "learning_rate": 0.00018898009138919322, + "loss": 0.6186, + "step": 992 + }, + { + "epoch": 0.17653333333333332, + "grad_norm": 0.354852666313951, + "learning_rate": 0.00018895379983631635, + "loss": 0.6307, + "step": 993 + }, + { + "epoch": 0.17671111111111112, + "grad_norm": 0.3799000596961431, + "learning_rate": 0.00018892747879067286, + "loss": 0.7246, + "step": 994 + }, + { + "epoch": 0.1768888888888889, + "grad_norm": 0.4116639770867036, + "learning_rate": 0.00018890112826098948, + "loss": 0.6918, + "step": 995 + }, + { + "epoch": 0.17706666666666668, + "grad_norm": 0.36948365701052455, + "learning_rate": 0.0001888747482560028, + "loss": 0.6586, + "step": 996 + }, + { + "epoch": 0.17724444444444445, + "grad_norm": 0.3844647498152997, + "learning_rate": 0.00018884833878445912, + "loss": 0.6963, + "step": 997 + }, + { + "epoch": 0.17742222222222223, + "grad_norm": 0.396528136258244, + "learning_rate": 0.00018882189985511456, + "loss": 0.7181, + "step": 998 + }, + { + "epoch": 0.1776, + "grad_norm": 0.34685819831335285, + "learning_rate": 0.00018879543147673502, + "loss": 0.6369, + "step": 999 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 0.36814974577224374, + "learning_rate": 0.00018876893365809606, + "loss": 0.6956, + "step": 1000 + }, + { + "epoch": 0.17795555555555556, + "grad_norm": 0.35038795028472525, + "learning_rate": 0.00018874240640798316, + "loss": 0.6551, + "step": 1001 + }, + { + "epoch": 0.17813333333333334, + "grad_norm": 0.3776709427322688, + "learning_rate": 0.0001887158497351914, + "loss": 0.6986, + "step": 1002 + }, + { + "epoch": 0.17831111111111111, + "grad_norm": 0.38182007870136747, + "learning_rate": 0.00018868926364852567, + "loss": 0.6663, + "step": 1003 + }, + { + "epoch": 0.1784888888888889, + "grad_norm": 0.39021592777210223, + "learning_rate": 0.0001886626481568007, + "loss": 0.7273, + "step": 1004 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 0.37070819002176686, + "learning_rate": 0.00018863600326884082, + "loss": 0.6592, + "step": 1005 + }, + { + "epoch": 0.17884444444444444, + "grad_norm": 0.368270598240532, + "learning_rate": 0.00018860932899348028, + "loss": 0.6836, + "step": 1006 + }, + { + "epoch": 0.17902222222222222, + "grad_norm": 0.3905545408808732, + "learning_rate": 0.0001885826253395629, + "loss": 0.684, + "step": 1007 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3659047557041727, + "learning_rate": 0.00018855589231594227, + "loss": 0.6591, + "step": 1008 + }, + { + "epoch": 0.17937777777777777, + "grad_norm": 0.37032199327762255, + "learning_rate": 0.0001885291299314819, + "loss": 0.6927, + "step": 1009 + }, + { + "epoch": 0.17955555555555555, + "grad_norm": 0.38979255847967137, + "learning_rate": 0.0001885023381950548, + "loss": 0.6703, + "step": 1010 + }, + { + "epoch": 0.17973333333333333, + "grad_norm": 0.35839924991171945, + "learning_rate": 0.00018847551711554384, + "loss": 0.673, + "step": 1011 + }, + { + "epoch": 0.1799111111111111, + "grad_norm": 0.3550182368855843, + "learning_rate": 0.0001884486667018416, + "loss": 0.6717, + "step": 1012 + }, + { + "epoch": 0.18008888888888888, + "grad_norm": 0.39327615547968137, + "learning_rate": 0.00018842178696285039, + "loss": 0.6808, + "step": 1013 + }, + { + "epoch": 0.18026666666666666, + "grad_norm": 0.39124528079775905, + "learning_rate": 0.00018839487790748216, + "loss": 0.6865, + "step": 1014 + }, + { + "epoch": 0.18044444444444444, + "grad_norm": 0.4126227435855467, + "learning_rate": 0.0001883679395446587, + "loss": 0.714, + "step": 1015 + }, + { + "epoch": 0.1806222222222222, + "grad_norm": 0.37866834865644117, + "learning_rate": 0.00018834097188331143, + "loss": 0.6533, + "step": 1016 + }, + { + "epoch": 0.1808, + "grad_norm": 0.3790988565105198, + "learning_rate": 0.00018831397493238158, + "loss": 0.6755, + "step": 1017 + }, + { + "epoch": 0.18097777777777777, + "grad_norm": 0.35713611305074694, + "learning_rate": 0.00018828694870082, + "loss": 0.6608, + "step": 1018 + }, + { + "epoch": 0.18115555555555554, + "grad_norm": 0.39112658937760464, + "learning_rate": 0.00018825989319758724, + "loss": 0.6462, + "step": 1019 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 0.36100079597207796, + "learning_rate": 0.00018823280843165363, + "loss": 0.653, + "step": 1020 + }, + { + "epoch": 0.18151111111111112, + "grad_norm": 0.3580258160869801, + "learning_rate": 0.00018820569441199917, + "loss": 0.6888, + "step": 1021 + }, + { + "epoch": 0.1816888888888889, + "grad_norm": 0.34887978321262814, + "learning_rate": 0.00018817855114761352, + "loss": 0.6378, + "step": 1022 + }, + { + "epoch": 0.18186666666666668, + "grad_norm": 0.35454007586306613, + "learning_rate": 0.00018815137864749612, + "loss": 0.6636, + "step": 1023 + }, + { + "epoch": 0.18204444444444445, + "grad_norm": 0.3384296633572218, + "learning_rate": 0.000188124176920656, + "loss": 0.6838, + "step": 1024 + }, + { + "epoch": 0.18222222222222223, + "grad_norm": 0.3640204934659153, + "learning_rate": 0.00018809694597611201, + "loss": 0.6872, + "step": 1025 + }, + { + "epoch": 0.1824, + "grad_norm": 0.35392282573488176, + "learning_rate": 0.00018806968582289253, + "loss": 0.7036, + "step": 1026 + }, + { + "epoch": 0.18257777777777778, + "grad_norm": 0.3781764662799718, + "learning_rate": 0.00018804239647003573, + "loss": 0.6995, + "step": 1027 + }, + { + "epoch": 0.18275555555555556, + "grad_norm": 0.3786562802147849, + "learning_rate": 0.00018801507792658942, + "loss": 0.6985, + "step": 1028 + }, + { + "epoch": 0.18293333333333334, + "grad_norm": 0.37520482173238917, + "learning_rate": 0.00018798773020161117, + "loss": 0.7104, + "step": 1029 + }, + { + "epoch": 0.1831111111111111, + "grad_norm": 0.3881926927530677, + "learning_rate": 0.0001879603533041681, + "loss": 0.6706, + "step": 1030 + }, + { + "epoch": 0.1832888888888889, + "grad_norm": 0.40272556819394756, + "learning_rate": 0.00018793294724333707, + "loss": 0.695, + "step": 1031 + }, + { + "epoch": 0.18346666666666667, + "grad_norm": 0.3660902327326268, + "learning_rate": 0.00018790551202820462, + "loss": 0.7255, + "step": 1032 + }, + { + "epoch": 0.18364444444444444, + "grad_norm": 0.36260949205740023, + "learning_rate": 0.00018787804766786693, + "loss": 0.6562, + "step": 1033 + }, + { + "epoch": 0.18382222222222222, + "grad_norm": 0.364765675122991, + "learning_rate": 0.0001878505541714298, + "loss": 0.6426, + "step": 1034 + }, + { + "epoch": 0.184, + "grad_norm": 0.36886135312743806, + "learning_rate": 0.00018782303154800886, + "loss": 0.6976, + "step": 1035 + }, + { + "epoch": 0.18417777777777777, + "grad_norm": 0.38799783711154834, + "learning_rate": 0.00018779547980672917, + "loss": 0.6739, + "step": 1036 + }, + { + "epoch": 0.18435555555555555, + "grad_norm": 0.37845233085869384, + "learning_rate": 0.00018776789895672558, + "loss": 0.6657, + "step": 1037 + }, + { + "epoch": 0.18453333333333333, + "grad_norm": 0.3840572114088358, + "learning_rate": 0.00018774028900714256, + "loss": 0.6153, + "step": 1038 + }, + { + "epoch": 0.1847111111111111, + "grad_norm": 0.3970548335396864, + "learning_rate": 0.00018771264996713424, + "loss": 0.6633, + "step": 1039 + }, + { + "epoch": 0.18488888888888888, + "grad_norm": 0.37743976282598407, + "learning_rate": 0.0001876849818458644, + "loss": 0.6736, + "step": 1040 + }, + { + "epoch": 0.18506666666666666, + "grad_norm": 0.376065022022482, + "learning_rate": 0.00018765728465250644, + "loss": 0.6345, + "step": 1041 + }, + { + "epoch": 0.18524444444444443, + "grad_norm": 0.37318612648855637, + "learning_rate": 0.00018762955839624334, + "loss": 0.6859, + "step": 1042 + }, + { + "epoch": 0.1854222222222222, + "grad_norm": 0.3891986085529512, + "learning_rate": 0.0001876018030862679, + "loss": 0.7351, + "step": 1043 + }, + { + "epoch": 0.1856, + "grad_norm": 0.39996236404332447, + "learning_rate": 0.00018757401873178235, + "loss": 0.7462, + "step": 1044 + }, + { + "epoch": 0.18577777777777776, + "grad_norm": 0.35164083079776937, + "learning_rate": 0.00018754620534199864, + "loss": 0.6848, + "step": 1045 + }, + { + "epoch": 0.18595555555555557, + "grad_norm": 0.3540584455289317, + "learning_rate": 0.00018751836292613838, + "loss": 0.6246, + "step": 1046 + }, + { + "epoch": 0.18613333333333335, + "grad_norm": 0.37539709411181454, + "learning_rate": 0.00018749049149343274, + "loss": 0.6679, + "step": 1047 + }, + { + "epoch": 0.18631111111111112, + "grad_norm": 0.3821921339030591, + "learning_rate": 0.00018746259105312257, + "loss": 0.6825, + "step": 1048 + }, + { + "epoch": 0.1864888888888889, + "grad_norm": 0.38234697402910833, + "learning_rate": 0.00018743466161445823, + "loss": 0.65, + "step": 1049 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.38949180460348376, + "learning_rate": 0.00018740670318669983, + "loss": 0.6988, + "step": 1050 + }, + { + "epoch": 0.18684444444444445, + "grad_norm": 0.37381928872938097, + "learning_rate": 0.000187378715779117, + "loss": 0.661, + "step": 1051 + }, + { + "epoch": 0.18702222222222223, + "grad_norm": 0.36656815412398536, + "learning_rate": 0.00018735069940098903, + "loss": 0.6857, + "step": 1052 + }, + { + "epoch": 0.1872, + "grad_norm": 0.38026359716448904, + "learning_rate": 0.00018732265406160476, + "loss": 0.6694, + "step": 1053 + }, + { + "epoch": 0.18737777777777778, + "grad_norm": 0.3784755671013034, + "learning_rate": 0.00018729457977026274, + "loss": 0.7031, + "step": 1054 + }, + { + "epoch": 0.18755555555555556, + "grad_norm": 0.3738130753243982, + "learning_rate": 0.00018726647653627093, + "loss": 0.6592, + "step": 1055 + }, + { + "epoch": 0.18773333333333334, + "grad_norm": 0.3678085945927551, + "learning_rate": 0.00018723834436894707, + "loss": 0.6805, + "step": 1056 + }, + { + "epoch": 0.1879111111111111, + "grad_norm": 0.3917628992193831, + "learning_rate": 0.00018721018327761842, + "loss": 0.7123, + "step": 1057 + }, + { + "epoch": 0.1880888888888889, + "grad_norm": 0.40096744433793907, + "learning_rate": 0.0001871819932716218, + "loss": 0.7008, + "step": 1058 + }, + { + "epoch": 0.18826666666666667, + "grad_norm": 0.3693331942483457, + "learning_rate": 0.0001871537743603037, + "loss": 0.7261, + "step": 1059 + }, + { + "epoch": 0.18844444444444444, + "grad_norm": 0.3690413756896375, + "learning_rate": 0.0001871255265530201, + "loss": 0.6289, + "step": 1060 + }, + { + "epoch": 0.18862222222222222, + "grad_norm": 0.36045732447648826, + "learning_rate": 0.0001870972498591366, + "loss": 0.6246, + "step": 1061 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4519262981598446, + "learning_rate": 0.00018706894428802845, + "loss": 0.7178, + "step": 1062 + }, + { + "epoch": 0.18897777777777777, + "grad_norm": 0.385301417601109, + "learning_rate": 0.0001870406098490803, + "loss": 0.6625, + "step": 1063 + }, + { + "epoch": 0.18915555555555555, + "grad_norm": 0.37344797452249623, + "learning_rate": 0.00018701224655168658, + "loss": 0.7175, + "step": 1064 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 0.3830148666530466, + "learning_rate": 0.0001869838544052511, + "loss": 0.6695, + "step": 1065 + }, + { + "epoch": 0.1895111111111111, + "grad_norm": 0.4002807163476561, + "learning_rate": 0.00018695543341918736, + "loss": 0.7022, + "step": 1066 + }, + { + "epoch": 0.18968888888888888, + "grad_norm": 0.36048817733599303, + "learning_rate": 0.00018692698360291837, + "loss": 0.611, + "step": 1067 + }, + { + "epoch": 0.18986666666666666, + "grad_norm": 0.37550275521026466, + "learning_rate": 0.00018689850496587674, + "loss": 0.6527, + "step": 1068 + }, + { + "epoch": 0.19004444444444443, + "grad_norm": 0.38382058586566487, + "learning_rate": 0.0001868699975175045, + "loss": 0.7059, + "step": 1069 + }, + { + "epoch": 0.1902222222222222, + "grad_norm": 0.38953057931205226, + "learning_rate": 0.00018684146126725351, + "loss": 0.6981, + "step": 1070 + }, + { + "epoch": 0.1904, + "grad_norm": 0.38404872425187175, + "learning_rate": 0.00018681289622458485, + "loss": 0.6733, + "step": 1071 + }, + { + "epoch": 0.1905777777777778, + "grad_norm": 0.3899713802272258, + "learning_rate": 0.00018678430239896937, + "loss": 0.6879, + "step": 1072 + }, + { + "epoch": 0.19075555555555557, + "grad_norm": 0.4081394693136384, + "learning_rate": 0.00018675567979988743, + "loss": 0.7543, + "step": 1073 + }, + { + "epoch": 0.19093333333333334, + "grad_norm": 0.3680491743867154, + "learning_rate": 0.00018672702843682882, + "loss": 0.6949, + "step": 1074 + }, + { + "epoch": 0.19111111111111112, + "grad_norm": 0.4201453327037143, + "learning_rate": 0.000186698348319293, + "loss": 0.7089, + "step": 1075 + }, + { + "epoch": 0.1912888888888889, + "grad_norm": 0.38747821429503604, + "learning_rate": 0.00018666963945678888, + "loss": 0.7007, + "step": 1076 + }, + { + "epoch": 0.19146666666666667, + "grad_norm": 0.3713182780208938, + "learning_rate": 0.00018664090185883491, + "loss": 0.6327, + "step": 1077 + }, + { + "epoch": 0.19164444444444445, + "grad_norm": 0.4076024682018734, + "learning_rate": 0.00018661213553495913, + "loss": 0.7065, + "step": 1078 + }, + { + "epoch": 0.19182222222222223, + "grad_norm": 0.37472837532627157, + "learning_rate": 0.00018658334049469904, + "loss": 0.6532, + "step": 1079 + }, + { + "epoch": 0.192, + "grad_norm": 0.377628716421935, + "learning_rate": 0.00018655451674760168, + "loss": 0.7094, + "step": 1080 + }, + { + "epoch": 0.19217777777777778, + "grad_norm": 0.426814684041566, + "learning_rate": 0.00018652566430322356, + "loss": 0.6952, + "step": 1081 + }, + { + "epoch": 0.19235555555555556, + "grad_norm": 0.3935512954679948, + "learning_rate": 0.00018649678317113084, + "loss": 0.7195, + "step": 1082 + }, + { + "epoch": 0.19253333333333333, + "grad_norm": 0.3554223296686633, + "learning_rate": 0.000186467873360899, + "loss": 0.6075, + "step": 1083 + }, + { + "epoch": 0.1927111111111111, + "grad_norm": 0.4143116372175908, + "learning_rate": 0.00018643893488211327, + "loss": 0.754, + "step": 1084 + }, + { + "epoch": 0.1928888888888889, + "grad_norm": 0.39760016319044933, + "learning_rate": 0.00018640996774436808, + "loss": 0.7025, + "step": 1085 + }, + { + "epoch": 0.19306666666666666, + "grad_norm": 0.41038883584327823, + "learning_rate": 0.00018638097195726764, + "loss": 0.7028, + "step": 1086 + }, + { + "epoch": 0.19324444444444444, + "grad_norm": 0.3971912174566233, + "learning_rate": 0.00018635194753042553, + "loss": 0.6829, + "step": 1087 + }, + { + "epoch": 0.19342222222222222, + "grad_norm": 0.3866693652689321, + "learning_rate": 0.00018632289447346483, + "loss": 0.6848, + "step": 1088 + }, + { + "epoch": 0.1936, + "grad_norm": 0.3750370247759865, + "learning_rate": 0.00018629381279601813, + "loss": 0.6715, + "step": 1089 + }, + { + "epoch": 0.19377777777777777, + "grad_norm": 0.37228851729537327, + "learning_rate": 0.00018626470250772748, + "loss": 0.6566, + "step": 1090 + }, + { + "epoch": 0.19395555555555555, + "grad_norm": 0.36048117851376804, + "learning_rate": 0.00018623556361824445, + "loss": 0.6651, + "step": 1091 + }, + { + "epoch": 0.19413333333333332, + "grad_norm": 0.38891722694839975, + "learning_rate": 0.00018620639613723013, + "loss": 0.6675, + "step": 1092 + }, + { + "epoch": 0.1943111111111111, + "grad_norm": 0.3675322596141409, + "learning_rate": 0.00018617720007435497, + "loss": 0.713, + "step": 1093 + }, + { + "epoch": 0.19448888888888888, + "grad_norm": 0.3682768200879474, + "learning_rate": 0.00018614797543929903, + "loss": 0.6895, + "step": 1094 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 0.3645532602208841, + "learning_rate": 0.00018611872224175177, + "loss": 0.6805, + "step": 1095 + }, + { + "epoch": 0.19484444444444443, + "grad_norm": 0.37646682665981573, + "learning_rate": 0.00018608944049141205, + "loss": 0.6566, + "step": 1096 + }, + { + "epoch": 0.19502222222222224, + "grad_norm": 0.36288741972391836, + "learning_rate": 0.00018606013019798837, + "loss": 0.6724, + "step": 1097 + }, + { + "epoch": 0.1952, + "grad_norm": 0.39136349420864747, + "learning_rate": 0.00018603079137119864, + "loss": 0.667, + "step": 1098 + }, + { + "epoch": 0.1953777777777778, + "grad_norm": 0.3861845596742569, + "learning_rate": 0.00018600142402077006, + "loss": 0.6934, + "step": 1099 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 0.36018266872198745, + "learning_rate": 0.00018597202815643952, + "loss": 0.6939, + "step": 1100 + }, + { + "epoch": 0.19573333333333334, + "grad_norm": 0.3583860976993596, + "learning_rate": 0.00018594260378795323, + "loss": 0.6655, + "step": 1101 + }, + { + "epoch": 0.19591111111111112, + "grad_norm": 0.3635719108851077, + "learning_rate": 0.00018591315092506688, + "loss": 0.6408, + "step": 1102 + }, + { + "epoch": 0.1960888888888889, + "grad_norm": 1.5344025595617339, + "learning_rate": 0.0001858836695775456, + "loss": 0.6816, + "step": 1103 + }, + { + "epoch": 0.19626666666666667, + "grad_norm": 0.38066363645383966, + "learning_rate": 0.000185854159755164, + "loss": 0.6644, + "step": 1104 + }, + { + "epoch": 0.19644444444444445, + "grad_norm": 0.37412340137287314, + "learning_rate": 0.00018582462146770614, + "loss": 0.6683, + "step": 1105 + }, + { + "epoch": 0.19662222222222223, + "grad_norm": 0.3715445539287525, + "learning_rate": 0.00018579505472496544, + "loss": 0.6458, + "step": 1106 + }, + { + "epoch": 0.1968, + "grad_norm": 0.38897752233594024, + "learning_rate": 0.00018576545953674476, + "loss": 0.6806, + "step": 1107 + }, + { + "epoch": 0.19697777777777778, + "grad_norm": 0.3869561469046338, + "learning_rate": 0.00018573583591285648, + "loss": 0.6239, + "step": 1108 + }, + { + "epoch": 0.19715555555555556, + "grad_norm": 0.3909276500815709, + "learning_rate": 0.00018570618386312235, + "loss": 0.728, + "step": 1109 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 0.37638523453247363, + "learning_rate": 0.00018567650339737358, + "loss": 0.6342, + "step": 1110 + }, + { + "epoch": 0.1975111111111111, + "grad_norm": 0.36314456399322503, + "learning_rate": 0.0001856467945254507, + "loss": 0.6378, + "step": 1111 + }, + { + "epoch": 0.1976888888888889, + "grad_norm": 0.3690453086516602, + "learning_rate": 0.0001856170572572038, + "loss": 0.6807, + "step": 1112 + }, + { + "epoch": 0.19786666666666666, + "grad_norm": 0.3829964987340547, + "learning_rate": 0.00018558729160249229, + "loss": 0.6822, + "step": 1113 + }, + { + "epoch": 0.19804444444444444, + "grad_norm": 0.38615860279156744, + "learning_rate": 0.00018555749757118498, + "loss": 0.6673, + "step": 1114 + }, + { + "epoch": 0.19822222222222222, + "grad_norm": 0.3771590897552604, + "learning_rate": 0.00018552767517316022, + "loss": 0.6499, + "step": 1115 + }, + { + "epoch": 0.1984, + "grad_norm": 0.4042250038542116, + "learning_rate": 0.00018549782441830556, + "loss": 0.6549, + "step": 1116 + }, + { + "epoch": 0.19857777777777777, + "grad_norm": 0.3644069189474034, + "learning_rate": 0.00018546794531651816, + "loss": 0.6466, + "step": 1117 + }, + { + "epoch": 0.19875555555555555, + "grad_norm": 0.3879639771946161, + "learning_rate": 0.00018543803787770443, + "loss": 0.7134, + "step": 1118 + }, + { + "epoch": 0.19893333333333332, + "grad_norm": 0.38777353313294133, + "learning_rate": 0.00018540810211178024, + "loss": 0.6996, + "step": 1119 + }, + { + "epoch": 0.1991111111111111, + "grad_norm": 0.3712480770903855, + "learning_rate": 0.0001853781380286708, + "loss": 0.724, + "step": 1120 + }, + { + "epoch": 0.19928888888888888, + "grad_norm": 0.3586363074948968, + "learning_rate": 0.00018534814563831082, + "loss": 0.6881, + "step": 1121 + }, + { + "epoch": 0.19946666666666665, + "grad_norm": 0.3699735860253999, + "learning_rate": 0.00018531812495064428, + "loss": 0.6816, + "step": 1122 + }, + { + "epoch": 0.19964444444444446, + "grad_norm": 0.3763551299303036, + "learning_rate": 0.0001852880759756246, + "loss": 0.62, + "step": 1123 + }, + { + "epoch": 0.19982222222222223, + "grad_norm": 0.37159190733663094, + "learning_rate": 0.0001852579987232145, + "loss": 0.7075, + "step": 1124 + }, + { + "epoch": 0.2, + "grad_norm": 0.369739302552831, + "learning_rate": 0.00018522789320338622, + "loss": 0.6516, + "step": 1125 + }, + { + "epoch": 0.2001777777777778, + "grad_norm": 0.3880099189965292, + "learning_rate": 0.00018519775942612128, + "loss": 0.6751, + "step": 1126 + }, + { + "epoch": 0.20035555555555556, + "grad_norm": 0.39833414610430823, + "learning_rate": 0.0001851675974014105, + "loss": 0.6953, + "step": 1127 + }, + { + "epoch": 0.20053333333333334, + "grad_norm": 0.40907109587737156, + "learning_rate": 0.0001851374071392543, + "loss": 0.6608, + "step": 1128 + }, + { + "epoch": 0.20071111111111112, + "grad_norm": 0.3701645480867035, + "learning_rate": 0.0001851071886496621, + "loss": 0.6348, + "step": 1129 + }, + { + "epoch": 0.2008888888888889, + "grad_norm": 0.3721107847160279, + "learning_rate": 0.0001850769419426531, + "loss": 0.6942, + "step": 1130 + }, + { + "epoch": 0.20106666666666667, + "grad_norm": 0.42932904686343043, + "learning_rate": 0.00018504666702825548, + "loss": 0.7384, + "step": 1131 + }, + { + "epoch": 0.20124444444444445, + "grad_norm": 0.38971189939948153, + "learning_rate": 0.00018501636391650701, + "loss": 0.647, + "step": 1132 + }, + { + "epoch": 0.20142222222222222, + "grad_norm": 0.36170462957011823, + "learning_rate": 0.0001849860326174547, + "loss": 0.6633, + "step": 1133 + }, + { + "epoch": 0.2016, + "grad_norm": 0.4286187198519224, + "learning_rate": 0.00018495567314115495, + "loss": 0.6968, + "step": 1134 + }, + { + "epoch": 0.20177777777777778, + "grad_norm": 0.3949704438013471, + "learning_rate": 0.00018492528549767353, + "loss": 0.701, + "step": 1135 + }, + { + "epoch": 0.20195555555555555, + "grad_norm": 0.39499882521052326, + "learning_rate": 0.00018489486969708543, + "loss": 0.6867, + "step": 1136 + }, + { + "epoch": 0.20213333333333333, + "grad_norm": 0.3849631930084498, + "learning_rate": 0.00018486442574947511, + "loss": 0.6707, + "step": 1137 + }, + { + "epoch": 0.2023111111111111, + "grad_norm": 0.38731904488506846, + "learning_rate": 0.0001848339536649363, + "loss": 0.7322, + "step": 1138 + }, + { + "epoch": 0.20248888888888888, + "grad_norm": 0.384619797734893, + "learning_rate": 0.00018480345345357204, + "loss": 0.711, + "step": 1139 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 0.3669035594777077, + "learning_rate": 0.00018477292512549475, + "loss": 0.6201, + "step": 1140 + }, + { + "epoch": 0.20284444444444444, + "grad_norm": 0.36143735625158474, + "learning_rate": 0.00018474236869082616, + "loss": 0.6939, + "step": 1141 + }, + { + "epoch": 0.20302222222222222, + "grad_norm": 0.38597058955210695, + "learning_rate": 0.00018471178415969722, + "loss": 0.6483, + "step": 1142 + }, + { + "epoch": 0.2032, + "grad_norm": 0.3661880236347438, + "learning_rate": 0.00018468117154224839, + "loss": 0.6735, + "step": 1143 + }, + { + "epoch": 0.20337777777777777, + "grad_norm": 0.3666624021547499, + "learning_rate": 0.00018465053084862923, + "loss": 0.669, + "step": 1144 + }, + { + "epoch": 0.20355555555555555, + "grad_norm": 0.3685974860703644, + "learning_rate": 0.00018461986208899878, + "loss": 0.6852, + "step": 1145 + }, + { + "epoch": 0.20373333333333332, + "grad_norm": 0.3775971460694314, + "learning_rate": 0.00018458916527352526, + "loss": 0.6222, + "step": 1146 + }, + { + "epoch": 0.2039111111111111, + "grad_norm": 0.37062533603571957, + "learning_rate": 0.00018455844041238625, + "loss": 0.6558, + "step": 1147 + }, + { + "epoch": 0.20408888888888888, + "grad_norm": 0.36951976564161504, + "learning_rate": 0.0001845276875157687, + "loss": 0.6766, + "step": 1148 + }, + { + "epoch": 0.20426666666666668, + "grad_norm": 0.36297876171397325, + "learning_rate": 0.0001844969065938687, + "loss": 0.6684, + "step": 1149 + }, + { + "epoch": 0.20444444444444446, + "grad_norm": 0.3855925854954929, + "learning_rate": 0.0001844660976568917, + "loss": 0.6203, + "step": 1150 + }, + { + "epoch": 0.20462222222222223, + "grad_norm": 0.36069190151612895, + "learning_rate": 0.00018443526071505254, + "loss": 0.6697, + "step": 1151 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3919835110382578, + "learning_rate": 0.0001844043957785752, + "loss": 0.66, + "step": 1152 + }, + { + "epoch": 0.2049777777777778, + "grad_norm": 0.3871294120088186, + "learning_rate": 0.00018437350285769295, + "loss": 0.654, + "step": 1153 + }, + { + "epoch": 0.20515555555555556, + "grad_norm": 0.3764036048957266, + "learning_rate": 0.00018434258196264845, + "loss": 0.6317, + "step": 1154 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 0.3569369602465215, + "learning_rate": 0.00018431163310369354, + "loss": 0.6095, + "step": 1155 + }, + { + "epoch": 0.20551111111111112, + "grad_norm": 0.3682996887968764, + "learning_rate": 0.00018428065629108934, + "loss": 0.6586, + "step": 1156 + }, + { + "epoch": 0.2056888888888889, + "grad_norm": 0.362514413631995, + "learning_rate": 0.00018424965153510635, + "loss": 0.6718, + "step": 1157 + }, + { + "epoch": 0.20586666666666667, + "grad_norm": 0.3569257258786691, + "learning_rate": 0.00018421861884602414, + "loss": 0.6714, + "step": 1158 + }, + { + "epoch": 0.20604444444444445, + "grad_norm": 0.36420710820280233, + "learning_rate": 0.0001841875582341317, + "loss": 0.6595, + "step": 1159 + }, + { + "epoch": 0.20622222222222222, + "grad_norm": 0.3667777885499808, + "learning_rate": 0.0001841564697097272, + "loss": 0.6966, + "step": 1160 + }, + { + "epoch": 0.2064, + "grad_norm": 0.3698696460020529, + "learning_rate": 0.00018412535328311814, + "loss": 0.6739, + "step": 1161 + }, + { + "epoch": 0.20657777777777778, + "grad_norm": 0.3603207099840638, + "learning_rate": 0.00018409420896462112, + "loss": 0.676, + "step": 1162 + }, + { + "epoch": 0.20675555555555555, + "grad_norm": 0.35764408962982924, + "learning_rate": 0.00018406303676456217, + "loss": 0.6324, + "step": 1163 + }, + { + "epoch": 0.20693333333333333, + "grad_norm": 0.3786941692537567, + "learning_rate": 0.00018403183669327646, + "loss": 0.6818, + "step": 1164 + }, + { + "epoch": 0.2071111111111111, + "grad_norm": 0.4034681434660428, + "learning_rate": 0.00018400060876110842, + "loss": 0.6521, + "step": 1165 + }, + { + "epoch": 0.20728888888888888, + "grad_norm": 0.37000246611017584, + "learning_rate": 0.00018396935297841166, + "loss": 0.6702, + "step": 1166 + }, + { + "epoch": 0.20746666666666666, + "grad_norm": 0.3936626024740484, + "learning_rate": 0.00018393806935554916, + "loss": 0.6588, + "step": 1167 + }, + { + "epoch": 0.20764444444444444, + "grad_norm": 0.40252913735608165, + "learning_rate": 0.00018390675790289302, + "loss": 0.7248, + "step": 1168 + }, + { + "epoch": 0.2078222222222222, + "grad_norm": 0.391218634766642, + "learning_rate": 0.0001838754186308246, + "loss": 0.6775, + "step": 1169 + }, + { + "epoch": 0.208, + "grad_norm": 0.39203492521533034, + "learning_rate": 0.0001838440515497345, + "loss": 0.67, + "step": 1170 + }, + { + "epoch": 0.20817777777777777, + "grad_norm": 0.3662629325527311, + "learning_rate": 0.0001838126566700225, + "loss": 0.6925, + "step": 1171 + }, + { + "epoch": 0.20835555555555554, + "grad_norm": 0.40363709245565926, + "learning_rate": 0.00018378123400209764, + "loss": 0.6992, + "step": 1172 + }, + { + "epoch": 0.20853333333333332, + "grad_norm": 0.3798693237666605, + "learning_rate": 0.00018374978355637813, + "loss": 0.7335, + "step": 1173 + }, + { + "epoch": 0.2087111111111111, + "grad_norm": 0.37116874568327346, + "learning_rate": 0.00018371830534329143, + "loss": 0.666, + "step": 1174 + }, + { + "epoch": 0.2088888888888889, + "grad_norm": 0.38959958943797834, + "learning_rate": 0.0001836867993732742, + "loss": 0.692, + "step": 1175 + }, + { + "epoch": 0.20906666666666668, + "grad_norm": 0.35332744570073377, + "learning_rate": 0.00018365526565677226, + "loss": 0.6204, + "step": 1176 + }, + { + "epoch": 0.20924444444444446, + "grad_norm": 0.3744133796064289, + "learning_rate": 0.00018362370420424068, + "loss": 0.6645, + "step": 1177 + }, + { + "epoch": 0.20942222222222223, + "grad_norm": 0.3834140013396747, + "learning_rate": 0.00018359211502614372, + "loss": 0.6731, + "step": 1178 + }, + { + "epoch": 0.2096, + "grad_norm": 0.3888578258105978, + "learning_rate": 0.00018356049813295476, + "loss": 0.65, + "step": 1179 + }, + { + "epoch": 0.20977777777777779, + "grad_norm": 0.38652518918507633, + "learning_rate": 0.00018352885353515653, + "loss": 0.6445, + "step": 1180 + }, + { + "epoch": 0.20995555555555556, + "grad_norm": 0.40431841752340153, + "learning_rate": 0.00018349718124324076, + "loss": 0.6865, + "step": 1181 + }, + { + "epoch": 0.21013333333333334, + "grad_norm": 0.3718903706123179, + "learning_rate": 0.00018346548126770847, + "loss": 0.6711, + "step": 1182 + }, + { + "epoch": 0.21031111111111112, + "grad_norm": 0.3612738256205751, + "learning_rate": 0.00018343375361906984, + "loss": 0.6569, + "step": 1183 + }, + { + "epoch": 0.2104888888888889, + "grad_norm": 0.37243505463992294, + "learning_rate": 0.00018340199830784422, + "loss": 0.6743, + "step": 1184 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 0.3927398200160076, + "learning_rate": 0.00018337021534456014, + "loss": 0.6656, + "step": 1185 + }, + { + "epoch": 0.21084444444444445, + "grad_norm": 0.36802493665053415, + "learning_rate": 0.00018333840473975526, + "loss": 0.6839, + "step": 1186 + }, + { + "epoch": 0.21102222222222222, + "grad_norm": 0.3834102880243585, + "learning_rate": 0.00018330656650397646, + "loss": 0.6849, + "step": 1187 + }, + { + "epoch": 0.2112, + "grad_norm": 0.38712689986378573, + "learning_rate": 0.00018327470064777974, + "loss": 0.6995, + "step": 1188 + }, + { + "epoch": 0.21137777777777778, + "grad_norm": 0.3621021624951718, + "learning_rate": 0.0001832428071817303, + "loss": 0.7123, + "step": 1189 + }, + { + "epoch": 0.21155555555555555, + "grad_norm": 0.40037133971448435, + "learning_rate": 0.00018321088611640245, + "loss": 0.6669, + "step": 1190 + }, + { + "epoch": 0.21173333333333333, + "grad_norm": 0.3958701078086994, + "learning_rate": 0.00018317893746237963, + "loss": 0.7173, + "step": 1191 + }, + { + "epoch": 0.2119111111111111, + "grad_norm": 0.3851579617459599, + "learning_rate": 0.00018314696123025454, + "loss": 0.6623, + "step": 1192 + }, + { + "epoch": 0.21208888888888888, + "grad_norm": 0.3890405939105116, + "learning_rate": 0.00018311495743062887, + "loss": 0.6764, + "step": 1193 + }, + { + "epoch": 0.21226666666666666, + "grad_norm": 0.37172193882892485, + "learning_rate": 0.0001830829260741136, + "loss": 0.6954, + "step": 1194 + }, + { + "epoch": 0.21244444444444444, + "grad_norm": 0.3491154516506322, + "learning_rate": 0.00018305086717132873, + "loss": 0.6467, + "step": 1195 + }, + { + "epoch": 0.2126222222222222, + "grad_norm": 0.3767031029786711, + "learning_rate": 0.00018301878073290345, + "loss": 0.7115, + "step": 1196 + }, + { + "epoch": 0.2128, + "grad_norm": 0.4938887172264978, + "learning_rate": 0.00018298666676947606, + "loss": 0.6541, + "step": 1197 + }, + { + "epoch": 0.21297777777777777, + "grad_norm": 0.36307327022000824, + "learning_rate": 0.000182954525291694, + "loss": 0.709, + "step": 1198 + }, + { + "epoch": 0.21315555555555554, + "grad_norm": 0.3669493220248955, + "learning_rate": 0.0001829223563102138, + "loss": 0.6255, + "step": 1199 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.38854350315322106, + "learning_rate": 0.0001828901598357012, + "loss": 0.7025, + "step": 1200 + }, + { + "epoch": 0.21351111111111112, + "grad_norm": 0.42280080060878183, + "learning_rate": 0.00018285793587883092, + "loss": 0.7177, + "step": 1201 + }, + { + "epoch": 0.2136888888888889, + "grad_norm": 0.3828164078513971, + "learning_rate": 0.0001828256844502869, + "loss": 0.707, + "step": 1202 + }, + { + "epoch": 0.21386666666666668, + "grad_norm": 0.3730267894679604, + "learning_rate": 0.00018279340556076216, + "loss": 0.6724, + "step": 1203 + }, + { + "epoch": 0.21404444444444445, + "grad_norm": 0.3543769112513274, + "learning_rate": 0.00018276109922095877, + "loss": 0.6726, + "step": 1204 + }, + { + "epoch": 0.21422222222222223, + "grad_norm": 0.35576667055637556, + "learning_rate": 0.00018272876544158794, + "loss": 0.6317, + "step": 1205 + }, + { + "epoch": 0.2144, + "grad_norm": 0.366934105456551, + "learning_rate": 0.00018269640423337007, + "loss": 0.6926, + "step": 1206 + }, + { + "epoch": 0.21457777777777778, + "grad_norm": 0.3760879476029945, + "learning_rate": 0.0001826640156070345, + "loss": 0.6379, + "step": 1207 + }, + { + "epoch": 0.21475555555555556, + "grad_norm": 0.3519381009965438, + "learning_rate": 0.0001826315995733197, + "loss": 0.6696, + "step": 1208 + }, + { + "epoch": 0.21493333333333334, + "grad_norm": 0.35780396962791067, + "learning_rate": 0.0001825991561429733, + "loss": 0.6456, + "step": 1209 + }, + { + "epoch": 0.21511111111111111, + "grad_norm": 0.4091743342317021, + "learning_rate": 0.00018256668532675197, + "loss": 0.6812, + "step": 1210 + }, + { + "epoch": 0.2152888888888889, + "grad_norm": 0.36528173330022046, + "learning_rate": 0.0001825341871354215, + "loss": 0.6424, + "step": 1211 + }, + { + "epoch": 0.21546666666666667, + "grad_norm": 0.38570555070167545, + "learning_rate": 0.00018250166157975661, + "loss": 0.7056, + "step": 1212 + }, + { + "epoch": 0.21564444444444444, + "grad_norm": 0.38738495633750414, + "learning_rate": 0.00018246910867054125, + "loss": 0.682, + "step": 1213 + }, + { + "epoch": 0.21582222222222222, + "grad_norm": 0.8338716476837322, + "learning_rate": 0.0001824365284185684, + "loss": 0.6588, + "step": 1214 + }, + { + "epoch": 0.216, + "grad_norm": 0.3464376813860515, + "learning_rate": 0.00018240392083464007, + "loss": 0.6623, + "step": 1215 + }, + { + "epoch": 0.21617777777777777, + "grad_norm": 0.3496356519858567, + "learning_rate": 0.00018237128592956737, + "loss": 0.6419, + "step": 1216 + }, + { + "epoch": 0.21635555555555555, + "grad_norm": 0.37612346679787867, + "learning_rate": 0.00018233862371417047, + "loss": 0.6733, + "step": 1217 + }, + { + "epoch": 0.21653333333333333, + "grad_norm": 0.36049843791702, + "learning_rate": 0.00018230593419927852, + "loss": 0.6382, + "step": 1218 + }, + { + "epoch": 0.2167111111111111, + "grad_norm": 0.387637216036029, + "learning_rate": 0.00018227321739572983, + "loss": 0.642, + "step": 1219 + }, + { + "epoch": 0.21688888888888888, + "grad_norm": 0.3861937665549605, + "learning_rate": 0.00018224047331437165, + "loss": 0.6769, + "step": 1220 + }, + { + "epoch": 0.21706666666666666, + "grad_norm": 0.36731108686428177, + "learning_rate": 0.0001822077019660604, + "loss": 0.7014, + "step": 1221 + }, + { + "epoch": 0.21724444444444443, + "grad_norm": 0.388699854849361, + "learning_rate": 0.00018217490336166144, + "loss": 0.7072, + "step": 1222 + }, + { + "epoch": 0.2174222222222222, + "grad_norm": 0.3819613871631891, + "learning_rate": 0.00018214207751204918, + "loss": 0.7297, + "step": 1223 + }, + { + "epoch": 0.2176, + "grad_norm": 0.3680423732001841, + "learning_rate": 0.00018210922442810708, + "loss": 0.6336, + "step": 1224 + }, + { + "epoch": 0.21777777777777776, + "grad_norm": 0.3376134165731433, + "learning_rate": 0.00018207634412072764, + "loss": 0.6477, + "step": 1225 + }, + { + "epoch": 0.21795555555555557, + "grad_norm": 0.37275274132289626, + "learning_rate": 0.0001820434366008124, + "loss": 0.6881, + "step": 1226 + }, + { + "epoch": 0.21813333333333335, + "grad_norm": 0.38555044919326564, + "learning_rate": 0.00018201050187927184, + "loss": 0.6754, + "step": 1227 + }, + { + "epoch": 0.21831111111111112, + "grad_norm": 0.3594542843830291, + "learning_rate": 0.00018197753996702557, + "loss": 0.6099, + "step": 1228 + }, + { + "epoch": 0.2184888888888889, + "grad_norm": 0.377758974478313, + "learning_rate": 0.00018194455087500218, + "loss": 0.6943, + "step": 1229 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 0.38270904518351395, + "learning_rate": 0.00018191153461413916, + "loss": 0.6981, + "step": 1230 + }, + { + "epoch": 0.21884444444444445, + "grad_norm": 0.36243414789220363, + "learning_rate": 0.00018187849119538318, + "loss": 0.6399, + "step": 1231 + }, + { + "epoch": 0.21902222222222223, + "grad_norm": 0.3555052323035816, + "learning_rate": 0.00018184542062968983, + "loss": 0.6332, + "step": 1232 + }, + { + "epoch": 0.2192, + "grad_norm": 0.3499507977114114, + "learning_rate": 0.00018181232292802365, + "loss": 0.6755, + "step": 1233 + }, + { + "epoch": 0.21937777777777778, + "grad_norm": 0.37627367466076955, + "learning_rate": 0.0001817791981013583, + "loss": 0.6751, + "step": 1234 + }, + { + "epoch": 0.21955555555555556, + "grad_norm": 0.3683451874640109, + "learning_rate": 0.00018174604616067632, + "loss": 0.6456, + "step": 1235 + }, + { + "epoch": 0.21973333333333334, + "grad_norm": 0.36823981386516685, + "learning_rate": 0.00018171286711696934, + "loss": 0.6662, + "step": 1236 + }, + { + "epoch": 0.2199111111111111, + "grad_norm": 0.3560313882981796, + "learning_rate": 0.00018167966098123786, + "loss": 0.6652, + "step": 1237 + }, + { + "epoch": 0.2200888888888889, + "grad_norm": 0.37564553359701064, + "learning_rate": 0.00018164642776449146, + "loss": 0.6878, + "step": 1238 + }, + { + "epoch": 0.22026666666666667, + "grad_norm": 0.38845477182844085, + "learning_rate": 0.00018161316747774864, + "loss": 0.7153, + "step": 1239 + }, + { + "epoch": 0.22044444444444444, + "grad_norm": 0.3743997347569959, + "learning_rate": 0.00018157988013203693, + "loss": 0.6192, + "step": 1240 + }, + { + "epoch": 0.22062222222222222, + "grad_norm": 0.3673109626908155, + "learning_rate": 0.00018154656573839275, + "loss": 0.6409, + "step": 1241 + }, + { + "epoch": 0.2208, + "grad_norm": 0.36651144212340137, + "learning_rate": 0.0001815132243078616, + "loss": 0.6669, + "step": 1242 + }, + { + "epoch": 0.22097777777777777, + "grad_norm": 0.3310886182609289, + "learning_rate": 0.00018147985585149784, + "loss": 0.6455, + "step": 1243 + }, + { + "epoch": 0.22115555555555555, + "grad_norm": 0.37941241054018465, + "learning_rate": 0.00018144646038036486, + "loss": 0.7017, + "step": 1244 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 0.38345594208676165, + "learning_rate": 0.00018141303790553495, + "loss": 0.6808, + "step": 1245 + }, + { + "epoch": 0.2215111111111111, + "grad_norm": 0.4088640247515805, + "learning_rate": 0.00018137958843808936, + "loss": 0.6985, + "step": 1246 + }, + { + "epoch": 0.22168888888888888, + "grad_norm": 0.36904643335418275, + "learning_rate": 0.0001813461119891184, + "loss": 0.7165, + "step": 1247 + }, + { + "epoch": 0.22186666666666666, + "grad_norm": 0.35382461144969835, + "learning_rate": 0.00018131260856972116, + "loss": 0.68, + "step": 1248 + }, + { + "epoch": 0.22204444444444443, + "grad_norm": 0.3531729011678622, + "learning_rate": 0.0001812790781910058, + "loss": 0.6658, + "step": 1249 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 0.35977107790245616, + "learning_rate": 0.0001812455208640893, + "loss": 0.6793, + "step": 1250 + }, + { + "epoch": 0.2224, + "grad_norm": 0.3518269563523556, + "learning_rate": 0.0001812119366000977, + "loss": 0.6331, + "step": 1251 + }, + { + "epoch": 0.2225777777777778, + "grad_norm": 0.36462610091755804, + "learning_rate": 0.00018117832541016587, + "loss": 0.7048, + "step": 1252 + }, + { + "epoch": 0.22275555555555557, + "grad_norm": 0.35318753293114574, + "learning_rate": 0.0001811446873054377, + "loss": 0.5774, + "step": 1253 + }, + { + "epoch": 0.22293333333333334, + "grad_norm": 0.35081977043426243, + "learning_rate": 0.0001811110222970659, + "loss": 0.65, + "step": 1254 + }, + { + "epoch": 0.22311111111111112, + "grad_norm": 0.39688865382910304, + "learning_rate": 0.00018107733039621223, + "loss": 0.6775, + "step": 1255 + }, + { + "epoch": 0.2232888888888889, + "grad_norm": 0.37686866681610026, + "learning_rate": 0.00018104361161404723, + "loss": 0.6703, + "step": 1256 + }, + { + "epoch": 0.22346666666666667, + "grad_norm": 0.36142850992294384, + "learning_rate": 0.00018100986596175046, + "loss": 0.6314, + "step": 1257 + }, + { + "epoch": 0.22364444444444445, + "grad_norm": 0.3798419234632831, + "learning_rate": 0.00018097609345051025, + "loss": 0.7103, + "step": 1258 + }, + { + "epoch": 0.22382222222222223, + "grad_norm": 0.3712502957307875, + "learning_rate": 0.00018094229409152402, + "loss": 0.6544, + "step": 1259 + }, + { + "epoch": 0.224, + "grad_norm": 0.36049302217826423, + "learning_rate": 0.00018090846789599798, + "loss": 0.7147, + "step": 1260 + }, + { + "epoch": 0.22417777777777778, + "grad_norm": 0.3523508248932008, + "learning_rate": 0.00018087461487514722, + "loss": 0.708, + "step": 1261 + }, + { + "epoch": 0.22435555555555556, + "grad_norm": 0.36155258534713897, + "learning_rate": 0.0001808407350401958, + "loss": 0.6691, + "step": 1262 + }, + { + "epoch": 0.22453333333333333, + "grad_norm": 0.3544347270915348, + "learning_rate": 0.0001808068284023766, + "loss": 0.6486, + "step": 1263 + }, + { + "epoch": 0.2247111111111111, + "grad_norm": 0.3555104595715303, + "learning_rate": 0.00018077289497293143, + "loss": 0.6257, + "step": 1264 + }, + { + "epoch": 0.2248888888888889, + "grad_norm": 0.356306142872019, + "learning_rate": 0.00018073893476311097, + "loss": 0.6621, + "step": 1265 + }, + { + "epoch": 0.22506666666666666, + "grad_norm": 0.3504699812872086, + "learning_rate": 0.00018070494778417477, + "loss": 0.6714, + "step": 1266 + }, + { + "epoch": 0.22524444444444444, + "grad_norm": 0.3889531451696137, + "learning_rate": 0.0001806709340473913, + "loss": 0.6766, + "step": 1267 + }, + { + "epoch": 0.22542222222222222, + "grad_norm": 0.36801762972423285, + "learning_rate": 0.0001806368935640378, + "loss": 0.6408, + "step": 1268 + }, + { + "epoch": 0.2256, + "grad_norm": 0.3521496509119404, + "learning_rate": 0.00018060282634540053, + "loss": 0.6401, + "step": 1269 + }, + { + "epoch": 0.22577777777777777, + "grad_norm": 0.35311540887409437, + "learning_rate": 0.00018056873240277445, + "loss": 0.6915, + "step": 1270 + }, + { + "epoch": 0.22595555555555555, + "grad_norm": 0.3559641166594349, + "learning_rate": 0.0001805346117474635, + "loss": 0.6021, + "step": 1271 + }, + { + "epoch": 0.22613333333333333, + "grad_norm": 0.3591596263716163, + "learning_rate": 0.0001805004643907804, + "loss": 0.6685, + "step": 1272 + }, + { + "epoch": 0.2263111111111111, + "grad_norm": 0.3707216191459186, + "learning_rate": 0.0001804662903440468, + "loss": 0.6296, + "step": 1273 + }, + { + "epoch": 0.22648888888888888, + "grad_norm": 0.3548733735155643, + "learning_rate": 0.00018043208961859316, + "loss": 0.6536, + "step": 1274 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.36397318383873745, + "learning_rate": 0.00018039786222575873, + "loss": 0.6476, + "step": 1275 + }, + { + "epoch": 0.22684444444444443, + "grad_norm": 0.3748274906465537, + "learning_rate": 0.0001803636081768917, + "loss": 0.6453, + "step": 1276 + }, + { + "epoch": 0.2270222222222222, + "grad_norm": 0.3541633584386624, + "learning_rate": 0.00018032932748334902, + "loss": 0.6636, + "step": 1277 + }, + { + "epoch": 0.2272, + "grad_norm": 0.365831485623866, + "learning_rate": 0.00018029502015649647, + "loss": 0.6786, + "step": 1278 + }, + { + "epoch": 0.2273777777777778, + "grad_norm": 0.3720993619824261, + "learning_rate": 0.00018026068620770883, + "loss": 0.6317, + "step": 1279 + }, + { + "epoch": 0.22755555555555557, + "grad_norm": 0.36783510120315577, + "learning_rate": 0.00018022632564836948, + "loss": 0.6508, + "step": 1280 + }, + { + "epoch": 0.22773333333333334, + "grad_norm": 0.3834184736054038, + "learning_rate": 0.0001801919384898707, + "loss": 0.6199, + "step": 1281 + }, + { + "epoch": 0.22791111111111112, + "grad_norm": 0.341374217529724, + "learning_rate": 0.00018015752474361362, + "loss": 0.6355, + "step": 1282 + }, + { + "epoch": 0.2280888888888889, + "grad_norm": 0.3871502641429102, + "learning_rate": 0.00018012308442100824, + "loss": 0.6575, + "step": 1283 + }, + { + "epoch": 0.22826666666666667, + "grad_norm": 0.35645642383502985, + "learning_rate": 0.00018008861753347316, + "loss": 0.635, + "step": 1284 + }, + { + "epoch": 0.22844444444444445, + "grad_norm": 0.4270501104834336, + "learning_rate": 0.00018005412409243606, + "loss": 0.6521, + "step": 1285 + }, + { + "epoch": 0.22862222222222223, + "grad_norm": 0.39586136842836317, + "learning_rate": 0.0001800196041093332, + "loss": 0.6907, + "step": 1286 + }, + { + "epoch": 0.2288, + "grad_norm": 0.359522010241049, + "learning_rate": 0.0001799850575956098, + "loss": 0.6285, + "step": 1287 + }, + { + "epoch": 0.22897777777777778, + "grad_norm": 0.3874640161748343, + "learning_rate": 0.0001799504845627198, + "loss": 0.6892, + "step": 1288 + }, + { + "epoch": 0.22915555555555556, + "grad_norm": 0.36882034365720556, + "learning_rate": 0.0001799158850221259, + "loss": 0.6547, + "step": 1289 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 0.3820864627123511, + "learning_rate": 0.00017988125898529966, + "loss": 0.679, + "step": 1290 + }, + { + "epoch": 0.2295111111111111, + "grad_norm": 0.35281146556952625, + "learning_rate": 0.0001798466064637214, + "loss": 0.6781, + "step": 1291 + }, + { + "epoch": 0.2296888888888889, + "grad_norm": 0.3713894627884917, + "learning_rate": 0.00017981192746888017, + "loss": 0.6234, + "step": 1292 + }, + { + "epoch": 0.22986666666666666, + "grad_norm": 0.3644697301527836, + "learning_rate": 0.0001797772220122739, + "loss": 0.6649, + "step": 1293 + }, + { + "epoch": 0.23004444444444444, + "grad_norm": 0.3692195806384566, + "learning_rate": 0.0001797424901054092, + "loss": 0.6924, + "step": 1294 + }, + { + "epoch": 0.23022222222222222, + "grad_norm": 0.3868252632234773, + "learning_rate": 0.0001797077317598015, + "loss": 0.6935, + "step": 1295 + }, + { + "epoch": 0.2304, + "grad_norm": 0.39208838602648277, + "learning_rate": 0.000179672946986975, + "loss": 0.6657, + "step": 1296 + }, + { + "epoch": 0.23057777777777777, + "grad_norm": 0.4082130619109606, + "learning_rate": 0.0001796381357984626, + "loss": 0.7243, + "step": 1297 + }, + { + "epoch": 0.23075555555555555, + "grad_norm": 0.3662372728050405, + "learning_rate": 0.00017960329820580607, + "loss": 0.6405, + "step": 1298 + }, + { + "epoch": 0.23093333333333332, + "grad_norm": 0.36352467148714435, + "learning_rate": 0.0001795684342205558, + "loss": 0.642, + "step": 1299 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 0.40917616866185247, + "learning_rate": 0.000179533543854271, + "loss": 0.6205, + "step": 1300 + }, + { + "epoch": 0.23128888888888888, + "grad_norm": 0.3842333705992674, + "learning_rate": 0.00017949862711851965, + "loss": 0.627, + "step": 1301 + }, + { + "epoch": 0.23146666666666665, + "grad_norm": 0.3955545266735241, + "learning_rate": 0.00017946368402487845, + "loss": 0.6832, + "step": 1302 + }, + { + "epoch": 0.23164444444444443, + "grad_norm": 0.37563124393497516, + "learning_rate": 0.00017942871458493284, + "loss": 0.6338, + "step": 1303 + }, + { + "epoch": 0.23182222222222224, + "grad_norm": 0.3967110033775136, + "learning_rate": 0.00017939371881027697, + "loss": 0.6939, + "step": 1304 + }, + { + "epoch": 0.232, + "grad_norm": 0.3516919270429312, + "learning_rate": 0.00017935869671251378, + "loss": 0.6215, + "step": 1305 + }, + { + "epoch": 0.2321777777777778, + "grad_norm": 0.37401797171452594, + "learning_rate": 0.0001793236483032548, + "loss": 0.6512, + "step": 1306 + }, + { + "epoch": 0.23235555555555557, + "grad_norm": 0.3867622724788743, + "learning_rate": 0.0001792885735941205, + "loss": 0.6782, + "step": 1307 + }, + { + "epoch": 0.23253333333333334, + "grad_norm": 0.35924062182946515, + "learning_rate": 0.0001792534725967399, + "loss": 0.6773, + "step": 1308 + }, + { + "epoch": 0.23271111111111112, + "grad_norm": 0.3711545394429619, + "learning_rate": 0.00017921834532275076, + "loss": 0.6326, + "step": 1309 + }, + { + "epoch": 0.2328888888888889, + "grad_norm": 0.3562773430235455, + "learning_rate": 0.00017918319178379967, + "loss": 0.6511, + "step": 1310 + }, + { + "epoch": 0.23306666666666667, + "grad_norm": 0.36525151006923434, + "learning_rate": 0.00017914801199154175, + "loss": 0.6568, + "step": 1311 + }, + { + "epoch": 0.23324444444444445, + "grad_norm": 0.3701701674212604, + "learning_rate": 0.00017911280595764092, + "loss": 0.6281, + "step": 1312 + }, + { + "epoch": 0.23342222222222223, + "grad_norm": 0.39619610073025263, + "learning_rate": 0.00017907757369376985, + "loss": 0.7054, + "step": 1313 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3634351419349522, + "learning_rate": 0.00017904231521160982, + "loss": 0.6867, + "step": 1314 + }, + { + "epoch": 0.23377777777777778, + "grad_norm": 0.38291421783768703, + "learning_rate": 0.00017900703052285084, + "loss": 0.6927, + "step": 1315 + }, + { + "epoch": 0.23395555555555556, + "grad_norm": 0.37420360371681227, + "learning_rate": 0.0001789717196391916, + "loss": 0.6726, + "step": 1316 + }, + { + "epoch": 0.23413333333333333, + "grad_norm": 0.3485924585311357, + "learning_rate": 0.00017893638257233943, + "loss": 0.605, + "step": 1317 + }, + { + "epoch": 0.2343111111111111, + "grad_norm": 0.3890514924650713, + "learning_rate": 0.00017890101933401047, + "loss": 0.684, + "step": 1318 + }, + { + "epoch": 0.23448888888888889, + "grad_norm": 0.37679602376150895, + "learning_rate": 0.0001788656299359294, + "loss": 0.6403, + "step": 1319 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 0.36588941825170834, + "learning_rate": 0.00017883021438982964, + "loss": 0.654, + "step": 1320 + }, + { + "epoch": 0.23484444444444444, + "grad_norm": 0.3762464975943449, + "learning_rate": 0.00017879477270745328, + "loss": 0.7041, + "step": 1321 + }, + { + "epoch": 0.23502222222222222, + "grad_norm": 0.36130975481183664, + "learning_rate": 0.00017875930490055106, + "loss": 0.6549, + "step": 1322 + }, + { + "epoch": 0.2352, + "grad_norm": 0.37374764256953785, + "learning_rate": 0.00017872381098088237, + "loss": 0.7119, + "step": 1323 + }, + { + "epoch": 0.23537777777777777, + "grad_norm": 0.41445183140938907, + "learning_rate": 0.00017868829096021527, + "loss": 0.7511, + "step": 1324 + }, + { + "epoch": 0.23555555555555555, + "grad_norm": 0.3426309998598964, + "learning_rate": 0.0001786527448503265, + "loss": 0.6733, + "step": 1325 + }, + { + "epoch": 0.23573333333333332, + "grad_norm": 0.3745962267587148, + "learning_rate": 0.0001786171726630014, + "loss": 0.6558, + "step": 1326 + }, + { + "epoch": 0.2359111111111111, + "grad_norm": 0.37760340600558523, + "learning_rate": 0.000178581574410034, + "loss": 0.7098, + "step": 1327 + }, + { + "epoch": 0.23608888888888888, + "grad_norm": 0.3631130973598564, + "learning_rate": 0.000178545950103227, + "loss": 0.687, + "step": 1328 + }, + { + "epoch": 0.23626666666666668, + "grad_norm": 0.37442239653839543, + "learning_rate": 0.00017851029975439158, + "loss": 0.6873, + "step": 1329 + }, + { + "epoch": 0.23644444444444446, + "grad_norm": 0.3544908461425953, + "learning_rate": 0.00017847462337534776, + "loss": 0.6434, + "step": 1330 + }, + { + "epoch": 0.23662222222222223, + "grad_norm": 0.37019313698788225, + "learning_rate": 0.00017843892097792408, + "loss": 0.6672, + "step": 1331 + }, + { + "epoch": 0.2368, + "grad_norm": 0.3760298925859323, + "learning_rate": 0.00017840319257395767, + "loss": 0.6366, + "step": 1332 + }, + { + "epoch": 0.2369777777777778, + "grad_norm": 0.40976279691863876, + "learning_rate": 0.0001783674381752944, + "loss": 0.6889, + "step": 1333 + }, + { + "epoch": 0.23715555555555556, + "grad_norm": 0.3582943729244857, + "learning_rate": 0.00017833165779378867, + "loss": 0.6395, + "step": 1334 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 0.36225126761600807, + "learning_rate": 0.00017829585144130356, + "loss": 0.6043, + "step": 1335 + }, + { + "epoch": 0.23751111111111112, + "grad_norm": 0.3537156017879337, + "learning_rate": 0.00017826001912971066, + "loss": 0.6637, + "step": 1336 + }, + { + "epoch": 0.2376888888888889, + "grad_norm": 0.36183506689018097, + "learning_rate": 0.00017822416087089025, + "loss": 0.6818, + "step": 1337 + }, + { + "epoch": 0.23786666666666667, + "grad_norm": 0.35941325243183436, + "learning_rate": 0.00017818827667673116, + "loss": 0.6767, + "step": 1338 + }, + { + "epoch": 0.23804444444444445, + "grad_norm": 0.36445531796959796, + "learning_rate": 0.00017815236655913092, + "loss": 0.6328, + "step": 1339 + }, + { + "epoch": 0.23822222222222222, + "grad_norm": 0.3687817642032925, + "learning_rate": 0.00017811643052999552, + "loss": 0.6465, + "step": 1340 + }, + { + "epoch": 0.2384, + "grad_norm": 0.3741595498147515, + "learning_rate": 0.0001780804686012396, + "loss": 0.6868, + "step": 1341 + }, + { + "epoch": 0.23857777777777778, + "grad_norm": 0.3891494595627898, + "learning_rate": 0.00017804448078478647, + "loss": 0.7196, + "step": 1342 + }, + { + "epoch": 0.23875555555555555, + "grad_norm": 0.36015019915748786, + "learning_rate": 0.0001780084670925679, + "loss": 0.603, + "step": 1343 + }, + { + "epoch": 0.23893333333333333, + "grad_norm": 0.3790785524684745, + "learning_rate": 0.00017797242753652423, + "loss": 0.6807, + "step": 1344 + }, + { + "epoch": 0.2391111111111111, + "grad_norm": 0.3609255760488036, + "learning_rate": 0.00017793636212860449, + "loss": 0.6612, + "step": 1345 + }, + { + "epoch": 0.23928888888888888, + "grad_norm": 0.3460993330945266, + "learning_rate": 0.0001779002708807662, + "loss": 0.6644, + "step": 1346 + }, + { + "epoch": 0.23946666666666666, + "grad_norm": 0.39822585606614946, + "learning_rate": 0.00017786415380497553, + "loss": 0.6763, + "step": 1347 + }, + { + "epoch": 0.23964444444444444, + "grad_norm": 0.37981576095414493, + "learning_rate": 0.00017782801091320707, + "loss": 0.6523, + "step": 1348 + }, + { + "epoch": 0.23982222222222221, + "grad_norm": 0.37352954163328894, + "learning_rate": 0.00017779184221744404, + "loss": 0.6211, + "step": 1349 + }, + { + "epoch": 0.24, + "grad_norm": 0.3817468997975673, + "learning_rate": 0.0001777556477296783, + "loss": 0.6404, + "step": 1350 + }, + { + "epoch": 0.24017777777777777, + "grad_norm": 0.39381365196624535, + "learning_rate": 0.00017771942746191014, + "loss": 0.6974, + "step": 1351 + }, + { + "epoch": 0.24035555555555554, + "grad_norm": 0.4230517518256083, + "learning_rate": 0.00017768318142614845, + "loss": 0.714, + "step": 1352 + }, + { + "epoch": 0.24053333333333332, + "grad_norm": 0.390896453245426, + "learning_rate": 0.00017764690963441066, + "loss": 0.6898, + "step": 1353 + }, + { + "epoch": 0.2407111111111111, + "grad_norm": 0.3942472823419236, + "learning_rate": 0.00017761061209872273, + "loss": 0.7128, + "step": 1354 + }, + { + "epoch": 0.2408888888888889, + "grad_norm": 0.3706793167830517, + "learning_rate": 0.00017757428883111918, + "loss": 0.6757, + "step": 1355 + }, + { + "epoch": 0.24106666666666668, + "grad_norm": 0.35878355668735806, + "learning_rate": 0.00017753793984364306, + "loss": 0.652, + "step": 1356 + }, + { + "epoch": 0.24124444444444446, + "grad_norm": 0.5370187234823591, + "learning_rate": 0.0001775015651483459, + "loss": 0.6293, + "step": 1357 + }, + { + "epoch": 0.24142222222222223, + "grad_norm": 0.35793651468723764, + "learning_rate": 0.00017746516475728775, + "loss": 0.6644, + "step": 1358 + }, + { + "epoch": 0.2416, + "grad_norm": 0.3548400450923138, + "learning_rate": 0.0001774287386825373, + "loss": 0.6407, + "step": 1359 + }, + { + "epoch": 0.24177777777777779, + "grad_norm": 0.40336711959002175, + "learning_rate": 0.0001773922869361716, + "loss": 0.7308, + "step": 1360 + }, + { + "epoch": 0.24195555555555556, + "grad_norm": 0.34835124751478924, + "learning_rate": 0.00017735580953027636, + "loss": 0.6656, + "step": 1361 + }, + { + "epoch": 0.24213333333333334, + "grad_norm": 0.3603559317123436, + "learning_rate": 0.0001773193064769456, + "loss": 0.6582, + "step": 1362 + }, + { + "epoch": 0.24231111111111112, + "grad_norm": 0.349635256140617, + "learning_rate": 0.0001772827777882821, + "loss": 0.619, + "step": 1363 + }, + { + "epoch": 0.2424888888888889, + "grad_norm": 0.3533609881973546, + "learning_rate": 0.00017724622347639688, + "loss": 0.6738, + "step": 1364 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 0.35940222120923704, + "learning_rate": 0.00017720964355340962, + "loss": 0.6313, + "step": 1365 + }, + { + "epoch": 0.24284444444444445, + "grad_norm": 0.35559852737944414, + "learning_rate": 0.00017717303803144852, + "loss": 0.641, + "step": 1366 + }, + { + "epoch": 0.24302222222222222, + "grad_norm": 0.35218243384859543, + "learning_rate": 0.00017713640692265008, + "loss": 0.6587, + "step": 1367 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3639913447558467, + "learning_rate": 0.00017709975023915949, + "loss": 0.6391, + "step": 1368 + }, + { + "epoch": 0.24337777777777778, + "grad_norm": 0.382021541566799, + "learning_rate": 0.00017706306799313026, + "loss": 0.6723, + "step": 1369 + }, + { + "epoch": 0.24355555555555555, + "grad_norm": 0.3728795948978041, + "learning_rate": 0.0001770263601967245, + "loss": 0.6647, + "step": 1370 + }, + { + "epoch": 0.24373333333333333, + "grad_norm": 0.37386094141962695, + "learning_rate": 0.00017698962686211268, + "loss": 0.6659, + "step": 1371 + }, + { + "epoch": 0.2439111111111111, + "grad_norm": 0.5491656199225733, + "learning_rate": 0.0001769528680014739, + "loss": 0.6503, + "step": 1372 + }, + { + "epoch": 0.24408888888888888, + "grad_norm": 0.3747091180279064, + "learning_rate": 0.00017691608362699546, + "loss": 0.6691, + "step": 1373 + }, + { + "epoch": 0.24426666666666666, + "grad_norm": 0.36053556512307666, + "learning_rate": 0.00017687927375087338, + "loss": 0.6693, + "step": 1374 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 0.35392200268040286, + "learning_rate": 0.000176842438385312, + "loss": 0.669, + "step": 1375 + }, + { + "epoch": 0.2446222222222222, + "grad_norm": 0.3515524141662454, + "learning_rate": 0.00017680557754252418, + "loss": 0.6504, + "step": 1376 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3622308577856946, + "learning_rate": 0.00017676869123473113, + "loss": 0.64, + "step": 1377 + }, + { + "epoch": 0.24497777777777777, + "grad_norm": 0.3752560157308086, + "learning_rate": 0.00017673177947416258, + "loss": 0.6691, + "step": 1378 + }, + { + "epoch": 0.24515555555555554, + "grad_norm": 0.5522802115221203, + "learning_rate": 0.0001766948422730567, + "loss": 0.71, + "step": 1379 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.3590664599368048, + "learning_rate": 0.00017665787964366006, + "loss": 0.6683, + "step": 1380 + }, + { + "epoch": 0.24551111111111112, + "grad_norm": 0.3619576043511919, + "learning_rate": 0.00017662089159822765, + "loss": 0.6669, + "step": 1381 + }, + { + "epoch": 0.2456888888888889, + "grad_norm": 0.3410550269326608, + "learning_rate": 0.00017658387814902294, + "loss": 0.6366, + "step": 1382 + }, + { + "epoch": 0.24586666666666668, + "grad_norm": 0.3817131022287267, + "learning_rate": 0.00017654683930831783, + "loss": 0.6605, + "step": 1383 + }, + { + "epoch": 0.24604444444444445, + "grad_norm": 0.38586700251304207, + "learning_rate": 0.00017650977508839254, + "loss": 0.642, + "step": 1384 + }, + { + "epoch": 0.24622222222222223, + "grad_norm": 0.36425469664368254, + "learning_rate": 0.00017647268550153583, + "loss": 0.6774, + "step": 1385 + }, + { + "epoch": 0.2464, + "grad_norm": 0.38394910279537725, + "learning_rate": 0.00017643557056004473, + "loss": 0.7108, + "step": 1386 + }, + { + "epoch": 0.24657777777777778, + "grad_norm": 0.3777166576941319, + "learning_rate": 0.0001763984302762248, + "loss": 0.7045, + "step": 1387 + }, + { + "epoch": 0.24675555555555556, + "grad_norm": 0.3521410536631835, + "learning_rate": 0.00017636126466238995, + "loss": 0.6311, + "step": 1388 + }, + { + "epoch": 0.24693333333333334, + "grad_norm": 0.36195372691412486, + "learning_rate": 0.00017632407373086256, + "loss": 0.6433, + "step": 1389 + }, + { + "epoch": 0.24711111111111111, + "grad_norm": 0.3781227718722322, + "learning_rate": 0.0001762868574939732, + "loss": 0.683, + "step": 1390 + }, + { + "epoch": 0.2472888888888889, + "grad_norm": 0.3657836825496804, + "learning_rate": 0.0001762496159640611, + "loss": 0.6403, + "step": 1391 + }, + { + "epoch": 0.24746666666666667, + "grad_norm": 0.3980086528061227, + "learning_rate": 0.00017621234915347368, + "loss": 0.7235, + "step": 1392 + }, + { + "epoch": 0.24764444444444444, + "grad_norm": 0.39661278406014594, + "learning_rate": 0.00017617505707456682, + "loss": 0.6645, + "step": 1393 + }, + { + "epoch": 0.24782222222222222, + "grad_norm": 0.4017342434831154, + "learning_rate": 0.00017613773973970478, + "loss": 0.6488, + "step": 1394 + }, + { + "epoch": 0.248, + "grad_norm": 0.3844575421862563, + "learning_rate": 0.00017610039716126018, + "loss": 0.6454, + "step": 1395 + }, + { + "epoch": 0.24817777777777777, + "grad_norm": 0.36205821497124974, + "learning_rate": 0.00017606302935161395, + "loss": 0.6677, + "step": 1396 + }, + { + "epoch": 0.24835555555555555, + "grad_norm": 0.3778362662591267, + "learning_rate": 0.00017602563632315553, + "loss": 0.656, + "step": 1397 + }, + { + "epoch": 0.24853333333333333, + "grad_norm": 0.3503277617173709, + "learning_rate": 0.0001759882180882826, + "loss": 0.6334, + "step": 1398 + }, + { + "epoch": 0.2487111111111111, + "grad_norm": 0.38157367612851106, + "learning_rate": 0.00017595077465940118, + "loss": 0.6781, + "step": 1399 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 0.37426016440352167, + "learning_rate": 0.00017591330604892574, + "loss": 0.6494, + "step": 1400 + }, + { + "epoch": 0.24906666666666666, + "grad_norm": 0.3907342559241473, + "learning_rate": 0.0001758758122692791, + "loss": 0.6914, + "step": 1401 + }, + { + "epoch": 0.24924444444444444, + "grad_norm": 0.38123067361092744, + "learning_rate": 0.0001758382933328923, + "loss": 0.6141, + "step": 1402 + }, + { + "epoch": 0.2494222222222222, + "grad_norm": 0.4038856882325022, + "learning_rate": 0.00017580074925220487, + "loss": 0.7019, + "step": 1403 + }, + { + "epoch": 0.2496, + "grad_norm": 0.36669749830297804, + "learning_rate": 0.00017576318003966455, + "loss": 0.6717, + "step": 1404 + }, + { + "epoch": 0.24977777777777777, + "grad_norm": 0.39651044883944064, + "learning_rate": 0.0001757255857077275, + "loss": 0.6905, + "step": 1405 + }, + { + "epoch": 0.24995555555555554, + "grad_norm": 0.3686611708426602, + "learning_rate": 0.00017568796626885814, + "loss": 0.6423, + "step": 1406 + }, + { + "epoch": 0.2501333333333333, + "grad_norm": 0.37802649141783823, + "learning_rate": 0.0001756503217355293, + "loss": 0.6451, + "step": 1407 + }, + { + "epoch": 0.2503111111111111, + "grad_norm": 0.3887471225498165, + "learning_rate": 0.00017561265212022206, + "loss": 0.6882, + "step": 1408 + }, + { + "epoch": 0.25048888888888887, + "grad_norm": 0.3909425460101287, + "learning_rate": 0.00017557495743542585, + "loss": 0.6854, + "step": 1409 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 0.36337171937273804, + "learning_rate": 0.00017553723769363837, + "loss": 0.6894, + "step": 1410 + }, + { + "epoch": 0.2508444444444444, + "grad_norm": 0.3650226302164386, + "learning_rate": 0.00017549949290736566, + "loss": 0.6324, + "step": 1411 + }, + { + "epoch": 0.2510222222222222, + "grad_norm": 0.3758796481662887, + "learning_rate": 0.00017546172308912213, + "loss": 0.6883, + "step": 1412 + }, + { + "epoch": 0.2512, + "grad_norm": 0.3729612994454608, + "learning_rate": 0.00017542392825143033, + "loss": 0.6977, + "step": 1413 + }, + { + "epoch": 0.25137777777777776, + "grad_norm": 0.3581757788334183, + "learning_rate": 0.00017538610840682126, + "loss": 0.6138, + "step": 1414 + }, + { + "epoch": 0.25155555555555553, + "grad_norm": 0.37614567145559047, + "learning_rate": 0.0001753482635678341, + "loss": 0.6497, + "step": 1415 + }, + { + "epoch": 0.2517333333333333, + "grad_norm": 0.39467932742866707, + "learning_rate": 0.00017531039374701636, + "loss": 0.699, + "step": 1416 + }, + { + "epoch": 0.2519111111111111, + "grad_norm": 0.38233021829393277, + "learning_rate": 0.0001752724989569239, + "loss": 0.673, + "step": 1417 + }, + { + "epoch": 0.25208888888888886, + "grad_norm": 0.36017435952321125, + "learning_rate": 0.00017523457921012075, + "loss": 0.6234, + "step": 1418 + }, + { + "epoch": 0.25226666666666664, + "grad_norm": 0.3694854023500707, + "learning_rate": 0.00017519663451917925, + "loss": 0.6197, + "step": 1419 + }, + { + "epoch": 0.25244444444444447, + "grad_norm": 0.3778772339124559, + "learning_rate": 0.00017515866489668005, + "loss": 0.6179, + "step": 1420 + }, + { + "epoch": 0.25262222222222225, + "grad_norm": 0.36663263378993366, + "learning_rate": 0.000175120670355212, + "loss": 0.6412, + "step": 1421 + }, + { + "epoch": 0.2528, + "grad_norm": 0.4006414162543807, + "learning_rate": 0.00017508265090737226, + "loss": 0.6744, + "step": 1422 + }, + { + "epoch": 0.2529777777777778, + "grad_norm": 0.3807019198494924, + "learning_rate": 0.00017504460656576627, + "loss": 0.6377, + "step": 1423 + }, + { + "epoch": 0.2531555555555556, + "grad_norm": 0.38091276177551353, + "learning_rate": 0.00017500653734300764, + "loss": 0.6234, + "step": 1424 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.35036734350459603, + "learning_rate": 0.00017496844325171827, + "loss": 0.6522, + "step": 1425 + }, + { + "epoch": 0.25351111111111113, + "grad_norm": 0.3573699751241585, + "learning_rate": 0.00017493032430452842, + "loss": 0.6446, + "step": 1426 + }, + { + "epoch": 0.2536888888888889, + "grad_norm": 0.4047691076617589, + "learning_rate": 0.00017489218051407638, + "loss": 0.636, + "step": 1427 + }, + { + "epoch": 0.2538666666666667, + "grad_norm": 0.3844335227443174, + "learning_rate": 0.00017485401189300877, + "loss": 0.6827, + "step": 1428 + }, + { + "epoch": 0.25404444444444446, + "grad_norm": 0.37740262108710376, + "learning_rate": 0.0001748158184539805, + "loss": 0.6599, + "step": 1429 + }, + { + "epoch": 0.25422222222222224, + "grad_norm": 0.3784014533076855, + "learning_rate": 0.00017477760020965467, + "loss": 0.6817, + "step": 1430 + }, + { + "epoch": 0.2544, + "grad_norm": 0.3830865036609814, + "learning_rate": 0.00017473935717270258, + "loss": 0.6998, + "step": 1431 + }, + { + "epoch": 0.2545777777777778, + "grad_norm": 0.4155168127586694, + "learning_rate": 0.00017470108935580377, + "loss": 0.6993, + "step": 1432 + }, + { + "epoch": 0.25475555555555557, + "grad_norm": 0.37379696886003255, + "learning_rate": 0.000174662796771646, + "loss": 0.6519, + "step": 1433 + }, + { + "epoch": 0.25493333333333335, + "grad_norm": 0.3631844001484389, + "learning_rate": 0.0001746244794329252, + "loss": 0.6597, + "step": 1434 + }, + { + "epoch": 0.2551111111111111, + "grad_norm": 0.3649259411940409, + "learning_rate": 0.0001745861373523456, + "loss": 0.6249, + "step": 1435 + }, + { + "epoch": 0.2552888888888889, + "grad_norm": 0.36520081445665964, + "learning_rate": 0.0001745477705426195, + "loss": 0.6369, + "step": 1436 + }, + { + "epoch": 0.2554666666666667, + "grad_norm": 0.3768838955651367, + "learning_rate": 0.00017450937901646754, + "loss": 0.6751, + "step": 1437 + }, + { + "epoch": 0.25564444444444445, + "grad_norm": 0.3715771614697907, + "learning_rate": 0.00017447096278661844, + "loss": 0.6509, + "step": 1438 + }, + { + "epoch": 0.25582222222222223, + "grad_norm": 0.3693045630748999, + "learning_rate": 0.00017443252186580922, + "loss": 0.6625, + "step": 1439 + }, + { + "epoch": 0.256, + "grad_norm": 0.35314560598286976, + "learning_rate": 0.00017439405626678496, + "loss": 0.6471, + "step": 1440 + }, + { + "epoch": 0.2561777777777778, + "grad_norm": 0.3615879788144135, + "learning_rate": 0.00017435556600229902, + "loss": 0.6513, + "step": 1441 + }, + { + "epoch": 0.25635555555555556, + "grad_norm": 0.3635929590305757, + "learning_rate": 0.0001743170510851129, + "loss": 0.6301, + "step": 1442 + }, + { + "epoch": 0.25653333333333334, + "grad_norm": 0.37253753032719467, + "learning_rate": 0.00017427851152799627, + "loss": 0.6642, + "step": 1443 + }, + { + "epoch": 0.2567111111111111, + "grad_norm": 0.3637263808569345, + "learning_rate": 0.000174239947343727, + "loss": 0.656, + "step": 1444 + }, + { + "epoch": 0.2568888888888889, + "grad_norm": 0.3472046979233085, + "learning_rate": 0.0001742013585450911, + "loss": 0.6484, + "step": 1445 + }, + { + "epoch": 0.25706666666666667, + "grad_norm": 0.3866906466435054, + "learning_rate": 0.0001741627451448827, + "loss": 0.6714, + "step": 1446 + }, + { + "epoch": 0.25724444444444444, + "grad_norm": 0.35997096209171037, + "learning_rate": 0.0001741241071559042, + "loss": 0.6864, + "step": 1447 + }, + { + "epoch": 0.2574222222222222, + "grad_norm": 0.3817306868428725, + "learning_rate": 0.00017408544459096605, + "loss": 0.6751, + "step": 1448 + }, + { + "epoch": 0.2576, + "grad_norm": 0.3606513883553394, + "learning_rate": 0.00017404675746288687, + "loss": 0.6356, + "step": 1449 + }, + { + "epoch": 0.2577777777777778, + "grad_norm": 0.36828163234806877, + "learning_rate": 0.00017400804578449343, + "loss": 0.6605, + "step": 1450 + }, + { + "epoch": 0.25795555555555555, + "grad_norm": 0.36476502065513966, + "learning_rate": 0.00017396930956862068, + "loss": 0.6872, + "step": 1451 + }, + { + "epoch": 0.2581333333333333, + "grad_norm": 0.38090128506740734, + "learning_rate": 0.00017393054882811168, + "loss": 0.724, + "step": 1452 + }, + { + "epoch": 0.2583111111111111, + "grad_norm": 0.3684218693982843, + "learning_rate": 0.00017389176357581753, + "loss": 0.6441, + "step": 1453 + }, + { + "epoch": 0.2584888888888889, + "grad_norm": 0.373349333221224, + "learning_rate": 0.00017385295382459765, + "loss": 0.6777, + "step": 1454 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 0.3972388204495291, + "learning_rate": 0.0001738141195873194, + "loss": 0.7323, + "step": 1455 + }, + { + "epoch": 0.25884444444444443, + "grad_norm": 0.37455826557407396, + "learning_rate": 0.00017377526087685832, + "loss": 0.6939, + "step": 1456 + }, + { + "epoch": 0.2590222222222222, + "grad_norm": 0.36104000718727974, + "learning_rate": 0.0001737363777060981, + "loss": 0.6532, + "step": 1457 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3782882968739907, + "learning_rate": 0.00017369747008793055, + "loss": 0.6211, + "step": 1458 + }, + { + "epoch": 0.25937777777777776, + "grad_norm": 0.3692613502820997, + "learning_rate": 0.00017365853803525552, + "loss": 0.7224, + "step": 1459 + }, + { + "epoch": 0.25955555555555554, + "grad_norm": 0.3765006906050503, + "learning_rate": 0.00017361958156098095, + "loss": 0.7001, + "step": 1460 + }, + { + "epoch": 0.2597333333333333, + "grad_norm": 0.37263894045485085, + "learning_rate": 0.00017358060067802295, + "loss": 0.6557, + "step": 1461 + }, + { + "epoch": 0.2599111111111111, + "grad_norm": 0.36340261864875034, + "learning_rate": 0.00017354159539930572, + "loss": 0.6379, + "step": 1462 + }, + { + "epoch": 0.26008888888888887, + "grad_norm": 0.36738390711537466, + "learning_rate": 0.00017350256573776148, + "loss": 0.6296, + "step": 1463 + }, + { + "epoch": 0.26026666666666665, + "grad_norm": 0.37171224467902014, + "learning_rate": 0.0001734635117063306, + "loss": 0.6472, + "step": 1464 + }, + { + "epoch": 0.2604444444444444, + "grad_norm": 0.3523571609245952, + "learning_rate": 0.00017342443331796147, + "loss": 0.6, + "step": 1465 + }, + { + "epoch": 0.2606222222222222, + "grad_norm": 0.3503674568803646, + "learning_rate": 0.0001733853305856106, + "loss": 0.6437, + "step": 1466 + }, + { + "epoch": 0.2608, + "grad_norm": 0.4598068073300456, + "learning_rate": 0.0001733462035222426, + "loss": 0.6657, + "step": 1467 + }, + { + "epoch": 0.26097777777777775, + "grad_norm": 0.3944663606343608, + "learning_rate": 0.00017330705214083005, + "loss": 0.7078, + "step": 1468 + }, + { + "epoch": 0.26115555555555553, + "grad_norm": 0.38551066923302413, + "learning_rate": 0.0001732678764543537, + "loss": 0.7162, + "step": 1469 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 0.416820964935935, + "learning_rate": 0.00017322867647580226, + "loss": 0.6674, + "step": 1470 + }, + { + "epoch": 0.26151111111111114, + "grad_norm": 0.3558581693437286, + "learning_rate": 0.00017318945221817255, + "loss": 0.6441, + "step": 1471 + }, + { + "epoch": 0.2616888888888889, + "grad_norm": 0.39317157498451755, + "learning_rate": 0.00017315020369446945, + "loss": 0.712, + "step": 1472 + }, + { + "epoch": 0.2618666666666667, + "grad_norm": 0.3591427705192805, + "learning_rate": 0.00017311093091770588, + "loss": 0.6934, + "step": 1473 + }, + { + "epoch": 0.26204444444444447, + "grad_norm": 0.36503262368115585, + "learning_rate": 0.00017307163390090278, + "loss": 0.6618, + "step": 1474 + }, + { + "epoch": 0.26222222222222225, + "grad_norm": 0.37082483529626886, + "learning_rate": 0.0001730323126570891, + "loss": 0.6392, + "step": 1475 + }, + { + "epoch": 0.2624, + "grad_norm": 0.41203787591515023, + "learning_rate": 0.0001729929671993019, + "loss": 0.6305, + "step": 1476 + }, + { + "epoch": 0.2625777777777778, + "grad_norm": 0.38123838480997874, + "learning_rate": 0.0001729535975405862, + "loss": 0.6706, + "step": 1477 + }, + { + "epoch": 0.2627555555555556, + "grad_norm": 0.36102807309951895, + "learning_rate": 0.0001729142036939951, + "loss": 0.6064, + "step": 1478 + }, + { + "epoch": 0.26293333333333335, + "grad_norm": 0.4107985758210903, + "learning_rate": 0.00017287478567258965, + "loss": 0.6272, + "step": 1479 + }, + { + "epoch": 0.26311111111111113, + "grad_norm": 0.39509203339984084, + "learning_rate": 0.00017283534348943897, + "loss": 0.6929, + "step": 1480 + }, + { + "epoch": 0.2632888888888889, + "grad_norm": 0.37924882565345963, + "learning_rate": 0.00017279587715762022, + "loss": 0.6803, + "step": 1481 + }, + { + "epoch": 0.2634666666666667, + "grad_norm": 0.34672286380281186, + "learning_rate": 0.00017275638669021846, + "loss": 0.6011, + "step": 1482 + }, + { + "epoch": 0.26364444444444446, + "grad_norm": 0.3497174725947891, + "learning_rate": 0.0001727168721003268, + "loss": 0.6181, + "step": 1483 + }, + { + "epoch": 0.26382222222222224, + "grad_norm": 0.3959601690508927, + "learning_rate": 0.00017267733340104645, + "loss": 0.6913, + "step": 1484 + }, + { + "epoch": 0.264, + "grad_norm": 0.3621608504877059, + "learning_rate": 0.00017263777060548644, + "loss": 0.6481, + "step": 1485 + }, + { + "epoch": 0.2641777777777778, + "grad_norm": 0.3838358113543457, + "learning_rate": 0.00017259818372676394, + "loss": 0.6555, + "step": 1486 + }, + { + "epoch": 0.26435555555555557, + "grad_norm": 0.36512524928546364, + "learning_rate": 0.00017255857277800396, + "loss": 0.6271, + "step": 1487 + }, + { + "epoch": 0.26453333333333334, + "grad_norm": 0.4078323042994446, + "learning_rate": 0.00017251893777233966, + "loss": 0.7055, + "step": 1488 + }, + { + "epoch": 0.2647111111111111, + "grad_norm": 0.37516990586718085, + "learning_rate": 0.000172479278722912, + "loss": 0.6598, + "step": 1489 + }, + { + "epoch": 0.2648888888888889, + "grad_norm": 0.40777636726708033, + "learning_rate": 0.00017243959564287008, + "loss": 0.6943, + "step": 1490 + }, + { + "epoch": 0.2650666666666667, + "grad_norm": 0.5733994481955946, + "learning_rate": 0.00017239988854537083, + "loss": 0.6834, + "step": 1491 + }, + { + "epoch": 0.26524444444444445, + "grad_norm": 0.38818957263294324, + "learning_rate": 0.00017236015744357918, + "loss": 0.6485, + "step": 1492 + }, + { + "epoch": 0.2654222222222222, + "grad_norm": 0.3658027457383031, + "learning_rate": 0.0001723204023506681, + "loss": 0.642, + "step": 1493 + }, + { + "epoch": 0.2656, + "grad_norm": 0.38183893139070835, + "learning_rate": 0.00017228062327981846, + "loss": 0.6471, + "step": 1494 + }, + { + "epoch": 0.2657777777777778, + "grad_norm": 0.37868478985993326, + "learning_rate": 0.000172240820244219, + "loss": 0.6574, + "step": 1495 + }, + { + "epoch": 0.26595555555555556, + "grad_norm": 0.35590937191998534, + "learning_rate": 0.0001722009932570665, + "loss": 0.6243, + "step": 1496 + }, + { + "epoch": 0.26613333333333333, + "grad_norm": 0.35817749953144884, + "learning_rate": 0.00017216114233156566, + "loss": 0.6694, + "step": 1497 + }, + { + "epoch": 0.2663111111111111, + "grad_norm": 0.3682189663181524, + "learning_rate": 0.00017212126748092916, + "loss": 0.6788, + "step": 1498 + }, + { + "epoch": 0.2664888888888889, + "grad_norm": 0.3702648109099891, + "learning_rate": 0.0001720813687183775, + "loss": 0.6544, + "step": 1499 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.3516858030245012, + "learning_rate": 0.0001720414460571392, + "loss": 0.6514, + "step": 1500 + }, + { + "epoch": 0.26684444444444444, + "grad_norm": 0.3970600285967603, + "learning_rate": 0.00017200149951045068, + "loss": 0.6893, + "step": 1501 + }, + { + "epoch": 0.2670222222222222, + "grad_norm": 0.36459457092143216, + "learning_rate": 0.00017196152909155628, + "loss": 0.6925, + "step": 1502 + }, + { + "epoch": 0.2672, + "grad_norm": 0.375708229588632, + "learning_rate": 0.0001719215348137083, + "loss": 0.6605, + "step": 1503 + }, + { + "epoch": 0.26737777777777777, + "grad_norm": 0.35337184864383386, + "learning_rate": 0.00017188151669016678, + "loss": 0.6554, + "step": 1504 + }, + { + "epoch": 0.26755555555555555, + "grad_norm": 0.3666502990516845, + "learning_rate": 0.00017184147473419992, + "loss": 0.6228, + "step": 1505 + }, + { + "epoch": 0.2677333333333333, + "grad_norm": 0.3684981108765006, + "learning_rate": 0.00017180140895908363, + "loss": 0.6186, + "step": 1506 + }, + { + "epoch": 0.2679111111111111, + "grad_norm": 0.36503252371701667, + "learning_rate": 0.00017176131937810175, + "loss": 0.6921, + "step": 1507 + }, + { + "epoch": 0.2680888888888889, + "grad_norm": 0.3782745121888272, + "learning_rate": 0.0001717212060045461, + "loss": 0.6557, + "step": 1508 + }, + { + "epoch": 0.26826666666666665, + "grad_norm": 0.3443728763321807, + "learning_rate": 0.00017168106885171632, + "loss": 0.6353, + "step": 1509 + }, + { + "epoch": 0.26844444444444443, + "grad_norm": 0.34783461804761934, + "learning_rate": 0.0001716409079329199, + "loss": 0.6489, + "step": 1510 + }, + { + "epoch": 0.2686222222222222, + "grad_norm": 0.380783713498233, + "learning_rate": 0.0001716007232614723, + "loss": 0.6725, + "step": 1511 + }, + { + "epoch": 0.2688, + "grad_norm": 0.35949483830814144, + "learning_rate": 0.0001715605148506968, + "loss": 0.6503, + "step": 1512 + }, + { + "epoch": 0.26897777777777776, + "grad_norm": 0.3787937251536407, + "learning_rate": 0.00017152028271392452, + "loss": 0.6362, + "step": 1513 + }, + { + "epoch": 0.26915555555555554, + "grad_norm": 0.35752388274199165, + "learning_rate": 0.00017148002686449455, + "loss": 0.6802, + "step": 1514 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 0.3631834986468229, + "learning_rate": 0.00017143974731575372, + "loss": 0.6426, + "step": 1515 + }, + { + "epoch": 0.2695111111111111, + "grad_norm": 0.3716725179700024, + "learning_rate": 0.00017139944408105676, + "loss": 0.6634, + "step": 1516 + }, + { + "epoch": 0.26968888888888887, + "grad_norm": 0.37151151693173484, + "learning_rate": 0.00017135911717376637, + "loss": 0.6523, + "step": 1517 + }, + { + "epoch": 0.26986666666666664, + "grad_norm": 0.3537830104493746, + "learning_rate": 0.0001713187666072529, + "loss": 0.6381, + "step": 1518 + }, + { + "epoch": 0.2700444444444444, + "grad_norm": 0.37948907695017964, + "learning_rate": 0.0001712783923948947, + "loss": 0.6873, + "step": 1519 + }, + { + "epoch": 0.2702222222222222, + "grad_norm": 0.3589776462448231, + "learning_rate": 0.00017123799455007785, + "loss": 0.6103, + "step": 1520 + }, + { + "epoch": 0.2704, + "grad_norm": 0.38306600167977467, + "learning_rate": 0.00017119757308619639, + "loss": 0.6686, + "step": 1521 + }, + { + "epoch": 0.27057777777777775, + "grad_norm": 0.38983566563127414, + "learning_rate": 0.000171157128016652, + "loss": 0.6767, + "step": 1522 + }, + { + "epoch": 0.2707555555555556, + "grad_norm": 0.40123979606818033, + "learning_rate": 0.00017111665935485443, + "loss": 0.6556, + "step": 1523 + }, + { + "epoch": 0.27093333333333336, + "grad_norm": 0.36046598015206405, + "learning_rate": 0.00017107616711422102, + "loss": 0.6521, + "step": 1524 + }, + { + "epoch": 0.27111111111111114, + "grad_norm": 0.3746026331035882, + "learning_rate": 0.00017103565130817714, + "loss": 0.6776, + "step": 1525 + }, + { + "epoch": 0.2712888888888889, + "grad_norm": 0.36511639433237236, + "learning_rate": 0.00017099511195015575, + "loss": 0.6519, + "step": 1526 + }, + { + "epoch": 0.2714666666666667, + "grad_norm": 0.3633731597608729, + "learning_rate": 0.00017095454905359785, + "loss": 0.6659, + "step": 1527 + }, + { + "epoch": 0.27164444444444447, + "grad_norm": 0.38796582885425795, + "learning_rate": 0.00017091396263195204, + "loss": 0.6976, + "step": 1528 + }, + { + "epoch": 0.27182222222222224, + "grad_norm": 0.38319622356798844, + "learning_rate": 0.00017087335269867483, + "loss": 0.6914, + "step": 1529 + }, + { + "epoch": 0.272, + "grad_norm": 0.386540050254295, + "learning_rate": 0.00017083271926723054, + "loss": 0.6741, + "step": 1530 + }, + { + "epoch": 0.2721777777777778, + "grad_norm": 0.35020812187299805, + "learning_rate": 0.00017079206235109124, + "loss": 0.7154, + "step": 1531 + }, + { + "epoch": 0.2723555555555556, + "grad_norm": 0.36149340273852243, + "learning_rate": 0.00017075138196373675, + "loss": 0.6468, + "step": 1532 + }, + { + "epoch": 0.27253333333333335, + "grad_norm": 0.3697273524960641, + "learning_rate": 0.00017071067811865476, + "loss": 0.6594, + "step": 1533 + }, + { + "epoch": 0.2727111111111111, + "grad_norm": 0.3639779555457213, + "learning_rate": 0.00017066995082934067, + "loss": 0.7021, + "step": 1534 + }, + { + "epoch": 0.2728888888888889, + "grad_norm": 0.3501220217072064, + "learning_rate": 0.00017062920010929767, + "loss": 0.6239, + "step": 1535 + }, + { + "epoch": 0.2730666666666667, + "grad_norm": 0.37338940495234596, + "learning_rate": 0.00017058842597203672, + "loss": 0.6722, + "step": 1536 + }, + { + "epoch": 0.27324444444444446, + "grad_norm": 0.3901137584685992, + "learning_rate": 0.00017054762843107658, + "loss": 0.6898, + "step": 1537 + }, + { + "epoch": 0.27342222222222223, + "grad_norm": 0.3940088695648882, + "learning_rate": 0.00017050680749994369, + "loss": 0.6576, + "step": 1538 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3777716988422191, + "learning_rate": 0.0001704659631921723, + "loss": 0.6055, + "step": 1539 + }, + { + "epoch": 0.2737777777777778, + "grad_norm": 0.4085346731201099, + "learning_rate": 0.00017042509552130444, + "loss": 0.6651, + "step": 1540 + }, + { + "epoch": 0.27395555555555556, + "grad_norm": 0.3634068203641392, + "learning_rate": 0.00017038420450088981, + "loss": 0.6514, + "step": 1541 + }, + { + "epoch": 0.27413333333333334, + "grad_norm": 0.3529669987589849, + "learning_rate": 0.0001703432901444859, + "loss": 0.6088, + "step": 1542 + }, + { + "epoch": 0.2743111111111111, + "grad_norm": 0.38791716697186734, + "learning_rate": 0.00017030235246565795, + "loss": 0.6367, + "step": 1543 + }, + { + "epoch": 0.2744888888888889, + "grad_norm": 0.36447585530490223, + "learning_rate": 0.0001702613914779789, + "loss": 0.6303, + "step": 1544 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 0.3686450777894795, + "learning_rate": 0.00017022040719502933, + "loss": 0.598, + "step": 1545 + }, + { + "epoch": 0.27484444444444445, + "grad_norm": 0.36849929161762734, + "learning_rate": 0.0001701793996303978, + "loss": 0.6524, + "step": 1546 + }, + { + "epoch": 0.2750222222222222, + "grad_norm": 0.3645542029984298, + "learning_rate": 0.00017013836879768035, + "loss": 0.6406, + "step": 1547 + }, + { + "epoch": 0.2752, + "grad_norm": 0.35776886253842693, + "learning_rate": 0.00017009731471048081, + "loss": 0.6372, + "step": 1548 + }, + { + "epoch": 0.2753777777777778, + "grad_norm": 0.39784186436800856, + "learning_rate": 0.00017005623738241074, + "loss": 0.6988, + "step": 1549 + }, + { + "epoch": 0.27555555555555555, + "grad_norm": 0.3613603388204999, + "learning_rate": 0.00017001513682708938, + "loss": 0.6602, + "step": 1550 + }, + { + "epoch": 0.27573333333333333, + "grad_norm": 0.3643154210364165, + "learning_rate": 0.00016997401305814371, + "loss": 0.6586, + "step": 1551 + }, + { + "epoch": 0.2759111111111111, + "grad_norm": 0.3620172362211282, + "learning_rate": 0.00016993286608920833, + "loss": 0.6389, + "step": 1552 + }, + { + "epoch": 0.2760888888888889, + "grad_norm": 0.3478875358901487, + "learning_rate": 0.0001698916959339256, + "loss": 0.6715, + "step": 1553 + }, + { + "epoch": 0.27626666666666666, + "grad_norm": 0.4116150103202986, + "learning_rate": 0.00016985050260594556, + "loss": 0.6675, + "step": 1554 + }, + { + "epoch": 0.27644444444444444, + "grad_norm": 0.37958738781837387, + "learning_rate": 0.0001698092861189259, + "loss": 0.6438, + "step": 1555 + }, + { + "epoch": 0.2766222222222222, + "grad_norm": 0.3594664827463607, + "learning_rate": 0.00016976804648653204, + "loss": 0.6508, + "step": 1556 + }, + { + "epoch": 0.2768, + "grad_norm": 0.3867221588300807, + "learning_rate": 0.00016972678372243703, + "loss": 0.7147, + "step": 1557 + }, + { + "epoch": 0.27697777777777777, + "grad_norm": 0.37346307836797943, + "learning_rate": 0.00016968549784032155, + "loss": 0.6362, + "step": 1558 + }, + { + "epoch": 0.27715555555555554, + "grad_norm": 0.3929228850167982, + "learning_rate": 0.0001696441888538741, + "loss": 0.7053, + "step": 1559 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 0.3569171020247684, + "learning_rate": 0.0001696028567767906, + "loss": 0.647, + "step": 1560 + }, + { + "epoch": 0.2775111111111111, + "grad_norm": 0.3449446587472281, + "learning_rate": 0.0001695615016227749, + "loss": 0.6413, + "step": 1561 + }, + { + "epoch": 0.2776888888888889, + "grad_norm": 0.3529974545581637, + "learning_rate": 0.0001695201234055383, + "loss": 0.6408, + "step": 1562 + }, + { + "epoch": 0.27786666666666665, + "grad_norm": 0.3553907525760485, + "learning_rate": 0.0001694787221387998, + "loss": 0.644, + "step": 1563 + }, + { + "epoch": 0.2780444444444444, + "grad_norm": 0.35211306907171414, + "learning_rate": 0.00016943729783628608, + "loss": 0.6302, + "step": 1564 + }, + { + "epoch": 0.2782222222222222, + "grad_norm": 0.3540440729162601, + "learning_rate": 0.0001693958505117314, + "loss": 0.648, + "step": 1565 + }, + { + "epoch": 0.2784, + "grad_norm": 0.33210994154598633, + "learning_rate": 0.00016935438017887772, + "loss": 0.604, + "step": 1566 + }, + { + "epoch": 0.27857777777777776, + "grad_norm": 0.4525180092565884, + "learning_rate": 0.00016931288685147455, + "loss": 0.6771, + "step": 1567 + }, + { + "epoch": 0.27875555555555553, + "grad_norm": 0.3596065489704551, + "learning_rate": 0.00016927137054327908, + "loss": 0.6327, + "step": 1568 + }, + { + "epoch": 0.2789333333333333, + "grad_norm": 0.37794117734583466, + "learning_rate": 0.00016922983126805614, + "loss": 0.6861, + "step": 1569 + }, + { + "epoch": 0.2791111111111111, + "grad_norm": 0.38868813944137415, + "learning_rate": 0.0001691882690395781, + "loss": 0.6819, + "step": 1570 + }, + { + "epoch": 0.27928888888888886, + "grad_norm": 0.38780003968249405, + "learning_rate": 0.00016914668387162497, + "loss": 0.6881, + "step": 1571 + }, + { + "epoch": 0.27946666666666664, + "grad_norm": 0.37979083416409737, + "learning_rate": 0.00016910507577798443, + "loss": 0.6783, + "step": 1572 + }, + { + "epoch": 0.2796444444444444, + "grad_norm": 0.34529841350312207, + "learning_rate": 0.00016906344477245165, + "loss": 0.6259, + "step": 1573 + }, + { + "epoch": 0.27982222222222225, + "grad_norm": 0.3592364311448176, + "learning_rate": 0.00016902179086882948, + "loss": 0.6632, + "step": 1574 + }, + { + "epoch": 0.28, + "grad_norm": 0.36716844209260896, + "learning_rate": 0.00016898011408092832, + "loss": 0.649, + "step": 1575 + }, + { + "epoch": 0.2801777777777778, + "grad_norm": 0.36409500573185516, + "learning_rate": 0.00016893841442256618, + "loss": 0.6785, + "step": 1576 + }, + { + "epoch": 0.2803555555555556, + "grad_norm": 0.3582229703727218, + "learning_rate": 0.00016889669190756868, + "loss": 0.6174, + "step": 1577 + }, + { + "epoch": 0.28053333333333336, + "grad_norm": 0.3573340906124708, + "learning_rate": 0.0001688549465497689, + "loss": 0.6133, + "step": 1578 + }, + { + "epoch": 0.28071111111111113, + "grad_norm": 0.36536457632216834, + "learning_rate": 0.00016881317836300766, + "loss": 0.6649, + "step": 1579 + }, + { + "epoch": 0.2808888888888889, + "grad_norm": 0.3655700899178259, + "learning_rate": 0.00016877138736113323, + "loss": 0.6306, + "step": 1580 + }, + { + "epoch": 0.2810666666666667, + "grad_norm": 0.3511758005548785, + "learning_rate": 0.00016872957355800144, + "loss": 0.6881, + "step": 1581 + }, + { + "epoch": 0.28124444444444446, + "grad_norm": 0.36451707401360406, + "learning_rate": 0.0001686877369674758, + "loss": 0.6748, + "step": 1582 + }, + { + "epoch": 0.28142222222222224, + "grad_norm": 0.36792045896596826, + "learning_rate": 0.00016864587760342725, + "loss": 0.6641, + "step": 1583 + }, + { + "epoch": 0.2816, + "grad_norm": 0.40129163977490706, + "learning_rate": 0.00016860399547973431, + "loss": 0.6976, + "step": 1584 + }, + { + "epoch": 0.2817777777777778, + "grad_norm": 0.36823163820797866, + "learning_rate": 0.0001685620906102831, + "loss": 0.6791, + "step": 1585 + }, + { + "epoch": 0.28195555555555557, + "grad_norm": 0.37108838575009506, + "learning_rate": 0.0001685201630089672, + "loss": 0.6129, + "step": 1586 + }, + { + "epoch": 0.28213333333333335, + "grad_norm": 0.3738833425629442, + "learning_rate": 0.00016847821268968784, + "loss": 0.6854, + "step": 1587 + }, + { + "epoch": 0.2823111111111111, + "grad_norm": 0.3630864417219976, + "learning_rate": 0.00016843623966635366, + "loss": 0.6151, + "step": 1588 + }, + { + "epoch": 0.2824888888888889, + "grad_norm": 0.37306805915697583, + "learning_rate": 0.00016839424395288083, + "loss": 0.6859, + "step": 1589 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 0.36990542494596307, + "learning_rate": 0.00016835222556319315, + "loss": 0.6525, + "step": 1590 + }, + { + "epoch": 0.28284444444444445, + "grad_norm": 0.39720331269463327, + "learning_rate": 0.00016831018451122194, + "loss": 0.614, + "step": 1591 + }, + { + "epoch": 0.28302222222222223, + "grad_norm": 0.36559407445369607, + "learning_rate": 0.00016826812081090586, + "loss": 0.6612, + "step": 1592 + }, + { + "epoch": 0.2832, + "grad_norm": 0.4101962332697649, + "learning_rate": 0.00016822603447619127, + "loss": 0.7124, + "step": 1593 + }, + { + "epoch": 0.2833777777777778, + "grad_norm": 0.36202439944229253, + "learning_rate": 0.00016818392552103194, + "loss": 0.6556, + "step": 1594 + }, + { + "epoch": 0.28355555555555556, + "grad_norm": 0.36569188513336376, + "learning_rate": 0.00016814179395938913, + "loss": 0.6506, + "step": 1595 + }, + { + "epoch": 0.28373333333333334, + "grad_norm": 0.3543236669712553, + "learning_rate": 0.00016809963980523164, + "loss": 0.593, + "step": 1596 + }, + { + "epoch": 0.2839111111111111, + "grad_norm": 0.35247266506067887, + "learning_rate": 0.00016805746307253574, + "loss": 0.6726, + "step": 1597 + }, + { + "epoch": 0.2840888888888889, + "grad_norm": 0.3740907670356731, + "learning_rate": 0.00016801526377528523, + "loss": 0.7119, + "step": 1598 + }, + { + "epoch": 0.28426666666666667, + "grad_norm": 0.3987839060348559, + "learning_rate": 0.0001679730419274713, + "loss": 0.6501, + "step": 1599 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 0.40372348789282286, + "learning_rate": 0.00016793079754309268, + "loss": 0.713, + "step": 1600 + }, + { + "epoch": 0.2846222222222222, + "grad_norm": 0.36237509073160673, + "learning_rate": 0.00016788853063615556, + "loss": 0.6107, + "step": 1601 + }, + { + "epoch": 0.2848, + "grad_norm": 0.38266541140634475, + "learning_rate": 0.0001678462412206736, + "loss": 0.6672, + "step": 1602 + }, + { + "epoch": 0.2849777777777778, + "grad_norm": 0.35249444514464395, + "learning_rate": 0.00016780392931066792, + "loss": 0.6315, + "step": 1603 + }, + { + "epoch": 0.28515555555555555, + "grad_norm": 0.36639923282166315, + "learning_rate": 0.0001677615949201671, + "loss": 0.668, + "step": 1604 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 0.3454093182608668, + "learning_rate": 0.0001677192380632072, + "loss": 0.6568, + "step": 1605 + }, + { + "epoch": 0.2855111111111111, + "grad_norm": 0.3593673595788356, + "learning_rate": 0.00016767685875383162, + "loss": 0.6466, + "step": 1606 + }, + { + "epoch": 0.2856888888888889, + "grad_norm": 0.35355527121251307, + "learning_rate": 0.00016763445700609134, + "loss": 0.6494, + "step": 1607 + }, + { + "epoch": 0.28586666666666666, + "grad_norm": 0.367848583122886, + "learning_rate": 0.00016759203283404475, + "loss": 0.6844, + "step": 1608 + }, + { + "epoch": 0.28604444444444443, + "grad_norm": 0.36825423348572783, + "learning_rate": 0.00016754958625175758, + "loss": 0.612, + "step": 1609 + }, + { + "epoch": 0.2862222222222222, + "grad_norm": 0.37562533596910713, + "learning_rate": 0.0001675071172733031, + "loss": 0.7046, + "step": 1610 + }, + { + "epoch": 0.2864, + "grad_norm": 0.352316300311903, + "learning_rate": 0.000167464625912762, + "loss": 0.6341, + "step": 1611 + }, + { + "epoch": 0.28657777777777776, + "grad_norm": 0.36726691871719863, + "learning_rate": 0.00016742211218422225, + "loss": 0.6598, + "step": 1612 + }, + { + "epoch": 0.28675555555555554, + "grad_norm": 0.37202547875973163, + "learning_rate": 0.00016737957610177942, + "loss": 0.6544, + "step": 1613 + }, + { + "epoch": 0.2869333333333333, + "grad_norm": 0.39590029044820857, + "learning_rate": 0.0001673370176795364, + "loss": 0.6456, + "step": 1614 + }, + { + "epoch": 0.2871111111111111, + "grad_norm": 0.33959767179359196, + "learning_rate": 0.0001672944369316035, + "loss": 0.569, + "step": 1615 + }, + { + "epoch": 0.28728888888888887, + "grad_norm": 0.3844339484101618, + "learning_rate": 0.00016725183387209845, + "loss": 0.643, + "step": 1616 + }, + { + "epoch": 0.28746666666666665, + "grad_norm": 0.40312131507951543, + "learning_rate": 0.0001672092085151463, + "loss": 0.6893, + "step": 1617 + }, + { + "epoch": 0.2876444444444444, + "grad_norm": 0.38037957999982946, + "learning_rate": 0.00016716656087487959, + "loss": 0.6253, + "step": 1618 + }, + { + "epoch": 0.2878222222222222, + "grad_norm": 0.38918651766949613, + "learning_rate": 0.00016712389096543818, + "loss": 0.7049, + "step": 1619 + }, + { + "epoch": 0.288, + "grad_norm": 0.3903890800343581, + "learning_rate": 0.00016708119880096942, + "loss": 0.6865, + "step": 1620 + }, + { + "epoch": 0.28817777777777775, + "grad_norm": 0.40719651196996154, + "learning_rate": 0.00016703848439562785, + "loss": 0.6355, + "step": 1621 + }, + { + "epoch": 0.28835555555555553, + "grad_norm": 0.38801940260098977, + "learning_rate": 0.0001669957477635756, + "loss": 0.6919, + "step": 1622 + }, + { + "epoch": 0.2885333333333333, + "grad_norm": 0.39761241812893516, + "learning_rate": 0.00016695298891898202, + "loss": 0.6334, + "step": 1623 + }, + { + "epoch": 0.2887111111111111, + "grad_norm": 0.36975082129572434, + "learning_rate": 0.00016691020787602386, + "loss": 0.6486, + "step": 1624 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 0.36544262517956666, + "learning_rate": 0.00016686740464888521, + "loss": 0.6418, + "step": 1625 + }, + { + "epoch": 0.2890666666666667, + "grad_norm": 0.3741479104412605, + "learning_rate": 0.00016682457925175763, + "loss": 0.6447, + "step": 1626 + }, + { + "epoch": 0.28924444444444447, + "grad_norm": 0.36114846927211625, + "learning_rate": 0.0001667817316988399, + "loss": 0.659, + "step": 1627 + }, + { + "epoch": 0.28942222222222225, + "grad_norm": 0.37757858853180404, + "learning_rate": 0.00016673886200433818, + "loss": 0.6637, + "step": 1628 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3790427429504027, + "learning_rate": 0.00016669597018246598, + "loss": 0.6589, + "step": 1629 + }, + { + "epoch": 0.2897777777777778, + "grad_norm": 0.3698428846175222, + "learning_rate": 0.00016665305624744415, + "loss": 0.664, + "step": 1630 + }, + { + "epoch": 0.2899555555555556, + "grad_norm": 0.35903265930230455, + "learning_rate": 0.00016661012021350092, + "loss": 0.6271, + "step": 1631 + }, + { + "epoch": 0.29013333333333335, + "grad_norm": 0.3991257430531067, + "learning_rate": 0.00016656716209487174, + "loss": 0.6621, + "step": 1632 + }, + { + "epoch": 0.29031111111111113, + "grad_norm": 0.35614584184602, + "learning_rate": 0.00016652418190579943, + "loss": 0.6283, + "step": 1633 + }, + { + "epoch": 0.2904888888888889, + "grad_norm": 0.35609420898718935, + "learning_rate": 0.00016648117966053418, + "loss": 0.6271, + "step": 1634 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 0.40418988531579475, + "learning_rate": 0.00016643815537333346, + "loss": 0.7326, + "step": 1635 + }, + { + "epoch": 0.29084444444444446, + "grad_norm": 0.3769344314020853, + "learning_rate": 0.00016639510905846195, + "loss": 0.6557, + "step": 1636 + }, + { + "epoch": 0.29102222222222224, + "grad_norm": 0.398178854541475, + "learning_rate": 0.00016635204073019183, + "loss": 0.7394, + "step": 1637 + }, + { + "epoch": 0.2912, + "grad_norm": 0.362286540124684, + "learning_rate": 0.00016630895040280238, + "loss": 0.678, + "step": 1638 + }, + { + "epoch": 0.2913777777777778, + "grad_norm": 0.35741926867254975, + "learning_rate": 0.00016626583809058033, + "loss": 0.6789, + "step": 1639 + }, + { + "epoch": 0.29155555555555557, + "grad_norm": 0.38382435380886026, + "learning_rate": 0.00016622270380781958, + "loss": 0.6815, + "step": 1640 + }, + { + "epoch": 0.29173333333333334, + "grad_norm": 0.3591472386834611, + "learning_rate": 0.00016617954756882144, + "loss": 0.6197, + "step": 1641 + }, + { + "epoch": 0.2919111111111111, + "grad_norm": 0.3830388642691941, + "learning_rate": 0.00016613636938789435, + "loss": 0.677, + "step": 1642 + }, + { + "epoch": 0.2920888888888889, + "grad_norm": 0.3582069695503815, + "learning_rate": 0.0001660931692793541, + "loss": 0.634, + "step": 1643 + }, + { + "epoch": 0.2922666666666667, + "grad_norm": 0.36018299074696664, + "learning_rate": 0.00016604994725752379, + "loss": 0.6863, + "step": 1644 + }, + { + "epoch": 0.29244444444444445, + "grad_norm": 0.3615518963205158, + "learning_rate": 0.00016600670333673375, + "loss": 0.6196, + "step": 1645 + }, + { + "epoch": 0.29262222222222223, + "grad_norm": 0.3503360141879489, + "learning_rate": 0.00016596343753132154, + "loss": 0.6427, + "step": 1646 + }, + { + "epoch": 0.2928, + "grad_norm": 0.4182386117411707, + "learning_rate": 0.000165920149855632, + "loss": 0.6454, + "step": 1647 + }, + { + "epoch": 0.2929777777777778, + "grad_norm": 0.3717216798529551, + "learning_rate": 0.00016587684032401732, + "loss": 0.6742, + "step": 1648 + }, + { + "epoch": 0.29315555555555556, + "grad_norm": 0.3719433656275183, + "learning_rate": 0.00016583350895083666, + "loss": 0.6499, + "step": 1649 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.38615751025353456, + "learning_rate": 0.00016579015575045677, + "loss": 0.6471, + "step": 1650 + }, + { + "epoch": 0.2935111111111111, + "grad_norm": 0.3715801680741638, + "learning_rate": 0.0001657467807372514, + "loss": 0.6622, + "step": 1651 + }, + { + "epoch": 0.2936888888888889, + "grad_norm": 0.34774148937984606, + "learning_rate": 0.00016570338392560155, + "loss": 0.6689, + "step": 1652 + }, + { + "epoch": 0.29386666666666666, + "grad_norm": 0.36754532374650367, + "learning_rate": 0.0001656599653298956, + "loss": 0.6772, + "step": 1653 + }, + { + "epoch": 0.29404444444444444, + "grad_norm": 0.3650260431663795, + "learning_rate": 0.000165616524964529, + "loss": 0.6497, + "step": 1654 + }, + { + "epoch": 0.2942222222222222, + "grad_norm": 0.35679064413053624, + "learning_rate": 0.00016557306284390445, + "loss": 0.6605, + "step": 1655 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3707613764546627, + "learning_rate": 0.0001655295789824319, + "loss": 0.7071, + "step": 1656 + }, + { + "epoch": 0.29457777777777777, + "grad_norm": 0.3639229761322604, + "learning_rate": 0.00016548607339452853, + "loss": 0.6396, + "step": 1657 + }, + { + "epoch": 0.29475555555555555, + "grad_norm": 0.38098051320159065, + "learning_rate": 0.0001654425460946186, + "loss": 0.6301, + "step": 1658 + }, + { + "epoch": 0.2949333333333333, + "grad_norm": 0.37483174017176313, + "learning_rate": 0.00016539899709713373, + "loss": 0.6693, + "step": 1659 + }, + { + "epoch": 0.2951111111111111, + "grad_norm": 0.3614679507704923, + "learning_rate": 0.00016535542641651262, + "loss": 0.6871, + "step": 1660 + }, + { + "epoch": 0.2952888888888889, + "grad_norm": 0.3853837325412522, + "learning_rate": 0.0001653118340672012, + "loss": 0.6617, + "step": 1661 + }, + { + "epoch": 0.29546666666666666, + "grad_norm": 0.36349561612140896, + "learning_rate": 0.00016526822006365257, + "loss": 0.6277, + "step": 1662 + }, + { + "epoch": 0.29564444444444443, + "grad_norm": 0.3546970965262031, + "learning_rate": 0.00016522458442032702, + "loss": 0.6257, + "step": 1663 + }, + { + "epoch": 0.2958222222222222, + "grad_norm": 0.3932794704259257, + "learning_rate": 0.00016518092715169202, + "loss": 0.6413, + "step": 1664 + }, + { + "epoch": 0.296, + "grad_norm": 0.3804945926184189, + "learning_rate": 0.00016513724827222227, + "loss": 0.6533, + "step": 1665 + }, + { + "epoch": 0.29617777777777776, + "grad_norm": 0.39375658232461064, + "learning_rate": 0.00016509354779639944, + "loss": 0.6478, + "step": 1666 + }, + { + "epoch": 0.29635555555555554, + "grad_norm": 0.35285292293367065, + "learning_rate": 0.00016504982573871253, + "loss": 0.6374, + "step": 1667 + }, + { + "epoch": 0.2965333333333333, + "grad_norm": 0.3405577410350097, + "learning_rate": 0.0001650060821136577, + "loss": 0.6367, + "step": 1668 + }, + { + "epoch": 0.2967111111111111, + "grad_norm": 0.34877181340632607, + "learning_rate": 0.0001649623169357382, + "loss": 0.6531, + "step": 1669 + }, + { + "epoch": 0.29688888888888887, + "grad_norm": 0.34941513955424286, + "learning_rate": 0.00016491853021946443, + "loss": 0.6284, + "step": 1670 + }, + { + "epoch": 0.29706666666666665, + "grad_norm": 0.38174250521150227, + "learning_rate": 0.00016487472197935393, + "loss": 0.7331, + "step": 1671 + }, + { + "epoch": 0.2972444444444444, + "grad_norm": 0.359302494372468, + "learning_rate": 0.0001648308922299314, + "loss": 0.6425, + "step": 1672 + }, + { + "epoch": 0.2974222222222222, + "grad_norm": 0.34153908692702345, + "learning_rate": 0.0001647870409857287, + "loss": 0.6198, + "step": 1673 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3575729020900639, + "learning_rate": 0.0001647431682612847, + "loss": 0.7009, + "step": 1674 + }, + { + "epoch": 0.29777777777777775, + "grad_norm": 0.3556820727438643, + "learning_rate": 0.0001646992740711455, + "loss": 0.5996, + "step": 1675 + }, + { + "epoch": 0.29795555555555553, + "grad_norm": 0.3842106915170282, + "learning_rate": 0.00016465535842986434, + "loss": 0.6785, + "step": 1676 + }, + { + "epoch": 0.2981333333333333, + "grad_norm": 0.37541548993651636, + "learning_rate": 0.0001646114213520014, + "loss": 0.6125, + "step": 1677 + }, + { + "epoch": 0.29831111111111114, + "grad_norm": 0.4604844744115064, + "learning_rate": 0.00016456746285212418, + "loss": 0.7025, + "step": 1678 + }, + { + "epoch": 0.2984888888888889, + "grad_norm": 0.3798955846806804, + "learning_rate": 0.00016452348294480716, + "loss": 0.6338, + "step": 1679 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 0.3685450257934339, + "learning_rate": 0.00016447948164463196, + "loss": 0.6488, + "step": 1680 + }, + { + "epoch": 0.29884444444444447, + "grad_norm": 0.3775056589359029, + "learning_rate": 0.00016443545896618723, + "loss": 0.6383, + "step": 1681 + }, + { + "epoch": 0.29902222222222224, + "grad_norm": 0.3771388170336481, + "learning_rate": 0.0001643914149240688, + "loss": 0.6656, + "step": 1682 + }, + { + "epoch": 0.2992, + "grad_norm": 0.36586734107527763, + "learning_rate": 0.00016434734953287955, + "loss": 0.6576, + "step": 1683 + }, + { + "epoch": 0.2993777777777778, + "grad_norm": 0.3776476015047614, + "learning_rate": 0.00016430326280722935, + "loss": 0.626, + "step": 1684 + }, + { + "epoch": 0.2995555555555556, + "grad_norm": 0.356129705155627, + "learning_rate": 0.00016425915476173532, + "loss": 0.6196, + "step": 1685 + }, + { + "epoch": 0.29973333333333335, + "grad_norm": 0.36271314440779684, + "learning_rate": 0.00016421502541102148, + "loss": 0.6887, + "step": 1686 + }, + { + "epoch": 0.29991111111111113, + "grad_norm": 0.36242908957370806, + "learning_rate": 0.000164170874769719, + "loss": 0.6398, + "step": 1687 + }, + { + "epoch": 0.3000888888888889, + "grad_norm": 0.36457516674878454, + "learning_rate": 0.0001641267028524661, + "loss": 0.6086, + "step": 1688 + }, + { + "epoch": 0.3002666666666667, + "grad_norm": 0.4072474505512138, + "learning_rate": 0.00016408250967390805, + "loss": 0.7279, + "step": 1689 + }, + { + "epoch": 0.30044444444444446, + "grad_norm": 0.3647595393407218, + "learning_rate": 0.00016403829524869719, + "loss": 0.6395, + "step": 1690 + }, + { + "epoch": 0.30062222222222224, + "grad_norm": 0.3468439266721444, + "learning_rate": 0.00016399405959149278, + "loss": 0.6153, + "step": 1691 + }, + { + "epoch": 0.3008, + "grad_norm": 0.3505758726562104, + "learning_rate": 0.00016394980271696133, + "loss": 0.6533, + "step": 1692 + }, + { + "epoch": 0.3009777777777778, + "grad_norm": 0.36954713137927453, + "learning_rate": 0.00016390552463977623, + "loss": 0.6853, + "step": 1693 + }, + { + "epoch": 0.30115555555555557, + "grad_norm": 0.4086642593848733, + "learning_rate": 0.0001638612253746179, + "loss": 0.5944, + "step": 1694 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 0.3728852684537193, + "learning_rate": 0.00016381690493617393, + "loss": 0.5958, + "step": 1695 + }, + { + "epoch": 0.3015111111111111, + "grad_norm": 0.37227237329823, + "learning_rate": 0.0001637725633391387, + "loss": 0.6364, + "step": 1696 + }, + { + "epoch": 0.3016888888888889, + "grad_norm": 0.3515896502389464, + "learning_rate": 0.00016372820059821388, + "loss": 0.6083, + "step": 1697 + }, + { + "epoch": 0.30186666666666667, + "grad_norm": 0.35558746860390805, + "learning_rate": 0.00016368381672810786, + "loss": 0.5973, + "step": 1698 + }, + { + "epoch": 0.30204444444444445, + "grad_norm": 0.39096283723134945, + "learning_rate": 0.00016363941174353628, + "loss": 0.697, + "step": 1699 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 0.3761212982411811, + "learning_rate": 0.00016359498565922165, + "loss": 0.6841, + "step": 1700 + }, + { + "epoch": 0.3024, + "grad_norm": 0.41384762557019056, + "learning_rate": 0.00016355053848989348, + "loss": 0.6274, + "step": 1701 + }, + { + "epoch": 0.3025777777777778, + "grad_norm": 0.3875701235326229, + "learning_rate": 0.00016350607025028834, + "loss": 0.6507, + "step": 1702 + }, + { + "epoch": 0.30275555555555556, + "grad_norm": 0.3523967540916701, + "learning_rate": 0.00016346158095514968, + "loss": 0.6093, + "step": 1703 + }, + { + "epoch": 0.30293333333333333, + "grad_norm": 0.37237022080565885, + "learning_rate": 0.00016341707061922803, + "loss": 0.6641, + "step": 1704 + }, + { + "epoch": 0.3031111111111111, + "grad_norm": 0.38752436104983895, + "learning_rate": 0.0001633725392572809, + "loss": 0.696, + "step": 1705 + }, + { + "epoch": 0.3032888888888889, + "grad_norm": 0.3644395139666274, + "learning_rate": 0.0001633279868840727, + "loss": 0.6243, + "step": 1706 + }, + { + "epoch": 0.30346666666666666, + "grad_norm": 0.3587656545832138, + "learning_rate": 0.00016328341351437478, + "loss": 0.6723, + "step": 1707 + }, + { + "epoch": 0.30364444444444444, + "grad_norm": 0.3805289941564587, + "learning_rate": 0.0001632388191629656, + "loss": 0.6629, + "step": 1708 + }, + { + "epoch": 0.3038222222222222, + "grad_norm": 0.3619187332038642, + "learning_rate": 0.0001631942038446304, + "loss": 0.6295, + "step": 1709 + }, + { + "epoch": 0.304, + "grad_norm": 0.39152956006743905, + "learning_rate": 0.00016314956757416154, + "loss": 0.6679, + "step": 1710 + }, + { + "epoch": 0.30417777777777777, + "grad_norm": 0.38136815429344956, + "learning_rate": 0.00016310491036635816, + "loss": 0.6561, + "step": 1711 + }, + { + "epoch": 0.30435555555555555, + "grad_norm": 0.3555085182286857, + "learning_rate": 0.0001630602322360265, + "loss": 0.6088, + "step": 1712 + }, + { + "epoch": 0.3045333333333333, + "grad_norm": 0.38749766289318294, + "learning_rate": 0.0001630155331979796, + "loss": 0.6496, + "step": 1713 + }, + { + "epoch": 0.3047111111111111, + "grad_norm": 0.37253326298486855, + "learning_rate": 0.0001629708132670375, + "loss": 0.6401, + "step": 1714 + }, + { + "epoch": 0.3048888888888889, + "grad_norm": 0.3604371225636213, + "learning_rate": 0.0001629260724580272, + "loss": 0.6479, + "step": 1715 + }, + { + "epoch": 0.30506666666666665, + "grad_norm": 0.36211094683281686, + "learning_rate": 0.00016288131078578258, + "loss": 0.671, + "step": 1716 + }, + { + "epoch": 0.30524444444444443, + "grad_norm": 0.3535167380697168, + "learning_rate": 0.0001628365282651444, + "loss": 0.6389, + "step": 1717 + }, + { + "epoch": 0.3054222222222222, + "grad_norm": 0.3674531278583611, + "learning_rate": 0.0001627917249109604, + "loss": 0.6864, + "step": 1718 + }, + { + "epoch": 0.3056, + "grad_norm": 0.36792805833937736, + "learning_rate": 0.0001627469007380852, + "loss": 0.6756, + "step": 1719 + }, + { + "epoch": 0.30577777777777776, + "grad_norm": 0.33062236265772743, + "learning_rate": 0.00016270205576138032, + "loss": 0.6255, + "step": 1720 + }, + { + "epoch": 0.30595555555555554, + "grad_norm": 0.36465488896191917, + "learning_rate": 0.00016265718999571415, + "loss": 0.6481, + "step": 1721 + }, + { + "epoch": 0.3061333333333333, + "grad_norm": 0.37826787882977797, + "learning_rate": 0.00016261230345596207, + "loss": 0.633, + "step": 1722 + }, + { + "epoch": 0.3063111111111111, + "grad_norm": 0.3764301507046969, + "learning_rate": 0.00016256739615700622, + "loss": 0.6795, + "step": 1723 + }, + { + "epoch": 0.30648888888888887, + "grad_norm": 0.36503316126263086, + "learning_rate": 0.0001625224681137357, + "loss": 0.65, + "step": 1724 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.37363993230389003, + "learning_rate": 0.00016247751934104647, + "loss": 0.6044, + "step": 1725 + }, + { + "epoch": 0.3068444444444444, + "grad_norm": 0.38272736827927206, + "learning_rate": 0.00016243254985384137, + "loss": 0.702, + "step": 1726 + }, + { + "epoch": 0.3070222222222222, + "grad_norm": 0.36838419648838244, + "learning_rate": 0.0001623875596670301, + "loss": 0.6142, + "step": 1727 + }, + { + "epoch": 0.3072, + "grad_norm": 0.3946411710598122, + "learning_rate": 0.0001623425487955292, + "loss": 0.6539, + "step": 1728 + }, + { + "epoch": 0.3073777777777778, + "grad_norm": 0.37169451135801346, + "learning_rate": 0.00016229751725426212, + "loss": 0.6668, + "step": 1729 + }, + { + "epoch": 0.3075555555555556, + "grad_norm": 0.37064746387431535, + "learning_rate": 0.00016225246505815916, + "loss": 0.6577, + "step": 1730 + }, + { + "epoch": 0.30773333333333336, + "grad_norm": 0.38655703582976847, + "learning_rate": 0.00016220739222215738, + "loss": 0.6644, + "step": 1731 + }, + { + "epoch": 0.30791111111111114, + "grad_norm": 0.36351047913657664, + "learning_rate": 0.0001621622987612008, + "loss": 0.601, + "step": 1732 + }, + { + "epoch": 0.3080888888888889, + "grad_norm": 0.36850199094876956, + "learning_rate": 0.00016211718469024019, + "loss": 0.6966, + "step": 1733 + }, + { + "epoch": 0.3082666666666667, + "grad_norm": 0.390154892595685, + "learning_rate": 0.0001620720500242332, + "loss": 0.6786, + "step": 1734 + }, + { + "epoch": 0.30844444444444447, + "grad_norm": 0.35824182930425935, + "learning_rate": 0.0001620268947781443, + "loss": 0.6472, + "step": 1735 + }, + { + "epoch": 0.30862222222222224, + "grad_norm": 0.35450289701538007, + "learning_rate": 0.0001619817189669448, + "loss": 0.6484, + "step": 1736 + }, + { + "epoch": 0.3088, + "grad_norm": 0.36600383412112, + "learning_rate": 0.00016193652260561279, + "loss": 0.6959, + "step": 1737 + }, + { + "epoch": 0.3089777777777778, + "grad_norm": 0.388772885514875, + "learning_rate": 0.0001618913057091332, + "loss": 0.6624, + "step": 1738 + }, + { + "epoch": 0.3091555555555556, + "grad_norm": 0.39064081120691746, + "learning_rate": 0.00016184606829249768, + "loss": 0.61, + "step": 1739 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 0.3542687929481353, + "learning_rate": 0.0001618008103707049, + "loss": 0.6312, + "step": 1740 + }, + { + "epoch": 0.3095111111111111, + "grad_norm": 0.3727749325508445, + "learning_rate": 0.0001617555319587601, + "loss": 0.6593, + "step": 1741 + }, + { + "epoch": 0.3096888888888889, + "grad_norm": 0.36140017898631316, + "learning_rate": 0.00016171023307167545, + "loss": 0.657, + "step": 1742 + }, + { + "epoch": 0.3098666666666667, + "grad_norm": 0.358738075807226, + "learning_rate": 0.00016166491372446984, + "loss": 0.6756, + "step": 1743 + }, + { + "epoch": 0.31004444444444446, + "grad_norm": 0.38091348380994894, + "learning_rate": 0.00016161957393216896, + "loss": 0.6673, + "step": 1744 + }, + { + "epoch": 0.31022222222222223, + "grad_norm": 0.38779010299402067, + "learning_rate": 0.0001615742137098053, + "loss": 0.6643, + "step": 1745 + }, + { + "epoch": 0.3104, + "grad_norm": 0.35724222431246083, + "learning_rate": 0.00016152883307241815, + "loss": 0.6564, + "step": 1746 + }, + { + "epoch": 0.3105777777777778, + "grad_norm": 0.36116431074929833, + "learning_rate": 0.00016148343203505346, + "loss": 0.7278, + "step": 1747 + }, + { + "epoch": 0.31075555555555556, + "grad_norm": 0.3629916030165536, + "learning_rate": 0.00016143801061276403, + "loss": 0.6386, + "step": 1748 + }, + { + "epoch": 0.31093333333333334, + "grad_norm": 0.395746393048325, + "learning_rate": 0.00016139256882060946, + "loss": 0.7123, + "step": 1749 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 0.3575212517637486, + "learning_rate": 0.00016134710667365596, + "loss": 0.6527, + "step": 1750 + }, + { + "epoch": 0.3112888888888889, + "grad_norm": 0.36500223084327904, + "learning_rate": 0.0001613016241869766, + "loss": 0.6502, + "step": 1751 + }, + { + "epoch": 0.31146666666666667, + "grad_norm": 0.37427191136196, + "learning_rate": 0.00016125612137565123, + "loss": 0.7022, + "step": 1752 + }, + { + "epoch": 0.31164444444444445, + "grad_norm": 0.3493185167633601, + "learning_rate": 0.0001612105982547663, + "loss": 0.5804, + "step": 1753 + }, + { + "epoch": 0.3118222222222222, + "grad_norm": 0.37903019466021276, + "learning_rate": 0.00016116505483941505, + "loss": 0.6641, + "step": 1754 + }, + { + "epoch": 0.312, + "grad_norm": 0.351886067019922, + "learning_rate": 0.0001611194911446976, + "loss": 0.6186, + "step": 1755 + }, + { + "epoch": 0.3121777777777778, + "grad_norm": 0.3608822364958209, + "learning_rate": 0.00016107390718572053, + "loss": 0.6128, + "step": 1756 + }, + { + "epoch": 0.31235555555555555, + "grad_norm": 0.36834705618611674, + "learning_rate": 0.0001610283029775973, + "loss": 0.6335, + "step": 1757 + }, + { + "epoch": 0.31253333333333333, + "grad_norm": 0.363195299854787, + "learning_rate": 0.0001609826785354481, + "loss": 0.6371, + "step": 1758 + }, + { + "epoch": 0.3127111111111111, + "grad_norm": 0.3639046661794866, + "learning_rate": 0.0001609370338743997, + "loss": 0.6385, + "step": 1759 + }, + { + "epoch": 0.3128888888888889, + "grad_norm": 0.37292250685136824, + "learning_rate": 0.00016089136900958577, + "loss": 0.6648, + "step": 1760 + }, + { + "epoch": 0.31306666666666666, + "grad_norm": 0.3728888359050662, + "learning_rate": 0.00016084568395614648, + "loss": 0.6558, + "step": 1761 + }, + { + "epoch": 0.31324444444444444, + "grad_norm": 0.36093429000488164, + "learning_rate": 0.00016079997872922878, + "loss": 0.6406, + "step": 1762 + }, + { + "epoch": 0.3134222222222222, + "grad_norm": 0.36252748591563455, + "learning_rate": 0.00016075425334398635, + "loss": 0.6511, + "step": 1763 + }, + { + "epoch": 0.3136, + "grad_norm": 0.37795241750301034, + "learning_rate": 0.00016070850781557948, + "loss": 0.6204, + "step": 1764 + }, + { + "epoch": 0.31377777777777777, + "grad_norm": 0.4232720202696655, + "learning_rate": 0.00016066274215917518, + "loss": 0.7007, + "step": 1765 + }, + { + "epoch": 0.31395555555555554, + "grad_norm": 0.3890812863566723, + "learning_rate": 0.00016061695638994715, + "loss": 0.6882, + "step": 1766 + }, + { + "epoch": 0.3141333333333333, + "grad_norm": 0.3816459860778058, + "learning_rate": 0.00016057115052307567, + "loss": 0.7129, + "step": 1767 + }, + { + "epoch": 0.3143111111111111, + "grad_norm": 0.5666069535202038, + "learning_rate": 0.00016052532457374777, + "loss": 0.6174, + "step": 1768 + }, + { + "epoch": 0.3144888888888889, + "grad_norm": 0.36688081306723974, + "learning_rate": 0.00016047947855715714, + "loss": 0.6964, + "step": 1769 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 0.34358146967871445, + "learning_rate": 0.00016043361248850406, + "loss": 0.5922, + "step": 1770 + }, + { + "epoch": 0.3148444444444444, + "grad_norm": 0.43275736691447925, + "learning_rate": 0.0001603877263829955, + "loss": 0.6407, + "step": 1771 + }, + { + "epoch": 0.3150222222222222, + "grad_norm": 0.3525551262689683, + "learning_rate": 0.0001603418202558451, + "loss": 0.6261, + "step": 1772 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3651264953192697, + "learning_rate": 0.00016029589412227307, + "loss": 0.6257, + "step": 1773 + }, + { + "epoch": 0.31537777777777776, + "grad_norm": 0.3553831704757985, + "learning_rate": 0.00016024994799750632, + "loss": 0.6228, + "step": 1774 + }, + { + "epoch": 0.31555555555555553, + "grad_norm": 0.3745489016338945, + "learning_rate": 0.0001602039818967783, + "loss": 0.6568, + "step": 1775 + }, + { + "epoch": 0.3157333333333333, + "grad_norm": 0.4166500432344376, + "learning_rate": 0.0001601579958353292, + "loss": 0.6399, + "step": 1776 + }, + { + "epoch": 0.3159111111111111, + "grad_norm": 0.38709085436032326, + "learning_rate": 0.00016011198982840576, + "loss": 0.6704, + "step": 1777 + }, + { + "epoch": 0.31608888888888886, + "grad_norm": 0.3806685510090918, + "learning_rate": 0.0001600659638912613, + "loss": 0.6962, + "step": 1778 + }, + { + "epoch": 0.31626666666666664, + "grad_norm": 0.3520549535927781, + "learning_rate": 0.00016001991803915583, + "loss": 0.6566, + "step": 1779 + }, + { + "epoch": 0.3164444444444444, + "grad_norm": 0.37084815554146033, + "learning_rate": 0.00015997385228735592, + "loss": 0.6801, + "step": 1780 + }, + { + "epoch": 0.31662222222222225, + "grad_norm": 0.36193607226819585, + "learning_rate": 0.0001599277666511347, + "loss": 0.6717, + "step": 1781 + }, + { + "epoch": 0.3168, + "grad_norm": 0.36978156836532877, + "learning_rate": 0.00015988166114577198, + "loss": 0.6537, + "step": 1782 + }, + { + "epoch": 0.3169777777777778, + "grad_norm": 0.36933082120015776, + "learning_rate": 0.00015983553578655408, + "loss": 0.6583, + "step": 1783 + }, + { + "epoch": 0.3171555555555556, + "grad_norm": 0.3610825036347129, + "learning_rate": 0.00015978939058877394, + "loss": 0.6443, + "step": 1784 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 0.3716920478229916, + "learning_rate": 0.00015974322556773108, + "loss": 0.6619, + "step": 1785 + }, + { + "epoch": 0.31751111111111113, + "grad_norm": 0.39740997112701565, + "learning_rate": 0.00015969704073873157, + "loss": 0.6356, + "step": 1786 + }, + { + "epoch": 0.3176888888888889, + "grad_norm": 0.3853483598971617, + "learning_rate": 0.00015965083611708809, + "loss": 0.658, + "step": 1787 + }, + { + "epoch": 0.3178666666666667, + "grad_norm": 0.35427312769562114, + "learning_rate": 0.00015960461171811977, + "loss": 0.6257, + "step": 1788 + }, + { + "epoch": 0.31804444444444446, + "grad_norm": 0.3721771209526191, + "learning_rate": 0.00015955836755715249, + "loss": 0.6568, + "step": 1789 + }, + { + "epoch": 0.31822222222222224, + "grad_norm": 0.3476821094871773, + "learning_rate": 0.0001595121036495185, + "loss": 0.6294, + "step": 1790 + }, + { + "epoch": 0.3184, + "grad_norm": 0.3795396867231499, + "learning_rate": 0.00015946582001055668, + "loss": 0.6847, + "step": 1791 + }, + { + "epoch": 0.3185777777777778, + "grad_norm": 0.36512089149574023, + "learning_rate": 0.00015941951665561244, + "loss": 0.6763, + "step": 1792 + }, + { + "epoch": 0.31875555555555557, + "grad_norm": 0.4040724874737144, + "learning_rate": 0.00015937319360003773, + "loss": 0.6906, + "step": 1793 + }, + { + "epoch": 0.31893333333333335, + "grad_norm": 0.3762924143149502, + "learning_rate": 0.00015932685085919105, + "loss": 0.7092, + "step": 1794 + }, + { + "epoch": 0.3191111111111111, + "grad_norm": 0.3603653277849108, + "learning_rate": 0.00015928048844843738, + "loss": 0.6756, + "step": 1795 + }, + { + "epoch": 0.3192888888888889, + "grad_norm": 0.37668503947766757, + "learning_rate": 0.00015923410638314826, + "loss": 0.7037, + "step": 1796 + }, + { + "epoch": 0.3194666666666667, + "grad_norm": 0.4088187612134446, + "learning_rate": 0.0001591877046787017, + "loss": 0.7297, + "step": 1797 + }, + { + "epoch": 0.31964444444444445, + "grad_norm": 0.34933068244720505, + "learning_rate": 0.00015914128335048236, + "loss": 0.6241, + "step": 1798 + }, + { + "epoch": 0.31982222222222223, + "grad_norm": 0.4183559140427564, + "learning_rate": 0.00015909484241388117, + "loss": 0.6778, + "step": 1799 + }, + { + "epoch": 0.32, + "grad_norm": 0.6017798847895347, + "learning_rate": 0.00015904838188429574, + "loss": 0.6494, + "step": 1800 + }, + { + "epoch": 0.3201777777777778, + "grad_norm": 0.37336083054040586, + "learning_rate": 0.00015900190177713016, + "loss": 0.5924, + "step": 1801 + }, + { + "epoch": 0.32035555555555556, + "grad_norm": 0.4413415636910764, + "learning_rate": 0.00015895540210779494, + "loss": 0.6839, + "step": 1802 + }, + { + "epoch": 0.32053333333333334, + "grad_norm": 0.3733855261534981, + "learning_rate": 0.00015890888289170712, + "loss": 0.658, + "step": 1803 + }, + { + "epoch": 0.3207111111111111, + "grad_norm": 0.36384878107670715, + "learning_rate": 0.00015886234414429028, + "loss": 0.6882, + "step": 1804 + }, + { + "epoch": 0.3208888888888889, + "grad_norm": 0.37360750715798086, + "learning_rate": 0.00015881578588097431, + "loss": 0.595, + "step": 1805 + }, + { + "epoch": 0.32106666666666667, + "grad_norm": 0.3974141428210875, + "learning_rate": 0.00015876920811719577, + "loss": 0.6385, + "step": 1806 + }, + { + "epoch": 0.32124444444444444, + "grad_norm": 0.40345037870858486, + "learning_rate": 0.0001587226108683975, + "loss": 0.7248, + "step": 1807 + }, + { + "epoch": 0.3214222222222222, + "grad_norm": 0.3505421880151765, + "learning_rate": 0.00015867599415002895, + "loss": 0.6414, + "step": 1808 + }, + { + "epoch": 0.3216, + "grad_norm": 0.4125416949844472, + "learning_rate": 0.00015862935797754594, + "loss": 0.6434, + "step": 1809 + }, + { + "epoch": 0.3217777777777778, + "grad_norm": 0.37937903549654906, + "learning_rate": 0.00015858270236641077, + "loss": 0.6631, + "step": 1810 + }, + { + "epoch": 0.32195555555555555, + "grad_norm": 0.37299629316427885, + "learning_rate": 0.00015853602733209216, + "loss": 0.6556, + "step": 1811 + }, + { + "epoch": 0.3221333333333333, + "grad_norm": 0.3718028224852353, + "learning_rate": 0.0001584893328900653, + "loss": 0.6679, + "step": 1812 + }, + { + "epoch": 0.3223111111111111, + "grad_norm": 0.4010053897187422, + "learning_rate": 0.00015844261905581183, + "loss": 0.6169, + "step": 1813 + }, + { + "epoch": 0.3224888888888889, + "grad_norm": 0.3735438093247827, + "learning_rate": 0.00015839588584481976, + "loss": 0.6296, + "step": 1814 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 0.3677969231783713, + "learning_rate": 0.00015834913327258355, + "loss": 0.6525, + "step": 1815 + }, + { + "epoch": 0.32284444444444443, + "grad_norm": 0.3779629852696358, + "learning_rate": 0.0001583023613546041, + "loss": 0.6608, + "step": 1816 + }, + { + "epoch": 0.3230222222222222, + "grad_norm": 0.3733931946361051, + "learning_rate": 0.00015825557010638871, + "loss": 0.6304, + "step": 1817 + }, + { + "epoch": 0.3232, + "grad_norm": 0.35646577004806546, + "learning_rate": 0.00015820875954345112, + "loss": 0.6737, + "step": 1818 + }, + { + "epoch": 0.32337777777777776, + "grad_norm": 0.39482963686923433, + "learning_rate": 0.00015816192968131138, + "loss": 0.6465, + "step": 1819 + }, + { + "epoch": 0.32355555555555554, + "grad_norm": 0.37743364160341825, + "learning_rate": 0.00015811508053549606, + "loss": 0.6772, + "step": 1820 + }, + { + "epoch": 0.3237333333333333, + "grad_norm": 0.36428252499879177, + "learning_rate": 0.000158068212121538, + "loss": 0.6472, + "step": 1821 + }, + { + "epoch": 0.3239111111111111, + "grad_norm": 0.3528531852682479, + "learning_rate": 0.00015802132445497658, + "loss": 0.6596, + "step": 1822 + }, + { + "epoch": 0.32408888888888887, + "grad_norm": 0.37254030182342834, + "learning_rate": 0.00015797441755135738, + "loss": 0.6748, + "step": 1823 + }, + { + "epoch": 0.32426666666666665, + "grad_norm": 0.36942472368324847, + "learning_rate": 0.00015792749142623253, + "loss": 0.6551, + "step": 1824 + }, + { + "epoch": 0.3244444444444444, + "grad_norm": 0.3392271728880086, + "learning_rate": 0.00015788054609516044, + "loss": 0.6353, + "step": 1825 + }, + { + "epoch": 0.3246222222222222, + "grad_norm": 0.39079625838679477, + "learning_rate": 0.00015783358157370588, + "loss": 0.7011, + "step": 1826 + }, + { + "epoch": 0.3248, + "grad_norm": 0.36209126504254585, + "learning_rate": 0.00015778659787744, + "loss": 0.6629, + "step": 1827 + }, + { + "epoch": 0.32497777777777775, + "grad_norm": 0.35524844820269663, + "learning_rate": 0.00015773959502194039, + "loss": 0.6367, + "step": 1828 + }, + { + "epoch": 0.32515555555555553, + "grad_norm": 0.34709356704915956, + "learning_rate": 0.00015769257302279086, + "loss": 0.6643, + "step": 1829 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 0.36694795171343575, + "learning_rate": 0.0001576455318955816, + "loss": 0.663, + "step": 1830 + }, + { + "epoch": 0.3255111111111111, + "grad_norm": 0.3498280223812707, + "learning_rate": 0.0001575984716559092, + "loss": 0.6298, + "step": 1831 + }, + { + "epoch": 0.3256888888888889, + "grad_norm": 0.3774613788220334, + "learning_rate": 0.00015755139231937658, + "loss": 0.662, + "step": 1832 + }, + { + "epoch": 0.3258666666666667, + "grad_norm": 0.368998514643403, + "learning_rate": 0.00015750429390159294, + "loss": 0.6468, + "step": 1833 + }, + { + "epoch": 0.32604444444444447, + "grad_norm": 0.38546270014141504, + "learning_rate": 0.0001574571764181738, + "loss": 0.6975, + "step": 1834 + }, + { + "epoch": 0.32622222222222225, + "grad_norm": 0.3766520943968019, + "learning_rate": 0.00015741003988474107, + "loss": 0.6303, + "step": 1835 + }, + { + "epoch": 0.3264, + "grad_norm": 0.3762880289124078, + "learning_rate": 0.00015736288431692294, + "loss": 0.6848, + "step": 1836 + }, + { + "epoch": 0.3265777777777778, + "grad_norm": 0.35878869094377924, + "learning_rate": 0.00015731570973035394, + "loss": 0.6632, + "step": 1837 + }, + { + "epoch": 0.3267555555555556, + "grad_norm": 0.3776189270689113, + "learning_rate": 0.00015726851614067475, + "loss": 0.6709, + "step": 1838 + }, + { + "epoch": 0.32693333333333335, + "grad_norm": 0.3696737850469688, + "learning_rate": 0.0001572213035635326, + "loss": 0.6274, + "step": 1839 + }, + { + "epoch": 0.32711111111111113, + "grad_norm": 0.36342965643412434, + "learning_rate": 0.00015717407201458087, + "loss": 0.6443, + "step": 1840 + }, + { + "epoch": 0.3272888888888889, + "grad_norm": 0.3569793187005551, + "learning_rate": 0.00015712682150947923, + "loss": 0.6664, + "step": 1841 + }, + { + "epoch": 0.3274666666666667, + "grad_norm": 0.3697345076612493, + "learning_rate": 0.00015707955206389367, + "loss": 0.6619, + "step": 1842 + }, + { + "epoch": 0.32764444444444446, + "grad_norm": 0.37888208154541403, + "learning_rate": 0.0001570322636934964, + "loss": 0.6812, + "step": 1843 + }, + { + "epoch": 0.32782222222222224, + "grad_norm": 0.3827436328962468, + "learning_rate": 0.00015698495641396602, + "loss": 0.6701, + "step": 1844 + }, + { + "epoch": 0.328, + "grad_norm": 0.3502815762468765, + "learning_rate": 0.00015693763024098728, + "loss": 0.6466, + "step": 1845 + }, + { + "epoch": 0.3281777777777778, + "grad_norm": 0.3788627369656457, + "learning_rate": 0.00015689028519025127, + "loss": 0.6322, + "step": 1846 + }, + { + "epoch": 0.32835555555555557, + "grad_norm": 0.3689993966092909, + "learning_rate": 0.0001568429212774553, + "loss": 0.6136, + "step": 1847 + }, + { + "epoch": 0.32853333333333334, + "grad_norm": 0.366200099038874, + "learning_rate": 0.00015679553851830297, + "loss": 0.6321, + "step": 1848 + }, + { + "epoch": 0.3287111111111111, + "grad_norm": 0.3749624111907132, + "learning_rate": 0.00015674813692850408, + "loss": 0.6315, + "step": 1849 + }, + { + "epoch": 0.3288888888888889, + "grad_norm": 0.3485067743402259, + "learning_rate": 0.00015670071652377468, + "loss": 0.6119, + "step": 1850 + }, + { + "epoch": 0.3290666666666667, + "grad_norm": 0.37490919937573847, + "learning_rate": 0.00015665327731983713, + "loss": 0.6245, + "step": 1851 + }, + { + "epoch": 0.32924444444444445, + "grad_norm": 0.3767796980955667, + "learning_rate": 0.00015660581933241993, + "loss": 0.6792, + "step": 1852 + }, + { + "epoch": 0.3294222222222222, + "grad_norm": 0.39562638401911415, + "learning_rate": 0.00015655834257725788, + "loss": 0.6522, + "step": 1853 + }, + { + "epoch": 0.3296, + "grad_norm": 0.3820660708180656, + "learning_rate": 0.00015651084707009192, + "loss": 0.6552, + "step": 1854 + }, + { + "epoch": 0.3297777777777778, + "grad_norm": 0.3967938699332167, + "learning_rate": 0.00015646333282666927, + "loss": 0.7019, + "step": 1855 + }, + { + "epoch": 0.32995555555555556, + "grad_norm": 0.3914165684715345, + "learning_rate": 0.0001564157998627434, + "loss": 0.6433, + "step": 1856 + }, + { + "epoch": 0.33013333333333333, + "grad_norm": 0.4681718778835079, + "learning_rate": 0.0001563682481940739, + "loss": 0.6338, + "step": 1857 + }, + { + "epoch": 0.3303111111111111, + "grad_norm": 0.3863744047782405, + "learning_rate": 0.00015632067783642658, + "loss": 0.679, + "step": 1858 + }, + { + "epoch": 0.3304888888888889, + "grad_norm": 0.36453496553135, + "learning_rate": 0.00015627308880557353, + "loss": 0.6552, + "step": 1859 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 0.36205310362923865, + "learning_rate": 0.00015622548111729286, + "loss": 0.6799, + "step": 1860 + }, + { + "epoch": 0.33084444444444444, + "grad_norm": 0.3371946679104084, + "learning_rate": 0.00015617785478736905, + "loss": 0.6309, + "step": 1861 + }, + { + "epoch": 0.3310222222222222, + "grad_norm": 0.36652767445020196, + "learning_rate": 0.00015613020983159265, + "loss": 0.6581, + "step": 1862 + }, + { + "epoch": 0.3312, + "grad_norm": 0.36055757358416896, + "learning_rate": 0.00015608254626576048, + "loss": 0.6039, + "step": 1863 + }, + { + "epoch": 0.33137777777777777, + "grad_norm": 0.3535281599186636, + "learning_rate": 0.00015603486410567535, + "loss": 0.6036, + "step": 1864 + }, + { + "epoch": 0.33155555555555555, + "grad_norm": 0.3808987736515503, + "learning_rate": 0.00015598716336714645, + "loss": 0.6435, + "step": 1865 + }, + { + "epoch": 0.3317333333333333, + "grad_norm": 0.35463981578553044, + "learning_rate": 0.00015593944406598896, + "loss": 0.6478, + "step": 1866 + }, + { + "epoch": 0.3319111111111111, + "grad_norm": 0.39314295836159147, + "learning_rate": 0.00015589170621802437, + "loss": 0.7147, + "step": 1867 + }, + { + "epoch": 0.3320888888888889, + "grad_norm": 0.3854939365192963, + "learning_rate": 0.00015584394983908018, + "loss": 0.6629, + "step": 1868 + }, + { + "epoch": 0.33226666666666665, + "grad_norm": 0.3672541368282204, + "learning_rate": 0.0001557961749449901, + "loss": 0.6558, + "step": 1869 + }, + { + "epoch": 0.33244444444444443, + "grad_norm": 0.35870691674772215, + "learning_rate": 0.00015574838155159396, + "loss": 0.6518, + "step": 1870 + }, + { + "epoch": 0.3326222222222222, + "grad_norm": 0.3623003718581855, + "learning_rate": 0.00015570056967473774, + "loss": 0.5724, + "step": 1871 + }, + { + "epoch": 0.3328, + "grad_norm": 0.395314412669602, + "learning_rate": 0.00015565273933027356, + "loss": 0.6518, + "step": 1872 + }, + { + "epoch": 0.33297777777777776, + "grad_norm": 0.36923396657196245, + "learning_rate": 0.0001556048905340596, + "loss": 0.6758, + "step": 1873 + }, + { + "epoch": 0.33315555555555554, + "grad_norm": 0.4146976645229107, + "learning_rate": 0.00015555702330196023, + "loss": 0.692, + "step": 1874 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.3637142555453114, + "learning_rate": 0.0001555091376498459, + "loss": 0.6324, + "step": 1875 + }, + { + "epoch": 0.3335111111111111, + "grad_norm": 0.36690442470997, + "learning_rate": 0.00015546123359359317, + "loss": 0.6225, + "step": 1876 + }, + { + "epoch": 0.33368888888888887, + "grad_norm": 0.3715761701911381, + "learning_rate": 0.00015541331114908469, + "loss": 0.5832, + "step": 1877 + }, + { + "epoch": 0.33386666666666664, + "grad_norm": 0.38519927693713374, + "learning_rate": 0.00015536537033220924, + "loss": 0.6874, + "step": 1878 + }, + { + "epoch": 0.3340444444444444, + "grad_norm": 0.3855424167318192, + "learning_rate": 0.00015531741115886165, + "loss": 0.6905, + "step": 1879 + }, + { + "epoch": 0.3342222222222222, + "grad_norm": 0.37280656368730114, + "learning_rate": 0.00015526943364494285, + "loss": 0.6543, + "step": 1880 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3556684338620708, + "learning_rate": 0.0001552214378063599, + "loss": 0.6316, + "step": 1881 + }, + { + "epoch": 0.33457777777777775, + "grad_norm": 0.3518349307137234, + "learning_rate": 0.00015517342365902584, + "loss": 0.6429, + "step": 1882 + }, + { + "epoch": 0.33475555555555553, + "grad_norm": 0.3663619991362937, + "learning_rate": 0.0001551253912188599, + "loss": 0.64, + "step": 1883 + }, + { + "epoch": 0.33493333333333336, + "grad_norm": 0.3785449964233379, + "learning_rate": 0.0001550773405017872, + "loss": 0.6481, + "step": 1884 + }, + { + "epoch": 0.33511111111111114, + "grad_norm": 0.38437528154960754, + "learning_rate": 0.00015502927152373914, + "loss": 0.6993, + "step": 1885 + }, + { + "epoch": 0.3352888888888889, + "grad_norm": 0.36715399454178416, + "learning_rate": 0.000154981184300653, + "loss": 0.6627, + "step": 1886 + }, + { + "epoch": 0.3354666666666667, + "grad_norm": 0.35146008161049724, + "learning_rate": 0.0001549330788484722, + "loss": 0.6402, + "step": 1887 + }, + { + "epoch": 0.33564444444444447, + "grad_norm": 0.3773256190589999, + "learning_rate": 0.00015488495518314616, + "loss": 0.6962, + "step": 1888 + }, + { + "epoch": 0.33582222222222224, + "grad_norm": 0.3513135901792078, + "learning_rate": 0.00015483681332063035, + "loss": 0.6289, + "step": 1889 + }, + { + "epoch": 0.336, + "grad_norm": 0.3604762705030136, + "learning_rate": 0.0001547886532768863, + "loss": 0.6291, + "step": 1890 + }, + { + "epoch": 0.3361777777777778, + "grad_norm": 0.37906680708969126, + "learning_rate": 0.0001547404750678815, + "loss": 0.6263, + "step": 1891 + }, + { + "epoch": 0.3363555555555556, + "grad_norm": 0.3391435377844623, + "learning_rate": 0.00015469227870958956, + "loss": 0.6127, + "step": 1892 + }, + { + "epoch": 0.33653333333333335, + "grad_norm": 0.4009699122530538, + "learning_rate": 0.00015464406421799, + "loss": 0.6774, + "step": 1893 + }, + { + "epoch": 0.3367111111111111, + "grad_norm": 0.37537286923319574, + "learning_rate": 0.00015459583160906847, + "loss": 0.6431, + "step": 1894 + }, + { + "epoch": 0.3368888888888889, + "grad_norm": 0.3656307483971322, + "learning_rate": 0.0001545475808988165, + "loss": 0.6615, + "step": 1895 + }, + { + "epoch": 0.3370666666666667, + "grad_norm": 0.3744616486364853, + "learning_rate": 0.0001544993121032318, + "loss": 0.6204, + "step": 1896 + }, + { + "epoch": 0.33724444444444446, + "grad_norm": 0.4252186356379338, + "learning_rate": 0.0001544510252383178, + "loss": 0.6453, + "step": 1897 + }, + { + "epoch": 0.33742222222222223, + "grad_norm": 0.3627415682909013, + "learning_rate": 0.0001544027203200842, + "loss": 0.6251, + "step": 1898 + }, + { + "epoch": 0.3376, + "grad_norm": 0.37295138064373845, + "learning_rate": 0.00015435439736454653, + "loss": 0.6258, + "step": 1899 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 0.354490949120553, + "learning_rate": 0.00015430605638772633, + "loss": 0.6094, + "step": 1900 + }, + { + "epoch": 0.33795555555555556, + "grad_norm": 0.3854865153643987, + "learning_rate": 0.00015425769740565114, + "loss": 0.6086, + "step": 1901 + }, + { + "epoch": 0.33813333333333334, + "grad_norm": 0.3549083537040871, + "learning_rate": 0.00015420932043435447, + "loss": 0.63, + "step": 1902 + }, + { + "epoch": 0.3383111111111111, + "grad_norm": 0.38869600865278875, + "learning_rate": 0.00015416092548987576, + "loss": 0.6444, + "step": 1903 + }, + { + "epoch": 0.3384888888888889, + "grad_norm": 0.39882571898590363, + "learning_rate": 0.0001541125125882604, + "loss": 0.7058, + "step": 1904 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 0.38074544193279036, + "learning_rate": 0.00015406408174555976, + "loss": 0.6836, + "step": 1905 + }, + { + "epoch": 0.33884444444444445, + "grad_norm": 0.3782464583309121, + "learning_rate": 0.00015401563297783122, + "loss": 0.6966, + "step": 1906 + }, + { + "epoch": 0.3390222222222222, + "grad_norm": 0.37908419739217725, + "learning_rate": 0.000153967166301138, + "loss": 0.6565, + "step": 1907 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3562745958188283, + "learning_rate": 0.00015391868173154932, + "loss": 0.6834, + "step": 1908 + }, + { + "epoch": 0.3393777777777778, + "grad_norm": 0.3709950236861562, + "learning_rate": 0.0001538701792851403, + "loss": 0.6368, + "step": 1909 + }, + { + "epoch": 0.33955555555555555, + "grad_norm": 0.3593647531273344, + "learning_rate": 0.00015382165897799197, + "loss": 0.6403, + "step": 1910 + }, + { + "epoch": 0.33973333333333333, + "grad_norm": 0.36660696672625154, + "learning_rate": 0.00015377312082619134, + "loss": 0.6572, + "step": 1911 + }, + { + "epoch": 0.3399111111111111, + "grad_norm": 0.3712075544687035, + "learning_rate": 0.00015372456484583134, + "loss": 0.5955, + "step": 1912 + }, + { + "epoch": 0.3400888888888889, + "grad_norm": 0.37010885752726813, + "learning_rate": 0.0001536759910530107, + "loss": 0.6087, + "step": 1913 + }, + { + "epoch": 0.34026666666666666, + "grad_norm": 0.3684350141830777, + "learning_rate": 0.0001536273994638342, + "loss": 0.6141, + "step": 1914 + }, + { + "epoch": 0.34044444444444444, + "grad_norm": 0.35812770243586833, + "learning_rate": 0.00015357879009441242, + "loss": 0.6304, + "step": 1915 + }, + { + "epoch": 0.3406222222222222, + "grad_norm": 0.3857271645225178, + "learning_rate": 0.0001535301629608619, + "loss": 0.6721, + "step": 1916 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3948624529959523, + "learning_rate": 0.00015348151807930506, + "loss": 0.6069, + "step": 1917 + }, + { + "epoch": 0.34097777777777777, + "grad_norm": 0.37789159457866367, + "learning_rate": 0.00015343285546587013, + "loss": 0.682, + "step": 1918 + }, + { + "epoch": 0.34115555555555555, + "grad_norm": 0.383158960550221, + "learning_rate": 0.00015338417513669126, + "loss": 0.682, + "step": 1919 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 0.3809544849048687, + "learning_rate": 0.00015333547710790851, + "loss": 0.6805, + "step": 1920 + }, + { + "epoch": 0.3415111111111111, + "grad_norm": 0.35530043473199124, + "learning_rate": 0.0001532867613956678, + "loss": 0.6734, + "step": 1921 + }, + { + "epoch": 0.3416888888888889, + "grad_norm": 0.3692528600382463, + "learning_rate": 0.00015323802801612093, + "loss": 0.6705, + "step": 1922 + }, + { + "epoch": 0.34186666666666665, + "grad_norm": 0.3623339603031708, + "learning_rate": 0.00015318927698542543, + "loss": 0.6855, + "step": 1923 + }, + { + "epoch": 0.34204444444444443, + "grad_norm": 0.39204956678696956, + "learning_rate": 0.00015314050831974484, + "loss": 0.671, + "step": 1924 + }, + { + "epoch": 0.3422222222222222, + "grad_norm": 0.3417444395877495, + "learning_rate": 0.00015309172203524854, + "loss": 0.6216, + "step": 1925 + }, + { + "epoch": 0.3424, + "grad_norm": 0.3883036894526186, + "learning_rate": 0.0001530429181481116, + "loss": 0.6737, + "step": 1926 + }, + { + "epoch": 0.34257777777777776, + "grad_norm": 0.3590628431997195, + "learning_rate": 0.000152994096674515, + "loss": 0.6563, + "step": 1927 + }, + { + "epoch": 0.34275555555555554, + "grad_norm": 0.38054864056881726, + "learning_rate": 0.0001529452576306457, + "loss": 0.6054, + "step": 1928 + }, + { + "epoch": 0.3429333333333333, + "grad_norm": 0.3797573084893008, + "learning_rate": 0.00015289640103269625, + "loss": 0.7011, + "step": 1929 + }, + { + "epoch": 0.3431111111111111, + "grad_norm": 0.35130540715217634, + "learning_rate": 0.0001528475268968652, + "loss": 0.5918, + "step": 1930 + }, + { + "epoch": 0.34328888888888887, + "grad_norm": 0.3549938374042644, + "learning_rate": 0.0001527986352393568, + "loss": 0.6543, + "step": 1931 + }, + { + "epoch": 0.34346666666666664, + "grad_norm": 0.3743413391677129, + "learning_rate": 0.00015274972607638113, + "loss": 0.6457, + "step": 1932 + }, + { + "epoch": 0.3436444444444444, + "grad_norm": 0.3768501226984263, + "learning_rate": 0.00015270079942415418, + "loss": 0.5915, + "step": 1933 + }, + { + "epoch": 0.3438222222222222, + "grad_norm": 0.3761269969345535, + "learning_rate": 0.00015265185529889758, + "loss": 0.6547, + "step": 1934 + }, + { + "epoch": 0.344, + "grad_norm": 0.38614267511717193, + "learning_rate": 0.00015260289371683884, + "loss": 0.6642, + "step": 1935 + }, + { + "epoch": 0.3441777777777778, + "grad_norm": 0.3771827789566294, + "learning_rate": 0.00015255391469421128, + "loss": 0.6686, + "step": 1936 + }, + { + "epoch": 0.3443555555555556, + "grad_norm": 0.3905826974516789, + "learning_rate": 0.00015250491824725398, + "loss": 0.6985, + "step": 1937 + }, + { + "epoch": 0.34453333333333336, + "grad_norm": 0.36065501029392516, + "learning_rate": 0.00015245590439221172, + "loss": 0.6118, + "step": 1938 + }, + { + "epoch": 0.34471111111111113, + "grad_norm": 0.34729715327884514, + "learning_rate": 0.00015240687314533515, + "loss": 0.6258, + "step": 1939 + }, + { + "epoch": 0.3448888888888889, + "grad_norm": 0.37259047859861416, + "learning_rate": 0.00015235782452288068, + "loss": 0.6783, + "step": 1940 + }, + { + "epoch": 0.3450666666666667, + "grad_norm": 0.3814504904666184, + "learning_rate": 0.0001523087585411104, + "loss": 0.6575, + "step": 1941 + }, + { + "epoch": 0.34524444444444446, + "grad_norm": 0.3781408065572556, + "learning_rate": 0.0001522596752162923, + "loss": 0.6921, + "step": 1942 + }, + { + "epoch": 0.34542222222222224, + "grad_norm": 0.35917518608478877, + "learning_rate": 0.00015221057456469994, + "loss": 0.6535, + "step": 1943 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3716373107186432, + "learning_rate": 0.00015216145660261273, + "loss": 0.6189, + "step": 1944 + }, + { + "epoch": 0.3457777777777778, + "grad_norm": 0.3587359557996415, + "learning_rate": 0.00015211232134631586, + "loss": 0.6339, + "step": 1945 + }, + { + "epoch": 0.34595555555555557, + "grad_norm": 0.36367310244560014, + "learning_rate": 0.00015206316881210015, + "loss": 0.6455, + "step": 1946 + }, + { + "epoch": 0.34613333333333335, + "grad_norm": 0.3495511971982955, + "learning_rate": 0.00015201399901626218, + "loss": 0.6328, + "step": 1947 + }, + { + "epoch": 0.3463111111111111, + "grad_norm": 0.37986954406834667, + "learning_rate": 0.0001519648119751043, + "loss": 0.6349, + "step": 1948 + }, + { + "epoch": 0.3464888888888889, + "grad_norm": 0.3864259166335274, + "learning_rate": 0.00015191560770493458, + "loss": 0.6292, + "step": 1949 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.3646330412340634, + "learning_rate": 0.00015186638622206674, + "loss": 0.7031, + "step": 1950 + }, + { + "epoch": 0.34684444444444446, + "grad_norm": 0.3480623094221179, + "learning_rate": 0.0001518171475428202, + "loss": 0.6254, + "step": 1951 + }, + { + "epoch": 0.34702222222222223, + "grad_norm": 0.36775586963200424, + "learning_rate": 0.00015176789168352018, + "loss": 0.6697, + "step": 1952 + }, + { + "epoch": 0.3472, + "grad_norm": 0.3599707140978756, + "learning_rate": 0.0001517186186604975, + "loss": 0.6287, + "step": 1953 + }, + { + "epoch": 0.3473777777777778, + "grad_norm": 0.36247623051311173, + "learning_rate": 0.0001516693284900887, + "loss": 0.6461, + "step": 1954 + }, + { + "epoch": 0.34755555555555556, + "grad_norm": 0.35170666984812476, + "learning_rate": 0.000151620021188636, + "loss": 0.6576, + "step": 1955 + }, + { + "epoch": 0.34773333333333334, + "grad_norm": 0.3524731712393581, + "learning_rate": 0.00015157069677248738, + "loss": 0.6521, + "step": 1956 + }, + { + "epoch": 0.3479111111111111, + "grad_norm": 0.35716231359740586, + "learning_rate": 0.00015152135525799633, + "loss": 0.6612, + "step": 1957 + }, + { + "epoch": 0.3480888888888889, + "grad_norm": 0.3515983779832405, + "learning_rate": 0.00015147199666152222, + "loss": 0.6659, + "step": 1958 + }, + { + "epoch": 0.34826666666666667, + "grad_norm": 0.39883077487011037, + "learning_rate": 0.00015142262099942982, + "loss": 0.6292, + "step": 1959 + }, + { + "epoch": 0.34844444444444445, + "grad_norm": 0.349348810324865, + "learning_rate": 0.00015137322828808982, + "loss": 0.624, + "step": 1960 + }, + { + "epoch": 0.3486222222222222, + "grad_norm": 0.37855580962184154, + "learning_rate": 0.0001513238185438784, + "loss": 0.6336, + "step": 1961 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3830086106065981, + "learning_rate": 0.00015127439178317745, + "loss": 0.6606, + "step": 1962 + }, + { + "epoch": 0.3489777777777778, + "grad_norm": 0.37220350745279995, + "learning_rate": 0.0001512249480223745, + "loss": 0.6613, + "step": 1963 + }, + { + "epoch": 0.34915555555555555, + "grad_norm": 0.3686382842837492, + "learning_rate": 0.00015117548727786265, + "loss": 0.6881, + "step": 1964 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 0.37594924882120045, + "learning_rate": 0.00015112600956604074, + "loss": 0.683, + "step": 1965 + }, + { + "epoch": 0.3495111111111111, + "grad_norm": 0.38365558467300287, + "learning_rate": 0.00015107651490331317, + "loss": 0.6648, + "step": 1966 + }, + { + "epoch": 0.3496888888888889, + "grad_norm": 0.3490925018748636, + "learning_rate": 0.00015102700330609, + "loss": 0.6277, + "step": 1967 + }, + { + "epoch": 0.34986666666666666, + "grad_norm": 0.33109148394762006, + "learning_rate": 0.0001509774747907868, + "loss": 0.5762, + "step": 1968 + }, + { + "epoch": 0.35004444444444444, + "grad_norm": 0.35354886274804637, + "learning_rate": 0.00015092792937382483, + "loss": 0.5943, + "step": 1969 + }, + { + "epoch": 0.3502222222222222, + "grad_norm": 0.35857196538644953, + "learning_rate": 0.000150878367071631, + "loss": 0.609, + "step": 1970 + }, + { + "epoch": 0.3504, + "grad_norm": 0.36722178664133814, + "learning_rate": 0.00015082878790063776, + "loss": 0.605, + "step": 1971 + }, + { + "epoch": 0.35057777777777777, + "grad_norm": 0.3689634845632975, + "learning_rate": 0.00015077919187728313, + "loss": 0.6549, + "step": 1972 + }, + { + "epoch": 0.35075555555555554, + "grad_norm": 0.42575878422861924, + "learning_rate": 0.00015072957901801076, + "loss": 0.6257, + "step": 1973 + }, + { + "epoch": 0.3509333333333333, + "grad_norm": 0.36567702736362295, + "learning_rate": 0.00015067994933926985, + "loss": 0.6575, + "step": 1974 + }, + { + "epoch": 0.3511111111111111, + "grad_norm": 0.39874234622292376, + "learning_rate": 0.00015063030285751526, + "loss": 0.7159, + "step": 1975 + }, + { + "epoch": 0.3512888888888889, + "grad_norm": 0.3935848186973221, + "learning_rate": 0.00015058063958920726, + "loss": 0.6284, + "step": 1976 + }, + { + "epoch": 0.35146666666666665, + "grad_norm": 0.3317955078156986, + "learning_rate": 0.00015053095955081184, + "loss": 0.6261, + "step": 1977 + }, + { + "epoch": 0.3516444444444444, + "grad_norm": 0.3683979992895569, + "learning_rate": 0.0001504812627588005, + "loss": 0.5753, + "step": 1978 + }, + { + "epoch": 0.3518222222222222, + "grad_norm": 0.3633136981036898, + "learning_rate": 0.00015043154922965028, + "loss": 0.6221, + "step": 1979 + }, + { + "epoch": 0.352, + "grad_norm": 0.3724394123561364, + "learning_rate": 0.00015038181897984374, + "loss": 0.6411, + "step": 1980 + }, + { + "epoch": 0.35217777777777776, + "grad_norm": 0.3591757601118717, + "learning_rate": 0.00015033207202586906, + "loss": 0.6459, + "step": 1981 + }, + { + "epoch": 0.35235555555555553, + "grad_norm": 0.3712084361937139, + "learning_rate": 0.0001502823083842199, + "loss": 0.6668, + "step": 1982 + }, + { + "epoch": 0.3525333333333333, + "grad_norm": 0.3606712812359245, + "learning_rate": 0.00015023252807139548, + "loss": 0.6445, + "step": 1983 + }, + { + "epoch": 0.3527111111111111, + "grad_norm": 0.3731001652780124, + "learning_rate": 0.0001501827311039005, + "loss": 0.6109, + "step": 1984 + }, + { + "epoch": 0.35288888888888886, + "grad_norm": 0.3531230524656085, + "learning_rate": 0.00015013291749824527, + "loss": 0.6454, + "step": 1985 + }, + { + "epoch": 0.35306666666666664, + "grad_norm": 0.3610863997758842, + "learning_rate": 0.00015008308727094554, + "loss": 0.6521, + "step": 1986 + }, + { + "epoch": 0.35324444444444447, + "grad_norm": 0.38521408902713933, + "learning_rate": 0.0001500332404385226, + "loss": 0.6026, + "step": 1987 + }, + { + "epoch": 0.35342222222222225, + "grad_norm": 0.35711335979526543, + "learning_rate": 0.00014998337701750325, + "loss": 0.6239, + "step": 1988 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3874402203662732, + "learning_rate": 0.00014993349702441977, + "loss": 0.7266, + "step": 1989 + }, + { + "epoch": 0.3537777777777778, + "grad_norm": 0.3695911002313585, + "learning_rate": 0.00014988360047580996, + "loss": 0.6324, + "step": 1990 + }, + { + "epoch": 0.3539555555555556, + "grad_norm": 0.3635293848274628, + "learning_rate": 0.00014983368738821713, + "loss": 0.6484, + "step": 1991 + }, + { + "epoch": 0.35413333333333336, + "grad_norm": 0.3613017541698347, + "learning_rate": 0.00014978375777818995, + "loss": 0.6448, + "step": 1992 + }, + { + "epoch": 0.35431111111111113, + "grad_norm": 0.33440682069469924, + "learning_rate": 0.00014973381166228272, + "loss": 0.6027, + "step": 1993 + }, + { + "epoch": 0.3544888888888889, + "grad_norm": 0.36943387967287367, + "learning_rate": 0.00014968384905705517, + "loss": 0.6233, + "step": 1994 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 0.35193679253771154, + "learning_rate": 0.0001496338699790724, + "loss": 0.6621, + "step": 1995 + }, + { + "epoch": 0.35484444444444446, + "grad_norm": 0.34982242455112916, + "learning_rate": 0.0001495838744449051, + "loss": 0.6162, + "step": 1996 + }, + { + "epoch": 0.35502222222222224, + "grad_norm": 0.3510048176570386, + "learning_rate": 0.0001495338624711294, + "loss": 0.6294, + "step": 1997 + }, + { + "epoch": 0.3552, + "grad_norm": 0.37252284842904776, + "learning_rate": 0.00014948383407432678, + "loss": 0.6425, + "step": 1998 + }, + { + "epoch": 0.3553777777777778, + "grad_norm": 0.33584495292898014, + "learning_rate": 0.00014943378927108426, + "loss": 0.6205, + "step": 1999 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 0.3833097467211567, + "learning_rate": 0.00014938372807799425, + "loss": 0.5696, + "step": 2000 + }, + { + "epoch": 0.35573333333333335, + "grad_norm": 0.3967682155529881, + "learning_rate": 0.0001493336505116546, + "loss": 0.6501, + "step": 2001 + }, + { + "epoch": 0.3559111111111111, + "grad_norm": 0.34637316067199425, + "learning_rate": 0.0001492835565886687, + "loss": 0.625, + "step": 2002 + }, + { + "epoch": 0.3560888888888889, + "grad_norm": 0.348732896387513, + "learning_rate": 0.0001492334463256452, + "loss": 0.6295, + "step": 2003 + }, + { + "epoch": 0.3562666666666667, + "grad_norm": 0.33839273097474754, + "learning_rate": 0.0001491833197391982, + "loss": 0.6521, + "step": 2004 + }, + { + "epoch": 0.35644444444444445, + "grad_norm": 0.34478921210181634, + "learning_rate": 0.00014913317684594728, + "loss": 0.6135, + "step": 2005 + }, + { + "epoch": 0.35662222222222223, + "grad_norm": 0.3350781131097185, + "learning_rate": 0.00014908301766251739, + "loss": 0.6132, + "step": 2006 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3457428656406698, + "learning_rate": 0.00014903284220553885, + "loss": 0.6265, + "step": 2007 + }, + { + "epoch": 0.3569777777777778, + "grad_norm": 0.3441985475780204, + "learning_rate": 0.00014898265049164748, + "loss": 0.6435, + "step": 2008 + }, + { + "epoch": 0.35715555555555556, + "grad_norm": 0.36776963685188147, + "learning_rate": 0.00014893244253748436, + "loss": 0.6647, + "step": 2009 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 0.35952266322726845, + "learning_rate": 0.00014888221835969605, + "loss": 0.6631, + "step": 2010 + }, + { + "epoch": 0.3575111111111111, + "grad_norm": 0.3659956712033054, + "learning_rate": 0.00014883197797493442, + "loss": 0.6265, + "step": 2011 + }, + { + "epoch": 0.3576888888888889, + "grad_norm": 0.3467382121018198, + "learning_rate": 0.00014878172139985675, + "loss": 0.6349, + "step": 2012 + }, + { + "epoch": 0.35786666666666667, + "grad_norm": 0.3957784753009316, + "learning_rate": 0.00014873144865112573, + "loss": 0.7023, + "step": 2013 + }, + { + "epoch": 0.35804444444444444, + "grad_norm": 0.35072125070146143, + "learning_rate": 0.0001486811597454093, + "loss": 0.6596, + "step": 2014 + }, + { + "epoch": 0.3582222222222222, + "grad_norm": 0.3632568771859667, + "learning_rate": 0.00014863085469938084, + "loss": 0.6317, + "step": 2015 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3810085439368567, + "learning_rate": 0.00014858053352971912, + "loss": 0.694, + "step": 2016 + }, + { + "epoch": 0.3585777777777778, + "grad_norm": 0.3938534272004421, + "learning_rate": 0.00014853019625310813, + "loss": 0.6896, + "step": 2017 + }, + { + "epoch": 0.35875555555555555, + "grad_norm": 0.3722351547253413, + "learning_rate": 0.0001484798428862373, + "loss": 0.617, + "step": 2018 + }, + { + "epoch": 0.3589333333333333, + "grad_norm": 0.36278454951864497, + "learning_rate": 0.00014842947344580133, + "loss": 0.6168, + "step": 2019 + }, + { + "epoch": 0.3591111111111111, + "grad_norm": 0.35893904729176807, + "learning_rate": 0.00014837908794850034, + "loss": 0.6224, + "step": 2020 + }, + { + "epoch": 0.3592888888888889, + "grad_norm": 0.382214848569764, + "learning_rate": 0.00014832868641103967, + "loss": 0.6681, + "step": 2021 + }, + { + "epoch": 0.35946666666666666, + "grad_norm": 0.3582634921068547, + "learning_rate": 0.00014827826885013007, + "loss": 0.6311, + "step": 2022 + }, + { + "epoch": 0.35964444444444443, + "grad_norm": 0.38137826123261026, + "learning_rate": 0.00014822783528248753, + "loss": 0.6363, + "step": 2023 + }, + { + "epoch": 0.3598222222222222, + "grad_norm": 0.36474734770259776, + "learning_rate": 0.00014817738572483338, + "loss": 0.6498, + "step": 2024 + }, + { + "epoch": 0.36, + "grad_norm": 0.36072861571456216, + "learning_rate": 0.00014812692019389425, + "loss": 0.6148, + "step": 2025 + }, + { + "epoch": 0.36017777777777776, + "grad_norm": 0.3716504575880546, + "learning_rate": 0.00014807643870640207, + "loss": 0.6213, + "step": 2026 + }, + { + "epoch": 0.36035555555555554, + "grad_norm": 0.349790460516821, + "learning_rate": 0.00014802594127909404, + "loss": 0.6617, + "step": 2027 + }, + { + "epoch": 0.3605333333333333, + "grad_norm": 0.36636570634623195, + "learning_rate": 0.00014797542792871265, + "loss": 0.6596, + "step": 2028 + }, + { + "epoch": 0.3607111111111111, + "grad_norm": 0.3225352366135295, + "learning_rate": 0.0001479248986720057, + "loss": 0.5834, + "step": 2029 + }, + { + "epoch": 0.36088888888888887, + "grad_norm": 0.3779073813598705, + "learning_rate": 0.00014787435352572623, + "loss": 0.6307, + "step": 2030 + }, + { + "epoch": 0.36106666666666665, + "grad_norm": 0.3642173384123533, + "learning_rate": 0.00014782379250663255, + "loss": 0.6531, + "step": 2031 + }, + { + "epoch": 0.3612444444444444, + "grad_norm": 0.3640970703178707, + "learning_rate": 0.0001477732156314883, + "loss": 0.6791, + "step": 2032 + }, + { + "epoch": 0.3614222222222222, + "grad_norm": 0.36313461672239566, + "learning_rate": 0.00014772262291706223, + "loss": 0.6809, + "step": 2033 + }, + { + "epoch": 0.3616, + "grad_norm": 0.37296127294131926, + "learning_rate": 0.00014767201438012847, + "loss": 0.6279, + "step": 2034 + }, + { + "epoch": 0.36177777777777775, + "grad_norm": 0.3835029026042995, + "learning_rate": 0.00014762139003746637, + "loss": 0.6554, + "step": 2035 + }, + { + "epoch": 0.36195555555555553, + "grad_norm": 0.37491201266683005, + "learning_rate": 0.0001475707499058605, + "loss": 0.663, + "step": 2036 + }, + { + "epoch": 0.3621333333333333, + "grad_norm": 0.3924362706200794, + "learning_rate": 0.00014752009400210067, + "loss": 0.6584, + "step": 2037 + }, + { + "epoch": 0.3623111111111111, + "grad_norm": 0.3662107952588624, + "learning_rate": 0.0001474694223429819, + "loss": 0.6446, + "step": 2038 + }, + { + "epoch": 0.3624888888888889, + "grad_norm": 0.3734678879172702, + "learning_rate": 0.0001474187349453045, + "loss": 0.6816, + "step": 2039 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 0.3551390433542849, + "learning_rate": 0.00014736803182587398, + "loss": 0.6803, + "step": 2040 + }, + { + "epoch": 0.36284444444444447, + "grad_norm": 0.3539108628447298, + "learning_rate": 0.0001473173130015009, + "loss": 0.6232, + "step": 2041 + }, + { + "epoch": 0.36302222222222225, + "grad_norm": 0.3497645277230926, + "learning_rate": 0.00014726657848900129, + "loss": 0.6324, + "step": 2042 + }, + { + "epoch": 0.3632, + "grad_norm": 0.3497048825422562, + "learning_rate": 0.00014721582830519623, + "loss": 0.6427, + "step": 2043 + }, + { + "epoch": 0.3633777777777778, + "grad_norm": 0.38204486391191084, + "learning_rate": 0.00014716506246691195, + "loss": 0.6234, + "step": 2044 + }, + { + "epoch": 0.3635555555555556, + "grad_norm": 0.353534799555291, + "learning_rate": 0.00014711428099098002, + "loss": 0.5836, + "step": 2045 + }, + { + "epoch": 0.36373333333333335, + "grad_norm": 0.41391749114138887, + "learning_rate": 0.00014706348389423708, + "loss": 0.6264, + "step": 2046 + }, + { + "epoch": 0.36391111111111113, + "grad_norm": 0.3756400198155271, + "learning_rate": 0.000147012671193525, + "loss": 0.6756, + "step": 2047 + }, + { + "epoch": 0.3640888888888889, + "grad_norm": 0.3847128408739994, + "learning_rate": 0.0001469618429056908, + "loss": 0.6295, + "step": 2048 + }, + { + "epoch": 0.3642666666666667, + "grad_norm": 0.3591691226806434, + "learning_rate": 0.00014691099904758667, + "loss": 0.633, + "step": 2049 + }, + { + "epoch": 0.36444444444444446, + "grad_norm": 0.36768580315706306, + "learning_rate": 0.00014686013963607, + "loss": 0.6466, + "step": 2050 + }, + { + "epoch": 0.36462222222222224, + "grad_norm": 0.3619506776953442, + "learning_rate": 0.00014680926468800326, + "loss": 0.6543, + "step": 2051 + }, + { + "epoch": 0.3648, + "grad_norm": 0.36832799311030334, + "learning_rate": 0.00014675837422025413, + "loss": 0.6624, + "step": 2052 + }, + { + "epoch": 0.3649777777777778, + "grad_norm": 0.38626330524947955, + "learning_rate": 0.00014670746824969544, + "loss": 0.6506, + "step": 2053 + }, + { + "epoch": 0.36515555555555557, + "grad_norm": 0.3846826512524197, + "learning_rate": 0.00014665654679320511, + "loss": 0.7167, + "step": 2054 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 0.36947120198843547, + "learning_rate": 0.00014660560986766623, + "loss": 0.6366, + "step": 2055 + }, + { + "epoch": 0.3655111111111111, + "grad_norm": 0.3705785771649254, + "learning_rate": 0.00014655465748996703, + "loss": 0.6513, + "step": 2056 + }, + { + "epoch": 0.3656888888888889, + "grad_norm": 0.3737980056067438, + "learning_rate": 0.00014650368967700084, + "loss": 0.6602, + "step": 2057 + }, + { + "epoch": 0.3658666666666667, + "grad_norm": 0.3585112543555933, + "learning_rate": 0.00014645270644566617, + "loss": 0.6848, + "step": 2058 + }, + { + "epoch": 0.36604444444444445, + "grad_norm": 0.3636451331076331, + "learning_rate": 0.0001464017078128665, + "loss": 0.6626, + "step": 2059 + }, + { + "epoch": 0.3662222222222222, + "grad_norm": 0.3538149131040327, + "learning_rate": 0.00014635069379551055, + "loss": 0.6373, + "step": 2060 + }, + { + "epoch": 0.3664, + "grad_norm": 0.34539514311107683, + "learning_rate": 0.00014629966441051208, + "loss": 0.6186, + "step": 2061 + }, + { + "epoch": 0.3665777777777778, + "grad_norm": 0.36299689300073207, + "learning_rate": 0.00014624861967478997, + "loss": 0.6342, + "step": 2062 + }, + { + "epoch": 0.36675555555555556, + "grad_norm": 0.37152935857686487, + "learning_rate": 0.00014619755960526817, + "loss": 0.6555, + "step": 2063 + }, + { + "epoch": 0.36693333333333333, + "grad_norm": 0.3640051531385131, + "learning_rate": 0.00014614648421887574, + "loss": 0.6064, + "step": 2064 + }, + { + "epoch": 0.3671111111111111, + "grad_norm": 0.39013013963600696, + "learning_rate": 0.00014609539353254678, + "loss": 0.6103, + "step": 2065 + }, + { + "epoch": 0.3672888888888889, + "grad_norm": 0.3558377578251637, + "learning_rate": 0.00014604428756322048, + "loss": 0.6376, + "step": 2066 + }, + { + "epoch": 0.36746666666666666, + "grad_norm": 0.36082133986793363, + "learning_rate": 0.00014599316632784112, + "loss": 0.664, + "step": 2067 + }, + { + "epoch": 0.36764444444444444, + "grad_norm": 0.3744793863193506, + "learning_rate": 0.00014594202984335804, + "loss": 0.6245, + "step": 2068 + }, + { + "epoch": 0.3678222222222222, + "grad_norm": 0.34143396881227045, + "learning_rate": 0.00014589087812672558, + "loss": 0.6147, + "step": 2069 + }, + { + "epoch": 0.368, + "grad_norm": 0.34854549704357407, + "learning_rate": 0.00014583971119490316, + "loss": 0.5693, + "step": 2070 + }, + { + "epoch": 0.36817777777777777, + "grad_norm": 0.3487247726779616, + "learning_rate": 0.00014578852906485531, + "loss": 0.6019, + "step": 2071 + }, + { + "epoch": 0.36835555555555555, + "grad_norm": 0.3627248460169325, + "learning_rate": 0.0001457373317535515, + "loss": 0.6591, + "step": 2072 + }, + { + "epoch": 0.3685333333333333, + "grad_norm": 0.35489116437132423, + "learning_rate": 0.0001456861192779663, + "loss": 0.6399, + "step": 2073 + }, + { + "epoch": 0.3687111111111111, + "grad_norm": 0.3898380138409434, + "learning_rate": 0.0001456348916550793, + "loss": 0.6699, + "step": 2074 + }, + { + "epoch": 0.3688888888888889, + "grad_norm": 0.34963253444716613, + "learning_rate": 0.00014558364890187501, + "loss": 0.6137, + "step": 2075 + }, + { + "epoch": 0.36906666666666665, + "grad_norm": 0.35621972227477494, + "learning_rate": 0.0001455323910353431, + "loss": 0.6873, + "step": 2076 + }, + { + "epoch": 0.36924444444444443, + "grad_norm": 0.3609739517191583, + "learning_rate": 0.0001454811180724782, + "loss": 0.639, + "step": 2077 + }, + { + "epoch": 0.3694222222222222, + "grad_norm": 0.35324307876701094, + "learning_rate": 0.00014542983003027995, + "loss": 0.6049, + "step": 2078 + }, + { + "epoch": 0.3696, + "grad_norm": 0.3684218240588309, + "learning_rate": 0.00014537852692575294, + "loss": 0.6435, + "step": 2079 + }, + { + "epoch": 0.36977777777777776, + "grad_norm": 0.38440495160733995, + "learning_rate": 0.00014532720877590683, + "loss": 0.642, + "step": 2080 + }, + { + "epoch": 0.36995555555555554, + "grad_norm": 0.38862218740196974, + "learning_rate": 0.00014527587559775616, + "loss": 0.6687, + "step": 2081 + }, + { + "epoch": 0.3701333333333333, + "grad_norm": 0.3512888006933407, + "learning_rate": 0.00014522452740832063, + "loss": 0.6408, + "step": 2082 + }, + { + "epoch": 0.3703111111111111, + "grad_norm": 0.40489115076877097, + "learning_rate": 0.0001451731642246247, + "loss": 0.636, + "step": 2083 + }, + { + "epoch": 0.37048888888888887, + "grad_norm": 0.3642348665286737, + "learning_rate": 0.000145121786063698, + "loss": 0.6552, + "step": 2084 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 0.36209778743433185, + "learning_rate": 0.00014507039294257498, + "loss": 0.593, + "step": 2085 + }, + { + "epoch": 0.3708444444444444, + "grad_norm": 0.3548057008262031, + "learning_rate": 0.00014501898487829514, + "loss": 0.5857, + "step": 2086 + }, + { + "epoch": 0.3710222222222222, + "grad_norm": 0.35657636965959416, + "learning_rate": 0.0001449675618879029, + "loss": 0.6355, + "step": 2087 + }, + { + "epoch": 0.3712, + "grad_norm": 0.3584592484074798, + "learning_rate": 0.0001449161239884476, + "loss": 0.6573, + "step": 2088 + }, + { + "epoch": 0.37137777777777775, + "grad_norm": 0.3585384982314153, + "learning_rate": 0.00014486467119698357, + "loss": 0.6394, + "step": 2089 + }, + { + "epoch": 0.37155555555555553, + "grad_norm": 0.39821068958150124, + "learning_rate": 0.00014481320353057007, + "loss": 0.6313, + "step": 2090 + }, + { + "epoch": 0.37173333333333336, + "grad_norm": 0.3659565415690751, + "learning_rate": 0.00014476172100627127, + "loss": 0.6097, + "step": 2091 + }, + { + "epoch": 0.37191111111111114, + "grad_norm": 0.4015216229336997, + "learning_rate": 0.00014471022364115628, + "loss": 0.6373, + "step": 2092 + }, + { + "epoch": 0.3720888888888889, + "grad_norm": 0.3999333247518061, + "learning_rate": 0.00014465871145229913, + "loss": 0.6955, + "step": 2093 + }, + { + "epoch": 0.3722666666666667, + "grad_norm": 0.3992134508434656, + "learning_rate": 0.00014460718445677876, + "loss": 0.6792, + "step": 2094 + }, + { + "epoch": 0.37244444444444447, + "grad_norm": 0.38087493710745773, + "learning_rate": 0.00014455564267167905, + "loss": 0.6094, + "step": 2095 + }, + { + "epoch": 0.37262222222222224, + "grad_norm": 0.35604088117839133, + "learning_rate": 0.00014450408611408873, + "loss": 0.5776, + "step": 2096 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3672131924064539, + "learning_rate": 0.00014445251480110145, + "loss": 0.6622, + "step": 2097 + }, + { + "epoch": 0.3729777777777778, + "grad_norm": 0.35990351493537415, + "learning_rate": 0.00014440092874981576, + "loss": 0.6673, + "step": 2098 + }, + { + "epoch": 0.3731555555555556, + "grad_norm": 0.364851988251713, + "learning_rate": 0.00014434932797733515, + "loss": 0.6446, + "step": 2099 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.3695891243315968, + "learning_rate": 0.00014429771250076785, + "loss": 0.6858, + "step": 2100 + }, + { + "epoch": 0.3735111111111111, + "grad_norm": 0.36310420904474155, + "learning_rate": 0.00014424608233722707, + "loss": 0.6583, + "step": 2101 + }, + { + "epoch": 0.3736888888888889, + "grad_norm": 0.3532427224281201, + "learning_rate": 0.0001441944375038309, + "loss": 0.6244, + "step": 2102 + }, + { + "epoch": 0.3738666666666667, + "grad_norm": 0.3641283155611553, + "learning_rate": 0.00014414277801770223, + "loss": 0.6413, + "step": 2103 + }, + { + "epoch": 0.37404444444444446, + "grad_norm": 0.37282122080784824, + "learning_rate": 0.00014409110389596887, + "loss": 0.6587, + "step": 2104 + }, + { + "epoch": 0.37422222222222223, + "grad_norm": 0.3618878863527839, + "learning_rate": 0.00014403941515576344, + "loss": 0.6073, + "step": 2105 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3629842550856872, + "learning_rate": 0.0001439877118142234, + "loss": 0.6559, + "step": 2106 + }, + { + "epoch": 0.3745777777777778, + "grad_norm": 0.37984758502006777, + "learning_rate": 0.0001439359938884911, + "loss": 0.6562, + "step": 2107 + }, + { + "epoch": 0.37475555555555556, + "grad_norm": 0.3602592501165416, + "learning_rate": 0.0001438842613957137, + "loss": 0.5927, + "step": 2108 + }, + { + "epoch": 0.37493333333333334, + "grad_norm": 0.3649379205527932, + "learning_rate": 0.00014383251435304314, + "loss": 0.5901, + "step": 2109 + }, + { + "epoch": 0.3751111111111111, + "grad_norm": 0.3736945232348994, + "learning_rate": 0.0001437807527776363, + "loss": 0.6318, + "step": 2110 + }, + { + "epoch": 0.3752888888888889, + "grad_norm": 0.3726405137609307, + "learning_rate": 0.00014372897668665476, + "loss": 0.6436, + "step": 2111 + }, + { + "epoch": 0.37546666666666667, + "grad_norm": 0.35773199192047844, + "learning_rate": 0.00014367718609726497, + "loss": 0.6119, + "step": 2112 + }, + { + "epoch": 0.37564444444444445, + "grad_norm": 0.3958807454635296, + "learning_rate": 0.00014362538102663817, + "loss": 0.6479, + "step": 2113 + }, + { + "epoch": 0.3758222222222222, + "grad_norm": 0.3717928204583553, + "learning_rate": 0.00014357356149195043, + "loss": 0.6561, + "step": 2114 + }, + { + "epoch": 0.376, + "grad_norm": 0.37259846302759836, + "learning_rate": 0.00014352172751038258, + "loss": 0.6284, + "step": 2115 + }, + { + "epoch": 0.3761777777777778, + "grad_norm": 0.3803169871031899, + "learning_rate": 0.00014346987909912023, + "loss": 0.6457, + "step": 2116 + }, + { + "epoch": 0.37635555555555555, + "grad_norm": 0.341157191367804, + "learning_rate": 0.00014341801627535387, + "loss": 0.5963, + "step": 2117 + }, + { + "epoch": 0.37653333333333333, + "grad_norm": 0.37635781406654667, + "learning_rate": 0.00014336613905627864, + "loss": 0.6881, + "step": 2118 + }, + { + "epoch": 0.3767111111111111, + "grad_norm": 0.3494390544002213, + "learning_rate": 0.00014331424745909455, + "loss": 0.6236, + "step": 2119 + }, + { + "epoch": 0.3768888888888889, + "grad_norm": 0.3711944336613304, + "learning_rate": 0.00014326234150100628, + "loss": 0.6847, + "step": 2120 + }, + { + "epoch": 0.37706666666666666, + "grad_norm": 0.3639506200310751, + "learning_rate": 0.00014321042119922337, + "loss": 0.5967, + "step": 2121 + }, + { + "epoch": 0.37724444444444444, + "grad_norm": 0.36612709961017464, + "learning_rate": 0.00014315848657096004, + "loss": 0.6628, + "step": 2122 + }, + { + "epoch": 0.3774222222222222, + "grad_norm": 0.37448169012243826, + "learning_rate": 0.00014310653763343538, + "loss": 0.6286, + "step": 2123 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3529198758310332, + "learning_rate": 0.00014305457440387306, + "loss": 0.5969, + "step": 2124 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 0.3897453826275033, + "learning_rate": 0.00014300259689950157, + "loss": 0.6221, + "step": 2125 + }, + { + "epoch": 0.37795555555555554, + "grad_norm": 0.3600597342924437, + "learning_rate": 0.00014295060513755417, + "loss": 0.6403, + "step": 2126 + }, + { + "epoch": 0.3781333333333333, + "grad_norm": 0.347381650346313, + "learning_rate": 0.00014289859913526874, + "loss": 0.6284, + "step": 2127 + }, + { + "epoch": 0.3783111111111111, + "grad_norm": 0.35067161675773445, + "learning_rate": 0.000142846578909888, + "loss": 0.6026, + "step": 2128 + }, + { + "epoch": 0.3784888888888889, + "grad_norm": 0.352117349469237, + "learning_rate": 0.00014279454447865936, + "loss": 0.6625, + "step": 2129 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 0.38615121863091334, + "learning_rate": 0.00014274249585883488, + "loss": 0.6915, + "step": 2130 + }, + { + "epoch": 0.37884444444444443, + "grad_norm": 0.34811074245483276, + "learning_rate": 0.00014269043306767135, + "loss": 0.5989, + "step": 2131 + }, + { + "epoch": 0.3790222222222222, + "grad_norm": 0.3761481727509228, + "learning_rate": 0.00014263835612243026, + "loss": 0.652, + "step": 2132 + }, + { + "epoch": 0.3792, + "grad_norm": 0.33384252861099756, + "learning_rate": 0.00014258626504037785, + "loss": 0.5909, + "step": 2133 + }, + { + "epoch": 0.37937777777777776, + "grad_norm": 0.35198272084894605, + "learning_rate": 0.00014253415983878494, + "loss": 0.6221, + "step": 2134 + }, + { + "epoch": 0.37955555555555553, + "grad_norm": 0.36860438851002236, + "learning_rate": 0.0001424820405349271, + "loss": 0.6593, + "step": 2135 + }, + { + "epoch": 0.3797333333333333, + "grad_norm": 0.3647099858235695, + "learning_rate": 0.0001424299071460846, + "loss": 0.6084, + "step": 2136 + }, + { + "epoch": 0.3799111111111111, + "grad_norm": 0.36971936067843375, + "learning_rate": 0.00014237775968954232, + "loss": 0.5996, + "step": 2137 + }, + { + "epoch": 0.38008888888888887, + "grad_norm": 0.40135103166774755, + "learning_rate": 0.00014232559818258984, + "loss": 0.6435, + "step": 2138 + }, + { + "epoch": 0.38026666666666664, + "grad_norm": 0.36258104916378414, + "learning_rate": 0.00014227342264252135, + "loss": 0.5921, + "step": 2139 + }, + { + "epoch": 0.3804444444444444, + "grad_norm": 0.3462943529021952, + "learning_rate": 0.00014222123308663576, + "loss": 0.6274, + "step": 2140 + }, + { + "epoch": 0.3806222222222222, + "grad_norm": 0.3705258541393284, + "learning_rate": 0.00014216902953223656, + "loss": 0.6067, + "step": 2141 + }, + { + "epoch": 0.3808, + "grad_norm": 0.34363773144042226, + "learning_rate": 0.00014211681199663198, + "loss": 0.6345, + "step": 2142 + }, + { + "epoch": 0.3809777777777778, + "grad_norm": 0.36093738100974226, + "learning_rate": 0.00014206458049713478, + "loss": 0.5837, + "step": 2143 + }, + { + "epoch": 0.3811555555555556, + "grad_norm": 0.3410809396233212, + "learning_rate": 0.0001420123350510624, + "loss": 0.6162, + "step": 2144 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 0.37523960210836604, + "learning_rate": 0.0001419600756757369, + "loss": 0.6835, + "step": 2145 + }, + { + "epoch": 0.38151111111111113, + "grad_norm": 0.34753360577189735, + "learning_rate": 0.00014190780238848493, + "loss": 0.6001, + "step": 2146 + }, + { + "epoch": 0.3816888888888889, + "grad_norm": 0.3417390143121445, + "learning_rate": 0.00014185551520663783, + "loss": 0.6318, + "step": 2147 + }, + { + "epoch": 0.3818666666666667, + "grad_norm": 0.34767287256511925, + "learning_rate": 0.0001418032141475315, + "loss": 0.6672, + "step": 2148 + }, + { + "epoch": 0.38204444444444446, + "grad_norm": 0.34986866939572314, + "learning_rate": 0.00014175089922850633, + "loss": 0.6049, + "step": 2149 + }, + { + "epoch": 0.38222222222222224, + "grad_norm": 0.38346100398983146, + "learning_rate": 0.00014169857046690752, + "loss": 0.5949, + "step": 2150 + }, + { + "epoch": 0.3824, + "grad_norm": 0.3604941478020668, + "learning_rate": 0.0001416462278800847, + "loss": 0.6329, + "step": 2151 + }, + { + "epoch": 0.3825777777777778, + "grad_norm": 0.37944882343357084, + "learning_rate": 0.00014159387148539212, + "loss": 0.659, + "step": 2152 + }, + { + "epoch": 0.38275555555555557, + "grad_norm": 0.3550831565518237, + "learning_rate": 0.00014154150130018866, + "loss": 0.6648, + "step": 2153 + }, + { + "epoch": 0.38293333333333335, + "grad_norm": 0.3730216991981197, + "learning_rate": 0.00014148911734183773, + "loss": 0.7023, + "step": 2154 + }, + { + "epoch": 0.3831111111111111, + "grad_norm": 0.3782374506276671, + "learning_rate": 0.00014143671962770727, + "loss": 0.65, + "step": 2155 + }, + { + "epoch": 0.3832888888888889, + "grad_norm": 0.42267071871104744, + "learning_rate": 0.00014138430817516989, + "loss": 0.6634, + "step": 2156 + }, + { + "epoch": 0.3834666666666667, + "grad_norm": 0.34093793843641973, + "learning_rate": 0.0001413318830016026, + "loss": 0.6557, + "step": 2157 + }, + { + "epoch": 0.38364444444444445, + "grad_norm": 0.3507739779657896, + "learning_rate": 0.00014127944412438713, + "loss": 0.6137, + "step": 2158 + }, + { + "epoch": 0.38382222222222223, + "grad_norm": 0.3466109679160134, + "learning_rate": 0.00014122699156090963, + "loss": 0.6066, + "step": 2159 + }, + { + "epoch": 0.384, + "grad_norm": 0.41556009016731515, + "learning_rate": 0.00014117452532856083, + "loss": 0.5822, + "step": 2160 + }, + { + "epoch": 0.3841777777777778, + "grad_norm": 0.4202343085679337, + "learning_rate": 0.00014112204544473598, + "loss": 0.6453, + "step": 2161 + }, + { + "epoch": 0.38435555555555556, + "grad_norm": 0.3577389123139154, + "learning_rate": 0.00014106955192683487, + "loss": 0.6047, + "step": 2162 + }, + { + "epoch": 0.38453333333333334, + "grad_norm": 0.374706724519363, + "learning_rate": 0.00014101704479226181, + "loss": 0.6094, + "step": 2163 + }, + { + "epoch": 0.3847111111111111, + "grad_norm": 0.36322802240061863, + "learning_rate": 0.0001409645240584256, + "loss": 0.5792, + "step": 2164 + }, + { + "epoch": 0.3848888888888889, + "grad_norm": 0.3462895298219409, + "learning_rate": 0.0001409119897427396, + "loss": 0.6522, + "step": 2165 + }, + { + "epoch": 0.38506666666666667, + "grad_norm": 0.38805561136927863, + "learning_rate": 0.00014085944186262162, + "loss": 0.6543, + "step": 2166 + }, + { + "epoch": 0.38524444444444444, + "grad_norm": 0.3649822424055061, + "learning_rate": 0.00014080688043549398, + "loss": 0.6085, + "step": 2167 + }, + { + "epoch": 0.3854222222222222, + "grad_norm": 0.35747983341457484, + "learning_rate": 0.00014075430547878353, + "loss": 0.6194, + "step": 2168 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3754182539842769, + "learning_rate": 0.0001407017170099216, + "loss": 0.6439, + "step": 2169 + }, + { + "epoch": 0.3857777777777778, + "grad_norm": 0.3489245454930453, + "learning_rate": 0.00014064911504634389, + "loss": 0.6373, + "step": 2170 + }, + { + "epoch": 0.38595555555555555, + "grad_norm": 0.356052339556721, + "learning_rate": 0.0001405964996054907, + "loss": 0.6595, + "step": 2171 + }, + { + "epoch": 0.38613333333333333, + "grad_norm": 0.382998213017678, + "learning_rate": 0.00014054387070480678, + "loss": 0.7346, + "step": 2172 + }, + { + "epoch": 0.3863111111111111, + "grad_norm": 0.3580304639049787, + "learning_rate": 0.00014049122836174135, + "loss": 0.6117, + "step": 2173 + }, + { + "epoch": 0.3864888888888889, + "grad_norm": 0.36935968443308337, + "learning_rate": 0.000140438572593748, + "loss": 0.6377, + "step": 2174 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.36965018729194526, + "learning_rate": 0.00014038590341828485, + "loss": 0.6696, + "step": 2175 + }, + { + "epoch": 0.38684444444444444, + "grad_norm": 0.36224405451966574, + "learning_rate": 0.0001403332208528144, + "loss": 0.5822, + "step": 2176 + }, + { + "epoch": 0.3870222222222222, + "grad_norm": 0.43704488788052576, + "learning_rate": 0.0001402805249148037, + "loss": 0.6424, + "step": 2177 + }, + { + "epoch": 0.3872, + "grad_norm": 0.36465651144729866, + "learning_rate": 0.00014022781562172417, + "loss": 0.6054, + "step": 2178 + }, + { + "epoch": 0.38737777777777777, + "grad_norm": 0.39460126513394794, + "learning_rate": 0.00014017509299105158, + "loss": 0.6311, + "step": 2179 + }, + { + "epoch": 0.38755555555555554, + "grad_norm": 0.34746145220952596, + "learning_rate": 0.0001401223570402663, + "loss": 0.6356, + "step": 2180 + }, + { + "epoch": 0.3877333333333333, + "grad_norm": 0.3502672643898457, + "learning_rate": 0.0001400696077868529, + "loss": 0.5963, + "step": 2181 + }, + { + "epoch": 0.3879111111111111, + "grad_norm": 0.34879019387944665, + "learning_rate": 0.00014001684524830057, + "loss": 0.6281, + "step": 2182 + }, + { + "epoch": 0.38808888888888887, + "grad_norm": 0.3694671698202079, + "learning_rate": 0.00013996406944210277, + "loss": 0.6254, + "step": 2183 + }, + { + "epoch": 0.38826666666666665, + "grad_norm": 0.36143632327219255, + "learning_rate": 0.00013991128038575741, + "loss": 0.6224, + "step": 2184 + }, + { + "epoch": 0.3884444444444444, + "grad_norm": 0.35475970790390776, + "learning_rate": 0.0001398584780967668, + "loss": 0.6218, + "step": 2185 + }, + { + "epoch": 0.3886222222222222, + "grad_norm": 0.35113464373309267, + "learning_rate": 0.00013980566259263756, + "loss": 0.6364, + "step": 2186 + }, + { + "epoch": 0.3888, + "grad_norm": 0.38080474460712316, + "learning_rate": 0.00013975283389088079, + "loss": 0.6519, + "step": 2187 + }, + { + "epoch": 0.38897777777777776, + "grad_norm": 0.3387932822864206, + "learning_rate": 0.00013969999200901193, + "loss": 0.5573, + "step": 2188 + }, + { + "epoch": 0.38915555555555553, + "grad_norm": 0.36755095589103015, + "learning_rate": 0.00013964713696455074, + "loss": 0.6025, + "step": 2189 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 0.37931402832404987, + "learning_rate": 0.00013959426877502144, + "loss": 0.6475, + "step": 2190 + }, + { + "epoch": 0.3895111111111111, + "grad_norm": 0.3662383744919383, + "learning_rate": 0.00013954138745795257, + "loss": 0.6702, + "step": 2191 + }, + { + "epoch": 0.38968888888888886, + "grad_norm": 0.35190299380242607, + "learning_rate": 0.00013948849303087698, + "loss": 0.6205, + "step": 2192 + }, + { + "epoch": 0.38986666666666664, + "grad_norm": 0.36172112011819413, + "learning_rate": 0.00013943558551133186, + "loss": 0.6431, + "step": 2193 + }, + { + "epoch": 0.39004444444444447, + "grad_norm": 0.35138678831696485, + "learning_rate": 0.00013938266491685886, + "loss": 0.6374, + "step": 2194 + }, + { + "epoch": 0.39022222222222225, + "grad_norm": 0.36519395240501246, + "learning_rate": 0.0001393297312650038, + "loss": 0.6495, + "step": 2195 + }, + { + "epoch": 0.3904, + "grad_norm": 0.521492896553643, + "learning_rate": 0.00013927678457331699, + "loss": 0.6226, + "step": 2196 + }, + { + "epoch": 0.3905777777777778, + "grad_norm": 0.3548497023119579, + "learning_rate": 0.00013922382485935297, + "loss": 0.6051, + "step": 2197 + }, + { + "epoch": 0.3907555555555556, + "grad_norm": 0.38448313075046897, + "learning_rate": 0.00013917085214067054, + "loss": 0.7214, + "step": 2198 + }, + { + "epoch": 0.39093333333333335, + "grad_norm": 0.3687069401932563, + "learning_rate": 0.00013911786643483297, + "loss": 0.6517, + "step": 2199 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 0.34684049663403577, + "learning_rate": 0.0001390648677594077, + "loss": 0.6053, + "step": 2200 + }, + { + "epoch": 0.3912888888888889, + "grad_norm": 0.35180260381139583, + "learning_rate": 0.00013901185613196654, + "loss": 0.6239, + "step": 2201 + }, + { + "epoch": 0.3914666666666667, + "grad_norm": 0.3698391599021474, + "learning_rate": 0.00013895883157008558, + "loss": 0.6064, + "step": 2202 + }, + { + "epoch": 0.39164444444444446, + "grad_norm": 0.36472959917824455, + "learning_rate": 0.00013890579409134518, + "loss": 0.6287, + "step": 2203 + }, + { + "epoch": 0.39182222222222224, + "grad_norm": 0.38474238140006123, + "learning_rate": 0.00013885274371333, + "loss": 0.6826, + "step": 2204 + }, + { + "epoch": 0.392, + "grad_norm": 0.3476018592702132, + "learning_rate": 0.00013879968045362901, + "loss": 0.6537, + "step": 2205 + }, + { + "epoch": 0.3921777777777778, + "grad_norm": 0.38442642410466105, + "learning_rate": 0.00013874660432983536, + "loss": 0.7407, + "step": 2206 + }, + { + "epoch": 0.39235555555555557, + "grad_norm": 0.3620908563106699, + "learning_rate": 0.00013869351535954652, + "loss": 0.61, + "step": 2207 + }, + { + "epoch": 0.39253333333333335, + "grad_norm": 0.36323918214435275, + "learning_rate": 0.00013864041356036427, + "loss": 0.6035, + "step": 2208 + }, + { + "epoch": 0.3927111111111111, + "grad_norm": 0.3465735449833905, + "learning_rate": 0.00013858729894989456, + "loss": 0.6079, + "step": 2209 + }, + { + "epoch": 0.3928888888888889, + "grad_norm": 0.3508195670082517, + "learning_rate": 0.0001385341715457476, + "loss": 0.6149, + "step": 2210 + }, + { + "epoch": 0.3930666666666667, + "grad_norm": 0.3649447656788205, + "learning_rate": 0.00013848103136553788, + "loss": 0.6607, + "step": 2211 + }, + { + "epoch": 0.39324444444444445, + "grad_norm": 0.32540805810766693, + "learning_rate": 0.00013842787842688412, + "loss": 0.5757, + "step": 2212 + }, + { + "epoch": 0.39342222222222223, + "grad_norm": 0.37045526111195654, + "learning_rate": 0.00013837471274740924, + "loss": 0.6543, + "step": 2213 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3878579193509301, + "learning_rate": 0.0001383215343447404, + "loss": 0.6754, + "step": 2214 + }, + { + "epoch": 0.3937777777777778, + "grad_norm": 0.3667130246029731, + "learning_rate": 0.000138268343236509, + "loss": 0.6314, + "step": 2215 + }, + { + "epoch": 0.39395555555555556, + "grad_norm": 0.35799672474178074, + "learning_rate": 0.0001382151394403506, + "loss": 0.6496, + "step": 2216 + }, + { + "epoch": 0.39413333333333334, + "grad_norm": 0.41999942376969673, + "learning_rate": 0.00013816192297390502, + "loss": 0.6045, + "step": 2217 + }, + { + "epoch": 0.3943111111111111, + "grad_norm": 0.3994431684518544, + "learning_rate": 0.00013810869385481623, + "loss": 0.7035, + "step": 2218 + }, + { + "epoch": 0.3944888888888889, + "grad_norm": 0.3711501205407042, + "learning_rate": 0.0001380554521007325, + "loss": 0.6001, + "step": 2219 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 0.3453335682249666, + "learning_rate": 0.00013800219772930612, + "loss": 0.5978, + "step": 2220 + }, + { + "epoch": 0.39484444444444444, + "grad_norm": 0.3660366715776582, + "learning_rate": 0.00013794893075819373, + "loss": 0.6391, + "step": 2221 + }, + { + "epoch": 0.3950222222222222, + "grad_norm": 0.5015331879628174, + "learning_rate": 0.00013789565120505607, + "loss": 0.7131, + "step": 2222 + }, + { + "epoch": 0.3952, + "grad_norm": 0.35333002194112134, + "learning_rate": 0.000137842359087558, + "loss": 0.641, + "step": 2223 + }, + { + "epoch": 0.3953777777777778, + "grad_norm": 0.3564476048752636, + "learning_rate": 0.00013778905442336865, + "loss": 0.6278, + "step": 2224 + }, + { + "epoch": 0.39555555555555555, + "grad_norm": 0.35276632598806756, + "learning_rate": 0.00013773573723016122, + "loss": 0.6133, + "step": 2225 + }, + { + "epoch": 0.3957333333333333, + "grad_norm": 0.37797619461352133, + "learning_rate": 0.00013768240752561314, + "loss": 0.6732, + "step": 2226 + }, + { + "epoch": 0.3959111111111111, + "grad_norm": 0.3783009340377197, + "learning_rate": 0.00013762906532740595, + "loss": 0.6743, + "step": 2227 + }, + { + "epoch": 0.3960888888888889, + "grad_norm": 0.3765372253957615, + "learning_rate": 0.00013757571065322534, + "loss": 0.6255, + "step": 2228 + }, + { + "epoch": 0.39626666666666666, + "grad_norm": 0.35524234346994, + "learning_rate": 0.00013752234352076116, + "loss": 0.6157, + "step": 2229 + }, + { + "epoch": 0.39644444444444443, + "grad_norm": 0.345037872312445, + "learning_rate": 0.00013746896394770727, + "loss": 0.5931, + "step": 2230 + }, + { + "epoch": 0.3966222222222222, + "grad_norm": 0.36783560622610667, + "learning_rate": 0.00013741557195176183, + "loss": 0.6063, + "step": 2231 + }, + { + "epoch": 0.3968, + "grad_norm": 0.3729745411551487, + "learning_rate": 0.000137362167550627, + "loss": 0.6541, + "step": 2232 + }, + { + "epoch": 0.39697777777777776, + "grad_norm": 0.3654039684684545, + "learning_rate": 0.00013730875076200914, + "loss": 0.6036, + "step": 2233 + }, + { + "epoch": 0.39715555555555554, + "grad_norm": 0.35109694080636267, + "learning_rate": 0.00013725532160361863, + "loss": 0.6297, + "step": 2234 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 0.36470389923261654, + "learning_rate": 0.00013720188009316996, + "loss": 0.6295, + "step": 2235 + }, + { + "epoch": 0.3975111111111111, + "grad_norm": 0.37171084612308297, + "learning_rate": 0.00013714842624838177, + "loss": 0.6561, + "step": 2236 + }, + { + "epoch": 0.39768888888888887, + "grad_norm": 0.41671806828699953, + "learning_rate": 0.0001370949600869768, + "loss": 0.6518, + "step": 2237 + }, + { + "epoch": 0.39786666666666665, + "grad_norm": 0.3744424006821874, + "learning_rate": 0.00013704148162668178, + "loss": 0.5946, + "step": 2238 + }, + { + "epoch": 0.3980444444444444, + "grad_norm": 0.3999851178907819, + "learning_rate": 0.00013698799088522758, + "loss": 0.6594, + "step": 2239 + }, + { + "epoch": 0.3982222222222222, + "grad_norm": 0.35331865769428233, + "learning_rate": 0.00013693448788034917, + "loss": 0.5764, + "step": 2240 + }, + { + "epoch": 0.3984, + "grad_norm": 0.38066216369092526, + "learning_rate": 0.00013688097262978555, + "loss": 0.6521, + "step": 2241 + }, + { + "epoch": 0.39857777777777775, + "grad_norm": 0.3572870438883799, + "learning_rate": 0.00013682744515127975, + "loss": 0.6054, + "step": 2242 + }, + { + "epoch": 0.39875555555555553, + "grad_norm": 0.35907742570038703, + "learning_rate": 0.0001367739054625789, + "loss": 0.6397, + "step": 2243 + }, + { + "epoch": 0.3989333333333333, + "grad_norm": 0.41714292789546137, + "learning_rate": 0.00013672035358143418, + "loss": 0.6565, + "step": 2244 + }, + { + "epoch": 0.39911111111111114, + "grad_norm": 0.405625896759291, + "learning_rate": 0.00013666678952560076, + "loss": 0.6559, + "step": 2245 + }, + { + "epoch": 0.3992888888888889, + "grad_norm": 0.39412618111010667, + "learning_rate": 0.00013661321331283796, + "loss": 0.653, + "step": 2246 + }, + { + "epoch": 0.3994666666666667, + "grad_norm": 0.37103779656489266, + "learning_rate": 0.00013655962496090894, + "loss": 0.6572, + "step": 2247 + }, + { + "epoch": 0.39964444444444447, + "grad_norm": 0.37300796764286714, + "learning_rate": 0.00013650602448758112, + "loss": 0.6237, + "step": 2248 + }, + { + "epoch": 0.39982222222222225, + "grad_norm": 0.37027085419069994, + "learning_rate": 0.0001364524119106257, + "loss": 0.6453, + "step": 2249 + }, + { + "epoch": 0.4, + "grad_norm": 0.3763403156025713, + "learning_rate": 0.00013639878724781813, + "loss": 0.5964, + "step": 2250 + }, + { + "epoch": 0.4001777777777778, + "grad_norm": 0.3637796464262999, + "learning_rate": 0.00013634515051693766, + "loss": 0.6352, + "step": 2251 + }, + { + "epoch": 0.4003555555555556, + "grad_norm": 0.3520726300177121, + "learning_rate": 0.00013629150173576762, + "loss": 0.5829, + "step": 2252 + }, + { + "epoch": 0.40053333333333335, + "grad_norm": 0.36119202828985025, + "learning_rate": 0.00013623784092209543, + "loss": 0.6425, + "step": 2253 + }, + { + "epoch": 0.40071111111111113, + "grad_norm": 0.37049771885488864, + "learning_rate": 0.00013618416809371237, + "loss": 0.6921, + "step": 2254 + }, + { + "epoch": 0.4008888888888889, + "grad_norm": 0.37393697448475316, + "learning_rate": 0.00013613048326841372, + "loss": 0.6195, + "step": 2255 + }, + { + "epoch": 0.4010666666666667, + "grad_norm": 0.34601021032042534, + "learning_rate": 0.0001360767864639988, + "loss": 0.6378, + "step": 2256 + }, + { + "epoch": 0.40124444444444446, + "grad_norm": 0.3408026567985287, + "learning_rate": 0.00013602307769827084, + "loss": 0.6105, + "step": 2257 + }, + { + "epoch": 0.40142222222222224, + "grad_norm": 0.36604065559847915, + "learning_rate": 0.0001359693569890371, + "loss": 0.6153, + "step": 2258 + }, + { + "epoch": 0.4016, + "grad_norm": 0.39299322092358724, + "learning_rate": 0.0001359156243541087, + "loss": 0.6807, + "step": 2259 + }, + { + "epoch": 0.4017777777777778, + "grad_norm": 0.3560902742072705, + "learning_rate": 0.00013586187981130086, + "loss": 0.6425, + "step": 2260 + }, + { + "epoch": 0.40195555555555557, + "grad_norm": 0.39317614378135457, + "learning_rate": 0.00013580812337843262, + "loss": 0.6101, + "step": 2261 + }, + { + "epoch": 0.40213333333333334, + "grad_norm": 0.3843783502794162, + "learning_rate": 0.00013575435507332697, + "loss": 0.6108, + "step": 2262 + }, + { + "epoch": 0.4023111111111111, + "grad_norm": 0.3688348767071207, + "learning_rate": 0.0001357005749138109, + "loss": 0.6007, + "step": 2263 + }, + { + "epoch": 0.4024888888888889, + "grad_norm": 0.3680543503224331, + "learning_rate": 0.00013564678291771534, + "loss": 0.6601, + "step": 2264 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.37164803022363313, + "learning_rate": 0.00013559297910287508, + "loss": 0.6081, + "step": 2265 + }, + { + "epoch": 0.40284444444444445, + "grad_norm": 0.39621972978569014, + "learning_rate": 0.00013553916348712884, + "loss": 0.6391, + "step": 2266 + }, + { + "epoch": 0.4030222222222222, + "grad_norm": 0.3689725631616896, + "learning_rate": 0.0001354853360883193, + "loss": 0.6147, + "step": 2267 + }, + { + "epoch": 0.4032, + "grad_norm": 0.36287847111403354, + "learning_rate": 0.000135431496924293, + "loss": 0.5916, + "step": 2268 + }, + { + "epoch": 0.4033777777777778, + "grad_norm": 0.3658611910294385, + "learning_rate": 0.00013537764601290037, + "loss": 0.636, + "step": 2269 + }, + { + "epoch": 0.40355555555555556, + "grad_norm": 0.35927960370283263, + "learning_rate": 0.00013532378337199582, + "loss": 0.6168, + "step": 2270 + }, + { + "epoch": 0.40373333333333333, + "grad_norm": 0.3471200168272243, + "learning_rate": 0.00013526990901943756, + "loss": 0.6227, + "step": 2271 + }, + { + "epoch": 0.4039111111111111, + "grad_norm": 0.36451245300985974, + "learning_rate": 0.0001352160229730877, + "loss": 0.6878, + "step": 2272 + }, + { + "epoch": 0.4040888888888889, + "grad_norm": 0.36247825343298284, + "learning_rate": 0.00013516212525081222, + "loss": 0.6112, + "step": 2273 + }, + { + "epoch": 0.40426666666666666, + "grad_norm": 0.33912327267062775, + "learning_rate": 0.00013510821587048107, + "loss": 0.5718, + "step": 2274 + }, + { + "epoch": 0.40444444444444444, + "grad_norm": 0.3534102851106006, + "learning_rate": 0.00013505429484996788, + "loss": 0.6049, + "step": 2275 + }, + { + "epoch": 0.4046222222222222, + "grad_norm": 0.36210789811782823, + "learning_rate": 0.00013500036220715034, + "loss": 0.6086, + "step": 2276 + }, + { + "epoch": 0.4048, + "grad_norm": 0.3610783069549878, + "learning_rate": 0.00013494641795990986, + "loss": 0.6525, + "step": 2277 + }, + { + "epoch": 0.40497777777777777, + "grad_norm": 0.3456435218318268, + "learning_rate": 0.00013489246212613172, + "loss": 0.6235, + "step": 2278 + }, + { + "epoch": 0.40515555555555555, + "grad_norm": 0.3491123928818089, + "learning_rate": 0.0001348384947237051, + "loss": 0.6133, + "step": 2279 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 0.4347292146868507, + "learning_rate": 0.00013478451577052293, + "loss": 0.6013, + "step": 2280 + }, + { + "epoch": 0.4055111111111111, + "grad_norm": 0.3521540151615664, + "learning_rate": 0.00013473052528448201, + "loss": 0.6306, + "step": 2281 + }, + { + "epoch": 0.4056888888888889, + "grad_norm": 0.3657452480559006, + "learning_rate": 0.00013467652328348306, + "loss": 0.6575, + "step": 2282 + }, + { + "epoch": 0.40586666666666665, + "grad_norm": 0.377056831347542, + "learning_rate": 0.00013462250978543044, + "loss": 0.6526, + "step": 2283 + }, + { + "epoch": 0.40604444444444443, + "grad_norm": 0.35899665506459566, + "learning_rate": 0.00013456848480823238, + "loss": 0.6077, + "step": 2284 + }, + { + "epoch": 0.4062222222222222, + "grad_norm": 0.3489345735334524, + "learning_rate": 0.000134514448369801, + "loss": 0.6295, + "step": 2285 + }, + { + "epoch": 0.4064, + "grad_norm": 0.386870182272734, + "learning_rate": 0.00013446040048805218, + "loss": 0.6722, + "step": 2286 + }, + { + "epoch": 0.40657777777777776, + "grad_norm": 0.3520920670584741, + "learning_rate": 0.00013440634118090552, + "loss": 0.594, + "step": 2287 + }, + { + "epoch": 0.40675555555555554, + "grad_norm": 0.3456438821828521, + "learning_rate": 0.0001343522704662845, + "loss": 0.5906, + "step": 2288 + }, + { + "epoch": 0.4069333333333333, + "grad_norm": 0.37167197979109773, + "learning_rate": 0.0001342981883621163, + "loss": 0.6793, + "step": 2289 + }, + { + "epoch": 0.4071111111111111, + "grad_norm": 0.3704497625024607, + "learning_rate": 0.000134244094886332, + "loss": 0.69, + "step": 2290 + }, + { + "epoch": 0.40728888888888887, + "grad_norm": 0.35927984762132265, + "learning_rate": 0.00013418999005686635, + "loss": 0.6429, + "step": 2291 + }, + { + "epoch": 0.40746666666666664, + "grad_norm": 0.4003445829647246, + "learning_rate": 0.00013413587389165784, + "loss": 0.6859, + "step": 2292 + }, + { + "epoch": 0.4076444444444444, + "grad_norm": 0.36165890050622507, + "learning_rate": 0.0001340817464086488, + "loss": 0.6344, + "step": 2293 + }, + { + "epoch": 0.4078222222222222, + "grad_norm": 0.38020801064425364, + "learning_rate": 0.00013402760762578527, + "loss": 0.6616, + "step": 2294 + }, + { + "epoch": 0.408, + "grad_norm": 0.345563933181974, + "learning_rate": 0.00013397345756101708, + "loss": 0.6557, + "step": 2295 + }, + { + "epoch": 0.40817777777777775, + "grad_norm": 0.34364179737224065, + "learning_rate": 0.00013391929623229773, + "loss": 0.6098, + "step": 2296 + }, + { + "epoch": 0.4083555555555556, + "grad_norm": 0.36496273070797147, + "learning_rate": 0.0001338651236575845, + "loss": 0.6298, + "step": 2297 + }, + { + "epoch": 0.40853333333333336, + "grad_norm": 0.3367366001547099, + "learning_rate": 0.00013381093985483837, + "loss": 0.6016, + "step": 2298 + }, + { + "epoch": 0.40871111111111114, + "grad_norm": 0.33798538043977244, + "learning_rate": 0.0001337567448420241, + "loss": 0.5834, + "step": 2299 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 0.3486084067281326, + "learning_rate": 0.00013370253863711007, + "loss": 0.6306, + "step": 2300 + }, + { + "epoch": 0.4090666666666667, + "grad_norm": 0.3327238937777495, + "learning_rate": 0.0001336483212580685, + "loss": 0.5926, + "step": 2301 + }, + { + "epoch": 0.40924444444444447, + "grad_norm": 0.3723619122906232, + "learning_rate": 0.0001335940927228752, + "loss": 0.6631, + "step": 2302 + }, + { + "epoch": 0.40942222222222224, + "grad_norm": 0.3569229877435541, + "learning_rate": 0.00013353985304950973, + "loss": 0.6086, + "step": 2303 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3825508026363746, + "learning_rate": 0.00013348560225595534, + "loss": 0.6668, + "step": 2304 + }, + { + "epoch": 0.4097777777777778, + "grad_norm": 0.3259140115763881, + "learning_rate": 0.00013343134036019895, + "loss": 0.5879, + "step": 2305 + }, + { + "epoch": 0.4099555555555556, + "grad_norm": 0.3696174236507997, + "learning_rate": 0.0001333770673802312, + "loss": 0.6326, + "step": 2306 + }, + { + "epoch": 0.41013333333333335, + "grad_norm": 0.3726843674946877, + "learning_rate": 0.00013332278333404637, + "loss": 0.67, + "step": 2307 + }, + { + "epoch": 0.4103111111111111, + "grad_norm": 0.360137440913349, + "learning_rate": 0.00013326848823964243, + "loss": 0.6237, + "step": 2308 + }, + { + "epoch": 0.4104888888888889, + "grad_norm": 0.3538565241839011, + "learning_rate": 0.00013321418211502091, + "loss": 0.6366, + "step": 2309 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 0.36794012133754433, + "learning_rate": 0.0001331598649781872, + "loss": 0.6771, + "step": 2310 + }, + { + "epoch": 0.41084444444444446, + "grad_norm": 0.3660473619849432, + "learning_rate": 0.0001331055368471502, + "loss": 0.6541, + "step": 2311 + }, + { + "epoch": 0.41102222222222223, + "grad_norm": 0.34333918895498594, + "learning_rate": 0.00013305119773992247, + "loss": 0.618, + "step": 2312 + }, + { + "epoch": 0.4112, + "grad_norm": 0.34432545065198905, + "learning_rate": 0.0001329968476745202, + "loss": 0.6535, + "step": 2313 + }, + { + "epoch": 0.4113777777777778, + "grad_norm": 0.3499355029370304, + "learning_rate": 0.00013294248666896328, + "loss": 0.6242, + "step": 2314 + }, + { + "epoch": 0.41155555555555556, + "grad_norm": 0.3516993466408435, + "learning_rate": 0.00013288811474127516, + "loss": 0.61, + "step": 2315 + }, + { + "epoch": 0.41173333333333334, + "grad_norm": 0.33801311156422703, + "learning_rate": 0.00013283373190948295, + "loss": 0.6399, + "step": 2316 + }, + { + "epoch": 0.4119111111111111, + "grad_norm": 0.3453902813352448, + "learning_rate": 0.0001327793381916173, + "loss": 0.6186, + "step": 2317 + }, + { + "epoch": 0.4120888888888889, + "grad_norm": 0.38208799243876135, + "learning_rate": 0.00013272493360571262, + "loss": 0.6297, + "step": 2318 + }, + { + "epoch": 0.41226666666666667, + "grad_norm": 0.357668395257919, + "learning_rate": 0.0001326705181698068, + "loss": 0.6312, + "step": 2319 + }, + { + "epoch": 0.41244444444444445, + "grad_norm": 0.3665682879377501, + "learning_rate": 0.00013261609190194136, + "loss": 0.6348, + "step": 2320 + }, + { + "epoch": 0.4126222222222222, + "grad_norm": 0.3551999719409832, + "learning_rate": 0.00013256165482016137, + "loss": 0.6634, + "step": 2321 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3686600288194294, + "learning_rate": 0.00013250720694251556, + "loss": 0.6128, + "step": 2322 + }, + { + "epoch": 0.4129777777777778, + "grad_norm": 0.35572854169481694, + "learning_rate": 0.0001324527482870562, + "loss": 0.6183, + "step": 2323 + }, + { + "epoch": 0.41315555555555555, + "grad_norm": 0.34935095978295044, + "learning_rate": 0.00013239827887183916, + "loss": 0.6093, + "step": 2324 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.36261703008313345, + "learning_rate": 0.0001323437987149238, + "loss": 0.6204, + "step": 2325 + }, + { + "epoch": 0.4135111111111111, + "grad_norm": 0.3459300513653912, + "learning_rate": 0.0001322893078343732, + "loss": 0.6186, + "step": 2326 + }, + { + "epoch": 0.4136888888888889, + "grad_norm": 0.34976199172081734, + "learning_rate": 0.0001322348062482538, + "loss": 0.6479, + "step": 2327 + }, + { + "epoch": 0.41386666666666666, + "grad_norm": 0.348935342106647, + "learning_rate": 0.0001321802939746357, + "loss": 0.6053, + "step": 2328 + }, + { + "epoch": 0.41404444444444444, + "grad_norm": 0.37180153525586357, + "learning_rate": 0.00013212577103159258, + "loss": 0.6899, + "step": 2329 + }, + { + "epoch": 0.4142222222222222, + "grad_norm": 0.3293784470438587, + "learning_rate": 0.00013207123743720156, + "loss": 0.5666, + "step": 2330 + }, + { + "epoch": 0.4144, + "grad_norm": 0.3268395656725775, + "learning_rate": 0.00013201669320954333, + "loss": 0.621, + "step": 2331 + }, + { + "epoch": 0.41457777777777777, + "grad_norm": 0.34569902660741436, + "learning_rate": 0.00013196213836670214, + "loss": 0.6676, + "step": 2332 + }, + { + "epoch": 0.41475555555555554, + "grad_norm": 0.3356318153386259, + "learning_rate": 0.0001319075729267657, + "loss": 0.6, + "step": 2333 + }, + { + "epoch": 0.4149333333333333, + "grad_norm": 0.34386109306069956, + "learning_rate": 0.00013185299690782526, + "loss": 0.6226, + "step": 2334 + }, + { + "epoch": 0.4151111111111111, + "grad_norm": 0.3623760791532519, + "learning_rate": 0.00013179841032797565, + "loss": 0.6634, + "step": 2335 + }, + { + "epoch": 0.4152888888888889, + "grad_norm": 0.3757665488245912, + "learning_rate": 0.00013174381320531505, + "loss": 0.6094, + "step": 2336 + }, + { + "epoch": 0.41546666666666665, + "grad_norm": 0.3481347352138883, + "learning_rate": 0.00013168920555794525, + "loss": 0.654, + "step": 2337 + }, + { + "epoch": 0.4156444444444444, + "grad_norm": 0.3452942094953099, + "learning_rate": 0.00013163458740397149, + "loss": 0.623, + "step": 2338 + }, + { + "epoch": 0.4158222222222222, + "grad_norm": 0.3891500238233552, + "learning_rate": 0.0001315799587615025, + "loss": 0.6763, + "step": 2339 + }, + { + "epoch": 0.416, + "grad_norm": 0.37927359653513915, + "learning_rate": 0.00013152531964865052, + "loss": 0.6334, + "step": 2340 + }, + { + "epoch": 0.41617777777777776, + "grad_norm": 0.3659354644333141, + "learning_rate": 0.0001314706700835312, + "loss": 0.6467, + "step": 2341 + }, + { + "epoch": 0.41635555555555553, + "grad_norm": 0.3602640145481248, + "learning_rate": 0.00013141601008426372, + "loss": 0.6565, + "step": 2342 + }, + { + "epoch": 0.4165333333333333, + "grad_norm": 0.3573937935188667, + "learning_rate": 0.00013136133966897064, + "loss": 0.6413, + "step": 2343 + }, + { + "epoch": 0.4167111111111111, + "grad_norm": 0.3399284147387536, + "learning_rate": 0.00013130665885577805, + "loss": 0.6374, + "step": 2344 + }, + { + "epoch": 0.41688888888888886, + "grad_norm": 0.3646181995748407, + "learning_rate": 0.00013125196766281544, + "loss": 0.6584, + "step": 2345 + }, + { + "epoch": 0.41706666666666664, + "grad_norm": 0.35329182669007025, + "learning_rate": 0.00013119726610821576, + "loss": 0.645, + "step": 2346 + }, + { + "epoch": 0.4172444444444444, + "grad_norm": 0.3704745300532282, + "learning_rate": 0.0001311425542101154, + "loss": 0.638, + "step": 2347 + }, + { + "epoch": 0.4174222222222222, + "grad_norm": 0.36525748658024687, + "learning_rate": 0.00013108783198665416, + "loss": 0.6985, + "step": 2348 + }, + { + "epoch": 0.4176, + "grad_norm": 0.35462538691231243, + "learning_rate": 0.0001310330994559753, + "loss": 0.6251, + "step": 2349 + }, + { + "epoch": 0.4177777777777778, + "grad_norm": 0.3269515104611661, + "learning_rate": 0.00013097835663622545, + "loss": 0.5992, + "step": 2350 + }, + { + "epoch": 0.4179555555555556, + "grad_norm": 0.3640316349009023, + "learning_rate": 0.00013092360354555467, + "loss": 0.5984, + "step": 2351 + }, + { + "epoch": 0.41813333333333336, + "grad_norm": 0.37482757256302207, + "learning_rate": 0.00013086884020211645, + "loss": 0.6042, + "step": 2352 + }, + { + "epoch": 0.41831111111111113, + "grad_norm": 0.3805531636261812, + "learning_rate": 0.00013081406662406763, + "loss": 0.653, + "step": 2353 + }, + { + "epoch": 0.4184888888888889, + "grad_norm": 0.3692067429214998, + "learning_rate": 0.00013075928282956853, + "loss": 0.6345, + "step": 2354 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 0.3677934881832081, + "learning_rate": 0.00013070448883678275, + "loss": 0.6303, + "step": 2355 + }, + { + "epoch": 0.41884444444444446, + "grad_norm": 0.3861103220341804, + "learning_rate": 0.0001306496846638773, + "loss": 0.661, + "step": 2356 + }, + { + "epoch": 0.41902222222222224, + "grad_norm": 0.37324704760877364, + "learning_rate": 0.00013059487032902268, + "loss": 0.6391, + "step": 2357 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3506326891064393, + "learning_rate": 0.00013054004585039258, + "loss": 0.6097, + "step": 2358 + }, + { + "epoch": 0.4193777777777778, + "grad_norm": 0.37755571556437467, + "learning_rate": 0.00013048521124616418, + "loss": 0.682, + "step": 2359 + }, + { + "epoch": 0.41955555555555557, + "grad_norm": 0.35562443904162094, + "learning_rate": 0.00013043036653451794, + "loss": 0.6072, + "step": 2360 + }, + { + "epoch": 0.41973333333333335, + "grad_norm": 0.34694886253349744, + "learning_rate": 0.00013037551173363774, + "loss": 0.6163, + "step": 2361 + }, + { + "epoch": 0.4199111111111111, + "grad_norm": 0.3578936885901449, + "learning_rate": 0.00013032064686171075, + "loss": 0.614, + "step": 2362 + }, + { + "epoch": 0.4200888888888889, + "grad_norm": 0.356159775126546, + "learning_rate": 0.0001302657719369275, + "loss": 0.6326, + "step": 2363 + }, + { + "epoch": 0.4202666666666667, + "grad_norm": 0.36086237377679226, + "learning_rate": 0.0001302108869774819, + "loss": 0.692, + "step": 2364 + }, + { + "epoch": 0.42044444444444445, + "grad_norm": 0.374333320230105, + "learning_rate": 0.00013015599200157107, + "loss": 0.6438, + "step": 2365 + }, + { + "epoch": 0.42062222222222223, + "grad_norm": 0.35999442735301823, + "learning_rate": 0.00013010108702739558, + "loss": 0.6396, + "step": 2366 + }, + { + "epoch": 0.4208, + "grad_norm": 0.37599794992759844, + "learning_rate": 0.00013004617207315922, + "loss": 0.6418, + "step": 2367 + }, + { + "epoch": 0.4209777777777778, + "grad_norm": 0.3519403982606927, + "learning_rate": 0.00012999124715706915, + "loss": 0.6289, + "step": 2368 + }, + { + "epoch": 0.42115555555555556, + "grad_norm": 0.3369286143577679, + "learning_rate": 0.00012993631229733582, + "loss": 0.5936, + "step": 2369 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 0.34394477663961104, + "learning_rate": 0.00012988136751217291, + "loss": 0.5947, + "step": 2370 + }, + { + "epoch": 0.4215111111111111, + "grad_norm": 0.3733845858367674, + "learning_rate": 0.0001298264128197975, + "loss": 0.5753, + "step": 2371 + }, + { + "epoch": 0.4216888888888889, + "grad_norm": 0.3591871915226606, + "learning_rate": 0.0001297714482384299, + "loss": 0.6199, + "step": 2372 + }, + { + "epoch": 0.42186666666666667, + "grad_norm": 0.35250390532830783, + "learning_rate": 0.00012971647378629366, + "loss": 0.6679, + "step": 2373 + }, + { + "epoch": 0.42204444444444444, + "grad_norm": 0.3553128954348677, + "learning_rate": 0.00012966148948161569, + "loss": 0.656, + "step": 2374 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 0.36651801744600043, + "learning_rate": 0.00012960649534262607, + "loss": 0.5725, + "step": 2375 + }, + { + "epoch": 0.4224, + "grad_norm": 0.36551793616827166, + "learning_rate": 0.00012955149138755821, + "loss": 0.6557, + "step": 2376 + }, + { + "epoch": 0.4225777777777778, + "grad_norm": 0.34508972530139137, + "learning_rate": 0.0001294964776346488, + "loss": 0.6181, + "step": 2377 + }, + { + "epoch": 0.42275555555555555, + "grad_norm": 0.3654008193872436, + "learning_rate": 0.00012944145410213764, + "loss": 0.6579, + "step": 2378 + }, + { + "epoch": 0.42293333333333333, + "grad_norm": 0.35230954792235686, + "learning_rate": 0.00012938642080826795, + "loss": 0.6206, + "step": 2379 + }, + { + "epoch": 0.4231111111111111, + "grad_norm": 0.36609973543745744, + "learning_rate": 0.00012933137777128607, + "loss": 0.5961, + "step": 2380 + }, + { + "epoch": 0.4232888888888889, + "grad_norm": 0.35617888987419094, + "learning_rate": 0.00012927632500944161, + "loss": 0.6365, + "step": 2381 + }, + { + "epoch": 0.42346666666666666, + "grad_norm": 0.36093591553360777, + "learning_rate": 0.00012922126254098735, + "loss": 0.6055, + "step": 2382 + }, + { + "epoch": 0.42364444444444443, + "grad_norm": 0.3344674802417674, + "learning_rate": 0.0001291661903841794, + "loss": 0.6264, + "step": 2383 + }, + { + "epoch": 0.4238222222222222, + "grad_norm": 0.34276341314754455, + "learning_rate": 0.000129111108557277, + "loss": 0.6052, + "step": 2384 + }, + { + "epoch": 0.424, + "grad_norm": 0.39669484073720285, + "learning_rate": 0.00012905601707854255, + "loss": 0.6882, + "step": 2385 + }, + { + "epoch": 0.42417777777777776, + "grad_norm": 0.3754941260882933, + "learning_rate": 0.0001290009159662418, + "loss": 0.6323, + "step": 2386 + }, + { + "epoch": 0.42435555555555554, + "grad_norm": 0.35504806432726915, + "learning_rate": 0.00012894580523864358, + "loss": 0.6488, + "step": 2387 + }, + { + "epoch": 0.4245333333333333, + "grad_norm": 0.35310960734760505, + "learning_rate": 0.0001288906849140199, + "loss": 0.6, + "step": 2388 + }, + { + "epoch": 0.4247111111111111, + "grad_norm": 0.36621650743851425, + "learning_rate": 0.00012883555501064603, + "loss": 0.6616, + "step": 2389 + }, + { + "epoch": 0.42488888888888887, + "grad_norm": 0.3656296247330148, + "learning_rate": 0.0001287804155468004, + "loss": 0.6211, + "step": 2390 + }, + { + "epoch": 0.42506666666666665, + "grad_norm": 0.3627491764165094, + "learning_rate": 0.0001287252665407645, + "loss": 0.5853, + "step": 2391 + }, + { + "epoch": 0.4252444444444444, + "grad_norm": 0.36252998478817616, + "learning_rate": 0.00012867010801082308, + "loss": 0.5652, + "step": 2392 + }, + { + "epoch": 0.4254222222222222, + "grad_norm": 0.3779783758073857, + "learning_rate": 0.0001286149399752641, + "loss": 0.6263, + "step": 2393 + }, + { + "epoch": 0.4256, + "grad_norm": 0.600048124131751, + "learning_rate": 0.00012855976245237854, + "loss": 0.6255, + "step": 2394 + }, + { + "epoch": 0.42577777777777776, + "grad_norm": 0.3981944200306016, + "learning_rate": 0.00012850457546046063, + "loss": 0.7123, + "step": 2395 + }, + { + "epoch": 0.42595555555555553, + "grad_norm": 0.34499464572562755, + "learning_rate": 0.00012844937901780766, + "loss": 0.6232, + "step": 2396 + }, + { + "epoch": 0.4261333333333333, + "grad_norm": 0.35532483125508896, + "learning_rate": 0.00012839417314272015, + "loss": 0.5934, + "step": 2397 + }, + { + "epoch": 0.4263111111111111, + "grad_norm": 0.34412101816347485, + "learning_rate": 0.00012833895785350165, + "loss": 0.5892, + "step": 2398 + }, + { + "epoch": 0.42648888888888886, + "grad_norm": 0.3538756389735311, + "learning_rate": 0.00012828373316845886, + "loss": 0.6319, + "step": 2399 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.34845042454957637, + "learning_rate": 0.00012822849910590166, + "loss": 0.5963, + "step": 2400 + }, + { + "epoch": 0.42684444444444447, + "grad_norm": 0.3620431462516756, + "learning_rate": 0.00012817325568414297, + "loss": 0.6437, + "step": 2401 + }, + { + "epoch": 0.42702222222222225, + "grad_norm": 0.3659724988605635, + "learning_rate": 0.0001281180029214988, + "loss": 0.6484, + "step": 2402 + }, + { + "epoch": 0.4272, + "grad_norm": 0.36758670904619567, + "learning_rate": 0.00012806274083628833, + "loss": 0.5708, + "step": 2403 + }, + { + "epoch": 0.4273777777777778, + "grad_norm": 0.3473949133208065, + "learning_rate": 0.00012800746944683372, + "loss": 0.6593, + "step": 2404 + }, + { + "epoch": 0.4275555555555556, + "grad_norm": 0.3676037859972472, + "learning_rate": 0.00012795218877146035, + "loss": 0.6896, + "step": 2405 + }, + { + "epoch": 0.42773333333333335, + "grad_norm": 0.34847896656952704, + "learning_rate": 0.00012789689882849659, + "loss": 0.6343, + "step": 2406 + }, + { + "epoch": 0.42791111111111113, + "grad_norm": 0.37702515284495397, + "learning_rate": 0.0001278415996362739, + "loss": 0.6818, + "step": 2407 + }, + { + "epoch": 0.4280888888888889, + "grad_norm": 0.36710909666073355, + "learning_rate": 0.0001277862912131268, + "loss": 0.6272, + "step": 2408 + }, + { + "epoch": 0.4282666666666667, + "grad_norm": 0.3663485642828436, + "learning_rate": 0.00012773097357739288, + "loss": 0.6465, + "step": 2409 + }, + { + "epoch": 0.42844444444444446, + "grad_norm": 0.34390668098915267, + "learning_rate": 0.0001276756467474128, + "loss": 0.6155, + "step": 2410 + }, + { + "epoch": 0.42862222222222224, + "grad_norm": 0.3989493163609929, + "learning_rate": 0.0001276203107415303, + "loss": 0.6654, + "step": 2411 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3571062608487482, + "learning_rate": 0.00012756496557809202, + "loss": 0.6202, + "step": 2412 + }, + { + "epoch": 0.4289777777777778, + "grad_norm": 0.37953012267640956, + "learning_rate": 0.0001275096112754478, + "loss": 0.6471, + "step": 2413 + }, + { + "epoch": 0.42915555555555557, + "grad_norm": 0.5038903073517091, + "learning_rate": 0.00012745424785195043, + "loss": 0.6649, + "step": 2414 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 0.35960086466595215, + "learning_rate": 0.00012739887532595574, + "loss": 0.6343, + "step": 2415 + }, + { + "epoch": 0.4295111111111111, + "grad_norm": 0.3701250126537466, + "learning_rate": 0.00012734349371582254, + "loss": 0.6698, + "step": 2416 + }, + { + "epoch": 0.4296888888888889, + "grad_norm": 0.39305014199113314, + "learning_rate": 0.0001272881030399127, + "loss": 0.6831, + "step": 2417 + }, + { + "epoch": 0.4298666666666667, + "grad_norm": 0.37114882218702244, + "learning_rate": 0.00012723270331659113, + "loss": 0.6398, + "step": 2418 + }, + { + "epoch": 0.43004444444444445, + "grad_norm": 0.36658739490811704, + "learning_rate": 0.00012717729456422565, + "loss": 0.656, + "step": 2419 + }, + { + "epoch": 0.43022222222222223, + "grad_norm": 0.3519966513320916, + "learning_rate": 0.00012712187680118713, + "loss": 0.6201, + "step": 2420 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3551711251204549, + "learning_rate": 0.00012706645004584936, + "loss": 0.6228, + "step": 2421 + }, + { + "epoch": 0.4305777777777778, + "grad_norm": 0.35281476469675455, + "learning_rate": 0.00012701101431658924, + "loss": 0.6372, + "step": 2422 + }, + { + "epoch": 0.43075555555555556, + "grad_norm": 0.37113668378033865, + "learning_rate": 0.00012695556963178653, + "loss": 0.6471, + "step": 2423 + }, + { + "epoch": 0.43093333333333333, + "grad_norm": 0.3541117125805001, + "learning_rate": 0.000126900116009824, + "loss": 0.6214, + "step": 2424 + }, + { + "epoch": 0.4311111111111111, + "grad_norm": 0.41086519789286663, + "learning_rate": 0.00012684465346908742, + "loss": 0.6837, + "step": 2425 + }, + { + "epoch": 0.4312888888888889, + "grad_norm": 0.3569640597504126, + "learning_rate": 0.0001267891820279654, + "loss": 0.634, + "step": 2426 + }, + { + "epoch": 0.43146666666666667, + "grad_norm": 0.40060018196959896, + "learning_rate": 0.00012673370170484968, + "loss": 0.6263, + "step": 2427 + }, + { + "epoch": 0.43164444444444444, + "grad_norm": 0.34122440907489915, + "learning_rate": 0.00012667821251813479, + "loss": 0.5851, + "step": 2428 + }, + { + "epoch": 0.4318222222222222, + "grad_norm": 0.35860201472900305, + "learning_rate": 0.00012662271448621822, + "loss": 0.6237, + "step": 2429 + }, + { + "epoch": 0.432, + "grad_norm": 0.38923528850450995, + "learning_rate": 0.0001265672076275005, + "loss": 0.634, + "step": 2430 + }, + { + "epoch": 0.43217777777777777, + "grad_norm": 0.37793130135844377, + "learning_rate": 0.00012651169196038496, + "loss": 0.6842, + "step": 2431 + }, + { + "epoch": 0.43235555555555555, + "grad_norm": 0.3483716650887419, + "learning_rate": 0.0001264561675032779, + "loss": 0.6139, + "step": 2432 + }, + { + "epoch": 0.4325333333333333, + "grad_norm": 0.3525398083572766, + "learning_rate": 0.00012640063427458856, + "loss": 0.6484, + "step": 2433 + }, + { + "epoch": 0.4327111111111111, + "grad_norm": 0.3385149688034773, + "learning_rate": 0.00012634509229272908, + "loss": 0.5935, + "step": 2434 + }, + { + "epoch": 0.4328888888888889, + "grad_norm": 0.34811250059530324, + "learning_rate": 0.0001262895415761145, + "loss": 0.6183, + "step": 2435 + }, + { + "epoch": 0.43306666666666666, + "grad_norm": 0.3316129669981448, + "learning_rate": 0.00012623398214316268, + "loss": 0.6097, + "step": 2436 + }, + { + "epoch": 0.43324444444444443, + "grad_norm": 0.3624292741374519, + "learning_rate": 0.00012617841401229446, + "loss": 0.622, + "step": 2437 + }, + { + "epoch": 0.4334222222222222, + "grad_norm": 0.3673924536245363, + "learning_rate": 0.00012612283720193356, + "loss": 0.6355, + "step": 2438 + }, + { + "epoch": 0.4336, + "grad_norm": 0.36232833272197146, + "learning_rate": 0.00012606725173050653, + "loss": 0.6143, + "step": 2439 + }, + { + "epoch": 0.43377777777777776, + "grad_norm": 0.38633378002603624, + "learning_rate": 0.00012601165761644286, + "loss": 0.6009, + "step": 2440 + }, + { + "epoch": 0.43395555555555554, + "grad_norm": 0.4596118286035029, + "learning_rate": 0.00012595605487817482, + "loss": 0.6184, + "step": 2441 + }, + { + "epoch": 0.4341333333333333, + "grad_norm": 0.36985006980277124, + "learning_rate": 0.00012590044353413758, + "loss": 0.6598, + "step": 2442 + }, + { + "epoch": 0.4343111111111111, + "grad_norm": 0.36243396049438226, + "learning_rate": 0.0001258448236027692, + "loss": 0.6999, + "step": 2443 + }, + { + "epoch": 0.43448888888888887, + "grad_norm": 0.3789757929823581, + "learning_rate": 0.0001257891951025105, + "loss": 0.6688, + "step": 2444 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 0.4359100608013618, + "learning_rate": 0.00012573355805180523, + "loss": 0.5645, + "step": 2445 + }, + { + "epoch": 0.4348444444444444, + "grad_norm": 0.3495815058136773, + "learning_rate": 0.00012567791246909994, + "loss": 0.6598, + "step": 2446 + }, + { + "epoch": 0.4350222222222222, + "grad_norm": 0.3666436268806853, + "learning_rate": 0.000125622258372844, + "loss": 0.6545, + "step": 2447 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3647740773109535, + "learning_rate": 0.0001255665957814896, + "loss": 0.6018, + "step": 2448 + }, + { + "epoch": 0.43537777777777775, + "grad_norm": 0.3419448831609855, + "learning_rate": 0.00012551092471349177, + "loss": 0.6042, + "step": 2449 + }, + { + "epoch": 0.43555555555555553, + "grad_norm": 0.40815142500533824, + "learning_rate": 0.00012545524518730835, + "loss": 0.6712, + "step": 2450 + }, + { + "epoch": 0.4357333333333333, + "grad_norm": 0.3696712101640166, + "learning_rate": 0.0001253995572213999, + "loss": 0.6243, + "step": 2451 + }, + { + "epoch": 0.43591111111111114, + "grad_norm": 0.3623882862598176, + "learning_rate": 0.00012534386083422997, + "loss": 0.5978, + "step": 2452 + }, + { + "epoch": 0.4360888888888889, + "grad_norm": 0.3864446537210954, + "learning_rate": 0.0001252881560442647, + "loss": 0.6683, + "step": 2453 + }, + { + "epoch": 0.4362666666666667, + "grad_norm": 0.36650734588962697, + "learning_rate": 0.00012523244286997309, + "loss": 0.64, + "step": 2454 + }, + { + "epoch": 0.43644444444444447, + "grad_norm": 0.36321631434588475, + "learning_rate": 0.00012517672132982693, + "loss": 0.6238, + "step": 2455 + }, + { + "epoch": 0.43662222222222224, + "grad_norm": 0.35887830476599214, + "learning_rate": 0.00012512099144230084, + "loss": 0.6964, + "step": 2456 + }, + { + "epoch": 0.4368, + "grad_norm": 0.3523114257936633, + "learning_rate": 0.00012506525322587207, + "loss": 0.6043, + "step": 2457 + }, + { + "epoch": 0.4369777777777778, + "grad_norm": 0.3396440867494397, + "learning_rate": 0.00012500950669902075, + "loss": 0.6065, + "step": 2458 + }, + { + "epoch": 0.4371555555555556, + "grad_norm": 0.34500582618109993, + "learning_rate": 0.00012495375188022973, + "loss": 0.6218, + "step": 2459 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 0.35652736286516173, + "learning_rate": 0.0001248979887879846, + "loss": 0.6105, + "step": 2460 + }, + { + "epoch": 0.43751111111111113, + "grad_norm": 0.3532454915551023, + "learning_rate": 0.00012484221744077367, + "loss": 0.6501, + "step": 2461 + }, + { + "epoch": 0.4376888888888889, + "grad_norm": 0.3751697591005347, + "learning_rate": 0.00012478643785708806, + "loss": 0.6109, + "step": 2462 + }, + { + "epoch": 0.4378666666666667, + "grad_norm": 0.3576413314460664, + "learning_rate": 0.00012473065005542155, + "loss": 0.6042, + "step": 2463 + }, + { + "epoch": 0.43804444444444446, + "grad_norm": 0.36242225247095117, + "learning_rate": 0.00012467485405427068, + "loss": 0.6535, + "step": 2464 + }, + { + "epoch": 0.43822222222222224, + "grad_norm": 0.36517761020329187, + "learning_rate": 0.00012461904987213468, + "loss": 0.655, + "step": 2465 + }, + { + "epoch": 0.4384, + "grad_norm": 0.36253256730110056, + "learning_rate": 0.00012456323752751554, + "loss": 0.6216, + "step": 2466 + }, + { + "epoch": 0.4385777777777778, + "grad_norm": 0.35740430775994647, + "learning_rate": 0.00012450741703891788, + "loss": 0.6633, + "step": 2467 + }, + { + "epoch": 0.43875555555555557, + "grad_norm": 0.34728820177508646, + "learning_rate": 0.0001244515884248491, + "loss": 0.5837, + "step": 2468 + }, + { + "epoch": 0.43893333333333334, + "grad_norm": 0.37899085198417637, + "learning_rate": 0.00012439575170381927, + "loss": 0.6687, + "step": 2469 + }, + { + "epoch": 0.4391111111111111, + "grad_norm": 0.380438314635491, + "learning_rate": 0.00012433990689434112, + "loss": 0.6858, + "step": 2470 + }, + { + "epoch": 0.4392888888888889, + "grad_norm": 0.3597587224277307, + "learning_rate": 0.0001242840540149301, + "loss": 0.601, + "step": 2471 + }, + { + "epoch": 0.43946666666666667, + "grad_norm": 0.42792811192738855, + "learning_rate": 0.0001242281930841043, + "loss": 0.5907, + "step": 2472 + }, + { + "epoch": 0.43964444444444445, + "grad_norm": 0.3511489103832953, + "learning_rate": 0.00012417232412038448, + "loss": 0.6167, + "step": 2473 + }, + { + "epoch": 0.4398222222222222, + "grad_norm": 0.348201520590711, + "learning_rate": 0.0001241164471422941, + "loss": 0.5894, + "step": 2474 + }, + { + "epoch": 0.44, + "grad_norm": 0.34978829020802815, + "learning_rate": 0.00012406056216835928, + "loss": 0.6216, + "step": 2475 + }, + { + "epoch": 0.4401777777777778, + "grad_norm": 0.3683729504483215, + "learning_rate": 0.00012400466921710874, + "loss": 0.5906, + "step": 2476 + }, + { + "epoch": 0.44035555555555556, + "grad_norm": 0.35557258402876357, + "learning_rate": 0.00012394876830707386, + "loss": 0.5703, + "step": 2477 + }, + { + "epoch": 0.44053333333333333, + "grad_norm": 0.355893979286471, + "learning_rate": 0.0001238928594567887, + "loss": 0.6125, + "step": 2478 + }, + { + "epoch": 0.4407111111111111, + "grad_norm": 0.3575680617367579, + "learning_rate": 0.00012383694268478993, + "loss": 0.6494, + "step": 2479 + }, + { + "epoch": 0.4408888888888889, + "grad_norm": 0.36079630985236694, + "learning_rate": 0.0001237810180096168, + "loss": 0.6612, + "step": 2480 + }, + { + "epoch": 0.44106666666666666, + "grad_norm": 0.36708244051107575, + "learning_rate": 0.0001237250854498112, + "loss": 0.6396, + "step": 2481 + }, + { + "epoch": 0.44124444444444444, + "grad_norm": 0.3583807231088781, + "learning_rate": 0.00012366914502391776, + "loss": 0.6217, + "step": 2482 + }, + { + "epoch": 0.4414222222222222, + "grad_norm": 0.34840330403646036, + "learning_rate": 0.0001236131967504835, + "loss": 0.6183, + "step": 2483 + }, + { + "epoch": 0.4416, + "grad_norm": 0.35602707774502895, + "learning_rate": 0.00012355724064805823, + "loss": 0.6347, + "step": 2484 + }, + { + "epoch": 0.44177777777777777, + "grad_norm": 0.3468948554115569, + "learning_rate": 0.00012350127673519426, + "loss": 0.6358, + "step": 2485 + }, + { + "epoch": 0.44195555555555555, + "grad_norm": 0.3723036438014148, + "learning_rate": 0.00012344530503044648, + "loss": 0.6288, + "step": 2486 + }, + { + "epoch": 0.4421333333333333, + "grad_norm": 0.39322452675440267, + "learning_rate": 0.00012338932555237242, + "loss": 0.6361, + "step": 2487 + }, + { + "epoch": 0.4423111111111111, + "grad_norm": 0.35577272957813433, + "learning_rate": 0.00012333333831953216, + "loss": 0.5971, + "step": 2488 + }, + { + "epoch": 0.4424888888888889, + "grad_norm": 0.3583997104842971, + "learning_rate": 0.00012327734335048837, + "loss": 0.6262, + "step": 2489 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 0.34672771044651235, + "learning_rate": 0.0001232213406638062, + "loss": 0.6734, + "step": 2490 + }, + { + "epoch": 0.44284444444444443, + "grad_norm": 0.344755464929935, + "learning_rate": 0.00012316533027805353, + "loss": 0.6225, + "step": 2491 + }, + { + "epoch": 0.4430222222222222, + "grad_norm": 0.3703909236340882, + "learning_rate": 0.0001231093122118006, + "loss": 0.6603, + "step": 2492 + }, + { + "epoch": 0.4432, + "grad_norm": 0.3637871650426539, + "learning_rate": 0.00012305328648362028, + "loss": 0.6046, + "step": 2493 + }, + { + "epoch": 0.44337777777777776, + "grad_norm": 0.3634852893036694, + "learning_rate": 0.00012299725311208808, + "loss": 0.638, + "step": 2494 + }, + { + "epoch": 0.44355555555555554, + "grad_norm": 0.3554437317670213, + "learning_rate": 0.00012294121211578184, + "loss": 0.606, + "step": 2495 + }, + { + "epoch": 0.4437333333333333, + "grad_norm": 0.36010220346759225, + "learning_rate": 0.00012288516351328208, + "loss": 0.6202, + "step": 2496 + }, + { + "epoch": 0.4439111111111111, + "grad_norm": 0.3533957637130856, + "learning_rate": 0.0001228291073231718, + "loss": 0.6206, + "step": 2497 + }, + { + "epoch": 0.44408888888888887, + "grad_norm": 0.36553838722476206, + "learning_rate": 0.00012277304356403656, + "loss": 0.6145, + "step": 2498 + }, + { + "epoch": 0.44426666666666664, + "grad_norm": 0.3600575736028397, + "learning_rate": 0.0001227169722544643, + "loss": 0.5992, + "step": 2499 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 0.38262337271538, + "learning_rate": 0.0001226608934130456, + "loss": 0.6422, + "step": 2500 + }, + { + "epoch": 0.4446222222222222, + "grad_norm": 0.35309795321313664, + "learning_rate": 0.0001226048070583735, + "loss": 0.6093, + "step": 2501 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3537591815446702, + "learning_rate": 0.00012254871320904347, + "loss": 0.5859, + "step": 2502 + }, + { + "epoch": 0.4449777777777778, + "grad_norm": 0.38955417158894107, + "learning_rate": 0.0001224926118836535, + "loss": 0.6054, + "step": 2503 + }, + { + "epoch": 0.4451555555555556, + "grad_norm": 0.3880502150274596, + "learning_rate": 0.00012243650310080412, + "loss": 0.6261, + "step": 2504 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.3507029394431925, + "learning_rate": 0.0001223803868790983, + "loss": 0.6313, + "step": 2505 + }, + { + "epoch": 0.44551111111111114, + "grad_norm": 0.334419017104526, + "learning_rate": 0.00012232426323714136, + "loss": 0.6239, + "step": 2506 + }, + { + "epoch": 0.4456888888888889, + "grad_norm": 0.3640862568304381, + "learning_rate": 0.00012226813219354122, + "loss": 0.6512, + "step": 2507 + }, + { + "epoch": 0.4458666666666667, + "grad_norm": 0.33968625510731293, + "learning_rate": 0.00012221199376690825, + "loss": 0.6077, + "step": 2508 + }, + { + "epoch": 0.44604444444444447, + "grad_norm": 0.3606677468042819, + "learning_rate": 0.00012215584797585524, + "loss": 0.6605, + "step": 2509 + }, + { + "epoch": 0.44622222222222224, + "grad_norm": 0.3370036884890409, + "learning_rate": 0.00012209969483899735, + "loss": 0.6109, + "step": 2510 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3727276030707424, + "learning_rate": 0.00012204353437495228, + "loss": 0.5754, + "step": 2511 + }, + { + "epoch": 0.4465777777777778, + "grad_norm": 0.3814855270365171, + "learning_rate": 0.00012198736660234009, + "loss": 0.6124, + "step": 2512 + }, + { + "epoch": 0.4467555555555556, + "grad_norm": 0.34305297471379737, + "learning_rate": 0.00012193119153978332, + "loss": 0.5901, + "step": 2513 + }, + { + "epoch": 0.44693333333333335, + "grad_norm": 0.3215511453171052, + "learning_rate": 0.00012187500920590689, + "loss": 0.59, + "step": 2514 + }, + { + "epoch": 0.4471111111111111, + "grad_norm": 0.35959590890962223, + "learning_rate": 0.0001218188196193381, + "loss": 0.6841, + "step": 2515 + }, + { + "epoch": 0.4472888888888889, + "grad_norm": 0.3649763905835896, + "learning_rate": 0.00012176262279870673, + "loss": 0.6506, + "step": 2516 + }, + { + "epoch": 0.4474666666666667, + "grad_norm": 0.3601431735851272, + "learning_rate": 0.0001217064187626449, + "loss": 0.6435, + "step": 2517 + }, + { + "epoch": 0.44764444444444446, + "grad_norm": 0.35964138205055174, + "learning_rate": 0.00012165020752978718, + "loss": 0.6535, + "step": 2518 + }, + { + "epoch": 0.44782222222222223, + "grad_norm": 0.34726631250201273, + "learning_rate": 0.00012159398911877045, + "loss": 0.6134, + "step": 2519 + }, + { + "epoch": 0.448, + "grad_norm": 0.35449870451567134, + "learning_rate": 0.00012153776354823401, + "loss": 0.6081, + "step": 2520 + }, + { + "epoch": 0.4481777777777778, + "grad_norm": 0.34573664350869815, + "learning_rate": 0.00012148153083681954, + "loss": 0.6142, + "step": 2521 + }, + { + "epoch": 0.44835555555555556, + "grad_norm": 0.36060712389294625, + "learning_rate": 0.0001214252910031711, + "loss": 0.6502, + "step": 2522 + }, + { + "epoch": 0.44853333333333334, + "grad_norm": 0.3826132897859077, + "learning_rate": 0.00012136904406593507, + "loss": 0.6432, + "step": 2523 + }, + { + "epoch": 0.4487111111111111, + "grad_norm": 0.34896381190658043, + "learning_rate": 0.00012131279004376024, + "loss": 0.6414, + "step": 2524 + }, + { + "epoch": 0.4488888888888889, + "grad_norm": 0.3600605051763946, + "learning_rate": 0.00012125652895529766, + "loss": 0.6405, + "step": 2525 + }, + { + "epoch": 0.44906666666666667, + "grad_norm": 0.36541280183275093, + "learning_rate": 0.00012120026081920084, + "loss": 0.5968, + "step": 2526 + }, + { + "epoch": 0.44924444444444445, + "grad_norm": 0.3562322280328803, + "learning_rate": 0.00012114398565412553, + "loss": 0.6113, + "step": 2527 + }, + { + "epoch": 0.4494222222222222, + "grad_norm": 0.36541262544716824, + "learning_rate": 0.00012108770347872982, + "loss": 0.6251, + "step": 2528 + }, + { + "epoch": 0.4496, + "grad_norm": 0.35789351173943645, + "learning_rate": 0.0001210314143116742, + "loss": 0.6362, + "step": 2529 + }, + { + "epoch": 0.4497777777777778, + "grad_norm": 0.3812303750695318, + "learning_rate": 0.00012097511817162139, + "loss": 0.6218, + "step": 2530 + }, + { + "epoch": 0.44995555555555555, + "grad_norm": 0.3548281963049733, + "learning_rate": 0.00012091881507723651, + "loss": 0.6274, + "step": 2531 + }, + { + "epoch": 0.45013333333333333, + "grad_norm": 0.3490925903863767, + "learning_rate": 0.00012086250504718687, + "loss": 0.6224, + "step": 2532 + }, + { + "epoch": 0.4503111111111111, + "grad_norm": 0.372575977805126, + "learning_rate": 0.00012080618810014221, + "loss": 0.6413, + "step": 2533 + }, + { + "epoch": 0.4504888888888889, + "grad_norm": 0.3664897624480647, + "learning_rate": 0.00012074986425477445, + "loss": 0.6512, + "step": 2534 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 0.35200150464177166, + "learning_rate": 0.00012069353352975787, + "loss": 0.6204, + "step": 2535 + }, + { + "epoch": 0.45084444444444444, + "grad_norm": 0.35660079439327175, + "learning_rate": 0.00012063719594376901, + "loss": 0.6337, + "step": 2536 + }, + { + "epoch": 0.4510222222222222, + "grad_norm": 0.3527386080500564, + "learning_rate": 0.00012058085151548668, + "loss": 0.6444, + "step": 2537 + }, + { + "epoch": 0.4512, + "grad_norm": 0.3495730898005332, + "learning_rate": 0.00012052450026359197, + "loss": 0.6088, + "step": 2538 + }, + { + "epoch": 0.45137777777777777, + "grad_norm": 0.4030111494261051, + "learning_rate": 0.00012046814220676817, + "loss": 0.5907, + "step": 2539 + }, + { + "epoch": 0.45155555555555554, + "grad_norm": 0.3431991453405851, + "learning_rate": 0.00012041177736370093, + "loss": 0.6022, + "step": 2540 + }, + { + "epoch": 0.4517333333333333, + "grad_norm": 0.36766567743241135, + "learning_rate": 0.00012035540575307809, + "loss": 0.6592, + "step": 2541 + }, + { + "epoch": 0.4519111111111111, + "grad_norm": 0.38535132266059796, + "learning_rate": 0.00012029902739358971, + "loss": 0.6736, + "step": 2542 + }, + { + "epoch": 0.4520888888888889, + "grad_norm": 0.3836992370242985, + "learning_rate": 0.00012024264230392819, + "loss": 0.6622, + "step": 2543 + }, + { + "epoch": 0.45226666666666665, + "grad_norm": 0.3354627586731223, + "learning_rate": 0.00012018625050278802, + "loss": 0.5926, + "step": 2544 + }, + { + "epoch": 0.4524444444444444, + "grad_norm": 0.3571685686510973, + "learning_rate": 0.00012012985200886602, + "loss": 0.6209, + "step": 2545 + }, + { + "epoch": 0.4526222222222222, + "grad_norm": 0.3541162745120483, + "learning_rate": 0.00012007344684086119, + "loss": 0.6508, + "step": 2546 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3831361097772331, + "learning_rate": 0.00012001703501747475, + "loss": 0.6179, + "step": 2547 + }, + { + "epoch": 0.45297777777777776, + "grad_norm": 0.36551304036628957, + "learning_rate": 0.00011996061655741013, + "loss": 0.639, + "step": 2548 + }, + { + "epoch": 0.45315555555555553, + "grad_norm": 0.37321440724637994, + "learning_rate": 0.00011990419147937295, + "loss": 0.6216, + "step": 2549 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.36736687257775785, + "learning_rate": 0.00011984775980207105, + "loss": 0.6485, + "step": 2550 + }, + { + "epoch": 0.4535111111111111, + "grad_norm": 0.34232912567235985, + "learning_rate": 0.0001197913215442144, + "loss": 0.5696, + "step": 2551 + }, + { + "epoch": 0.45368888888888886, + "grad_norm": 0.39308572174384604, + "learning_rate": 0.00011973487672451523, + "loss": 0.6747, + "step": 2552 + }, + { + "epoch": 0.45386666666666664, + "grad_norm": 0.3438333290686015, + "learning_rate": 0.00011967842536168785, + "loss": 0.6, + "step": 2553 + }, + { + "epoch": 0.4540444444444444, + "grad_norm": 0.3506880602989641, + "learning_rate": 0.00011962196747444882, + "loss": 0.6287, + "step": 2554 + }, + { + "epoch": 0.45422222222222225, + "grad_norm": 0.3627018536646449, + "learning_rate": 0.00011956550308151689, + "loss": 0.607, + "step": 2555 + }, + { + "epoch": 0.4544, + "grad_norm": 0.35503347776951466, + "learning_rate": 0.00011950903220161285, + "loss": 0.5889, + "step": 2556 + }, + { + "epoch": 0.4545777777777778, + "grad_norm": 0.33814420598465106, + "learning_rate": 0.00011945255485345972, + "loss": 0.6156, + "step": 2557 + }, + { + "epoch": 0.4547555555555556, + "grad_norm": 0.3662821736322486, + "learning_rate": 0.00011939607105578266, + "loss": 0.6193, + "step": 2558 + }, + { + "epoch": 0.45493333333333336, + "grad_norm": 0.35731199879165426, + "learning_rate": 0.00011933958082730894, + "loss": 0.6694, + "step": 2559 + }, + { + "epoch": 0.45511111111111113, + "grad_norm": 0.3325573301812507, + "learning_rate": 0.000119283084186768, + "loss": 0.5469, + "step": 2560 + }, + { + "epoch": 0.4552888888888889, + "grad_norm": 0.5457887907196308, + "learning_rate": 0.00011922658115289141, + "loss": 0.6855, + "step": 2561 + }, + { + "epoch": 0.4554666666666667, + "grad_norm": 0.36198247507661996, + "learning_rate": 0.00011917007174441279, + "loss": 0.6257, + "step": 2562 + }, + { + "epoch": 0.45564444444444446, + "grad_norm": 0.35081646164599617, + "learning_rate": 0.00011911355598006794, + "loss": 0.5782, + "step": 2563 + }, + { + "epoch": 0.45582222222222224, + "grad_norm": 0.35967694469167877, + "learning_rate": 0.00011905703387859475, + "loss": 0.6249, + "step": 2564 + }, + { + "epoch": 0.456, + "grad_norm": 0.3486612584742585, + "learning_rate": 0.0001190005054587332, + "loss": 0.6371, + "step": 2565 + }, + { + "epoch": 0.4561777777777778, + "grad_norm": 0.3713135940760266, + "learning_rate": 0.00011894397073922536, + "loss": 0.6545, + "step": 2566 + }, + { + "epoch": 0.45635555555555557, + "grad_norm": 0.36423054497226026, + "learning_rate": 0.00011888742973881543, + "loss": 0.6611, + "step": 2567 + }, + { + "epoch": 0.45653333333333335, + "grad_norm": 0.3625864988060894, + "learning_rate": 0.00011883088247624965, + "loss": 0.6082, + "step": 2568 + }, + { + "epoch": 0.4567111111111111, + "grad_norm": 0.33217904307380747, + "learning_rate": 0.00011877432897027637, + "loss": 0.6047, + "step": 2569 + }, + { + "epoch": 0.4568888888888889, + "grad_norm": 0.338344936754837, + "learning_rate": 0.00011871776923964592, + "loss": 0.5985, + "step": 2570 + }, + { + "epoch": 0.4570666666666667, + "grad_norm": 0.3609757180115861, + "learning_rate": 0.00011866120330311086, + "loss": 0.6193, + "step": 2571 + }, + { + "epoch": 0.45724444444444445, + "grad_norm": 0.3512902947713863, + "learning_rate": 0.00011860463117942567, + "loss": 0.6116, + "step": 2572 + }, + { + "epoch": 0.45742222222222223, + "grad_norm": 0.3823292504112605, + "learning_rate": 0.00011854805288734689, + "loss": 0.6817, + "step": 2573 + }, + { + "epoch": 0.4576, + "grad_norm": 0.34817556542078, + "learning_rate": 0.0001184914684456332, + "loss": 0.5696, + "step": 2574 + }, + { + "epoch": 0.4577777777777778, + "grad_norm": 0.35394866568776095, + "learning_rate": 0.00011843487787304521, + "loss": 0.5984, + "step": 2575 + }, + { + "epoch": 0.45795555555555556, + "grad_norm": 0.3451786970155022, + "learning_rate": 0.00011837828118834564, + "loss": 0.6008, + "step": 2576 + }, + { + "epoch": 0.45813333333333334, + "grad_norm": 0.3639374317634931, + "learning_rate": 0.00011832167841029918, + "loss": 0.612, + "step": 2577 + }, + { + "epoch": 0.4583111111111111, + "grad_norm": 0.34210717123781936, + "learning_rate": 0.00011826506955767258, + "loss": 0.5995, + "step": 2578 + }, + { + "epoch": 0.4584888888888889, + "grad_norm": 0.35947946747542225, + "learning_rate": 0.00011820845464923458, + "loss": 0.6229, + "step": 2579 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 0.378625822610655, + "learning_rate": 0.00011815183370375595, + "loss": 0.6524, + "step": 2580 + }, + { + "epoch": 0.45884444444444444, + "grad_norm": 0.34824514391996814, + "learning_rate": 0.00011809520674000944, + "loss": 0.6016, + "step": 2581 + }, + { + "epoch": 0.4590222222222222, + "grad_norm": 0.3578488579196418, + "learning_rate": 0.00011803857377676983, + "loss": 0.6026, + "step": 2582 + }, + { + "epoch": 0.4592, + "grad_norm": 0.36466370898040745, + "learning_rate": 0.00011798193483281386, + "loss": 0.5943, + "step": 2583 + }, + { + "epoch": 0.4593777777777778, + "grad_norm": 0.3764979473198388, + "learning_rate": 0.00011792528992692022, + "loss": 0.6732, + "step": 2584 + }, + { + "epoch": 0.45955555555555555, + "grad_norm": 0.3759736729902727, + "learning_rate": 0.00011786863907786965, + "loss": 0.5957, + "step": 2585 + }, + { + "epoch": 0.4597333333333333, + "grad_norm": 0.36624487114376464, + "learning_rate": 0.00011781198230444479, + "loss": 0.6184, + "step": 2586 + }, + { + "epoch": 0.4599111111111111, + "grad_norm": 0.33735658839872296, + "learning_rate": 0.00011775531962543036, + "loss": 0.6043, + "step": 2587 + }, + { + "epoch": 0.4600888888888889, + "grad_norm": 0.3511801928186866, + "learning_rate": 0.00011769865105961283, + "loss": 0.6099, + "step": 2588 + }, + { + "epoch": 0.46026666666666666, + "grad_norm": 0.34014649416870824, + "learning_rate": 0.00011764197662578086, + "loss": 0.5811, + "step": 2589 + }, + { + "epoch": 0.46044444444444443, + "grad_norm": 0.3591190467678517, + "learning_rate": 0.0001175852963427249, + "loss": 0.6081, + "step": 2590 + }, + { + "epoch": 0.4606222222222222, + "grad_norm": 0.35996571845822695, + "learning_rate": 0.00011752861022923736, + "loss": 0.6306, + "step": 2591 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3613898953089732, + "learning_rate": 0.00011747191830411264, + "loss": 0.6178, + "step": 2592 + }, + { + "epoch": 0.46097777777777776, + "grad_norm": 0.3415070577647176, + "learning_rate": 0.00011741522058614705, + "loss": 0.639, + "step": 2593 + }, + { + "epoch": 0.46115555555555554, + "grad_norm": 0.3656135584756059, + "learning_rate": 0.00011735851709413874, + "loss": 0.625, + "step": 2594 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 0.3499049734452857, + "learning_rate": 0.00011730180784688789, + "loss": 0.6405, + "step": 2595 + }, + { + "epoch": 0.4615111111111111, + "grad_norm": 0.36387902605449524, + "learning_rate": 0.00011724509286319654, + "loss": 0.6413, + "step": 2596 + }, + { + "epoch": 0.46168888888888887, + "grad_norm": 0.3432788560828658, + "learning_rate": 0.0001171883721618686, + "loss": 0.5904, + "step": 2597 + }, + { + "epoch": 0.46186666666666665, + "grad_norm": 0.3846126542469513, + "learning_rate": 0.00011713164576170992, + "loss": 0.6395, + "step": 2598 + }, + { + "epoch": 0.4620444444444444, + "grad_norm": 0.37512029931777036, + "learning_rate": 0.00011707491368152823, + "loss": 0.6526, + "step": 2599 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 0.3652760863439255, + "learning_rate": 0.00011701817594013312, + "loss": 0.6427, + "step": 2600 + }, + { + "epoch": 0.4624, + "grad_norm": 0.34689977607904615, + "learning_rate": 0.00011696143255633607, + "loss": 0.6347, + "step": 2601 + }, + { + "epoch": 0.46257777777777775, + "grad_norm": 0.36668242103221116, + "learning_rate": 0.00011690468354895045, + "loss": 0.6533, + "step": 2602 + }, + { + "epoch": 0.46275555555555553, + "grad_norm": 0.3718117994604249, + "learning_rate": 0.00011684792893679149, + "loss": 0.6283, + "step": 2603 + }, + { + "epoch": 0.4629333333333333, + "grad_norm": 0.3291103069896437, + "learning_rate": 0.00011679116873867624, + "loss": 0.5934, + "step": 2604 + }, + { + "epoch": 0.4631111111111111, + "grad_norm": 0.3347516924602731, + "learning_rate": 0.00011673440297342364, + "loss": 0.5844, + "step": 2605 + }, + { + "epoch": 0.46328888888888886, + "grad_norm": 0.3507778909885527, + "learning_rate": 0.00011667763165985446, + "loss": 0.5787, + "step": 2606 + }, + { + "epoch": 0.4634666666666667, + "grad_norm": 0.340848584314414, + "learning_rate": 0.00011662085481679133, + "loss": 0.5553, + "step": 2607 + }, + { + "epoch": 0.46364444444444447, + "grad_norm": 0.34700831839306545, + "learning_rate": 0.00011656407246305867, + "loss": 0.6479, + "step": 2608 + }, + { + "epoch": 0.46382222222222225, + "grad_norm": 0.35928217108251026, + "learning_rate": 0.0001165072846174828, + "loss": 0.6003, + "step": 2609 + }, + { + "epoch": 0.464, + "grad_norm": 0.3512026978141891, + "learning_rate": 0.00011645049129889179, + "loss": 0.6227, + "step": 2610 + }, + { + "epoch": 0.4641777777777778, + "grad_norm": 0.34592416581651336, + "learning_rate": 0.00011639369252611552, + "loss": 0.6416, + "step": 2611 + }, + { + "epoch": 0.4643555555555556, + "grad_norm": 0.3532149210589753, + "learning_rate": 0.0001163368883179858, + "loss": 0.6372, + "step": 2612 + }, + { + "epoch": 0.46453333333333335, + "grad_norm": 0.3977834229037991, + "learning_rate": 0.00011628007869333603, + "loss": 0.6362, + "step": 2613 + }, + { + "epoch": 0.46471111111111113, + "grad_norm": 0.3361528441180401, + "learning_rate": 0.0001162232636710016, + "loss": 0.5933, + "step": 2614 + }, + { + "epoch": 0.4648888888888889, + "grad_norm": 0.346657457987765, + "learning_rate": 0.00011616644326981963, + "loss": 0.6319, + "step": 2615 + }, + { + "epoch": 0.4650666666666667, + "grad_norm": 0.3704249807859668, + "learning_rate": 0.00011610961750862897, + "loss": 0.6496, + "step": 2616 + }, + { + "epoch": 0.46524444444444446, + "grad_norm": 0.36322592742827775, + "learning_rate": 0.00011605278640627028, + "loss": 0.6162, + "step": 2617 + }, + { + "epoch": 0.46542222222222224, + "grad_norm": 0.3412190447748015, + "learning_rate": 0.00011599594998158602, + "loss": 0.5791, + "step": 2618 + }, + { + "epoch": 0.4656, + "grad_norm": 0.36615520677021174, + "learning_rate": 0.00011593910825342043, + "loss": 0.6701, + "step": 2619 + }, + { + "epoch": 0.4657777777777778, + "grad_norm": 0.3824893850067352, + "learning_rate": 0.0001158822612406194, + "loss": 0.6309, + "step": 2620 + }, + { + "epoch": 0.46595555555555557, + "grad_norm": 0.3669126233490488, + "learning_rate": 0.00011582540896203067, + "loss": 0.6184, + "step": 2621 + }, + { + "epoch": 0.46613333333333334, + "grad_norm": 0.3686145934825217, + "learning_rate": 0.00011576855143650371, + "loss": 0.6139, + "step": 2622 + }, + { + "epoch": 0.4663111111111111, + "grad_norm": 0.3768932262212343, + "learning_rate": 0.00011571168868288973, + "loss": 0.626, + "step": 2623 + }, + { + "epoch": 0.4664888888888889, + "grad_norm": 0.3827046669952357, + "learning_rate": 0.00011565482072004164, + "loss": 0.6296, + "step": 2624 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.3460460694096838, + "learning_rate": 0.0001155979475668141, + "loss": 0.6442, + "step": 2625 + }, + { + "epoch": 0.46684444444444445, + "grad_norm": 0.38766965000704273, + "learning_rate": 0.00011554106924206347, + "loss": 0.6475, + "step": 2626 + }, + { + "epoch": 0.4670222222222222, + "grad_norm": 0.3358428689481838, + "learning_rate": 0.00011548418576464791, + "loss": 0.5847, + "step": 2627 + }, + { + "epoch": 0.4672, + "grad_norm": 0.34018866811431725, + "learning_rate": 0.00011542729715342713, + "loss": 0.6008, + "step": 2628 + }, + { + "epoch": 0.4673777777777778, + "grad_norm": 0.3629115012343357, + "learning_rate": 0.00011537040342726271, + "loss": 0.6295, + "step": 2629 + }, + { + "epoch": 0.46755555555555556, + "grad_norm": 0.3583035374195773, + "learning_rate": 0.00011531350460501782, + "loss": 0.6551, + "step": 2630 + }, + { + "epoch": 0.46773333333333333, + "grad_norm": 0.3605839733646907, + "learning_rate": 0.00011525660070555735, + "loss": 0.6496, + "step": 2631 + }, + { + "epoch": 0.4679111111111111, + "grad_norm": 0.34466580861324947, + "learning_rate": 0.0001151996917477479, + "loss": 0.5804, + "step": 2632 + }, + { + "epoch": 0.4680888888888889, + "grad_norm": 0.36279627612429444, + "learning_rate": 0.00011514277775045768, + "loss": 0.6807, + "step": 2633 + }, + { + "epoch": 0.46826666666666666, + "grad_norm": 0.38075181604467584, + "learning_rate": 0.00011508585873255663, + "loss": 0.6719, + "step": 2634 + }, + { + "epoch": 0.46844444444444444, + "grad_norm": 0.34987587696317807, + "learning_rate": 0.00011502893471291636, + "loss": 0.645, + "step": 2635 + }, + { + "epoch": 0.4686222222222222, + "grad_norm": 0.3448820041592509, + "learning_rate": 0.00011497200571041009, + "loss": 0.6211, + "step": 2636 + }, + { + "epoch": 0.4688, + "grad_norm": 0.3727510610949181, + "learning_rate": 0.00011491507174391271, + "loss": 0.6162, + "step": 2637 + }, + { + "epoch": 0.46897777777777777, + "grad_norm": 0.3583867282546688, + "learning_rate": 0.00011485813283230079, + "loss": 0.5574, + "step": 2638 + }, + { + "epoch": 0.46915555555555555, + "grad_norm": 0.3594947186395565, + "learning_rate": 0.00011480118899445247, + "loss": 0.6343, + "step": 2639 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 0.34247734029417154, + "learning_rate": 0.00011474424024924759, + "loss": 0.5675, + "step": 2640 + }, + { + "epoch": 0.4695111111111111, + "grad_norm": 0.3764511634940411, + "learning_rate": 0.0001146872866155676, + "loss": 0.6159, + "step": 2641 + }, + { + "epoch": 0.4696888888888889, + "grad_norm": 0.3531976768111412, + "learning_rate": 0.00011463032811229557, + "loss": 0.5696, + "step": 2642 + }, + { + "epoch": 0.46986666666666665, + "grad_norm": 0.3586694966033011, + "learning_rate": 0.00011457336475831612, + "loss": 0.6358, + "step": 2643 + }, + { + "epoch": 0.47004444444444443, + "grad_norm": 0.37034145685760583, + "learning_rate": 0.00011451639657251563, + "loss": 0.6487, + "step": 2644 + }, + { + "epoch": 0.4702222222222222, + "grad_norm": 0.3830748606355568, + "learning_rate": 0.00011445942357378192, + "loss": 0.6481, + "step": 2645 + }, + { + "epoch": 0.4704, + "grad_norm": 0.35652442490379327, + "learning_rate": 0.00011440244578100447, + "loss": 0.6074, + "step": 2646 + }, + { + "epoch": 0.47057777777777776, + "grad_norm": 0.3568499151748979, + "learning_rate": 0.0001143454632130744, + "loss": 0.5981, + "step": 2647 + }, + { + "epoch": 0.47075555555555554, + "grad_norm": 0.3768724152737397, + "learning_rate": 0.00011428847588888434, + "loss": 0.6353, + "step": 2648 + }, + { + "epoch": 0.4709333333333333, + "grad_norm": 0.3765848864355913, + "learning_rate": 0.00011423148382732853, + "loss": 0.6358, + "step": 2649 + }, + { + "epoch": 0.4711111111111111, + "grad_norm": 0.36915676679875964, + "learning_rate": 0.00011417448704730275, + "loss": 0.6045, + "step": 2650 + }, + { + "epoch": 0.47128888888888887, + "grad_norm": 0.36233416081947073, + "learning_rate": 0.0001141174855677044, + "loss": 0.6112, + "step": 2651 + }, + { + "epoch": 0.47146666666666665, + "grad_norm": 0.3642104553696075, + "learning_rate": 0.00011406047940743239, + "loss": 0.6625, + "step": 2652 + }, + { + "epoch": 0.4716444444444444, + "grad_norm": 0.3434966818111389, + "learning_rate": 0.0001140034685853872, + "loss": 0.6053, + "step": 2653 + }, + { + "epoch": 0.4718222222222222, + "grad_norm": 0.37966589821930796, + "learning_rate": 0.00011394645312047086, + "loss": 0.6537, + "step": 2654 + }, + { + "epoch": 0.472, + "grad_norm": 0.3442634387796601, + "learning_rate": 0.00011388943303158693, + "loss": 0.6309, + "step": 2655 + }, + { + "epoch": 0.47217777777777775, + "grad_norm": 0.35493277038420756, + "learning_rate": 0.0001138324083376405, + "loss": 0.6601, + "step": 2656 + }, + { + "epoch": 0.47235555555555553, + "grad_norm": 0.3541468266169037, + "learning_rate": 0.0001137753790575382, + "loss": 0.6371, + "step": 2657 + }, + { + "epoch": 0.47253333333333336, + "grad_norm": 0.3340969839230348, + "learning_rate": 0.00011371834521018818, + "loss": 0.5654, + "step": 2658 + }, + { + "epoch": 0.47271111111111114, + "grad_norm": 0.35130500748332294, + "learning_rate": 0.00011366130681450008, + "loss": 0.6247, + "step": 2659 + }, + { + "epoch": 0.4728888888888889, + "grad_norm": 0.3291968480511084, + "learning_rate": 0.00011360426388938508, + "loss": 0.5779, + "step": 2660 + }, + { + "epoch": 0.4730666666666667, + "grad_norm": 0.3271214931402535, + "learning_rate": 0.00011354721645375588, + "loss": 0.5881, + "step": 2661 + }, + { + "epoch": 0.47324444444444447, + "grad_norm": 0.3740586765449767, + "learning_rate": 0.00011349016452652657, + "loss": 0.6168, + "step": 2662 + }, + { + "epoch": 0.47342222222222224, + "grad_norm": 0.3828461818661629, + "learning_rate": 0.00011343310812661286, + "loss": 0.7076, + "step": 2663 + }, + { + "epoch": 0.4736, + "grad_norm": 0.35349321585283905, + "learning_rate": 0.00011337604727293185, + "loss": 0.5952, + "step": 2664 + }, + { + "epoch": 0.4737777777777778, + "grad_norm": 0.33341587424963764, + "learning_rate": 0.00011331898198440219, + "loss": 0.5731, + "step": 2665 + }, + { + "epoch": 0.4739555555555556, + "grad_norm": 0.38392100296688336, + "learning_rate": 0.00011326191227994391, + "loss": 0.6736, + "step": 2666 + }, + { + "epoch": 0.47413333333333335, + "grad_norm": 0.3917244196738235, + "learning_rate": 0.00011320483817847862, + "loss": 0.6303, + "step": 2667 + }, + { + "epoch": 0.47431111111111113, + "grad_norm": 0.3532778146515433, + "learning_rate": 0.0001131477596989293, + "loss": 0.5841, + "step": 2668 + }, + { + "epoch": 0.4744888888888889, + "grad_norm": 0.3466873851107952, + "learning_rate": 0.00011309067686022037, + "loss": 0.6226, + "step": 2669 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 0.3464991699454127, + "learning_rate": 0.00011303358968127778, + "loss": 0.5929, + "step": 2670 + }, + { + "epoch": 0.47484444444444446, + "grad_norm": 0.35501088118365143, + "learning_rate": 0.00011297649818102884, + "loss": 0.6398, + "step": 2671 + }, + { + "epoch": 0.47502222222222223, + "grad_norm": 0.349899933739195, + "learning_rate": 0.00011291940237840235, + "loss": 0.6163, + "step": 2672 + }, + { + "epoch": 0.4752, + "grad_norm": 0.38855265897666125, + "learning_rate": 0.0001128623022923285, + "loss": 0.645, + "step": 2673 + }, + { + "epoch": 0.4753777777777778, + "grad_norm": 0.36794088568281935, + "learning_rate": 0.00011280519794173889, + "loss": 0.6368, + "step": 2674 + }, + { + "epoch": 0.47555555555555556, + "grad_norm": 0.34980805194622705, + "learning_rate": 0.00011274808934556655, + "loss": 0.5698, + "step": 2675 + }, + { + "epoch": 0.47573333333333334, + "grad_norm": 0.37832759436319335, + "learning_rate": 0.00011269097652274596, + "loss": 0.6153, + "step": 2676 + }, + { + "epoch": 0.4759111111111111, + "grad_norm": 0.3596230483450509, + "learning_rate": 0.00011263385949221295, + "loss": 0.5895, + "step": 2677 + }, + { + "epoch": 0.4760888888888889, + "grad_norm": 0.36089900480039894, + "learning_rate": 0.00011257673827290471, + "loss": 0.6572, + "step": 2678 + }, + { + "epoch": 0.47626666666666667, + "grad_norm": 0.36648935021651785, + "learning_rate": 0.00011251961288375994, + "loss": 0.6137, + "step": 2679 + }, + { + "epoch": 0.47644444444444445, + "grad_norm": 0.32777743291177003, + "learning_rate": 0.0001124624833437186, + "loss": 0.5711, + "step": 2680 + }, + { + "epoch": 0.4766222222222222, + "grad_norm": 0.354466514045681, + "learning_rate": 0.0001124053496717221, + "loss": 0.6311, + "step": 2681 + }, + { + "epoch": 0.4768, + "grad_norm": 0.36897056553479496, + "learning_rate": 0.00011234821188671319, + "loss": 0.6035, + "step": 2682 + }, + { + "epoch": 0.4769777777777778, + "grad_norm": 0.3851188681869118, + "learning_rate": 0.00011229107000763597, + "loss": 0.652, + "step": 2683 + }, + { + "epoch": 0.47715555555555556, + "grad_norm": 0.36807460938564895, + "learning_rate": 0.00011223392405343594, + "loss": 0.6049, + "step": 2684 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 0.3778302718415281, + "learning_rate": 0.00011217677404305993, + "loss": 0.6808, + "step": 2685 + }, + { + "epoch": 0.4775111111111111, + "grad_norm": 0.36782968532147403, + "learning_rate": 0.00011211961999545609, + "loss": 0.5952, + "step": 2686 + }, + { + "epoch": 0.4776888888888889, + "grad_norm": 0.3354434904344047, + "learning_rate": 0.00011206246192957391, + "loss": 0.603, + "step": 2687 + }, + { + "epoch": 0.47786666666666666, + "grad_norm": 0.41553175655275, + "learning_rate": 0.0001120052998643643, + "loss": 0.6028, + "step": 2688 + }, + { + "epoch": 0.47804444444444444, + "grad_norm": 0.35055490126767114, + "learning_rate": 0.00011194813381877937, + "loss": 0.647, + "step": 2689 + }, + { + "epoch": 0.4782222222222222, + "grad_norm": 0.41273037053461203, + "learning_rate": 0.00011189096381177265, + "loss": 0.604, + "step": 2690 + }, + { + "epoch": 0.4784, + "grad_norm": 0.3400119695433725, + "learning_rate": 0.00011183378986229891, + "loss": 0.6049, + "step": 2691 + }, + { + "epoch": 0.47857777777777777, + "grad_norm": 0.3406063104757223, + "learning_rate": 0.00011177661198931426, + "loss": 0.6007, + "step": 2692 + }, + { + "epoch": 0.47875555555555555, + "grad_norm": 0.33537977158680193, + "learning_rate": 0.00011171943021177615, + "loss": 0.6297, + "step": 2693 + }, + { + "epoch": 0.4789333333333333, + "grad_norm": 0.3720330247363786, + "learning_rate": 0.00011166224454864325, + "loss": 0.6691, + "step": 2694 + }, + { + "epoch": 0.4791111111111111, + "grad_norm": 0.34645248508811, + "learning_rate": 0.00011160505501887555, + "loss": 0.6285, + "step": 2695 + }, + { + "epoch": 0.4792888888888889, + "grad_norm": 0.3525730364851391, + "learning_rate": 0.00011154786164143433, + "loss": 0.6425, + "step": 2696 + }, + { + "epoch": 0.47946666666666665, + "grad_norm": 0.332652464608944, + "learning_rate": 0.00011149066443528218, + "loss": 0.5205, + "step": 2697 + }, + { + "epoch": 0.47964444444444443, + "grad_norm": 0.35771403998277956, + "learning_rate": 0.00011143346341938288, + "loss": 0.6244, + "step": 2698 + }, + { + "epoch": 0.4798222222222222, + "grad_norm": 0.38398350996700514, + "learning_rate": 0.00011137625861270151, + "loss": 0.5586, + "step": 2699 + }, + { + "epoch": 0.48, + "grad_norm": 0.38893302054915424, + "learning_rate": 0.00011131905003420442, + "loss": 0.6298, + "step": 2700 + }, + { + "epoch": 0.48017777777777776, + "grad_norm": 0.3391839272542596, + "learning_rate": 0.00011126183770285918, + "loss": 0.5568, + "step": 2701 + }, + { + "epoch": 0.48035555555555554, + "grad_norm": 0.35387103703885525, + "learning_rate": 0.00011120462163763468, + "loss": 0.6333, + "step": 2702 + }, + { + "epoch": 0.4805333333333333, + "grad_norm": 0.34988748314485196, + "learning_rate": 0.00011114740185750093, + "loss": 0.6163, + "step": 2703 + }, + { + "epoch": 0.4807111111111111, + "grad_norm": 0.36559732127875183, + "learning_rate": 0.00011109017838142928, + "loss": 0.6038, + "step": 2704 + }, + { + "epoch": 0.48088888888888887, + "grad_norm": 0.3657990552249299, + "learning_rate": 0.00011103295122839221, + "loss": 0.6599, + "step": 2705 + }, + { + "epoch": 0.48106666666666664, + "grad_norm": 0.3402190427312039, + "learning_rate": 0.00011097572041736353, + "loss": 0.6221, + "step": 2706 + }, + { + "epoch": 0.4812444444444444, + "grad_norm": 0.35260149677985864, + "learning_rate": 0.00011091848596731817, + "loss": 0.6225, + "step": 2707 + }, + { + "epoch": 0.4814222222222222, + "grad_norm": 0.3638809115304859, + "learning_rate": 0.00011086124789723232, + "loss": 0.6187, + "step": 2708 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3489193289546153, + "learning_rate": 0.0001108040062260833, + "loss": 0.6052, + "step": 2709 + }, + { + "epoch": 0.4817777777777778, + "grad_norm": 0.3517204828552794, + "learning_rate": 0.00011074676097284973, + "loss": 0.596, + "step": 2710 + }, + { + "epoch": 0.4819555555555556, + "grad_norm": 0.34309708875835576, + "learning_rate": 0.00011068951215651132, + "loss": 0.5934, + "step": 2711 + }, + { + "epoch": 0.48213333333333336, + "grad_norm": 0.36781363888822954, + "learning_rate": 0.00011063225979604899, + "loss": 0.6136, + "step": 2712 + }, + { + "epoch": 0.48231111111111113, + "grad_norm": 0.34618456072920345, + "learning_rate": 0.00011057500391044489, + "loss": 0.6213, + "step": 2713 + }, + { + "epoch": 0.4824888888888889, + "grad_norm": 0.37735825252498334, + "learning_rate": 0.00011051774451868226, + "loss": 0.6437, + "step": 2714 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 0.35199866517338113, + "learning_rate": 0.00011046048163974558, + "loss": 0.6088, + "step": 2715 + }, + { + "epoch": 0.48284444444444446, + "grad_norm": 0.34466995360901276, + "learning_rate": 0.00011040321529262041, + "loss": 0.6049, + "step": 2716 + }, + { + "epoch": 0.48302222222222224, + "grad_norm": 0.3924179006343552, + "learning_rate": 0.0001103459454962935, + "loss": 0.6667, + "step": 2717 + }, + { + "epoch": 0.4832, + "grad_norm": 0.3674466116361259, + "learning_rate": 0.00011028867226975272, + "loss": 0.6302, + "step": 2718 + }, + { + "epoch": 0.4833777777777778, + "grad_norm": 0.3540570626167075, + "learning_rate": 0.00011023139563198714, + "loss": 0.6062, + "step": 2719 + }, + { + "epoch": 0.48355555555555557, + "grad_norm": 0.3641240841142744, + "learning_rate": 0.00011017411560198686, + "loss": 0.5971, + "step": 2720 + }, + { + "epoch": 0.48373333333333335, + "grad_norm": 0.4154207130462865, + "learning_rate": 0.00011011683219874323, + "loss": 0.6533, + "step": 2721 + }, + { + "epoch": 0.4839111111111111, + "grad_norm": 0.34079782344676135, + "learning_rate": 0.00011005954544124862, + "loss": 0.5708, + "step": 2722 + }, + { + "epoch": 0.4840888888888889, + "grad_norm": 0.3677213002409934, + "learning_rate": 0.00011000225534849649, + "loss": 0.6276, + "step": 2723 + }, + { + "epoch": 0.4842666666666667, + "grad_norm": 0.3577200586354648, + "learning_rate": 0.0001099449619394815, + "loss": 0.5544, + "step": 2724 + }, + { + "epoch": 0.48444444444444446, + "grad_norm": 0.36319426448018627, + "learning_rate": 0.00010988766523319935, + "loss": 0.6471, + "step": 2725 + }, + { + "epoch": 0.48462222222222223, + "grad_norm": 0.35324684939353757, + "learning_rate": 0.00010983036524864689, + "loss": 0.6384, + "step": 2726 + }, + { + "epoch": 0.4848, + "grad_norm": 0.3609013571048923, + "learning_rate": 0.00010977306200482195, + "loss": 0.6265, + "step": 2727 + }, + { + "epoch": 0.4849777777777778, + "grad_norm": 0.3518419956554968, + "learning_rate": 0.00010971575552072357, + "loss": 0.5932, + "step": 2728 + }, + { + "epoch": 0.48515555555555556, + "grad_norm": 0.3658745675549795, + "learning_rate": 0.00010965844581535178, + "loss": 0.6221, + "step": 2729 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 0.36422447564502797, + "learning_rate": 0.0001096011329077077, + "loss": 0.6436, + "step": 2730 + }, + { + "epoch": 0.4855111111111111, + "grad_norm": 0.3774567614812268, + "learning_rate": 0.00010954381681679352, + "loss": 0.6276, + "step": 2731 + }, + { + "epoch": 0.4856888888888889, + "grad_norm": 0.37889147254460265, + "learning_rate": 0.00010948649756161246, + "loss": 0.6099, + "step": 2732 + }, + { + "epoch": 0.48586666666666667, + "grad_norm": 0.33818971949149906, + "learning_rate": 0.0001094291751611688, + "loss": 0.635, + "step": 2733 + }, + { + "epoch": 0.48604444444444445, + "grad_norm": 0.3741478913319296, + "learning_rate": 0.00010937184963446788, + "loss": 0.6706, + "step": 2734 + }, + { + "epoch": 0.4862222222222222, + "grad_norm": 0.36351168281219204, + "learning_rate": 0.00010931452100051605, + "loss": 0.6102, + "step": 2735 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3356327170524334, + "learning_rate": 0.00010925718927832073, + "loss": 0.6024, + "step": 2736 + }, + { + "epoch": 0.4865777777777778, + "grad_norm": 0.3572987085131686, + "learning_rate": 0.00010919985448689031, + "loss": 0.603, + "step": 2737 + }, + { + "epoch": 0.48675555555555555, + "grad_norm": 0.3638996318644696, + "learning_rate": 0.00010914251664523428, + "loss": 0.6206, + "step": 2738 + }, + { + "epoch": 0.48693333333333333, + "grad_norm": 0.36630154557599215, + "learning_rate": 0.00010908517577236302, + "loss": 0.6386, + "step": 2739 + }, + { + "epoch": 0.4871111111111111, + "grad_norm": 0.3445619840587946, + "learning_rate": 0.00010902783188728802, + "loss": 0.6043, + "step": 2740 + }, + { + "epoch": 0.4872888888888889, + "grad_norm": 0.38833044321790144, + "learning_rate": 0.00010897048500902172, + "loss": 0.6421, + "step": 2741 + }, + { + "epoch": 0.48746666666666666, + "grad_norm": 0.3553842396887497, + "learning_rate": 0.0001089131351565776, + "loss": 0.6306, + "step": 2742 + }, + { + "epoch": 0.48764444444444444, + "grad_norm": 0.3484606526479836, + "learning_rate": 0.00010885578234897003, + "loss": 0.6018, + "step": 2743 + }, + { + "epoch": 0.4878222222222222, + "grad_norm": 0.4045170295439436, + "learning_rate": 0.00010879842660521449, + "loss": 0.6049, + "step": 2744 + }, + { + "epoch": 0.488, + "grad_norm": 0.361274390330694, + "learning_rate": 0.00010874106794432728, + "loss": 0.6552, + "step": 2745 + }, + { + "epoch": 0.48817777777777777, + "grad_norm": 0.37408216159252544, + "learning_rate": 0.00010868370638532582, + "loss": 0.599, + "step": 2746 + }, + { + "epoch": 0.48835555555555554, + "grad_norm": 0.37213300758748064, + "learning_rate": 0.00010862634194722839, + "loss": 0.5792, + "step": 2747 + }, + { + "epoch": 0.4885333333333333, + "grad_norm": 0.3560509110229911, + "learning_rate": 0.00010856897464905425, + "loss": 0.585, + "step": 2748 + }, + { + "epoch": 0.4887111111111111, + "grad_norm": 0.35152372742860466, + "learning_rate": 0.00010851160450982363, + "loss": 0.6065, + "step": 2749 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 0.3529930348619273, + "learning_rate": 0.0001084542315485577, + "loss": 0.6007, + "step": 2750 + }, + { + "epoch": 0.48906666666666665, + "grad_norm": 0.3579137296933873, + "learning_rate": 0.00010839685578427852, + "loss": 0.6185, + "step": 2751 + }, + { + "epoch": 0.4892444444444444, + "grad_norm": 0.3677083102081914, + "learning_rate": 0.00010833947723600913, + "loss": 0.6388, + "step": 2752 + }, + { + "epoch": 0.4894222222222222, + "grad_norm": 0.3497669374143786, + "learning_rate": 0.00010828209592277346, + "loss": 0.6451, + "step": 2753 + }, + { + "epoch": 0.4896, + "grad_norm": 0.36761661601408646, + "learning_rate": 0.00010822471186359639, + "loss": 0.6101, + "step": 2754 + }, + { + "epoch": 0.48977777777777776, + "grad_norm": 0.34399349729851153, + "learning_rate": 0.00010816732507750369, + "loss": 0.5736, + "step": 2755 + }, + { + "epoch": 0.48995555555555553, + "grad_norm": 0.39444123166385175, + "learning_rate": 0.00010810993558352202, + "loss": 0.6284, + "step": 2756 + }, + { + "epoch": 0.4901333333333333, + "grad_norm": 0.35128568212938327, + "learning_rate": 0.00010805254340067899, + "loss": 0.6484, + "step": 2757 + }, + { + "epoch": 0.4903111111111111, + "grad_norm": 0.36373459421377374, + "learning_rate": 0.00010799514854800298, + "loss": 0.6074, + "step": 2758 + }, + { + "epoch": 0.49048888888888886, + "grad_norm": 0.34565919628276104, + "learning_rate": 0.00010793775104452344, + "loss": 0.6426, + "step": 2759 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 0.36787727648529744, + "learning_rate": 0.00010788035090927053, + "loss": 0.6324, + "step": 2760 + }, + { + "epoch": 0.49084444444444447, + "grad_norm": 0.36484557219373004, + "learning_rate": 0.0001078229481612754, + "loss": 0.6387, + "step": 2761 + }, + { + "epoch": 0.49102222222222225, + "grad_norm": 0.35297798272843073, + "learning_rate": 0.00010776554281956998, + "loss": 0.5804, + "step": 2762 + }, + { + "epoch": 0.4912, + "grad_norm": 0.34499691049091197, + "learning_rate": 0.00010770813490318712, + "loss": 0.5713, + "step": 2763 + }, + { + "epoch": 0.4913777777777778, + "grad_norm": 0.34891630894582804, + "learning_rate": 0.00010765072443116049, + "loss": 0.6115, + "step": 2764 + }, + { + "epoch": 0.4915555555555556, + "grad_norm": 0.35138702833764945, + "learning_rate": 0.00010759331142252462, + "loss": 0.6197, + "step": 2765 + }, + { + "epoch": 0.49173333333333336, + "grad_norm": 0.38567865348514535, + "learning_rate": 0.0001075358958963149, + "loss": 0.6589, + "step": 2766 + }, + { + "epoch": 0.49191111111111113, + "grad_norm": 0.3407754970002096, + "learning_rate": 0.0001074784778715675, + "loss": 0.5985, + "step": 2767 + }, + { + "epoch": 0.4920888888888889, + "grad_norm": 0.3517842959008859, + "learning_rate": 0.00010742105736731947, + "loss": 0.6522, + "step": 2768 + }, + { + "epoch": 0.4922666666666667, + "grad_norm": 0.36403048140710736, + "learning_rate": 0.00010736363440260869, + "loss": 0.5974, + "step": 2769 + }, + { + "epoch": 0.49244444444444446, + "grad_norm": 0.36015957701572476, + "learning_rate": 0.00010730620899647379, + "loss": 0.6059, + "step": 2770 + }, + { + "epoch": 0.49262222222222224, + "grad_norm": 0.34803655189632937, + "learning_rate": 0.00010724878116795424, + "loss": 0.6267, + "step": 2771 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3463003479928638, + "learning_rate": 0.00010719135093609038, + "loss": 0.5481, + "step": 2772 + }, + { + "epoch": 0.4929777777777778, + "grad_norm": 0.35025933562783984, + "learning_rate": 0.00010713391831992323, + "loss": 0.5708, + "step": 2773 + }, + { + "epoch": 0.49315555555555557, + "grad_norm": 0.3693630903126469, + "learning_rate": 0.00010707648333849472, + "loss": 0.6437, + "step": 2774 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.35118765870843055, + "learning_rate": 0.00010701904601084745, + "loss": 0.6016, + "step": 2775 + }, + { + "epoch": 0.4935111111111111, + "grad_norm": 0.3372626141760626, + "learning_rate": 0.00010696160635602487, + "loss": 0.6101, + "step": 2776 + }, + { + "epoch": 0.4936888888888889, + "grad_norm": 0.35267664479676036, + "learning_rate": 0.00010690416439307122, + "loss": 0.5723, + "step": 2777 + }, + { + "epoch": 0.4938666666666667, + "grad_norm": 0.3697368831270203, + "learning_rate": 0.00010684672014103143, + "loss": 0.5874, + "step": 2778 + }, + { + "epoch": 0.49404444444444445, + "grad_norm": 0.3644349469458444, + "learning_rate": 0.00010678927361895124, + "loss": 0.6141, + "step": 2779 + }, + { + "epoch": 0.49422222222222223, + "grad_norm": 0.34684473843740704, + "learning_rate": 0.00010673182484587711, + "loss": 0.6372, + "step": 2780 + }, + { + "epoch": 0.4944, + "grad_norm": 0.3350506295496847, + "learning_rate": 0.00010667437384085634, + "loss": 0.5779, + "step": 2781 + }, + { + "epoch": 0.4945777777777778, + "grad_norm": 0.35946078346449256, + "learning_rate": 0.00010661692062293682, + "loss": 0.6179, + "step": 2782 + }, + { + "epoch": 0.49475555555555556, + "grad_norm": 0.3689683408579485, + "learning_rate": 0.00010655946521116726, + "loss": 0.6505, + "step": 2783 + }, + { + "epoch": 0.49493333333333334, + "grad_norm": 0.370988792598542, + "learning_rate": 0.0001065020076245971, + "loss": 0.6249, + "step": 2784 + }, + { + "epoch": 0.4951111111111111, + "grad_norm": 0.36063430235058636, + "learning_rate": 0.0001064445478822765, + "loss": 0.6068, + "step": 2785 + }, + { + "epoch": 0.4952888888888889, + "grad_norm": 0.35720263045801315, + "learning_rate": 0.00010638708600325632, + "loss": 0.6222, + "step": 2786 + }, + { + "epoch": 0.49546666666666667, + "grad_norm": 0.38600134491333904, + "learning_rate": 0.00010632962200658815, + "loss": 0.6193, + "step": 2787 + }, + { + "epoch": 0.49564444444444444, + "grad_norm": 0.3538939390168749, + "learning_rate": 0.00010627215591132422, + "loss": 0.6089, + "step": 2788 + }, + { + "epoch": 0.4958222222222222, + "grad_norm": 0.35610930727412976, + "learning_rate": 0.00010621468773651755, + "loss": 0.6407, + "step": 2789 + }, + { + "epoch": 0.496, + "grad_norm": 0.3460646906483097, + "learning_rate": 0.00010615721750122177, + "loss": 0.5823, + "step": 2790 + }, + { + "epoch": 0.4961777777777778, + "grad_norm": 0.33362362823218344, + "learning_rate": 0.00010609974522449122, + "loss": 0.6216, + "step": 2791 + }, + { + "epoch": 0.49635555555555555, + "grad_norm": 0.36261281518354926, + "learning_rate": 0.00010604227092538095, + "loss": 0.6467, + "step": 2792 + }, + { + "epoch": 0.4965333333333333, + "grad_norm": 0.34603549890246943, + "learning_rate": 0.00010598479462294663, + "loss": 0.6229, + "step": 2793 + }, + { + "epoch": 0.4967111111111111, + "grad_norm": 0.34599693327445646, + "learning_rate": 0.0001059273163362446, + "loss": 0.6143, + "step": 2794 + }, + { + "epoch": 0.4968888888888889, + "grad_norm": 0.35513485757639185, + "learning_rate": 0.0001058698360843319, + "loss": 0.6642, + "step": 2795 + }, + { + "epoch": 0.49706666666666666, + "grad_norm": 0.3509477718058822, + "learning_rate": 0.00010581235388626618, + "loss": 0.6357, + "step": 2796 + }, + { + "epoch": 0.49724444444444443, + "grad_norm": 0.364396029448311, + "learning_rate": 0.00010575486976110575, + "loss": 0.598, + "step": 2797 + }, + { + "epoch": 0.4974222222222222, + "grad_norm": 0.3768475083816325, + "learning_rate": 0.00010569738372790956, + "loss": 0.5976, + "step": 2798 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3566503351383309, + "learning_rate": 0.00010563989580573719, + "loss": 0.6372, + "step": 2799 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 0.3452388377561649, + "learning_rate": 0.00010558240601364886, + "loss": 0.5963, + "step": 2800 + }, + { + "epoch": 0.49795555555555554, + "grad_norm": 0.34920378279486075, + "learning_rate": 0.00010552491437070537, + "loss": 0.6575, + "step": 2801 + }, + { + "epoch": 0.4981333333333333, + "grad_norm": 0.35188471236857155, + "learning_rate": 0.0001054674208959682, + "loss": 0.566, + "step": 2802 + }, + { + "epoch": 0.4983111111111111, + "grad_norm": 0.3492653990364668, + "learning_rate": 0.00010540992560849936, + "loss": 0.6316, + "step": 2803 + }, + { + "epoch": 0.49848888888888887, + "grad_norm": 0.36604260248841, + "learning_rate": 0.00010535242852736151, + "loss": 0.5899, + "step": 2804 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 0.36538487096263866, + "learning_rate": 0.00010529492967161794, + "loss": 0.6097, + "step": 2805 + }, + { + "epoch": 0.4988444444444444, + "grad_norm": 0.34839531813772273, + "learning_rate": 0.00010523742906033241, + "loss": 0.6215, + "step": 2806 + }, + { + "epoch": 0.4990222222222222, + "grad_norm": 0.33840736496823837, + "learning_rate": 0.00010517992671256937, + "loss": 0.6107, + "step": 2807 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3918809735177293, + "learning_rate": 0.00010512242264739381, + "loss": 0.6124, + "step": 2808 + }, + { + "epoch": 0.49937777777777775, + "grad_norm": 0.35773165944495616, + "learning_rate": 0.00010506491688387127, + "loss": 0.6215, + "step": 2809 + }, + { + "epoch": 0.49955555555555553, + "grad_norm": 0.33996554260270845, + "learning_rate": 0.0001050074094410679, + "loss": 0.6122, + "step": 2810 + }, + { + "epoch": 0.4997333333333333, + "grad_norm": 0.3517499100221005, + "learning_rate": 0.00010494990033805038, + "loss": 0.5956, + "step": 2811 + }, + { + "epoch": 0.4999111111111111, + "grad_norm": 0.33218122062584815, + "learning_rate": 0.00010489238959388592, + "loss": 0.5604, + "step": 2812 + }, + { + "epoch": 0.5000888888888889, + "grad_norm": 0.3498560816930892, + "learning_rate": 0.00010483487722764231, + "loss": 0.5927, + "step": 2813 + }, + { + "epoch": 0.5002666666666666, + "grad_norm": 0.3684901328729348, + "learning_rate": 0.00010477736325838785, + "loss": 0.6467, + "step": 2814 + }, + { + "epoch": 0.5004444444444445, + "grad_norm": 0.3667843096158526, + "learning_rate": 0.00010471984770519139, + "loss": 0.6199, + "step": 2815 + }, + { + "epoch": 0.5006222222222222, + "grad_norm": 0.359214710059682, + "learning_rate": 0.00010466233058712229, + "loss": 0.608, + "step": 2816 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3380932320111709, + "learning_rate": 0.00010460481192325045, + "loss": 0.5796, + "step": 2817 + }, + { + "epoch": 0.5009777777777777, + "grad_norm": 0.3707288954879967, + "learning_rate": 0.00010454729173264627, + "loss": 0.637, + "step": 2818 + }, + { + "epoch": 0.5011555555555556, + "grad_norm": 0.3300407135065756, + "learning_rate": 0.00010448977003438066, + "loss": 0.593, + "step": 2819 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 0.34950465708506095, + "learning_rate": 0.000104432246847525, + "loss": 0.5904, + "step": 2820 + }, + { + "epoch": 0.5015111111111111, + "grad_norm": 0.34126426032560353, + "learning_rate": 0.00010437472219115119, + "loss": 0.5793, + "step": 2821 + }, + { + "epoch": 0.5016888888888889, + "grad_norm": 0.3356185281893816, + "learning_rate": 0.00010431719608433163, + "loss": 0.5864, + "step": 2822 + }, + { + "epoch": 0.5018666666666667, + "grad_norm": 0.4000250462032941, + "learning_rate": 0.00010425966854613922, + "loss": 0.564, + "step": 2823 + }, + { + "epoch": 0.5020444444444444, + "grad_norm": 0.33102759237268814, + "learning_rate": 0.00010420213959564726, + "loss": 0.612, + "step": 2824 + }, + { + "epoch": 0.5022222222222222, + "grad_norm": 0.35705141978540295, + "learning_rate": 0.00010414460925192957, + "loss": 0.6473, + "step": 2825 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3856557515811476, + "learning_rate": 0.00010408707753406041, + "loss": 0.6133, + "step": 2826 + }, + { + "epoch": 0.5025777777777778, + "grad_norm": 0.36849516824903794, + "learning_rate": 0.00010402954446111454, + "loss": 0.6982, + "step": 2827 + }, + { + "epoch": 0.5027555555555555, + "grad_norm": 0.34277234138188417, + "learning_rate": 0.00010397201005216712, + "loss": 0.5997, + "step": 2828 + }, + { + "epoch": 0.5029333333333333, + "grad_norm": 0.3715528396359596, + "learning_rate": 0.00010391447432629376, + "loss": 0.6771, + "step": 2829 + }, + { + "epoch": 0.5031111111111111, + "grad_norm": 0.3484179123443325, + "learning_rate": 0.00010385693730257055, + "loss": 0.632, + "step": 2830 + }, + { + "epoch": 0.5032888888888889, + "grad_norm": 0.33654209290635734, + "learning_rate": 0.00010379939900007393, + "loss": 0.5752, + "step": 2831 + }, + { + "epoch": 0.5034666666666666, + "grad_norm": 0.3660390639476581, + "learning_rate": 0.00010374185943788084, + "loss": 0.6554, + "step": 2832 + }, + { + "epoch": 0.5036444444444445, + "grad_norm": 0.346019943877722, + "learning_rate": 0.0001036843186350686, + "loss": 0.5795, + "step": 2833 + }, + { + "epoch": 0.5038222222222222, + "grad_norm": 0.3566103197804138, + "learning_rate": 0.00010362677661071496, + "loss": 0.5937, + "step": 2834 + }, + { + "epoch": 0.504, + "grad_norm": 0.35575117467178, + "learning_rate": 0.00010356923338389806, + "loss": 0.6202, + "step": 2835 + }, + { + "epoch": 0.5041777777777777, + "grad_norm": 0.3441497215196966, + "learning_rate": 0.00010351168897369643, + "loss": 0.622, + "step": 2836 + }, + { + "epoch": 0.5043555555555556, + "grad_norm": 0.38114956937969147, + "learning_rate": 0.00010345414339918902, + "loss": 0.65, + "step": 2837 + }, + { + "epoch": 0.5045333333333333, + "grad_norm": 0.34563702273352565, + "learning_rate": 0.00010339659667945516, + "loss": 0.6272, + "step": 2838 + }, + { + "epoch": 0.5047111111111111, + "grad_norm": 0.3599459817591909, + "learning_rate": 0.00010333904883357455, + "loss": 0.6447, + "step": 2839 + }, + { + "epoch": 0.5048888888888889, + "grad_norm": 0.359921736087741, + "learning_rate": 0.00010328149988062724, + "loss": 0.6133, + "step": 2840 + }, + { + "epoch": 0.5050666666666667, + "grad_norm": 0.3586654850163382, + "learning_rate": 0.00010322394983969368, + "loss": 0.6287, + "step": 2841 + }, + { + "epoch": 0.5052444444444445, + "grad_norm": 0.34841974771113626, + "learning_rate": 0.00010316639872985472, + "loss": 0.6259, + "step": 2842 + }, + { + "epoch": 0.5054222222222222, + "grad_norm": 0.36272827256722956, + "learning_rate": 0.00010310884657019146, + "loss": 0.5888, + "step": 2843 + }, + { + "epoch": 0.5056, + "grad_norm": 0.35419453750164864, + "learning_rate": 0.00010305129337978543, + "loss": 0.6025, + "step": 2844 + }, + { + "epoch": 0.5057777777777778, + "grad_norm": 0.3610755239552308, + "learning_rate": 0.00010299373917771846, + "loss": 0.6225, + "step": 2845 + }, + { + "epoch": 0.5059555555555556, + "grad_norm": 0.35206729828008626, + "learning_rate": 0.00010293618398307276, + "loss": 0.5734, + "step": 2846 + }, + { + "epoch": 0.5061333333333333, + "grad_norm": 0.45376085976884545, + "learning_rate": 0.00010287862781493081, + "loss": 0.6708, + "step": 2847 + }, + { + "epoch": 0.5063111111111112, + "grad_norm": 0.36643314471592, + "learning_rate": 0.00010282107069237548, + "loss": 0.5756, + "step": 2848 + }, + { + "epoch": 0.5064888888888889, + "grad_norm": 0.3357220441033354, + "learning_rate": 0.00010276351263448989, + "loss": 0.6114, + "step": 2849 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.3521334800412782, + "learning_rate": 0.00010270595366035751, + "loss": 0.619, + "step": 2850 + }, + { + "epoch": 0.5068444444444444, + "grad_norm": 0.3434862798781897, + "learning_rate": 0.0001026483937890621, + "loss": 0.6065, + "step": 2851 + }, + { + "epoch": 0.5070222222222223, + "grad_norm": 0.3704597964796915, + "learning_rate": 0.00010259083303968775, + "loss": 0.6555, + "step": 2852 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3631273288179781, + "learning_rate": 0.00010253327143131879, + "loss": 0.6377, + "step": 2853 + }, + { + "epoch": 0.5073777777777778, + "grad_norm": 0.3436159999082076, + "learning_rate": 0.00010247570898303986, + "loss": 0.5805, + "step": 2854 + }, + { + "epoch": 0.5075555555555555, + "grad_norm": 0.36449824920003177, + "learning_rate": 0.0001024181457139359, + "loss": 0.6289, + "step": 2855 + }, + { + "epoch": 0.5077333333333334, + "grad_norm": 0.3564461569146345, + "learning_rate": 0.00010236058164309205, + "loss": 0.5974, + "step": 2856 + }, + { + "epoch": 0.5079111111111111, + "grad_norm": 0.3624876331423114, + "learning_rate": 0.0001023030167895938, + "loss": 0.6455, + "step": 2857 + }, + { + "epoch": 0.5080888888888889, + "grad_norm": 0.4065282276465651, + "learning_rate": 0.00010224545117252686, + "loss": 0.6389, + "step": 2858 + }, + { + "epoch": 0.5082666666666666, + "grad_norm": 0.355295615845016, + "learning_rate": 0.00010218788481097719, + "loss": 0.6086, + "step": 2859 + }, + { + "epoch": 0.5084444444444445, + "grad_norm": 0.36008478013804707, + "learning_rate": 0.00010213031772403099, + "loss": 0.5975, + "step": 2860 + }, + { + "epoch": 0.5086222222222222, + "grad_norm": 0.361608065323148, + "learning_rate": 0.00010207274993077475, + "loss": 0.5939, + "step": 2861 + }, + { + "epoch": 0.5088, + "grad_norm": 0.37737679440277544, + "learning_rate": 0.00010201518145029514, + "loss": 0.6516, + "step": 2862 + }, + { + "epoch": 0.5089777777777778, + "grad_norm": 0.41062655534356596, + "learning_rate": 0.00010195761230167906, + "loss": 0.6797, + "step": 2863 + }, + { + "epoch": 0.5091555555555556, + "grad_norm": 0.3812547547124469, + "learning_rate": 0.00010190004250401368, + "loss": 0.63, + "step": 2864 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 0.3411389261481799, + "learning_rate": 0.00010184247207638636, + "loss": 0.6364, + "step": 2865 + }, + { + "epoch": 0.5095111111111111, + "grad_norm": 0.3372784137913383, + "learning_rate": 0.0001017849010378846, + "loss": 0.5785, + "step": 2866 + }, + { + "epoch": 0.5096888888888889, + "grad_norm": 0.3535829710930523, + "learning_rate": 0.00010172732940759626, + "loss": 0.6458, + "step": 2867 + }, + { + "epoch": 0.5098666666666667, + "grad_norm": 0.3626486580503156, + "learning_rate": 0.0001016697572046092, + "loss": 0.6038, + "step": 2868 + }, + { + "epoch": 0.5100444444444444, + "grad_norm": 0.3627595777778218, + "learning_rate": 0.00010161218444801164, + "loss": 0.6395, + "step": 2869 + }, + { + "epoch": 0.5102222222222222, + "grad_norm": 0.3539135083911791, + "learning_rate": 0.00010155461115689187, + "loss": 0.6391, + "step": 2870 + }, + { + "epoch": 0.5104, + "grad_norm": 0.34235740406145715, + "learning_rate": 0.00010149703735033845, + "loss": 0.6297, + "step": 2871 + }, + { + "epoch": 0.5105777777777778, + "grad_norm": 0.357642403952184, + "learning_rate": 0.00010143946304744001, + "loss": 0.6471, + "step": 2872 + }, + { + "epoch": 0.5107555555555555, + "grad_norm": 0.35778765198668405, + "learning_rate": 0.00010138188826728543, + "loss": 0.6478, + "step": 2873 + }, + { + "epoch": 0.5109333333333334, + "grad_norm": 0.3695782303343061, + "learning_rate": 0.00010132431302896372, + "loss": 0.5901, + "step": 2874 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 0.38786672316122284, + "learning_rate": 0.00010126673735156402, + "loss": 0.5878, + "step": 2875 + }, + { + "epoch": 0.5112888888888889, + "grad_norm": 0.34095507834884853, + "learning_rate": 0.00010120916125417563, + "loss": 0.5731, + "step": 2876 + }, + { + "epoch": 0.5114666666666666, + "grad_norm": 0.3271501522369416, + "learning_rate": 0.00010115158475588799, + "loss": 0.5888, + "step": 2877 + }, + { + "epoch": 0.5116444444444445, + "grad_norm": 0.365155829963744, + "learning_rate": 0.00010109400787579071, + "loss": 0.58, + "step": 2878 + }, + { + "epoch": 0.5118222222222222, + "grad_norm": 0.35205320550146113, + "learning_rate": 0.00010103643063297348, + "loss": 0.6206, + "step": 2879 + }, + { + "epoch": 0.512, + "grad_norm": 0.34403296207257006, + "learning_rate": 0.0001009788530465261, + "loss": 0.6104, + "step": 2880 + }, + { + "epoch": 0.5121777777777777, + "grad_norm": 0.3485221769171977, + "learning_rate": 0.0001009212751355385, + "loss": 0.6093, + "step": 2881 + }, + { + "epoch": 0.5123555555555556, + "grad_norm": 0.33122769821530423, + "learning_rate": 0.00010086369691910073, + "loss": 0.5772, + "step": 2882 + }, + { + "epoch": 0.5125333333333333, + "grad_norm": 0.3555973359000505, + "learning_rate": 0.00010080611841630296, + "loss": 0.5852, + "step": 2883 + }, + { + "epoch": 0.5127111111111111, + "grad_norm": 0.365659490986573, + "learning_rate": 0.0001007485396462354, + "loss": 0.5631, + "step": 2884 + }, + { + "epoch": 0.5128888888888888, + "grad_norm": 0.35833070480218726, + "learning_rate": 0.0001006909606279884, + "loss": 0.6433, + "step": 2885 + }, + { + "epoch": 0.5130666666666667, + "grad_norm": 0.35166222102821754, + "learning_rate": 0.00010063338138065234, + "loss": 0.6239, + "step": 2886 + }, + { + "epoch": 0.5132444444444444, + "grad_norm": 0.3718176392604159, + "learning_rate": 0.00010057580192331775, + "loss": 0.6576, + "step": 2887 + }, + { + "epoch": 0.5134222222222222, + "grad_norm": 0.961036915007125, + "learning_rate": 0.00010051822227507515, + "loss": 0.643, + "step": 2888 + }, + { + "epoch": 0.5136, + "grad_norm": 0.3862831022927344, + "learning_rate": 0.00010046064245501518, + "loss": 0.6133, + "step": 2889 + }, + { + "epoch": 0.5137777777777778, + "grad_norm": 0.34797681350138393, + "learning_rate": 0.0001004030624822285, + "loss": 0.6079, + "step": 2890 + }, + { + "epoch": 0.5139555555555556, + "grad_norm": 0.38759151148497084, + "learning_rate": 0.00010034548237580587, + "loss": 0.6264, + "step": 2891 + }, + { + "epoch": 0.5141333333333333, + "grad_norm": 0.3610477038546272, + "learning_rate": 0.00010028790215483803, + "loss": 0.6229, + "step": 2892 + }, + { + "epoch": 0.5143111111111112, + "grad_norm": 0.3814255721295432, + "learning_rate": 0.00010023032183841579, + "loss": 0.6003, + "step": 2893 + }, + { + "epoch": 0.5144888888888889, + "grad_norm": 0.37499587100800164, + "learning_rate": 0.00010017274144562998, + "loss": 0.6282, + "step": 2894 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 0.3701775558107347, + "learning_rate": 0.0001001151609955715, + "loss": 0.5972, + "step": 2895 + }, + { + "epoch": 0.5148444444444444, + "grad_norm": 0.3852048164158099, + "learning_rate": 0.0001000575805073312, + "loss": 0.6401, + "step": 2896 + }, + { + "epoch": 0.5150222222222223, + "grad_norm": 0.336630404537959, + "learning_rate": 0.0001, + "loss": 0.5556, + "step": 2897 + }, + { + "epoch": 0.5152, + "grad_norm": 0.3779473174849211, + "learning_rate": 9.994241949266879e-05, + "loss": 0.6195, + "step": 2898 + }, + { + "epoch": 0.5153777777777778, + "grad_norm": 0.35868374198402836, + "learning_rate": 9.988483900442854e-05, + "loss": 0.6452, + "step": 2899 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 0.4004543199994003, + "learning_rate": 9.982725855437002e-05, + "loss": 0.6886, + "step": 2900 + }, + { + "epoch": 0.5157333333333334, + "grad_norm": 0.35747016513202784, + "learning_rate": 9.976967816158423e-05, + "loss": 0.6571, + "step": 2901 + }, + { + "epoch": 0.5159111111111111, + "grad_norm": 0.36712045139323474, + "learning_rate": 9.9712097845162e-05, + "loss": 0.6257, + "step": 2902 + }, + { + "epoch": 0.5160888888888889, + "grad_norm": 0.3446530348167064, + "learning_rate": 9.965451762419415e-05, + "loss": 0.6235, + "step": 2903 + }, + { + "epoch": 0.5162666666666667, + "grad_norm": 0.37947442831733524, + "learning_rate": 9.959693751777149e-05, + "loss": 0.6448, + "step": 2904 + }, + { + "epoch": 0.5164444444444445, + "grad_norm": 0.35469199732020873, + "learning_rate": 9.953935754498484e-05, + "loss": 0.6033, + "step": 2905 + }, + { + "epoch": 0.5166222222222222, + "grad_norm": 0.3952447820031589, + "learning_rate": 9.948177772492484e-05, + "loss": 0.6094, + "step": 2906 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3609029313848285, + "learning_rate": 9.942419807668227e-05, + "loss": 0.6478, + "step": 2907 + }, + { + "epoch": 0.5169777777777778, + "grad_norm": 0.391285828099458, + "learning_rate": 9.936661861934765e-05, + "loss": 0.5886, + "step": 2908 + }, + { + "epoch": 0.5171555555555556, + "grad_norm": 0.3460556556219249, + "learning_rate": 9.930903937201163e-05, + "loss": 0.5819, + "step": 2909 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 0.3636055320598646, + "learning_rate": 9.925146035376459e-05, + "loss": 0.582, + "step": 2910 + }, + { + "epoch": 0.5175111111111111, + "grad_norm": 0.360552208979026, + "learning_rate": 9.919388158369707e-05, + "loss": 0.6045, + "step": 2911 + }, + { + "epoch": 0.5176888888888889, + "grad_norm": 0.3737646187553946, + "learning_rate": 9.913630308089927e-05, + "loss": 0.6196, + "step": 2912 + }, + { + "epoch": 0.5178666666666667, + "grad_norm": 0.35756137908178914, + "learning_rate": 9.907872486446152e-05, + "loss": 0.6161, + "step": 2913 + }, + { + "epoch": 0.5180444444444444, + "grad_norm": 0.365391892951095, + "learning_rate": 9.902114695347393e-05, + "loss": 0.6237, + "step": 2914 + }, + { + "epoch": 0.5182222222222223, + "grad_norm": 0.35831353292325624, + "learning_rate": 9.896356936702653e-05, + "loss": 0.6069, + "step": 2915 + }, + { + "epoch": 0.5184, + "grad_norm": 0.35526902761971624, + "learning_rate": 9.890599212420927e-05, + "loss": 0.6295, + "step": 2916 + }, + { + "epoch": 0.5185777777777778, + "grad_norm": 0.3623638101671441, + "learning_rate": 9.884841524411202e-05, + "loss": 0.6414, + "step": 2917 + }, + { + "epoch": 0.5187555555555555, + "grad_norm": 0.35417122316719685, + "learning_rate": 9.879083874582438e-05, + "loss": 0.5746, + "step": 2918 + }, + { + "epoch": 0.5189333333333334, + "grad_norm": 0.38381388572606173, + "learning_rate": 9.8733262648436e-05, + "loss": 0.6285, + "step": 2919 + }, + { + "epoch": 0.5191111111111111, + "grad_norm": 0.3522278048435224, + "learning_rate": 9.867568697103629e-05, + "loss": 0.6175, + "step": 2920 + }, + { + "epoch": 0.5192888888888889, + "grad_norm": 0.4180580565329305, + "learning_rate": 9.861811173271459e-05, + "loss": 0.6264, + "step": 2921 + }, + { + "epoch": 0.5194666666666666, + "grad_norm": 0.37354074873377613, + "learning_rate": 9.856053695255999e-05, + "loss": 0.6275, + "step": 2922 + }, + { + "epoch": 0.5196444444444445, + "grad_norm": 0.3758961247515681, + "learning_rate": 9.850296264966159e-05, + "loss": 0.6361, + "step": 2923 + }, + { + "epoch": 0.5198222222222222, + "grad_norm": 0.34017103126500436, + "learning_rate": 9.844538884310813e-05, + "loss": 0.6092, + "step": 2924 + }, + { + "epoch": 0.52, + "grad_norm": 0.35844883460816734, + "learning_rate": 9.838781555198839e-05, + "loss": 0.6058, + "step": 2925 + }, + { + "epoch": 0.5201777777777777, + "grad_norm": 0.47351394790219103, + "learning_rate": 9.833024279539081e-05, + "loss": 0.6503, + "step": 2926 + }, + { + "epoch": 0.5203555555555556, + "grad_norm": 0.35037283404698394, + "learning_rate": 9.827267059240377e-05, + "loss": 0.6606, + "step": 2927 + }, + { + "epoch": 0.5205333333333333, + "grad_norm": 0.3526269390644446, + "learning_rate": 9.821509896211539e-05, + "loss": 0.6236, + "step": 2928 + }, + { + "epoch": 0.5207111111111111, + "grad_norm": 0.35322818435064757, + "learning_rate": 9.815752792361368e-05, + "loss": 0.5848, + "step": 2929 + }, + { + "epoch": 0.5208888888888888, + "grad_norm": 0.3539780920742349, + "learning_rate": 9.809995749598632e-05, + "loss": 0.5857, + "step": 2930 + }, + { + "epoch": 0.5210666666666667, + "grad_norm": 0.35744938734845616, + "learning_rate": 9.804238769832095e-05, + "loss": 0.6431, + "step": 2931 + }, + { + "epoch": 0.5212444444444444, + "grad_norm": 0.3607043617509548, + "learning_rate": 9.798481854970485e-05, + "loss": 0.6193, + "step": 2932 + }, + { + "epoch": 0.5214222222222222, + "grad_norm": 0.36731672479324823, + "learning_rate": 9.792725006922527e-05, + "loss": 0.6384, + "step": 2933 + }, + { + "epoch": 0.5216, + "grad_norm": 0.35196149132551974, + "learning_rate": 9.7869682275969e-05, + "loss": 0.5745, + "step": 2934 + }, + { + "epoch": 0.5217777777777778, + "grad_norm": 0.4007814820710728, + "learning_rate": 9.781211518902285e-05, + "loss": 0.6539, + "step": 2935 + }, + { + "epoch": 0.5219555555555555, + "grad_norm": 0.39414427988381906, + "learning_rate": 9.775454882747315e-05, + "loss": 0.6302, + "step": 2936 + }, + { + "epoch": 0.5221333333333333, + "grad_norm": 0.36894733246837, + "learning_rate": 9.769698321040622e-05, + "loss": 0.6474, + "step": 2937 + }, + { + "epoch": 0.5223111111111111, + "grad_norm": 0.3882822725995771, + "learning_rate": 9.763941835690796e-05, + "loss": 0.6397, + "step": 2938 + }, + { + "epoch": 0.5224888888888889, + "grad_norm": 0.38523492453561436, + "learning_rate": 9.758185428606412e-05, + "loss": 0.6075, + "step": 2939 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 0.4057877666168052, + "learning_rate": 9.752429101696013e-05, + "loss": 0.5862, + "step": 2940 + }, + { + "epoch": 0.5228444444444444, + "grad_norm": 0.34496439990568606, + "learning_rate": 9.746672856868123e-05, + "loss": 0.5848, + "step": 2941 + }, + { + "epoch": 0.5230222222222223, + "grad_norm": 0.34486492247772954, + "learning_rate": 9.740916696031225e-05, + "loss": 0.614, + "step": 2942 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3235033304298466, + "learning_rate": 9.73516062109379e-05, + "loss": 0.5876, + "step": 2943 + }, + { + "epoch": 0.5233777777777778, + "grad_norm": 0.3575551506137171, + "learning_rate": 9.729404633964248e-05, + "loss": 0.5979, + "step": 2944 + }, + { + "epoch": 0.5235555555555556, + "grad_norm": 0.3599655231583651, + "learning_rate": 9.723648736551015e-05, + "loss": 0.6476, + "step": 2945 + }, + { + "epoch": 0.5237333333333334, + "grad_norm": 0.3656483567812726, + "learning_rate": 9.717892930762453e-05, + "loss": 0.6267, + "step": 2946 + }, + { + "epoch": 0.5239111111111111, + "grad_norm": 0.379213754725316, + "learning_rate": 9.71213721850692e-05, + "loss": 0.5718, + "step": 2947 + }, + { + "epoch": 0.5240888888888889, + "grad_norm": 0.3730532621707509, + "learning_rate": 9.706381601692725e-05, + "loss": 0.6248, + "step": 2948 + }, + { + "epoch": 0.5242666666666667, + "grad_norm": 0.3432656977391376, + "learning_rate": 9.700626082228156e-05, + "loss": 0.5671, + "step": 2949 + }, + { + "epoch": 0.5244444444444445, + "grad_norm": 0.38912488739395096, + "learning_rate": 9.694870662021459e-05, + "loss": 0.6183, + "step": 2950 + }, + { + "epoch": 0.5246222222222222, + "grad_norm": 0.35469745042207806, + "learning_rate": 9.689115342980856e-05, + "loss": 0.5806, + "step": 2951 + }, + { + "epoch": 0.5248, + "grad_norm": 0.35818278335217285, + "learning_rate": 9.683360127014529e-05, + "loss": 0.638, + "step": 2952 + }, + { + "epoch": 0.5249777777777778, + "grad_norm": 0.3501042752256769, + "learning_rate": 9.677605016030632e-05, + "loss": 0.5815, + "step": 2953 + }, + { + "epoch": 0.5251555555555556, + "grad_norm": 0.3931154270394189, + "learning_rate": 9.671850011937277e-05, + "loss": 0.6054, + "step": 2954 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 0.35099574769864955, + "learning_rate": 9.666095116642549e-05, + "loss": 0.6183, + "step": 2955 + }, + { + "epoch": 0.5255111111111112, + "grad_norm": 0.3771109287968403, + "learning_rate": 9.660340332054483e-05, + "loss": 0.6231, + "step": 2956 + }, + { + "epoch": 0.5256888888888889, + "grad_norm": 0.33478556952210486, + "learning_rate": 9.654585660081099e-05, + "loss": 0.6358, + "step": 2957 + }, + { + "epoch": 0.5258666666666667, + "grad_norm": 0.3610180396758555, + "learning_rate": 9.648831102630356e-05, + "loss": 0.6042, + "step": 2958 + }, + { + "epoch": 0.5260444444444444, + "grad_norm": 0.3818628723386374, + "learning_rate": 9.643076661610196e-05, + "loss": 0.5794, + "step": 2959 + }, + { + "epoch": 0.5262222222222223, + "grad_norm": 0.37561556756123304, + "learning_rate": 9.637322338928504e-05, + "loss": 0.639, + "step": 2960 + }, + { + "epoch": 0.5264, + "grad_norm": 0.3495583032963642, + "learning_rate": 9.631568136493142e-05, + "loss": 0.6032, + "step": 2961 + }, + { + "epoch": 0.5265777777777778, + "grad_norm": 0.35731148939516844, + "learning_rate": 9.625814056211918e-05, + "loss": 0.596, + "step": 2962 + }, + { + "epoch": 0.5267555555555555, + "grad_norm": 0.35354696768364813, + "learning_rate": 9.620060099992609e-05, + "loss": 0.5984, + "step": 2963 + }, + { + "epoch": 0.5269333333333334, + "grad_norm": 0.3601969735002162, + "learning_rate": 9.614306269742947e-05, + "loss": 0.6014, + "step": 2964 + }, + { + "epoch": 0.5271111111111111, + "grad_norm": 0.34677588890476424, + "learning_rate": 9.608552567370626e-05, + "loss": 0.6196, + "step": 2965 + }, + { + "epoch": 0.5272888888888889, + "grad_norm": 0.34555670822742307, + "learning_rate": 9.602798994783289e-05, + "loss": 0.6309, + "step": 2966 + }, + { + "epoch": 0.5274666666666666, + "grad_norm": 0.3430181957042666, + "learning_rate": 9.597045553888548e-05, + "loss": 0.6128, + "step": 2967 + }, + { + "epoch": 0.5276444444444445, + "grad_norm": 0.347476669007943, + "learning_rate": 9.591292246593958e-05, + "loss": 0.5784, + "step": 2968 + }, + { + "epoch": 0.5278222222222222, + "grad_norm": 0.353854717521372, + "learning_rate": 9.585539074807047e-05, + "loss": 0.5561, + "step": 2969 + }, + { + "epoch": 0.528, + "grad_norm": 0.35783471157217955, + "learning_rate": 9.579786040435275e-05, + "loss": 0.6015, + "step": 2970 + }, + { + "epoch": 0.5281777777777777, + "grad_norm": 0.3686395304174823, + "learning_rate": 9.574033145386079e-05, + "loss": 0.6185, + "step": 2971 + }, + { + "epoch": 0.5283555555555556, + "grad_norm": 0.3947530478590641, + "learning_rate": 9.568280391566835e-05, + "loss": 0.6312, + "step": 2972 + }, + { + "epoch": 0.5285333333333333, + "grad_norm": 0.3675404481285992, + "learning_rate": 9.562527780884884e-05, + "loss": 0.6026, + "step": 2973 + }, + { + "epoch": 0.5287111111111111, + "grad_norm": 0.3547325153181744, + "learning_rate": 9.556775315247501e-05, + "loss": 0.6473, + "step": 2974 + }, + { + "epoch": 0.5288888888888889, + "grad_norm": 0.3581767017880835, + "learning_rate": 9.551022996561937e-05, + "loss": 0.5857, + "step": 2975 + }, + { + "epoch": 0.5290666666666667, + "grad_norm": 0.3288979979584883, + "learning_rate": 9.545270826735374e-05, + "loss": 0.5593, + "step": 2976 + }, + { + "epoch": 0.5292444444444444, + "grad_norm": 0.34967501780154436, + "learning_rate": 9.539518807674957e-05, + "loss": 0.6057, + "step": 2977 + }, + { + "epoch": 0.5294222222222222, + "grad_norm": 0.37073253484684826, + "learning_rate": 9.533766941287771e-05, + "loss": 0.6048, + "step": 2978 + }, + { + "epoch": 0.5296, + "grad_norm": 0.37955417288277576, + "learning_rate": 9.528015229480864e-05, + "loss": 0.652, + "step": 2979 + }, + { + "epoch": 0.5297777777777778, + "grad_norm": 0.3873359186323274, + "learning_rate": 9.522263674161215e-05, + "loss": 0.6439, + "step": 2980 + }, + { + "epoch": 0.5299555555555555, + "grad_norm": 0.3820254078051218, + "learning_rate": 9.516512277235771e-05, + "loss": 0.6427, + "step": 2981 + }, + { + "epoch": 0.5301333333333333, + "grad_norm": 0.35265162052968785, + "learning_rate": 9.510761040611406e-05, + "loss": 0.5435, + "step": 2982 + }, + { + "epoch": 0.5303111111111111, + "grad_norm": 0.37053788926145836, + "learning_rate": 9.505009966194964e-05, + "loss": 0.5894, + "step": 2983 + }, + { + "epoch": 0.5304888888888889, + "grad_norm": 0.33521735108513867, + "learning_rate": 9.499259055893208e-05, + "loss": 0.5472, + "step": 2984 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 0.3646828974786125, + "learning_rate": 9.493508311612874e-05, + "loss": 0.633, + "step": 2985 + }, + { + "epoch": 0.5308444444444445, + "grad_norm": 0.37056438526004687, + "learning_rate": 9.48775773526062e-05, + "loss": 0.603, + "step": 2986 + }, + { + "epoch": 0.5310222222222222, + "grad_norm": 0.37787245926577895, + "learning_rate": 9.482007328743065e-05, + "loss": 0.6098, + "step": 2987 + }, + { + "epoch": 0.5312, + "grad_norm": 0.36488837379231537, + "learning_rate": 9.47625709396676e-05, + "loss": 0.5894, + "step": 2988 + }, + { + "epoch": 0.5313777777777777, + "grad_norm": 0.35361590777252105, + "learning_rate": 9.470507032838208e-05, + "loss": 0.6264, + "step": 2989 + }, + { + "epoch": 0.5315555555555556, + "grad_norm": 0.35393507601515406, + "learning_rate": 9.464757147263849e-05, + "loss": 0.5819, + "step": 2990 + }, + { + "epoch": 0.5317333333333333, + "grad_norm": 0.3648103392845752, + "learning_rate": 9.459007439150066e-05, + "loss": 0.6362, + "step": 2991 + }, + { + "epoch": 0.5319111111111111, + "grad_norm": 0.43035577017104076, + "learning_rate": 9.45325791040318e-05, + "loss": 0.5852, + "step": 2992 + }, + { + "epoch": 0.5320888888888888, + "grad_norm": 0.35528654638984675, + "learning_rate": 9.447508562929465e-05, + "loss": 0.5984, + "step": 2993 + }, + { + "epoch": 0.5322666666666667, + "grad_norm": 0.363727616042198, + "learning_rate": 9.441759398635115e-05, + "loss": 0.6244, + "step": 2994 + }, + { + "epoch": 0.5324444444444445, + "grad_norm": 0.34812432126185633, + "learning_rate": 9.436010419426283e-05, + "loss": 0.6041, + "step": 2995 + }, + { + "epoch": 0.5326222222222222, + "grad_norm": 0.3844321186212034, + "learning_rate": 9.430261627209044e-05, + "loss": 0.6444, + "step": 2996 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3362309205097066, + "learning_rate": 9.424513023889427e-05, + "loss": 0.596, + "step": 2997 + }, + { + "epoch": 0.5329777777777778, + "grad_norm": 0.36276629116989384, + "learning_rate": 9.418764611373382e-05, + "loss": 0.6008, + "step": 2998 + }, + { + "epoch": 0.5331555555555556, + "grad_norm": 0.3356011124921123, + "learning_rate": 9.413016391566813e-05, + "loss": 0.6017, + "step": 2999 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.3428435844038877, + "learning_rate": 9.407268366375541e-05, + "loss": 0.5736, + "step": 3000 + }, + { + "epoch": 0.5335111111111112, + "grad_norm": 0.3745312609198618, + "learning_rate": 9.401520537705339e-05, + "loss": 0.6146, + "step": 3001 + }, + { + "epoch": 0.5336888888888889, + "grad_norm": 0.34099990359193805, + "learning_rate": 9.395772907461906e-05, + "loss": 0.576, + "step": 3002 + }, + { + "epoch": 0.5338666666666667, + "grad_norm": 0.3516542656742074, + "learning_rate": 9.39002547755088e-05, + "loss": 0.6039, + "step": 3003 + }, + { + "epoch": 0.5340444444444444, + "grad_norm": 0.35598025192935656, + "learning_rate": 9.384278249877823e-05, + "loss": 0.6539, + "step": 3004 + }, + { + "epoch": 0.5342222222222223, + "grad_norm": 0.3539822890750942, + "learning_rate": 9.378531226348247e-05, + "loss": 0.6625, + "step": 3005 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3944789672267754, + "learning_rate": 9.372784408867577e-05, + "loss": 0.6594, + "step": 3006 + }, + { + "epoch": 0.5345777777777778, + "grad_norm": 0.3627464888858326, + "learning_rate": 9.367037799341187e-05, + "loss": 0.6153, + "step": 3007 + }, + { + "epoch": 0.5347555555555555, + "grad_norm": 0.3605039588772481, + "learning_rate": 9.361291399674367e-05, + "loss": 0.6531, + "step": 3008 + }, + { + "epoch": 0.5349333333333334, + "grad_norm": 0.3765641551584363, + "learning_rate": 9.35554521177235e-05, + "loss": 0.6389, + "step": 3009 + }, + { + "epoch": 0.5351111111111111, + "grad_norm": 0.3485758876463875, + "learning_rate": 9.349799237540288e-05, + "loss": 0.6145, + "step": 3010 + }, + { + "epoch": 0.5352888888888889, + "grad_norm": 0.6935716907372586, + "learning_rate": 9.344053478883277e-05, + "loss": 0.6376, + "step": 3011 + }, + { + "epoch": 0.5354666666666666, + "grad_norm": 0.3492611561333636, + "learning_rate": 9.338307937706321e-05, + "loss": 0.6033, + "step": 3012 + }, + { + "epoch": 0.5356444444444445, + "grad_norm": 0.34455975255882687, + "learning_rate": 9.332562615914368e-05, + "loss": 0.6322, + "step": 3013 + }, + { + "epoch": 0.5358222222222222, + "grad_norm": 0.3441906225206993, + "learning_rate": 9.326817515412287e-05, + "loss": 0.6155, + "step": 3014 + }, + { + "epoch": 0.536, + "grad_norm": 0.4294715417975386, + "learning_rate": 9.321072638104879e-05, + "loss": 0.5586, + "step": 3015 + }, + { + "epoch": 0.5361777777777778, + "grad_norm": 0.37452621132649355, + "learning_rate": 9.315327985896857e-05, + "loss": 0.6203, + "step": 3016 + }, + { + "epoch": 0.5363555555555556, + "grad_norm": 0.3631902743294269, + "learning_rate": 9.30958356069288e-05, + "loss": 0.5975, + "step": 3017 + }, + { + "epoch": 0.5365333333333333, + "grad_norm": 0.37337044984881307, + "learning_rate": 9.303839364397511e-05, + "loss": 0.6122, + "step": 3018 + }, + { + "epoch": 0.5367111111111111, + "grad_norm": 0.3399066713893503, + "learning_rate": 9.298095398915256e-05, + "loss": 0.6118, + "step": 3019 + }, + { + "epoch": 0.5368888888888889, + "grad_norm": 0.34556326993052, + "learning_rate": 9.292351666150528e-05, + "loss": 0.6076, + "step": 3020 + }, + { + "epoch": 0.5370666666666667, + "grad_norm": 0.3545040718126656, + "learning_rate": 9.286608168007678e-05, + "loss": 0.6236, + "step": 3021 + }, + { + "epoch": 0.5372444444444444, + "grad_norm": 0.33769307856844244, + "learning_rate": 9.280864906390963e-05, + "loss": 0.5957, + "step": 3022 + }, + { + "epoch": 0.5374222222222222, + "grad_norm": 0.3559084304526048, + "learning_rate": 9.275121883204577e-05, + "loss": 0.6206, + "step": 3023 + }, + { + "epoch": 0.5376, + "grad_norm": 0.34373779963744044, + "learning_rate": 9.269379100352624e-05, + "loss": 0.6094, + "step": 3024 + }, + { + "epoch": 0.5377777777777778, + "grad_norm": 0.3518854392861585, + "learning_rate": 9.263636559739132e-05, + "loss": 0.5657, + "step": 3025 + }, + { + "epoch": 0.5379555555555555, + "grad_norm": 0.343681953536367, + "learning_rate": 9.257894263268054e-05, + "loss": 0.5821, + "step": 3026 + }, + { + "epoch": 0.5381333333333334, + "grad_norm": 0.357766217489286, + "learning_rate": 9.252152212843252e-05, + "loss": 0.6149, + "step": 3027 + }, + { + "epoch": 0.5383111111111111, + "grad_norm": 0.35798739109935196, + "learning_rate": 9.24641041036851e-05, + "loss": 0.6395, + "step": 3028 + }, + { + "epoch": 0.5384888888888889, + "grad_norm": 0.3535057450281119, + "learning_rate": 9.24066885774754e-05, + "loss": 0.5945, + "step": 3029 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 0.36059981864387025, + "learning_rate": 9.23492755688395e-05, + "loss": 0.5919, + "step": 3030 + }, + { + "epoch": 0.5388444444444445, + "grad_norm": 0.35602087109642055, + "learning_rate": 9.22918650968129e-05, + "loss": 0.6057, + "step": 3031 + }, + { + "epoch": 0.5390222222222222, + "grad_norm": 0.355986237405648, + "learning_rate": 9.223445718043001e-05, + "loss": 0.6122, + "step": 3032 + }, + { + "epoch": 0.5392, + "grad_norm": 0.33096217158480706, + "learning_rate": 9.217705183872462e-05, + "loss": 0.5677, + "step": 3033 + }, + { + "epoch": 0.5393777777777777, + "grad_norm": 0.3382042520643305, + "learning_rate": 9.211964909072945e-05, + "loss": 0.6099, + "step": 3034 + }, + { + "epoch": 0.5395555555555556, + "grad_norm": 0.3787502846974754, + "learning_rate": 9.206224895547658e-05, + "loss": 0.6299, + "step": 3035 + }, + { + "epoch": 0.5397333333333333, + "grad_norm": 0.3586806414897177, + "learning_rate": 9.200485145199704e-05, + "loss": 0.6523, + "step": 3036 + }, + { + "epoch": 0.5399111111111111, + "grad_norm": 0.3781378245497815, + "learning_rate": 9.194745659932105e-05, + "loss": 0.6195, + "step": 3037 + }, + { + "epoch": 0.5400888888888888, + "grad_norm": 0.4171924019844307, + "learning_rate": 9.189006441647799e-05, + "loss": 0.6305, + "step": 3038 + }, + { + "epoch": 0.5402666666666667, + "grad_norm": 0.3824494785777442, + "learning_rate": 9.183267492249635e-05, + "loss": 0.598, + "step": 3039 + }, + { + "epoch": 0.5404444444444444, + "grad_norm": 0.3513291151204152, + "learning_rate": 9.177528813640362e-05, + "loss": 0.5873, + "step": 3040 + }, + { + "epoch": 0.5406222222222222, + "grad_norm": 0.35935904618984305, + "learning_rate": 9.171790407722656e-05, + "loss": 0.5844, + "step": 3041 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3441305387135324, + "learning_rate": 9.166052276399088e-05, + "loss": 0.6182, + "step": 3042 + }, + { + "epoch": 0.5409777777777778, + "grad_norm": 0.38530641545228766, + "learning_rate": 9.160314421572152e-05, + "loss": 0.5529, + "step": 3043 + }, + { + "epoch": 0.5411555555555555, + "grad_norm": 0.36894512800058743, + "learning_rate": 9.154576845144231e-05, + "loss": 0.6071, + "step": 3044 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 0.3581027666785767, + "learning_rate": 9.148839549017639e-05, + "loss": 0.6237, + "step": 3045 + }, + { + "epoch": 0.5415111111111112, + "grad_norm": 0.3450882268316929, + "learning_rate": 9.143102535094575e-05, + "loss": 0.6142, + "step": 3046 + }, + { + "epoch": 0.5416888888888889, + "grad_norm": 0.4960994345055827, + "learning_rate": 9.137365805277164e-05, + "loss": 0.568, + "step": 3047 + }, + { + "epoch": 0.5418666666666667, + "grad_norm": 0.35924534009725706, + "learning_rate": 9.13162936146742e-05, + "loss": 0.6158, + "step": 3048 + }, + { + "epoch": 0.5420444444444444, + "grad_norm": 0.3667035458947974, + "learning_rate": 9.125893205567273e-05, + "loss": 0.6148, + "step": 3049 + }, + { + "epoch": 0.5422222222222223, + "grad_norm": 0.36158423398438444, + "learning_rate": 9.120157339478555e-05, + "loss": 0.6246, + "step": 3050 + }, + { + "epoch": 0.5424, + "grad_norm": 0.34326281940349385, + "learning_rate": 9.114421765102999e-05, + "loss": 0.5021, + "step": 3051 + }, + { + "epoch": 0.5425777777777778, + "grad_norm": 0.35569674283341973, + "learning_rate": 9.108686484342241e-05, + "loss": 0.6413, + "step": 3052 + }, + { + "epoch": 0.5427555555555555, + "grad_norm": 0.34389443820476223, + "learning_rate": 9.102951499097829e-05, + "loss": 0.5538, + "step": 3053 + }, + { + "epoch": 0.5429333333333334, + "grad_norm": 0.35396625946768323, + "learning_rate": 9.097216811271199e-05, + "loss": 0.6344, + "step": 3054 + }, + { + "epoch": 0.5431111111111111, + "grad_norm": 0.341088389696483, + "learning_rate": 9.0914824227637e-05, + "loss": 0.6164, + "step": 3055 + }, + { + "epoch": 0.5432888888888889, + "grad_norm": 0.36311799866731137, + "learning_rate": 9.085748335476573e-05, + "loss": 0.5969, + "step": 3056 + }, + { + "epoch": 0.5434666666666667, + "grad_norm": 0.3531098612566486, + "learning_rate": 9.08001455131097e-05, + "loss": 0.5955, + "step": 3057 + }, + { + "epoch": 0.5436444444444445, + "grad_norm": 0.3562884086810283, + "learning_rate": 9.074281072167928e-05, + "loss": 0.609, + "step": 3058 + }, + { + "epoch": 0.5438222222222222, + "grad_norm": 0.346151980360336, + "learning_rate": 9.068547899948396e-05, + "loss": 0.6094, + "step": 3059 + }, + { + "epoch": 0.544, + "grad_norm": 0.3731839449531166, + "learning_rate": 9.062815036553213e-05, + "loss": 0.6181, + "step": 3060 + }, + { + "epoch": 0.5441777777777778, + "grad_norm": 0.34747862061774065, + "learning_rate": 9.057082483883122e-05, + "loss": 0.6033, + "step": 3061 + }, + { + "epoch": 0.5443555555555556, + "grad_norm": 0.3392560270491025, + "learning_rate": 9.051350243838756e-05, + "loss": 0.5926, + "step": 3062 + }, + { + "epoch": 0.5445333333333333, + "grad_norm": 0.34041497572993407, + "learning_rate": 9.045618318320651e-05, + "loss": 0.5691, + "step": 3063 + }, + { + "epoch": 0.5447111111111111, + "grad_norm": 0.34623108779299117, + "learning_rate": 9.039886709229229e-05, + "loss": 0.6024, + "step": 3064 + }, + { + "epoch": 0.5448888888888889, + "grad_norm": 0.37725364633926356, + "learning_rate": 9.034155418464823e-05, + "loss": 0.6451, + "step": 3065 + }, + { + "epoch": 0.5450666666666667, + "grad_norm": 0.3545440476571932, + "learning_rate": 9.028424447927641e-05, + "loss": 0.5789, + "step": 3066 + }, + { + "epoch": 0.5452444444444444, + "grad_norm": 0.33553046371123374, + "learning_rate": 9.022693799517806e-05, + "loss": 0.6083, + "step": 3067 + }, + { + "epoch": 0.5454222222222223, + "grad_norm": 0.3649647731076927, + "learning_rate": 9.016963475135313e-05, + "loss": 0.5908, + "step": 3068 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3646861002799353, + "learning_rate": 9.011233476680067e-05, + "loss": 0.6235, + "step": 3069 + }, + { + "epoch": 0.5457777777777778, + "grad_norm": 0.3588797191871229, + "learning_rate": 9.005503806051853e-05, + "loss": 0.6129, + "step": 3070 + }, + { + "epoch": 0.5459555555555555, + "grad_norm": 0.3656544796879238, + "learning_rate": 8.999774465150356e-05, + "loss": 0.6313, + "step": 3071 + }, + { + "epoch": 0.5461333333333334, + "grad_norm": 0.3511686649342916, + "learning_rate": 8.994045455875142e-05, + "loss": 0.6027, + "step": 3072 + }, + { + "epoch": 0.5463111111111111, + "grad_norm": 0.3528330258003095, + "learning_rate": 8.98831678012568e-05, + "loss": 0.5755, + "step": 3073 + }, + { + "epoch": 0.5464888888888889, + "grad_norm": 0.37001058784158514, + "learning_rate": 8.982588439801314e-05, + "loss": 0.6382, + "step": 3074 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.3509138043944784, + "learning_rate": 8.976860436801291e-05, + "loss": 0.5873, + "step": 3075 + }, + { + "epoch": 0.5468444444444445, + "grad_norm": 0.35842961480793795, + "learning_rate": 8.97113277302473e-05, + "loss": 0.6716, + "step": 3076 + }, + { + "epoch": 0.5470222222222222, + "grad_norm": 0.3702768845996892, + "learning_rate": 8.965405450370655e-05, + "loss": 0.6196, + "step": 3077 + }, + { + "epoch": 0.5472, + "grad_norm": 0.3693486007256979, + "learning_rate": 8.959678470737962e-05, + "loss": 0.6085, + "step": 3078 + }, + { + "epoch": 0.5473777777777777, + "grad_norm": 0.3669139475822687, + "learning_rate": 8.953951836025446e-05, + "loss": 0.6296, + "step": 3079 + }, + { + "epoch": 0.5475555555555556, + "grad_norm": 0.36325698313278154, + "learning_rate": 8.948225548131775e-05, + "loss": 0.6286, + "step": 3080 + }, + { + "epoch": 0.5477333333333333, + "grad_norm": 0.35813603385671217, + "learning_rate": 8.942499608955516e-05, + "loss": 0.5871, + "step": 3081 + }, + { + "epoch": 0.5479111111111111, + "grad_norm": 0.3641911072277507, + "learning_rate": 8.936774020395103e-05, + "loss": 0.6287, + "step": 3082 + }, + { + "epoch": 0.5480888888888888, + "grad_norm": 0.3555746540746553, + "learning_rate": 8.931048784348875e-05, + "loss": 0.584, + "step": 3083 + }, + { + "epoch": 0.5482666666666667, + "grad_norm": 0.5759905763971599, + "learning_rate": 8.925323902715031e-05, + "loss": 0.6031, + "step": 3084 + }, + { + "epoch": 0.5484444444444444, + "grad_norm": 0.37142069058373095, + "learning_rate": 8.919599377391673e-05, + "loss": 0.5777, + "step": 3085 + }, + { + "epoch": 0.5486222222222222, + "grad_norm": 0.35549108718528766, + "learning_rate": 8.913875210276772e-05, + "loss": 0.6154, + "step": 3086 + }, + { + "epoch": 0.5488, + "grad_norm": 0.44243168121754856, + "learning_rate": 8.908151403268184e-05, + "loss": 0.6181, + "step": 3087 + }, + { + "epoch": 0.5489777777777778, + "grad_norm": 0.343253774873771, + "learning_rate": 8.902427958263648e-05, + "loss": 0.5662, + "step": 3088 + }, + { + "epoch": 0.5491555555555555, + "grad_norm": 0.39428861280338556, + "learning_rate": 8.896704877160782e-05, + "loss": 0.6282, + "step": 3089 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 0.3653713214750227, + "learning_rate": 8.890982161857076e-05, + "loss": 0.5895, + "step": 3090 + }, + { + "epoch": 0.5495111111111111, + "grad_norm": 0.3471673275406715, + "learning_rate": 8.885259814249911e-05, + "loss": 0.6172, + "step": 3091 + }, + { + "epoch": 0.5496888888888889, + "grad_norm": 0.3665147045347773, + "learning_rate": 8.879537836236536e-05, + "loss": 0.6507, + "step": 3092 + }, + { + "epoch": 0.5498666666666666, + "grad_norm": 0.3592818666062855, + "learning_rate": 8.873816229714084e-05, + "loss": 0.624, + "step": 3093 + }, + { + "epoch": 0.5500444444444444, + "grad_norm": 0.34200932991818483, + "learning_rate": 8.868094996579561e-05, + "loss": 0.5988, + "step": 3094 + }, + { + "epoch": 0.5502222222222222, + "grad_norm": 0.34951040669998445, + "learning_rate": 8.862374138729853e-05, + "loss": 0.5816, + "step": 3095 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3392300153933995, + "learning_rate": 8.856653658061713e-05, + "loss": 0.6101, + "step": 3096 + }, + { + "epoch": 0.5505777777777778, + "grad_norm": 0.3660001741569265, + "learning_rate": 8.850933556471785e-05, + "loss": 0.5924, + "step": 3097 + }, + { + "epoch": 0.5507555555555556, + "grad_norm": 0.3530203304048615, + "learning_rate": 8.845213835856568e-05, + "loss": 0.6034, + "step": 3098 + }, + { + "epoch": 0.5509333333333334, + "grad_norm": 0.349978795488595, + "learning_rate": 8.839494498112447e-05, + "loss": 0.5489, + "step": 3099 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 0.3571641427643841, + "learning_rate": 8.833775545135678e-05, + "loss": 0.6479, + "step": 3100 + }, + { + "epoch": 0.5512888888888889, + "grad_norm": 0.3642650747991144, + "learning_rate": 8.82805697882239e-05, + "loss": 0.6121, + "step": 3101 + }, + { + "epoch": 0.5514666666666667, + "grad_norm": 0.377636787329736, + "learning_rate": 8.822338801068575e-05, + "loss": 0.6204, + "step": 3102 + }, + { + "epoch": 0.5516444444444445, + "grad_norm": 0.34967875380412294, + "learning_rate": 8.816621013770114e-05, + "loss": 0.6137, + "step": 3103 + }, + { + "epoch": 0.5518222222222222, + "grad_norm": 0.40166395689072415, + "learning_rate": 8.810903618822739e-05, + "loss": 0.6482, + "step": 3104 + }, + { + "epoch": 0.552, + "grad_norm": 0.3770578571856448, + "learning_rate": 8.805186618122068e-05, + "loss": 0.6411, + "step": 3105 + }, + { + "epoch": 0.5521777777777778, + "grad_norm": 0.3911401868679578, + "learning_rate": 8.799470013563573e-05, + "loss": 0.6278, + "step": 3106 + }, + { + "epoch": 0.5523555555555556, + "grad_norm": 0.3364077173465212, + "learning_rate": 8.793753807042613e-05, + "loss": 0.5746, + "step": 3107 + }, + { + "epoch": 0.5525333333333333, + "grad_norm": 0.3699284574000947, + "learning_rate": 8.788038000454395e-05, + "loss": 0.6174, + "step": 3108 + }, + { + "epoch": 0.5527111111111112, + "grad_norm": 0.41597305088654535, + "learning_rate": 8.782322595694012e-05, + "loss": 0.6114, + "step": 3109 + }, + { + "epoch": 0.5528888888888889, + "grad_norm": 0.37236736217163985, + "learning_rate": 8.77660759465641e-05, + "loss": 0.6186, + "step": 3110 + }, + { + "epoch": 0.5530666666666667, + "grad_norm": 0.33558960816760525, + "learning_rate": 8.770892999236405e-05, + "loss": 0.5467, + "step": 3111 + }, + { + "epoch": 0.5532444444444444, + "grad_norm": 0.360362982104993, + "learning_rate": 8.765178811328684e-05, + "loss": 0.6446, + "step": 3112 + }, + { + "epoch": 0.5534222222222223, + "grad_norm": 0.364594648735862, + "learning_rate": 8.759465032827794e-05, + "loss": 0.5886, + "step": 3113 + }, + { + "epoch": 0.5536, + "grad_norm": 0.36715831679915656, + "learning_rate": 8.753751665628141e-05, + "loss": 0.5832, + "step": 3114 + }, + { + "epoch": 0.5537777777777778, + "grad_norm": 0.35640763908920553, + "learning_rate": 8.74803871162401e-05, + "loss": 0.6061, + "step": 3115 + }, + { + "epoch": 0.5539555555555555, + "grad_norm": 0.35505394075035235, + "learning_rate": 8.74232617270953e-05, + "loss": 0.6006, + "step": 3116 + }, + { + "epoch": 0.5541333333333334, + "grad_norm": 0.35864570486856606, + "learning_rate": 8.73661405077871e-05, + "loss": 0.6114, + "step": 3117 + }, + { + "epoch": 0.5543111111111111, + "grad_norm": 0.3477896098016776, + "learning_rate": 8.730902347725406e-05, + "loss": 0.6191, + "step": 3118 + }, + { + "epoch": 0.5544888888888889, + "grad_norm": 0.34130412328102105, + "learning_rate": 8.725191065443348e-05, + "loss": 0.5663, + "step": 3119 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 0.3485717646431946, + "learning_rate": 8.719480205826113e-05, + "loss": 0.5826, + "step": 3120 + }, + { + "epoch": 0.5548444444444445, + "grad_norm": 0.3902562888116477, + "learning_rate": 8.713769770767155e-05, + "loss": 0.6397, + "step": 3121 + }, + { + "epoch": 0.5550222222222222, + "grad_norm": 0.36643025161550385, + "learning_rate": 8.708059762159768e-05, + "loss": 0.5958, + "step": 3122 + }, + { + "epoch": 0.5552, + "grad_norm": 0.3640031617331684, + "learning_rate": 8.702350181897118e-05, + "loss": 0.633, + "step": 3123 + }, + { + "epoch": 0.5553777777777777, + "grad_norm": 0.3465618679460059, + "learning_rate": 8.696641031872224e-05, + "loss": 0.5949, + "step": 3124 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 0.3488210519703397, + "learning_rate": 8.690932313977967e-05, + "loss": 0.5929, + "step": 3125 + }, + { + "epoch": 0.5557333333333333, + "grad_norm": 0.3515530668214604, + "learning_rate": 8.685224030107074e-05, + "loss": 0.6046, + "step": 3126 + }, + { + "epoch": 0.5559111111111111, + "grad_norm": 0.3666684164577866, + "learning_rate": 8.679516182152142e-05, + "loss": 0.6072, + "step": 3127 + }, + { + "epoch": 0.5560888888888889, + "grad_norm": 0.34904269030972257, + "learning_rate": 8.67380877200561e-05, + "loss": 0.5815, + "step": 3128 + }, + { + "epoch": 0.5562666666666667, + "grad_norm": 0.36260934065996636, + "learning_rate": 8.668101801559786e-05, + "loss": 0.5884, + "step": 3129 + }, + { + "epoch": 0.5564444444444444, + "grad_norm": 0.3628142272416488, + "learning_rate": 8.662395272706816e-05, + "loss": 0.5717, + "step": 3130 + }, + { + "epoch": 0.5566222222222222, + "grad_norm": 0.3459240506705345, + "learning_rate": 8.656689187338719e-05, + "loss": 0.6121, + "step": 3131 + }, + { + "epoch": 0.5568, + "grad_norm": 0.344497018084573, + "learning_rate": 8.650983547347344e-05, + "loss": 0.6168, + "step": 3132 + }, + { + "epoch": 0.5569777777777778, + "grad_norm": 0.35586702287311356, + "learning_rate": 8.645278354624417e-05, + "loss": 0.5718, + "step": 3133 + }, + { + "epoch": 0.5571555555555555, + "grad_norm": 0.3322156957893119, + "learning_rate": 8.639573611061493e-05, + "loss": 0.5861, + "step": 3134 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 0.3476549058512365, + "learning_rate": 8.633869318549994e-05, + "loss": 0.607, + "step": 3135 + }, + { + "epoch": 0.5575111111111111, + "grad_norm": 0.37275228473645716, + "learning_rate": 8.628165478981184e-05, + "loss": 0.5912, + "step": 3136 + }, + { + "epoch": 0.5576888888888889, + "grad_norm": 0.36493057153297015, + "learning_rate": 8.622462094246184e-05, + "loss": 0.6098, + "step": 3137 + }, + { + "epoch": 0.5578666666666666, + "grad_norm": 0.37000568478540485, + "learning_rate": 8.616759166235953e-05, + "loss": 0.6161, + "step": 3138 + }, + { + "epoch": 0.5580444444444445, + "grad_norm": 0.35943889795557477, + "learning_rate": 8.611056696841312e-05, + "loss": 0.565, + "step": 3139 + }, + { + "epoch": 0.5582222222222222, + "grad_norm": 0.3481778216699788, + "learning_rate": 8.605354687952915e-05, + "loss": 0.5788, + "step": 3140 + }, + { + "epoch": 0.5584, + "grad_norm": 0.351397478124162, + "learning_rate": 8.599653141461283e-05, + "loss": 0.6196, + "step": 3141 + }, + { + "epoch": 0.5585777777777777, + "grad_norm": 0.4284752981764255, + "learning_rate": 8.593952059256762e-05, + "loss": 0.6028, + "step": 3142 + }, + { + "epoch": 0.5587555555555556, + "grad_norm": 0.34091581138652954, + "learning_rate": 8.588251443229563e-05, + "loss": 0.6181, + "step": 3143 + }, + { + "epoch": 0.5589333333333333, + "grad_norm": 0.33577299657976956, + "learning_rate": 8.582551295269726e-05, + "loss": 0.5781, + "step": 3144 + }, + { + "epoch": 0.5591111111111111, + "grad_norm": 0.35497584121487163, + "learning_rate": 8.57685161726715e-05, + "loss": 0.6502, + "step": 3145 + }, + { + "epoch": 0.5592888888888888, + "grad_norm": 0.33747314274775486, + "learning_rate": 8.571152411111568e-05, + "loss": 0.6141, + "step": 3146 + }, + { + "epoch": 0.5594666666666667, + "grad_norm": 0.344317610914493, + "learning_rate": 8.565453678692561e-05, + "loss": 0.6049, + "step": 3147 + }, + { + "epoch": 0.5596444444444445, + "grad_norm": 0.3653650100436146, + "learning_rate": 8.559755421899554e-05, + "loss": 0.5983, + "step": 3148 + }, + { + "epoch": 0.5598222222222222, + "grad_norm": 0.35753188265371255, + "learning_rate": 8.554057642621813e-05, + "loss": 0.5594, + "step": 3149 + }, + { + "epoch": 0.56, + "grad_norm": 0.34542644785606025, + "learning_rate": 8.54836034274844e-05, + "loss": 0.6067, + "step": 3150 + }, + { + "epoch": 0.5601777777777778, + "grad_norm": 0.3546525697413544, + "learning_rate": 8.54266352416839e-05, + "loss": 0.6302, + "step": 3151 + }, + { + "epoch": 0.5603555555555556, + "grad_norm": 0.3801307029221448, + "learning_rate": 8.536967188770447e-05, + "loss": 0.5578, + "step": 3152 + }, + { + "epoch": 0.5605333333333333, + "grad_norm": 0.4333945897184056, + "learning_rate": 8.531271338443245e-05, + "loss": 0.5995, + "step": 3153 + }, + { + "epoch": 0.5607111111111112, + "grad_norm": 0.3794676206788478, + "learning_rate": 8.525575975075243e-05, + "loss": 0.6231, + "step": 3154 + }, + { + "epoch": 0.5608888888888889, + "grad_norm": 0.3445862961150107, + "learning_rate": 8.519881100554758e-05, + "loss": 0.5897, + "step": 3155 + }, + { + "epoch": 0.5610666666666667, + "grad_norm": 0.34775161594446613, + "learning_rate": 8.514186716769924e-05, + "loss": 0.5942, + "step": 3156 + }, + { + "epoch": 0.5612444444444444, + "grad_norm": 0.3543524040787787, + "learning_rate": 8.508492825608733e-05, + "loss": 0.5821, + "step": 3157 + }, + { + "epoch": 0.5614222222222223, + "grad_norm": 0.36300900376343004, + "learning_rate": 8.502799428958994e-05, + "loss": 0.5952, + "step": 3158 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3597064410580433, + "learning_rate": 8.497106528708368e-05, + "loss": 0.6607, + "step": 3159 + }, + { + "epoch": 0.5617777777777778, + "grad_norm": 0.3506203636181199, + "learning_rate": 8.491414126744339e-05, + "loss": 0.5876, + "step": 3160 + }, + { + "epoch": 0.5619555555555555, + "grad_norm": 0.3617457160387125, + "learning_rate": 8.485722224954237e-05, + "loss": 0.6384, + "step": 3161 + }, + { + "epoch": 0.5621333333333334, + "grad_norm": 0.38679068330514044, + "learning_rate": 8.480030825225213e-05, + "loss": 0.6219, + "step": 3162 + }, + { + "epoch": 0.5623111111111111, + "grad_norm": 0.34490811120009385, + "learning_rate": 8.47433992944427e-05, + "loss": 0.5689, + "step": 3163 + }, + { + "epoch": 0.5624888888888889, + "grad_norm": 0.36722133181190103, + "learning_rate": 8.46864953949822e-05, + "loss": 0.6173, + "step": 3164 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 0.38256267314838216, + "learning_rate": 8.462959657273733e-05, + "loss": 0.5816, + "step": 3165 + }, + { + "epoch": 0.5628444444444445, + "grad_norm": 0.3775268343160406, + "learning_rate": 8.45727028465729e-05, + "loss": 0.5871, + "step": 3166 + }, + { + "epoch": 0.5630222222222222, + "grad_norm": 0.36256538279905787, + "learning_rate": 8.451581423535216e-05, + "loss": 0.5537, + "step": 3167 + }, + { + "epoch": 0.5632, + "grad_norm": 0.35620464035234517, + "learning_rate": 8.445893075793654e-05, + "loss": 0.5896, + "step": 3168 + }, + { + "epoch": 0.5633777777777778, + "grad_norm": 0.3646897876478277, + "learning_rate": 8.440205243318595e-05, + "loss": 0.5723, + "step": 3169 + }, + { + "epoch": 0.5635555555555556, + "grad_norm": 0.3747770265453073, + "learning_rate": 8.434517927995837e-05, + "loss": 0.578, + "step": 3170 + }, + { + "epoch": 0.5637333333333333, + "grad_norm": 0.37331941498167825, + "learning_rate": 8.42883113171103e-05, + "loss": 0.6196, + "step": 3171 + }, + { + "epoch": 0.5639111111111111, + "grad_norm": 0.37541078837816255, + "learning_rate": 8.423144856349631e-05, + "loss": 0.6456, + "step": 3172 + }, + { + "epoch": 0.5640888888888889, + "grad_norm": 0.3559919088487258, + "learning_rate": 8.417459103796934e-05, + "loss": 0.6194, + "step": 3173 + }, + { + "epoch": 0.5642666666666667, + "grad_norm": 0.34176307523197985, + "learning_rate": 8.411773875938062e-05, + "loss": 0.6091, + "step": 3174 + }, + { + "epoch": 0.5644444444444444, + "grad_norm": 0.36413620864705437, + "learning_rate": 8.406089174657963e-05, + "loss": 0.6342, + "step": 3175 + }, + { + "epoch": 0.5646222222222222, + "grad_norm": 0.3507592133296721, + "learning_rate": 8.400405001841399e-05, + "loss": 0.6182, + "step": 3176 + }, + { + "epoch": 0.5648, + "grad_norm": 0.36304011525177293, + "learning_rate": 8.394721359372977e-05, + "loss": 0.6337, + "step": 3177 + }, + { + "epoch": 0.5649777777777778, + "grad_norm": 0.3826975668612223, + "learning_rate": 8.389038249137107e-05, + "loss": 0.6457, + "step": 3178 + }, + { + "epoch": 0.5651555555555555, + "grad_norm": 0.3642988280175765, + "learning_rate": 8.383355673018042e-05, + "loss": 0.5975, + "step": 3179 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 0.35835731920280894, + "learning_rate": 8.37767363289984e-05, + "loss": 0.5828, + "step": 3180 + }, + { + "epoch": 0.5655111111111111, + "grad_norm": 0.35282122749763534, + "learning_rate": 8.371992130666402e-05, + "loss": 0.6017, + "step": 3181 + }, + { + "epoch": 0.5656888888888889, + "grad_norm": 0.37653502126753874, + "learning_rate": 8.366311168201424e-05, + "loss": 0.5912, + "step": 3182 + }, + { + "epoch": 0.5658666666666666, + "grad_norm": 0.355864785060337, + "learning_rate": 8.36063074738845e-05, + "loss": 0.5955, + "step": 3183 + }, + { + "epoch": 0.5660444444444445, + "grad_norm": 0.44217982011143075, + "learning_rate": 8.354950870110825e-05, + "loss": 0.5671, + "step": 3184 + }, + { + "epoch": 0.5662222222222222, + "grad_norm": 0.3819307115013125, + "learning_rate": 8.349271538251723e-05, + "loss": 0.6272, + "step": 3185 + }, + { + "epoch": 0.5664, + "grad_norm": 0.37564114361939993, + "learning_rate": 8.343592753694135e-05, + "loss": 0.5794, + "step": 3186 + }, + { + "epoch": 0.5665777777777777, + "grad_norm": 0.3547291479327033, + "learning_rate": 8.337914518320873e-05, + "loss": 0.6149, + "step": 3187 + }, + { + "epoch": 0.5667555555555556, + "grad_norm": 0.483125549496956, + "learning_rate": 8.332236834014557e-05, + "loss": 0.6427, + "step": 3188 + }, + { + "epoch": 0.5669333333333333, + "grad_norm": 0.34078864154225064, + "learning_rate": 8.326559702657642e-05, + "loss": 0.5659, + "step": 3189 + }, + { + "epoch": 0.5671111111111111, + "grad_norm": 0.3483396814003731, + "learning_rate": 8.320883126132379e-05, + "loss": 0.5937, + "step": 3190 + }, + { + "epoch": 0.5672888888888888, + "grad_norm": 0.3558735546193817, + "learning_rate": 8.315207106320856e-05, + "loss": 0.5945, + "step": 3191 + }, + { + "epoch": 0.5674666666666667, + "grad_norm": 0.3388676937189465, + "learning_rate": 8.309531645104957e-05, + "loss": 0.5916, + "step": 3192 + }, + { + "epoch": 0.5676444444444444, + "grad_norm": 0.3794106802209468, + "learning_rate": 8.303856744366396e-05, + "loss": 0.6703, + "step": 3193 + }, + { + "epoch": 0.5678222222222222, + "grad_norm": 0.35055071449610203, + "learning_rate": 8.298182405986689e-05, + "loss": 0.5758, + "step": 3194 + }, + { + "epoch": 0.568, + "grad_norm": 0.3570849064302238, + "learning_rate": 8.29250863184718e-05, + "loss": 0.6086, + "step": 3195 + }, + { + "epoch": 0.5681777777777778, + "grad_norm": 0.3612083756158218, + "learning_rate": 8.28683542382901e-05, + "loss": 0.5962, + "step": 3196 + }, + { + "epoch": 0.5683555555555555, + "grad_norm": 0.3413380974468956, + "learning_rate": 8.281162783813142e-05, + "loss": 0.6194, + "step": 3197 + }, + { + "epoch": 0.5685333333333333, + "grad_norm": 0.3351282573369659, + "learning_rate": 8.275490713680349e-05, + "loss": 0.5922, + "step": 3198 + }, + { + "epoch": 0.5687111111111111, + "grad_norm": 0.32870786739907515, + "learning_rate": 8.269819215311215e-05, + "loss": 0.5346, + "step": 3199 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 0.42148297449214084, + "learning_rate": 8.264148290586128e-05, + "loss": 0.5908, + "step": 3200 + }, + { + "epoch": 0.5690666666666667, + "grad_norm": 0.3597229690057992, + "learning_rate": 8.2584779413853e-05, + "loss": 0.5803, + "step": 3201 + }, + { + "epoch": 0.5692444444444444, + "grad_norm": 0.3529049587456271, + "learning_rate": 8.252808169588737e-05, + "loss": 0.586, + "step": 3202 + }, + { + "epoch": 0.5694222222222223, + "grad_norm": 0.3390960192350595, + "learning_rate": 8.247138977076268e-05, + "loss": 0.5748, + "step": 3203 + }, + { + "epoch": 0.5696, + "grad_norm": 0.35366625829077525, + "learning_rate": 8.241470365727512e-05, + "loss": 0.5928, + "step": 3204 + }, + { + "epoch": 0.5697777777777778, + "grad_norm": 0.34860352320708177, + "learning_rate": 8.235802337421919e-05, + "loss": 0.5949, + "step": 3205 + }, + { + "epoch": 0.5699555555555555, + "grad_norm": 0.3760047648701897, + "learning_rate": 8.230134894038717e-05, + "loss": 0.6489, + "step": 3206 + }, + { + "epoch": 0.5701333333333334, + "grad_norm": 0.3497018581908783, + "learning_rate": 8.224468037456969e-05, + "loss": 0.5848, + "step": 3207 + }, + { + "epoch": 0.5703111111111111, + "grad_norm": 0.3383106023210154, + "learning_rate": 8.218801769555522e-05, + "loss": 0.5821, + "step": 3208 + }, + { + "epoch": 0.5704888888888889, + "grad_norm": 0.34557003005831516, + "learning_rate": 8.213136092213039e-05, + "loss": 0.5815, + "step": 3209 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 0.345902953229418, + "learning_rate": 8.20747100730798e-05, + "loss": 0.6137, + "step": 3210 + }, + { + "epoch": 0.5708444444444445, + "grad_norm": 0.36050005168293403, + "learning_rate": 8.20180651671862e-05, + "loss": 0.6114, + "step": 3211 + }, + { + "epoch": 0.5710222222222222, + "grad_norm": 0.3539116096470461, + "learning_rate": 8.196142622323018e-05, + "loss": 0.595, + "step": 3212 + }, + { + "epoch": 0.5712, + "grad_norm": 0.34764928464885, + "learning_rate": 8.190479325999059e-05, + "loss": 0.6394, + "step": 3213 + }, + { + "epoch": 0.5713777777777778, + "grad_norm": 0.3335957421014551, + "learning_rate": 8.184816629624406e-05, + "loss": 0.5828, + "step": 3214 + }, + { + "epoch": 0.5715555555555556, + "grad_norm": 0.33821529916234605, + "learning_rate": 8.179154535076546e-05, + "loss": 0.6084, + "step": 3215 + }, + { + "epoch": 0.5717333333333333, + "grad_norm": 0.3466593247785601, + "learning_rate": 8.173493044232745e-05, + "loss": 0.58, + "step": 3216 + }, + { + "epoch": 0.5719111111111111, + "grad_norm": 0.34376603334207395, + "learning_rate": 8.167832158970087e-05, + "loss": 0.5852, + "step": 3217 + }, + { + "epoch": 0.5720888888888889, + "grad_norm": 0.3410878199409463, + "learning_rate": 8.162171881165439e-05, + "loss": 0.5353, + "step": 3218 + }, + { + "epoch": 0.5722666666666667, + "grad_norm": 0.33581722023083194, + "learning_rate": 8.156512212695481e-05, + "loss": 0.5958, + "step": 3219 + }, + { + "epoch": 0.5724444444444444, + "grad_norm": 0.3876139758830211, + "learning_rate": 8.150853155436684e-05, + "loss": 0.6065, + "step": 3220 + }, + { + "epoch": 0.5726222222222223, + "grad_norm": 0.366702189552425, + "learning_rate": 8.145194711265313e-05, + "loss": 0.5829, + "step": 3221 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3747886781175315, + "learning_rate": 8.139536882057437e-05, + "loss": 0.6315, + "step": 3222 + }, + { + "epoch": 0.5729777777777778, + "grad_norm": 0.35944879676503755, + "learning_rate": 8.133879669688919e-05, + "loss": 0.5808, + "step": 3223 + }, + { + "epoch": 0.5731555555555555, + "grad_norm": 0.3731052388063133, + "learning_rate": 8.128223076035409e-05, + "loss": 0.6269, + "step": 3224 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.37189423198335936, + "learning_rate": 8.12256710297237e-05, + "loss": 0.5975, + "step": 3225 + }, + { + "epoch": 0.5735111111111111, + "grad_norm": 0.3605741705374192, + "learning_rate": 8.116911752375037e-05, + "loss": 0.6006, + "step": 3226 + }, + { + "epoch": 0.5736888888888889, + "grad_norm": 0.3304544761838582, + "learning_rate": 8.11125702611846e-05, + "loss": 0.551, + "step": 3227 + }, + { + "epoch": 0.5738666666666666, + "grad_norm": 0.3558160543577562, + "learning_rate": 8.105602926077466e-05, + "loss": 0.5521, + "step": 3228 + }, + { + "epoch": 0.5740444444444445, + "grad_norm": 0.3253695049622035, + "learning_rate": 8.099949454126685e-05, + "loss": 0.5248, + "step": 3229 + }, + { + "epoch": 0.5742222222222222, + "grad_norm": 0.36688875291056017, + "learning_rate": 8.094296612140527e-05, + "loss": 0.5809, + "step": 3230 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3625188073755832, + "learning_rate": 8.08864440199321e-05, + "loss": 0.6023, + "step": 3231 + }, + { + "epoch": 0.5745777777777777, + "grad_norm": 0.3865779612828931, + "learning_rate": 8.082992825558725e-05, + "loss": 0.6231, + "step": 3232 + }, + { + "epoch": 0.5747555555555556, + "grad_norm": 0.3466544763316007, + "learning_rate": 8.077341884710862e-05, + "loss": 0.6212, + "step": 3233 + }, + { + "epoch": 0.5749333333333333, + "grad_norm": 0.31976588144951157, + "learning_rate": 8.0716915813232e-05, + "loss": 0.556, + "step": 3234 + }, + { + "epoch": 0.5751111111111111, + "grad_norm": 0.42158126591798334, + "learning_rate": 8.06604191726911e-05, + "loss": 0.6117, + "step": 3235 + }, + { + "epoch": 0.5752888888888888, + "grad_norm": 0.34953454045791155, + "learning_rate": 8.060392894421738e-05, + "loss": 0.5842, + "step": 3236 + }, + { + "epoch": 0.5754666666666667, + "grad_norm": 0.34427122944642785, + "learning_rate": 8.054744514654033e-05, + "loss": 0.566, + "step": 3237 + }, + { + "epoch": 0.5756444444444444, + "grad_norm": 0.3572357181683326, + "learning_rate": 8.049096779838719e-05, + "loss": 0.5925, + "step": 3238 + }, + { + "epoch": 0.5758222222222222, + "grad_norm": 0.3567602357887107, + "learning_rate": 8.043449691848311e-05, + "loss": 0.5553, + "step": 3239 + }, + { + "epoch": 0.576, + "grad_norm": 0.3542586374741914, + "learning_rate": 8.037803252555119e-05, + "loss": 0.5725, + "step": 3240 + }, + { + "epoch": 0.5761777777777778, + "grad_norm": 0.3763144038913365, + "learning_rate": 8.032157463831216e-05, + "loss": 0.6047, + "step": 3241 + }, + { + "epoch": 0.5763555555555555, + "grad_norm": 0.34775200908543114, + "learning_rate": 8.026512327548481e-05, + "loss": 0.5933, + "step": 3242 + }, + { + "epoch": 0.5765333333333333, + "grad_norm": 0.3539013838006837, + "learning_rate": 8.020867845578561e-05, + "loss": 0.5878, + "step": 3243 + }, + { + "epoch": 0.5767111111111111, + "grad_norm": 0.3657847387277009, + "learning_rate": 8.015224019792897e-05, + "loss": 0.5533, + "step": 3244 + }, + { + "epoch": 0.5768888888888889, + "grad_norm": 0.3728235269669566, + "learning_rate": 8.009580852062705e-05, + "loss": 0.5967, + "step": 3245 + }, + { + "epoch": 0.5770666666666666, + "grad_norm": 0.3336254272730483, + "learning_rate": 8.003938344258989e-05, + "loss": 0.6004, + "step": 3246 + }, + { + "epoch": 0.5772444444444444, + "grad_norm": 0.3391704117672068, + "learning_rate": 7.998296498252525e-05, + "loss": 0.5806, + "step": 3247 + }, + { + "epoch": 0.5774222222222222, + "grad_norm": 0.35932536353209654, + "learning_rate": 7.992655315913884e-05, + "loss": 0.6297, + "step": 3248 + }, + { + "epoch": 0.5776, + "grad_norm": 0.37869935999021365, + "learning_rate": 7.987014799113397e-05, + "loss": 0.6298, + "step": 3249 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 0.3528996449807185, + "learning_rate": 7.9813749497212e-05, + "loss": 0.6214, + "step": 3250 + }, + { + "epoch": 0.5779555555555556, + "grad_norm": 0.4077919215628851, + "learning_rate": 7.975735769607182e-05, + "loss": 0.642, + "step": 3251 + }, + { + "epoch": 0.5781333333333334, + "grad_norm": 0.34700963153372466, + "learning_rate": 7.97009726064103e-05, + "loss": 0.644, + "step": 3252 + }, + { + "epoch": 0.5783111111111111, + "grad_norm": 0.3603098568269623, + "learning_rate": 7.964459424692192e-05, + "loss": 0.6134, + "step": 3253 + }, + { + "epoch": 0.5784888888888889, + "grad_norm": 0.3549269215650164, + "learning_rate": 7.95882226362991e-05, + "loss": 0.6504, + "step": 3254 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 0.3652849680227186, + "learning_rate": 7.953185779323184e-05, + "loss": 0.6137, + "step": 3255 + }, + { + "epoch": 0.5788444444444445, + "grad_norm": 0.3647760847861043, + "learning_rate": 7.947549973640805e-05, + "loss": 0.5687, + "step": 3256 + }, + { + "epoch": 0.5790222222222222, + "grad_norm": 0.42257931506560475, + "learning_rate": 7.941914848451332e-05, + "loss": 0.5655, + "step": 3257 + }, + { + "epoch": 0.5792, + "grad_norm": 0.36105815023804116, + "learning_rate": 7.9362804056231e-05, + "loss": 0.5665, + "step": 3258 + }, + { + "epoch": 0.5793777777777778, + "grad_norm": 0.393268462341893, + "learning_rate": 7.930646647024212e-05, + "loss": 0.5744, + "step": 3259 + }, + { + "epoch": 0.5795555555555556, + "grad_norm": 0.39559813499428703, + "learning_rate": 7.925013574522557e-05, + "loss": 0.5856, + "step": 3260 + }, + { + "epoch": 0.5797333333333333, + "grad_norm": 0.3836796112683418, + "learning_rate": 7.919381189985778e-05, + "loss": 0.6124, + "step": 3261 + }, + { + "epoch": 0.5799111111111112, + "grad_norm": 0.36247571443958754, + "learning_rate": 7.913749495281313e-05, + "loss": 0.5924, + "step": 3262 + }, + { + "epoch": 0.5800888888888889, + "grad_norm": 0.34618340316981183, + "learning_rate": 7.90811849227635e-05, + "loss": 0.6073, + "step": 3263 + }, + { + "epoch": 0.5802666666666667, + "grad_norm": 0.36381507288621134, + "learning_rate": 7.902488182837862e-05, + "loss": 0.6154, + "step": 3264 + }, + { + "epoch": 0.5804444444444444, + "grad_norm": 0.36460098681435715, + "learning_rate": 7.896858568832581e-05, + "loss": 0.6167, + "step": 3265 + }, + { + "epoch": 0.5806222222222223, + "grad_norm": 0.3610372779962909, + "learning_rate": 7.891229652127019e-05, + "loss": 0.5785, + "step": 3266 + }, + { + "epoch": 0.5808, + "grad_norm": 0.34722416568103387, + "learning_rate": 7.885601434587451e-05, + "loss": 0.5892, + "step": 3267 + }, + { + "epoch": 0.5809777777777778, + "grad_norm": 0.34556477946590836, + "learning_rate": 7.879973918079917e-05, + "loss": 0.5652, + "step": 3268 + }, + { + "epoch": 0.5811555555555555, + "grad_norm": 0.38701087414100327, + "learning_rate": 7.874347104470234e-05, + "loss": 0.6337, + "step": 3269 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 0.34713506420533025, + "learning_rate": 7.868720995623979e-05, + "loss": 0.6082, + "step": 3270 + }, + { + "epoch": 0.5815111111111111, + "grad_norm": 0.3446063806541194, + "learning_rate": 7.863095593406491e-05, + "loss": 0.5552, + "step": 3271 + }, + { + "epoch": 0.5816888888888889, + "grad_norm": 0.35607543569341626, + "learning_rate": 7.857470899682891e-05, + "loss": 0.625, + "step": 3272 + }, + { + "epoch": 0.5818666666666666, + "grad_norm": 0.3494569640832731, + "learning_rate": 7.851846916318046e-05, + "loss": 0.5939, + "step": 3273 + }, + { + "epoch": 0.5820444444444445, + "grad_norm": 0.3666921985855683, + "learning_rate": 7.846223645176601e-05, + "loss": 0.5892, + "step": 3274 + }, + { + "epoch": 0.5822222222222222, + "grad_norm": 0.3815520200790617, + "learning_rate": 7.840601088122956e-05, + "loss": 0.6154, + "step": 3275 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3736537771602796, + "learning_rate": 7.834979247021284e-05, + "loss": 0.6268, + "step": 3276 + }, + { + "epoch": 0.5825777777777777, + "grad_norm": 0.35001740396600817, + "learning_rate": 7.829358123735508e-05, + "loss": 0.5796, + "step": 3277 + }, + { + "epoch": 0.5827555555555556, + "grad_norm": 0.356787922303948, + "learning_rate": 7.823737720129329e-05, + "loss": 0.6633, + "step": 3278 + }, + { + "epoch": 0.5829333333333333, + "grad_norm": 0.35954809184534875, + "learning_rate": 7.818118038066192e-05, + "loss": 0.616, + "step": 3279 + }, + { + "epoch": 0.5831111111111111, + "grad_norm": 0.37199503467315925, + "learning_rate": 7.812499079409315e-05, + "loss": 0.6127, + "step": 3280 + }, + { + "epoch": 0.5832888888888889, + "grad_norm": 0.3594395678178431, + "learning_rate": 7.806880846021669e-05, + "loss": 0.5995, + "step": 3281 + }, + { + "epoch": 0.5834666666666667, + "grad_norm": 0.3308563876376353, + "learning_rate": 7.801263339765994e-05, + "loss": 0.5668, + "step": 3282 + }, + { + "epoch": 0.5836444444444444, + "grad_norm": 0.33622357801190716, + "learning_rate": 7.795646562504773e-05, + "loss": 0.6409, + "step": 3283 + }, + { + "epoch": 0.5838222222222222, + "grad_norm": 0.3682693074229111, + "learning_rate": 7.790030516100266e-05, + "loss": 0.6359, + "step": 3284 + }, + { + "epoch": 0.584, + "grad_norm": 0.3531784526940973, + "learning_rate": 7.784415202414477e-05, + "loss": 0.5617, + "step": 3285 + }, + { + "epoch": 0.5841777777777778, + "grad_norm": 0.37353671756996487, + "learning_rate": 7.778800623309174e-05, + "loss": 0.5809, + "step": 3286 + }, + { + "epoch": 0.5843555555555555, + "grad_norm": 0.35310441129337927, + "learning_rate": 7.773186780645876e-05, + "loss": 0.5906, + "step": 3287 + }, + { + "epoch": 0.5845333333333333, + "grad_norm": 0.3537066659654453, + "learning_rate": 7.767573676285868e-05, + "loss": 0.6029, + "step": 3288 + }, + { + "epoch": 0.5847111111111111, + "grad_norm": 0.3438077129509418, + "learning_rate": 7.761961312090174e-05, + "loss": 0.5566, + "step": 3289 + }, + { + "epoch": 0.5848888888888889, + "grad_norm": 0.33716452222769244, + "learning_rate": 7.756349689919589e-05, + "loss": 0.5744, + "step": 3290 + }, + { + "epoch": 0.5850666666666666, + "grad_norm": 0.3621282096715922, + "learning_rate": 7.75073881163465e-05, + "loss": 0.6225, + "step": 3291 + }, + { + "epoch": 0.5852444444444445, + "grad_norm": 0.36523940835678276, + "learning_rate": 7.745128679095656e-05, + "loss": 0.5389, + "step": 3292 + }, + { + "epoch": 0.5854222222222222, + "grad_norm": 0.33802368799807025, + "learning_rate": 7.739519294162652e-05, + "loss": 0.566, + "step": 3293 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3607540558689247, + "learning_rate": 7.733910658695442e-05, + "loss": 0.6113, + "step": 3294 + }, + { + "epoch": 0.5857777777777777, + "grad_norm": 0.35774280066206815, + "learning_rate": 7.72830277455357e-05, + "loss": 0.5872, + "step": 3295 + }, + { + "epoch": 0.5859555555555556, + "grad_norm": 0.4344251113159798, + "learning_rate": 7.722695643596348e-05, + "loss": 0.6024, + "step": 3296 + }, + { + "epoch": 0.5861333333333333, + "grad_norm": 0.3536585333168312, + "learning_rate": 7.717089267682818e-05, + "loss": 0.6021, + "step": 3297 + }, + { + "epoch": 0.5863111111111111, + "grad_norm": 0.36773621638124965, + "learning_rate": 7.711483648671794e-05, + "loss": 0.6076, + "step": 3298 + }, + { + "epoch": 0.5864888888888888, + "grad_norm": 0.34752823390039894, + "learning_rate": 7.705878788421816e-05, + "loss": 0.6069, + "step": 3299 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.3513660436422426, + "learning_rate": 7.700274688791196e-05, + "loss": 0.6126, + "step": 3300 + }, + { + "epoch": 0.5868444444444444, + "grad_norm": 0.3530919828672039, + "learning_rate": 7.694671351637969e-05, + "loss": 0.5858, + "step": 3301 + }, + { + "epoch": 0.5870222222222222, + "grad_norm": 0.3452743729346822, + "learning_rate": 7.689068778819943e-05, + "loss": 0.6013, + "step": 3302 + }, + { + "epoch": 0.5872, + "grad_norm": 0.37701578988872997, + "learning_rate": 7.68346697219465e-05, + "loss": 0.6294, + "step": 3303 + }, + { + "epoch": 0.5873777777777778, + "grad_norm": 0.3496541077104658, + "learning_rate": 7.677865933619379e-05, + "loss": 0.5992, + "step": 3304 + }, + { + "epoch": 0.5875555555555556, + "grad_norm": 0.3866205948894247, + "learning_rate": 7.672265664951165e-05, + "loss": 0.6269, + "step": 3305 + }, + { + "epoch": 0.5877333333333333, + "grad_norm": 0.3436657532357334, + "learning_rate": 7.666666168046785e-05, + "loss": 0.5741, + "step": 3306 + }, + { + "epoch": 0.5879111111111112, + "grad_norm": 0.3449342997300359, + "learning_rate": 7.661067444762759e-05, + "loss": 0.5561, + "step": 3307 + }, + { + "epoch": 0.5880888888888889, + "grad_norm": 0.38107807862075255, + "learning_rate": 7.655469496955354e-05, + "loss": 0.578, + "step": 3308 + }, + { + "epoch": 0.5882666666666667, + "grad_norm": 0.33779229148129897, + "learning_rate": 7.649872326480577e-05, + "loss": 0.5831, + "step": 3309 + }, + { + "epoch": 0.5884444444444444, + "grad_norm": 0.36559427761031954, + "learning_rate": 7.64427593519418e-05, + "loss": 0.6128, + "step": 3310 + }, + { + "epoch": 0.5886222222222223, + "grad_norm": 0.3670247497724943, + "learning_rate": 7.638680324951649e-05, + "loss": 0.6287, + "step": 3311 + }, + { + "epoch": 0.5888, + "grad_norm": 0.4552853524246501, + "learning_rate": 7.633085497608228e-05, + "loss": 0.5928, + "step": 3312 + }, + { + "epoch": 0.5889777777777778, + "grad_norm": 0.3513254046770352, + "learning_rate": 7.627491455018878e-05, + "loss": 0.5708, + "step": 3313 + }, + { + "epoch": 0.5891555555555555, + "grad_norm": 0.3678438236773794, + "learning_rate": 7.621898199038324e-05, + "loss": 0.5605, + "step": 3314 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 0.3619970818284411, + "learning_rate": 7.616305731521008e-05, + "loss": 0.6163, + "step": 3315 + }, + { + "epoch": 0.5895111111111111, + "grad_norm": 0.3565846036072688, + "learning_rate": 7.610714054321131e-05, + "loss": 0.5915, + "step": 3316 + }, + { + "epoch": 0.5896888888888889, + "grad_norm": 0.34663205177532047, + "learning_rate": 7.605123169292614e-05, + "loss": 0.6138, + "step": 3317 + }, + { + "epoch": 0.5898666666666667, + "grad_norm": 0.33375501029289745, + "learning_rate": 7.599533078289129e-05, + "loss": 0.552, + "step": 3318 + }, + { + "epoch": 0.5900444444444445, + "grad_norm": 0.375172816981703, + "learning_rate": 7.593943783164073e-05, + "loss": 0.6553, + "step": 3319 + }, + { + "epoch": 0.5902222222222222, + "grad_norm": 0.3511562235624918, + "learning_rate": 7.588355285770591e-05, + "loss": 0.565, + "step": 3320 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3432792388815817, + "learning_rate": 7.582767587961552e-05, + "loss": 0.5383, + "step": 3321 + }, + { + "epoch": 0.5905777777777778, + "grad_norm": 0.3398380046959315, + "learning_rate": 7.577180691589573e-05, + "loss": 0.5864, + "step": 3322 + }, + { + "epoch": 0.5907555555555556, + "grad_norm": 0.376956460317968, + "learning_rate": 7.57159459850699e-05, + "loss": 0.6085, + "step": 3323 + }, + { + "epoch": 0.5909333333333333, + "grad_norm": 0.397766913031596, + "learning_rate": 7.566009310565889e-05, + "loss": 0.6315, + "step": 3324 + }, + { + "epoch": 0.5911111111111111, + "grad_norm": 0.34255839182088776, + "learning_rate": 7.560424829618072e-05, + "loss": 0.5868, + "step": 3325 + }, + { + "epoch": 0.5912888888888889, + "grad_norm": 0.36139566313021204, + "learning_rate": 7.554841157515092e-05, + "loss": 0.6043, + "step": 3326 + }, + { + "epoch": 0.5914666666666667, + "grad_norm": 0.3440667395113186, + "learning_rate": 7.549258296108212e-05, + "loss": 0.5527, + "step": 3327 + }, + { + "epoch": 0.5916444444444444, + "grad_norm": 0.3519612045216992, + "learning_rate": 7.543676247248448e-05, + "loss": 0.6418, + "step": 3328 + }, + { + "epoch": 0.5918222222222222, + "grad_norm": 0.3529309503919504, + "learning_rate": 7.538095012786534e-05, + "loss": 0.6018, + "step": 3329 + }, + { + "epoch": 0.592, + "grad_norm": 0.360342477817817, + "learning_rate": 7.532514594572934e-05, + "loss": 0.5708, + "step": 3330 + }, + { + "epoch": 0.5921777777777778, + "grad_norm": 0.33843209912459327, + "learning_rate": 7.526934994457844e-05, + "loss": 0.5735, + "step": 3331 + }, + { + "epoch": 0.5923555555555555, + "grad_norm": 0.35346176512951677, + "learning_rate": 7.521356214291196e-05, + "loss": 0.5558, + "step": 3332 + }, + { + "epoch": 0.5925333333333334, + "grad_norm": 0.3509996723425611, + "learning_rate": 7.515778255922632e-05, + "loss": 0.5593, + "step": 3333 + }, + { + "epoch": 0.5927111111111111, + "grad_norm": 0.35564686432479586, + "learning_rate": 7.510201121201543e-05, + "loss": 0.6214, + "step": 3334 + }, + { + "epoch": 0.5928888888888889, + "grad_norm": 0.37156628011523685, + "learning_rate": 7.504624811977028e-05, + "loss": 0.6155, + "step": 3335 + }, + { + "epoch": 0.5930666666666666, + "grad_norm": 0.363834113087486, + "learning_rate": 7.499049330097927e-05, + "loss": 0.5861, + "step": 3336 + }, + { + "epoch": 0.5932444444444445, + "grad_norm": 0.3514957916817266, + "learning_rate": 7.493474677412794e-05, + "loss": 0.634, + "step": 3337 + }, + { + "epoch": 0.5934222222222222, + "grad_norm": 0.3558700634123441, + "learning_rate": 7.48790085576992e-05, + "loss": 0.5639, + "step": 3338 + }, + { + "epoch": 0.5936, + "grad_norm": 0.37334624206829015, + "learning_rate": 7.482327867017306e-05, + "loss": 0.5887, + "step": 3339 + }, + { + "epoch": 0.5937777777777777, + "grad_norm": 0.35003336764712384, + "learning_rate": 7.476755713002694e-05, + "loss": 0.6204, + "step": 3340 + }, + { + "epoch": 0.5939555555555556, + "grad_norm": 0.378493575947033, + "learning_rate": 7.471184395573534e-05, + "loss": 0.6048, + "step": 3341 + }, + { + "epoch": 0.5941333333333333, + "grad_norm": 0.3567038497714876, + "learning_rate": 7.465613916577004e-05, + "loss": 0.6278, + "step": 3342 + }, + { + "epoch": 0.5943111111111111, + "grad_norm": 0.3650714277703017, + "learning_rate": 7.460044277860008e-05, + "loss": 0.5862, + "step": 3343 + }, + { + "epoch": 0.5944888888888888, + "grad_norm": 0.34832197723751246, + "learning_rate": 7.454475481269168e-05, + "loss": 0.5738, + "step": 3344 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 0.360535790624661, + "learning_rate": 7.448907528650823e-05, + "loss": 0.6069, + "step": 3345 + }, + { + "epoch": 0.5948444444444444, + "grad_norm": 0.36761078093416527, + "learning_rate": 7.443340421851041e-05, + "loss": 0.6151, + "step": 3346 + }, + { + "epoch": 0.5950222222222222, + "grad_norm": 0.3637153394302031, + "learning_rate": 7.4377741627156e-05, + "loss": 0.6238, + "step": 3347 + }, + { + "epoch": 0.5952, + "grad_norm": 0.35466599643827534, + "learning_rate": 7.432208753090009e-05, + "loss": 0.5968, + "step": 3348 + }, + { + "epoch": 0.5953777777777778, + "grad_norm": 0.38957721706421305, + "learning_rate": 7.426644194819477e-05, + "loss": 0.6335, + "step": 3349 + }, + { + "epoch": 0.5955555555555555, + "grad_norm": 0.3585581377397012, + "learning_rate": 7.421080489748953e-05, + "loss": 0.62, + "step": 3350 + }, + { + "epoch": 0.5957333333333333, + "grad_norm": 0.36279571099801644, + "learning_rate": 7.415517639723082e-05, + "loss": 0.5589, + "step": 3351 + }, + { + "epoch": 0.5959111111111111, + "grad_norm": 0.32450819513265844, + "learning_rate": 7.409955646586244e-05, + "loss": 0.5549, + "step": 3352 + }, + { + "epoch": 0.5960888888888889, + "grad_norm": 0.3281875091609394, + "learning_rate": 7.40439451218252e-05, + "loss": 0.5703, + "step": 3353 + }, + { + "epoch": 0.5962666666666666, + "grad_norm": 0.36882856133349584, + "learning_rate": 7.398834238355716e-05, + "loss": 0.5806, + "step": 3354 + }, + { + "epoch": 0.5964444444444444, + "grad_norm": 0.377243790484592, + "learning_rate": 7.393274826949346e-05, + "loss": 0.5735, + "step": 3355 + }, + { + "epoch": 0.5966222222222223, + "grad_norm": 0.38619203339052627, + "learning_rate": 7.387716279806647e-05, + "loss": 0.6197, + "step": 3356 + }, + { + "epoch": 0.5968, + "grad_norm": 0.37337661605182787, + "learning_rate": 7.382158598770554e-05, + "loss": 0.5954, + "step": 3357 + }, + { + "epoch": 0.5969777777777778, + "grad_norm": 0.3492734538090118, + "learning_rate": 7.376601785683736e-05, + "loss": 0.5658, + "step": 3358 + }, + { + "epoch": 0.5971555555555556, + "grad_norm": 0.34740816457405305, + "learning_rate": 7.371045842388552e-05, + "loss": 0.5997, + "step": 3359 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 0.3574778027933568, + "learning_rate": 7.365490770727093e-05, + "loss": 0.589, + "step": 3360 + }, + { + "epoch": 0.5975111111111111, + "grad_norm": 0.35794504027300283, + "learning_rate": 7.359936572541142e-05, + "loss": 0.6222, + "step": 3361 + }, + { + "epoch": 0.5976888888888889, + "grad_norm": 0.3410778406720154, + "learning_rate": 7.354383249672212e-05, + "loss": 0.579, + "step": 3362 + }, + { + "epoch": 0.5978666666666667, + "grad_norm": 0.3475162117944813, + "learning_rate": 7.348830803961507e-05, + "loss": 0.5828, + "step": 3363 + }, + { + "epoch": 0.5980444444444445, + "grad_norm": 0.38229664498826715, + "learning_rate": 7.343279237249953e-05, + "loss": 0.5836, + "step": 3364 + }, + { + "epoch": 0.5982222222222222, + "grad_norm": 0.36707632210616165, + "learning_rate": 7.337728551378179e-05, + "loss": 0.5948, + "step": 3365 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3474817079331617, + "learning_rate": 7.332178748186525e-05, + "loss": 0.5889, + "step": 3366 + }, + { + "epoch": 0.5985777777777778, + "grad_norm": 0.3713238049763101, + "learning_rate": 7.326629829515033e-05, + "loss": 0.6178, + "step": 3367 + }, + { + "epoch": 0.5987555555555556, + "grad_norm": 0.36966344233847986, + "learning_rate": 7.32108179720346e-05, + "loss": 0.6202, + "step": 3368 + }, + { + "epoch": 0.5989333333333333, + "grad_norm": 0.37475955569802427, + "learning_rate": 7.31553465309126e-05, + "loss": 0.6087, + "step": 3369 + }, + { + "epoch": 0.5991111111111111, + "grad_norm": 0.3592736417732489, + "learning_rate": 7.309988399017602e-05, + "loss": 0.6101, + "step": 3370 + }, + { + "epoch": 0.5992888888888889, + "grad_norm": 0.34115307886889823, + "learning_rate": 7.304443036821347e-05, + "loss": 0.5948, + "step": 3371 + }, + { + "epoch": 0.5994666666666667, + "grad_norm": 0.3586940032510138, + "learning_rate": 7.298898568341079e-05, + "loss": 0.5915, + "step": 3372 + }, + { + "epoch": 0.5996444444444444, + "grad_norm": 0.36482075692038446, + "learning_rate": 7.293354995415063e-05, + "loss": 0.5824, + "step": 3373 + }, + { + "epoch": 0.5998222222222223, + "grad_norm": 0.3511386745325496, + "learning_rate": 7.28781231988129e-05, + "loss": 0.5581, + "step": 3374 + }, + { + "epoch": 0.6, + "grad_norm": 0.35392495004983016, + "learning_rate": 7.282270543577436e-05, + "loss": 0.586, + "step": 3375 + }, + { + "epoch": 0.6001777777777778, + "grad_norm": 0.3375085603248464, + "learning_rate": 7.276729668340888e-05, + "loss": 0.5754, + "step": 3376 + }, + { + "epoch": 0.6003555555555555, + "grad_norm": 0.3231489059358228, + "learning_rate": 7.271189696008729e-05, + "loss": 0.5839, + "step": 3377 + }, + { + "epoch": 0.6005333333333334, + "grad_norm": 0.3395463107124685, + "learning_rate": 7.265650628417747e-05, + "loss": 0.6095, + "step": 3378 + }, + { + "epoch": 0.6007111111111111, + "grad_norm": 0.3487272701191673, + "learning_rate": 7.260112467404427e-05, + "loss": 0.6056, + "step": 3379 + }, + { + "epoch": 0.6008888888888889, + "grad_norm": 0.3563527272856261, + "learning_rate": 7.254575214804959e-05, + "loss": 0.5952, + "step": 3380 + }, + { + "epoch": 0.6010666666666666, + "grad_norm": 0.3354288791679355, + "learning_rate": 7.24903887245522e-05, + "loss": 0.586, + "step": 3381 + }, + { + "epoch": 0.6012444444444445, + "grad_norm": 0.3591236916224874, + "learning_rate": 7.2435034421908e-05, + "loss": 0.6042, + "step": 3382 + }, + { + "epoch": 0.6014222222222222, + "grad_norm": 0.3480128202901435, + "learning_rate": 7.237968925846971e-05, + "loss": 0.6178, + "step": 3383 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3582522058815753, + "learning_rate": 7.23243532525872e-05, + "loss": 0.6054, + "step": 3384 + }, + { + "epoch": 0.6017777777777777, + "grad_norm": 0.34547437305602535, + "learning_rate": 7.226902642260711e-05, + "loss": 0.583, + "step": 3385 + }, + { + "epoch": 0.6019555555555556, + "grad_norm": 0.3728007066025142, + "learning_rate": 7.221370878687324e-05, + "loss": 0.6072, + "step": 3386 + }, + { + "epoch": 0.6021333333333333, + "grad_norm": 0.36040920892315237, + "learning_rate": 7.215840036372611e-05, + "loss": 0.5431, + "step": 3387 + }, + { + "epoch": 0.6023111111111111, + "grad_norm": 0.3702255694427694, + "learning_rate": 7.210310117150342e-05, + "loss": 0.6339, + "step": 3388 + }, + { + "epoch": 0.6024888888888889, + "grad_norm": 0.3365385578466156, + "learning_rate": 7.204781122853966e-05, + "loss": 0.6097, + "step": 3389 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 0.35864118900041453, + "learning_rate": 7.199253055316629e-05, + "loss": 0.6183, + "step": 3390 + }, + { + "epoch": 0.6028444444444444, + "grad_norm": 0.33154923908210954, + "learning_rate": 7.19372591637117e-05, + "loss": 0.549, + "step": 3391 + }, + { + "epoch": 0.6030222222222222, + "grad_norm": 0.3650137900719657, + "learning_rate": 7.188199707850122e-05, + "loss": 0.6081, + "step": 3392 + }, + { + "epoch": 0.6032, + "grad_norm": 0.34384244014334475, + "learning_rate": 7.182674431585704e-05, + "loss": 0.5718, + "step": 3393 + }, + { + "epoch": 0.6033777777777778, + "grad_norm": 0.36559871282173134, + "learning_rate": 7.177150089409835e-05, + "loss": 0.5855, + "step": 3394 + }, + { + "epoch": 0.6035555555555555, + "grad_norm": 0.3619185449952785, + "learning_rate": 7.171626683154112e-05, + "loss": 0.6064, + "step": 3395 + }, + { + "epoch": 0.6037333333333333, + "grad_norm": 0.3562468123241234, + "learning_rate": 7.166104214649839e-05, + "loss": 0.6167, + "step": 3396 + }, + { + "epoch": 0.6039111111111111, + "grad_norm": 0.34835039127977774, + "learning_rate": 7.160582685727986e-05, + "loss": 0.5333, + "step": 3397 + }, + { + "epoch": 0.6040888888888889, + "grad_norm": 0.35517134904340303, + "learning_rate": 7.155062098219235e-05, + "loss": 0.5819, + "step": 3398 + }, + { + "epoch": 0.6042666666666666, + "grad_norm": 0.36803369471251707, + "learning_rate": 7.149542453953938e-05, + "loss": 0.6475, + "step": 3399 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 0.34423511465744333, + "learning_rate": 7.144023754762149e-05, + "loss": 0.6139, + "step": 3400 + }, + { + "epoch": 0.6046222222222222, + "grad_norm": 0.3639381293621342, + "learning_rate": 7.138506002473591e-05, + "loss": 0.5888, + "step": 3401 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3594084399758162, + "learning_rate": 7.132989198917692e-05, + "loss": 0.5633, + "step": 3402 + }, + { + "epoch": 0.6049777777777777, + "grad_norm": 0.35620243783022976, + "learning_rate": 7.127473345923554e-05, + "loss": 0.6213, + "step": 3403 + }, + { + "epoch": 0.6051555555555556, + "grad_norm": 0.37797755027524865, + "learning_rate": 7.121958445319965e-05, + "loss": 0.6171, + "step": 3404 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 0.3469824290242751, + "learning_rate": 7.116444498935396e-05, + "loss": 0.5921, + "step": 3405 + }, + { + "epoch": 0.6055111111111111, + "grad_norm": 0.37160240787966137, + "learning_rate": 7.110931508598011e-05, + "loss": 0.6094, + "step": 3406 + }, + { + "epoch": 0.6056888888888889, + "grad_norm": 0.3587097698412013, + "learning_rate": 7.105419476135643e-05, + "loss": 0.6159, + "step": 3407 + }, + { + "epoch": 0.6058666666666667, + "grad_norm": 0.33870367471615753, + "learning_rate": 7.099908403375823e-05, + "loss": 0.5809, + "step": 3408 + }, + { + "epoch": 0.6060444444444445, + "grad_norm": 0.3509255640429414, + "learning_rate": 7.094398292145746e-05, + "loss": 0.5664, + "step": 3409 + }, + { + "epoch": 0.6062222222222222, + "grad_norm": 0.35089660964993785, + "learning_rate": 7.088889144272305e-05, + "loss": 0.605, + "step": 3410 + }, + { + "epoch": 0.6064, + "grad_norm": 0.3606835979508538, + "learning_rate": 7.083380961582062e-05, + "loss": 0.5673, + "step": 3411 + }, + { + "epoch": 0.6065777777777778, + "grad_norm": 0.35417333026626147, + "learning_rate": 7.077873745901269e-05, + "loss": 0.5927, + "step": 3412 + }, + { + "epoch": 0.6067555555555556, + "grad_norm": 0.3273834645633267, + "learning_rate": 7.072367499055844e-05, + "loss": 0.5911, + "step": 3413 + }, + { + "epoch": 0.6069333333333333, + "grad_norm": 0.35203990821899905, + "learning_rate": 7.066862222871397e-05, + "loss": 0.59, + "step": 3414 + }, + { + "epoch": 0.6071111111111112, + "grad_norm": 0.3301638394687133, + "learning_rate": 7.061357919173209e-05, + "loss": 0.5655, + "step": 3415 + }, + { + "epoch": 0.6072888888888889, + "grad_norm": 0.3650828916902255, + "learning_rate": 7.055854589786241e-05, + "loss": 0.5676, + "step": 3416 + }, + { + "epoch": 0.6074666666666667, + "grad_norm": 0.33755707923075684, + "learning_rate": 7.050352236535125e-05, + "loss": 0.5728, + "step": 3417 + }, + { + "epoch": 0.6076444444444444, + "grad_norm": 0.3496975276716916, + "learning_rate": 7.044850861244184e-05, + "loss": 0.5374, + "step": 3418 + }, + { + "epoch": 0.6078222222222223, + "grad_norm": 0.38579749447468065, + "learning_rate": 7.039350465737396e-05, + "loss": 0.5871, + "step": 3419 + }, + { + "epoch": 0.608, + "grad_norm": 0.3706742400380307, + "learning_rate": 7.033851051838437e-05, + "loss": 0.5855, + "step": 3420 + }, + { + "epoch": 0.6081777777777778, + "grad_norm": 0.3762731585109549, + "learning_rate": 7.028352621370635e-05, + "loss": 0.6535, + "step": 3421 + }, + { + "epoch": 0.6083555555555555, + "grad_norm": 0.40364872923069495, + "learning_rate": 7.022855176157016e-05, + "loss": 0.5838, + "step": 3422 + }, + { + "epoch": 0.6085333333333334, + "grad_norm": 0.3446107212341191, + "learning_rate": 7.01735871802025e-05, + "loss": 0.5465, + "step": 3423 + }, + { + "epoch": 0.6087111111111111, + "grad_norm": 0.3630382413718293, + "learning_rate": 7.011863248782711e-05, + "loss": 0.5693, + "step": 3424 + }, + { + "epoch": 0.6088888888888889, + "grad_norm": 0.3392069360937096, + "learning_rate": 7.006368770266421e-05, + "loss": 0.5707, + "step": 3425 + }, + { + "epoch": 0.6090666666666666, + "grad_norm": 0.34262698514308126, + "learning_rate": 7.000875284293087e-05, + "loss": 0.5448, + "step": 3426 + }, + { + "epoch": 0.6092444444444445, + "grad_norm": 0.3753001426240263, + "learning_rate": 6.99538279268408e-05, + "loss": 0.5916, + "step": 3427 + }, + { + "epoch": 0.6094222222222222, + "grad_norm": 0.3549000141752506, + "learning_rate": 6.989891297260445e-05, + "loss": 0.6293, + "step": 3428 + }, + { + "epoch": 0.6096, + "grad_norm": 0.36388223808887243, + "learning_rate": 6.984400799842894e-05, + "loss": 0.6224, + "step": 3429 + }, + { + "epoch": 0.6097777777777778, + "grad_norm": 0.35668869119572716, + "learning_rate": 6.978911302251816e-05, + "loss": 0.561, + "step": 3430 + }, + { + "epoch": 0.6099555555555556, + "grad_norm": 0.45875194448495316, + "learning_rate": 6.97342280630725e-05, + "loss": 0.6046, + "step": 3431 + }, + { + "epoch": 0.6101333333333333, + "grad_norm": 0.3817313111470102, + "learning_rate": 6.967935313828929e-05, + "loss": 0.5896, + "step": 3432 + }, + { + "epoch": 0.6103111111111111, + "grad_norm": 0.36625653110145323, + "learning_rate": 6.962448826636227e-05, + "loss": 0.6292, + "step": 3433 + }, + { + "epoch": 0.6104888888888889, + "grad_norm": 0.33666849321718983, + "learning_rate": 6.95696334654821e-05, + "loss": 0.5515, + "step": 3434 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 0.3716990902826086, + "learning_rate": 6.951478875383583e-05, + "loss": 0.6169, + "step": 3435 + }, + { + "epoch": 0.6108444444444444, + "grad_norm": 0.3612786294761165, + "learning_rate": 6.945995414960744e-05, + "loss": 0.5923, + "step": 3436 + }, + { + "epoch": 0.6110222222222222, + "grad_norm": 0.37804702404560686, + "learning_rate": 6.940512967097732e-05, + "loss": 0.6244, + "step": 3437 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3409226264109023, + "learning_rate": 6.93503153361227e-05, + "loss": 0.5481, + "step": 3438 + }, + { + "epoch": 0.6113777777777778, + "grad_norm": 0.37905874923852434, + "learning_rate": 6.929551116321728e-05, + "loss": 0.6298, + "step": 3439 + }, + { + "epoch": 0.6115555555555555, + "grad_norm": 0.3519272670285671, + "learning_rate": 6.92407171704315e-05, + "loss": 0.6168, + "step": 3440 + }, + { + "epoch": 0.6117333333333334, + "grad_norm": 0.3477323318060181, + "learning_rate": 6.918593337593238e-05, + "loss": 0.5963, + "step": 3441 + }, + { + "epoch": 0.6119111111111111, + "grad_norm": 0.36818449007290877, + "learning_rate": 6.913115979788361e-05, + "loss": 0.6239, + "step": 3442 + }, + { + "epoch": 0.6120888888888889, + "grad_norm": 0.38883583130154914, + "learning_rate": 6.907639645444536e-05, + "loss": 0.5935, + "step": 3443 + }, + { + "epoch": 0.6122666666666666, + "grad_norm": 0.3422123624192518, + "learning_rate": 6.902164336377461e-05, + "loss": 0.563, + "step": 3444 + }, + { + "epoch": 0.6124444444444445, + "grad_norm": 0.3601061351890906, + "learning_rate": 6.896690054402473e-05, + "loss": 0.5782, + "step": 3445 + }, + { + "epoch": 0.6126222222222222, + "grad_norm": 0.3441416166286006, + "learning_rate": 6.891216801334588e-05, + "loss": 0.5841, + "step": 3446 + }, + { + "epoch": 0.6128, + "grad_norm": 0.3975417759059079, + "learning_rate": 6.885744578988463e-05, + "loss": 0.6506, + "step": 3447 + }, + { + "epoch": 0.6129777777777777, + "grad_norm": 0.341448084704136, + "learning_rate": 6.88027338917843e-05, + "loss": 0.5624, + "step": 3448 + }, + { + "epoch": 0.6131555555555556, + "grad_norm": 0.4099521071865417, + "learning_rate": 6.874803233718459e-05, + "loss": 0.585, + "step": 3449 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.37124650499283024, + "learning_rate": 6.869334114422199e-05, + "loss": 0.6219, + "step": 3450 + }, + { + "epoch": 0.6135111111111111, + "grad_norm": 0.36201958031038717, + "learning_rate": 6.863866033102939e-05, + "loss": 0.5911, + "step": 3451 + }, + { + "epoch": 0.6136888888888888, + "grad_norm": 0.3449753701435678, + "learning_rate": 6.858398991573631e-05, + "loss": 0.5899, + "step": 3452 + }, + { + "epoch": 0.6138666666666667, + "grad_norm": 0.34640689141990855, + "learning_rate": 6.852932991646881e-05, + "loss": 0.5842, + "step": 3453 + }, + { + "epoch": 0.6140444444444444, + "grad_norm": 0.34622564944822953, + "learning_rate": 6.847468035134951e-05, + "loss": 0.5956, + "step": 3454 + }, + { + "epoch": 0.6142222222222222, + "grad_norm": 0.3634386258832327, + "learning_rate": 6.842004123849752e-05, + "loss": 0.621, + "step": 3455 + }, + { + "epoch": 0.6144, + "grad_norm": 0.34729536366957653, + "learning_rate": 6.836541259602856e-05, + "loss": 0.5592, + "step": 3456 + }, + { + "epoch": 0.6145777777777778, + "grad_norm": 0.3569812522628856, + "learning_rate": 6.83107944420548e-05, + "loss": 0.614, + "step": 3457 + }, + { + "epoch": 0.6147555555555556, + "grad_norm": 0.33679433365634, + "learning_rate": 6.825618679468502e-05, + "loss": 0.5676, + "step": 3458 + }, + { + "epoch": 0.6149333333333333, + "grad_norm": 0.33882201084539326, + "learning_rate": 6.820158967202439e-05, + "loss": 0.5212, + "step": 3459 + }, + { + "epoch": 0.6151111111111112, + "grad_norm": 0.35537078550186557, + "learning_rate": 6.814700309217476e-05, + "loss": 0.5638, + "step": 3460 + }, + { + "epoch": 0.6152888888888889, + "grad_norm": 0.352661724425596, + "learning_rate": 6.809242707323432e-05, + "loss": 0.5938, + "step": 3461 + }, + { + "epoch": 0.6154666666666667, + "grad_norm": 0.36952325119547186, + "learning_rate": 6.80378616332979e-05, + "loss": 0.5615, + "step": 3462 + }, + { + "epoch": 0.6156444444444444, + "grad_norm": 0.38565201108993036, + "learning_rate": 6.79833067904567e-05, + "loss": 0.5787, + "step": 3463 + }, + { + "epoch": 0.6158222222222223, + "grad_norm": 0.3541276041447201, + "learning_rate": 6.792876256279846e-05, + "loss": 0.5896, + "step": 3464 + }, + { + "epoch": 0.616, + "grad_norm": 0.39347866147464766, + "learning_rate": 6.787422896840743e-05, + "loss": 0.5866, + "step": 3465 + }, + { + "epoch": 0.6161777777777778, + "grad_norm": 0.3376416182067583, + "learning_rate": 6.781970602536432e-05, + "loss": 0.5613, + "step": 3466 + }, + { + "epoch": 0.6163555555555555, + "grad_norm": 0.35782104298073997, + "learning_rate": 6.776519375174621e-05, + "loss": 0.59, + "step": 3467 + }, + { + "epoch": 0.6165333333333334, + "grad_norm": 0.34311783171678184, + "learning_rate": 6.771069216562684e-05, + "loss": 0.5509, + "step": 3468 + }, + { + "epoch": 0.6167111111111111, + "grad_norm": 0.35297933042292007, + "learning_rate": 6.765620128507619e-05, + "loss": 0.5925, + "step": 3469 + }, + { + "epoch": 0.6168888888888889, + "grad_norm": 0.36537623858666873, + "learning_rate": 6.76017211281609e-05, + "loss": 0.6049, + "step": 3470 + }, + { + "epoch": 0.6170666666666667, + "grad_norm": 0.3722809872677141, + "learning_rate": 6.754725171294382e-05, + "loss": 0.5862, + "step": 3471 + }, + { + "epoch": 0.6172444444444445, + "grad_norm": 0.35907129238545, + "learning_rate": 6.749279305748448e-05, + "loss": 0.5986, + "step": 3472 + }, + { + "epoch": 0.6174222222222222, + "grad_norm": 0.3620021998753497, + "learning_rate": 6.743834517983865e-05, + "loss": 0.5954, + "step": 3473 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3555134466831354, + "learning_rate": 6.73839080980587e-05, + "loss": 0.5907, + "step": 3474 + }, + { + "epoch": 0.6177777777777778, + "grad_norm": 0.3405314946741367, + "learning_rate": 6.732948183019324e-05, + "loss": 0.5286, + "step": 3475 + }, + { + "epoch": 0.6179555555555556, + "grad_norm": 0.3419335786782383, + "learning_rate": 6.727506639428739e-05, + "loss": 0.5783, + "step": 3476 + }, + { + "epoch": 0.6181333333333333, + "grad_norm": 0.3608614890871183, + "learning_rate": 6.72206618083827e-05, + "loss": 0.6059, + "step": 3477 + }, + { + "epoch": 0.6183111111111111, + "grad_norm": 0.350330236392432, + "learning_rate": 6.71662680905171e-05, + "loss": 0.6079, + "step": 3478 + }, + { + "epoch": 0.6184888888888889, + "grad_norm": 0.3701563113973166, + "learning_rate": 6.711188525872486e-05, + "loss": 0.5895, + "step": 3479 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 0.3459818472425109, + "learning_rate": 6.705751333103675e-05, + "loss": 0.5937, + "step": 3480 + }, + { + "epoch": 0.6188444444444444, + "grad_norm": 0.34377589387540275, + "learning_rate": 6.700315232547981e-05, + "loss": 0.6201, + "step": 3481 + }, + { + "epoch": 0.6190222222222223, + "grad_norm": 0.3481638400587967, + "learning_rate": 6.694880226007757e-05, + "loss": 0.5828, + "step": 3482 + }, + { + "epoch": 0.6192, + "grad_norm": 0.35083915324440135, + "learning_rate": 6.689446315284981e-05, + "loss": 0.5639, + "step": 3483 + }, + { + "epoch": 0.6193777777777778, + "grad_norm": 0.36772858049589496, + "learning_rate": 6.684013502181281e-05, + "loss": 0.5576, + "step": 3484 + }, + { + "epoch": 0.6195555555555555, + "grad_norm": 0.35895734465117346, + "learning_rate": 6.678581788497908e-05, + "loss": 0.5583, + "step": 3485 + }, + { + "epoch": 0.6197333333333334, + "grad_norm": 0.36488448897219433, + "learning_rate": 6.673151176035762e-05, + "loss": 0.586, + "step": 3486 + }, + { + "epoch": 0.6199111111111111, + "grad_norm": 0.3556208053970919, + "learning_rate": 6.667721666595365e-05, + "loss": 0.6065, + "step": 3487 + }, + { + "epoch": 0.6200888888888889, + "grad_norm": 0.3484665544452661, + "learning_rate": 6.662293261976882e-05, + "loss": 0.6213, + "step": 3488 + }, + { + "epoch": 0.6202666666666666, + "grad_norm": 0.358535263140265, + "learning_rate": 6.656865963980105e-05, + "loss": 0.5572, + "step": 3489 + }, + { + "epoch": 0.6204444444444445, + "grad_norm": 0.3524276919910799, + "learning_rate": 6.651439774404471e-05, + "loss": 0.5964, + "step": 3490 + }, + { + "epoch": 0.6206222222222222, + "grad_norm": 0.36854678642307076, + "learning_rate": 6.64601469504903e-05, + "loss": 0.6039, + "step": 3491 + }, + { + "epoch": 0.6208, + "grad_norm": 0.3528405841598993, + "learning_rate": 6.640590727712485e-05, + "loss": 0.6008, + "step": 3492 + }, + { + "epoch": 0.6209777777777777, + "grad_norm": 0.37787793788798146, + "learning_rate": 6.635167874193153e-05, + "loss": 0.6022, + "step": 3493 + }, + { + "epoch": 0.6211555555555556, + "grad_norm": 0.3628039732841485, + "learning_rate": 6.629746136288997e-05, + "loss": 0.6084, + "step": 3494 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 0.36982780314785374, + "learning_rate": 6.624325515797593e-05, + "loss": 0.6215, + "step": 3495 + }, + { + "epoch": 0.6215111111111111, + "grad_norm": 0.339044758060382, + "learning_rate": 6.618906014516168e-05, + "loss": 0.6094, + "step": 3496 + }, + { + "epoch": 0.6216888888888888, + "grad_norm": 0.3498626307348065, + "learning_rate": 6.613487634241553e-05, + "loss": 0.6208, + "step": 3497 + }, + { + "epoch": 0.6218666666666667, + "grad_norm": 0.3380632226383569, + "learning_rate": 6.608070376770231e-05, + "loss": 0.5599, + "step": 3498 + }, + { + "epoch": 0.6220444444444444, + "grad_norm": 0.3309575532400716, + "learning_rate": 6.602654243898294e-05, + "loss": 0.5719, + "step": 3499 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 0.36177064999756525, + "learning_rate": 6.597239237421476e-05, + "loss": 0.6276, + "step": 3500 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3604754376620941, + "learning_rate": 6.591825359135123e-05, + "loss": 0.5731, + "step": 3501 + }, + { + "epoch": 0.6225777777777778, + "grad_norm": 0.3775551538387199, + "learning_rate": 6.586412610834221e-05, + "loss": 0.6182, + "step": 3502 + }, + { + "epoch": 0.6227555555555555, + "grad_norm": 0.39689345077170296, + "learning_rate": 6.581000994313369e-05, + "loss": 0.5862, + "step": 3503 + }, + { + "epoch": 0.6229333333333333, + "grad_norm": 0.4202047682946692, + "learning_rate": 6.575590511366804e-05, + "loss": 0.6513, + "step": 3504 + }, + { + "epoch": 0.6231111111111111, + "grad_norm": 0.37963083087491084, + "learning_rate": 6.57018116378837e-05, + "loss": 0.608, + "step": 3505 + }, + { + "epoch": 0.6232888888888889, + "grad_norm": 0.3546949873482942, + "learning_rate": 6.564772953371555e-05, + "loss": 0.6026, + "step": 3506 + }, + { + "epoch": 0.6234666666666666, + "grad_norm": 0.34509642068842444, + "learning_rate": 6.55936588190945e-05, + "loss": 0.5632, + "step": 3507 + }, + { + "epoch": 0.6236444444444444, + "grad_norm": 0.3668139779251502, + "learning_rate": 6.553959951194787e-05, + "loss": 0.5937, + "step": 3508 + }, + { + "epoch": 0.6238222222222222, + "grad_norm": 0.34196327178857566, + "learning_rate": 6.5485551630199e-05, + "loss": 0.5792, + "step": 3509 + }, + { + "epoch": 0.624, + "grad_norm": 0.340771113425655, + "learning_rate": 6.543151519176764e-05, + "loss": 0.6075, + "step": 3510 + }, + { + "epoch": 0.6241777777777778, + "grad_norm": 0.37371294048224785, + "learning_rate": 6.537749021456959e-05, + "loss": 0.6254, + "step": 3511 + }, + { + "epoch": 0.6243555555555556, + "grad_norm": 0.3472025361223291, + "learning_rate": 6.532347671651697e-05, + "loss": 0.5427, + "step": 3512 + }, + { + "epoch": 0.6245333333333334, + "grad_norm": 0.3597057240268955, + "learning_rate": 6.526947471551798e-05, + "loss": 0.6071, + "step": 3513 + }, + { + "epoch": 0.6247111111111111, + "grad_norm": 0.3589264481249089, + "learning_rate": 6.521548422947709e-05, + "loss": 0.5886, + "step": 3514 + }, + { + "epoch": 0.6248888888888889, + "grad_norm": 0.5912673206431269, + "learning_rate": 6.516150527629495e-05, + "loss": 0.5546, + "step": 3515 + }, + { + "epoch": 0.6250666666666667, + "grad_norm": 0.3424699171545662, + "learning_rate": 6.510753787386831e-05, + "loss": 0.6057, + "step": 3516 + }, + { + "epoch": 0.6252444444444445, + "grad_norm": 0.3553341428033302, + "learning_rate": 6.505358204009017e-05, + "loss": 0.6527, + "step": 3517 + }, + { + "epoch": 0.6254222222222222, + "grad_norm": 0.34647811154723, + "learning_rate": 6.499963779284971e-05, + "loss": 0.5721, + "step": 3518 + }, + { + "epoch": 0.6256, + "grad_norm": 0.3827205530879716, + "learning_rate": 6.494570515003214e-05, + "loss": 0.5961, + "step": 3519 + }, + { + "epoch": 0.6257777777777778, + "grad_norm": 0.3572996619245539, + "learning_rate": 6.489178412951899e-05, + "loss": 0.6128, + "step": 3520 + }, + { + "epoch": 0.6259555555555556, + "grad_norm": 0.3358120218729844, + "learning_rate": 6.483787474918779e-05, + "loss": 0.5271, + "step": 3521 + }, + { + "epoch": 0.6261333333333333, + "grad_norm": 0.37989551295032553, + "learning_rate": 6.478397702691236e-05, + "loss": 0.6234, + "step": 3522 + }, + { + "epoch": 0.6263111111111112, + "grad_norm": 0.36223898724996817, + "learning_rate": 6.473009098056246e-05, + "loss": 0.5742, + "step": 3523 + }, + { + "epoch": 0.6264888888888889, + "grad_norm": 0.37495641852335404, + "learning_rate": 6.46762166280042e-05, + "loss": 0.5455, + "step": 3524 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.36508614808648054, + "learning_rate": 6.462235398709963e-05, + "loss": 0.5958, + "step": 3525 + }, + { + "epoch": 0.6268444444444444, + "grad_norm": 0.35860944822508845, + "learning_rate": 6.456850307570704e-05, + "loss": 0.6057, + "step": 3526 + }, + { + "epoch": 0.6270222222222223, + "grad_norm": 0.34764076097932783, + "learning_rate": 6.451466391168072e-05, + "loss": 0.5716, + "step": 3527 + }, + { + "epoch": 0.6272, + "grad_norm": 0.37072155796838124, + "learning_rate": 6.44608365128712e-05, + "loss": 0.5797, + "step": 3528 + }, + { + "epoch": 0.6273777777777778, + "grad_norm": 0.3657179908443539, + "learning_rate": 6.440702089712494e-05, + "loss": 0.5646, + "step": 3529 + }, + { + "epoch": 0.6275555555555555, + "grad_norm": 0.3665192780348969, + "learning_rate": 6.43532170822847e-05, + "loss": 0.5686, + "step": 3530 + }, + { + "epoch": 0.6277333333333334, + "grad_norm": 0.38878454090009323, + "learning_rate": 6.429942508618911e-05, + "loss": 0.6099, + "step": 3531 + }, + { + "epoch": 0.6279111111111111, + "grad_norm": 0.3571545416145081, + "learning_rate": 6.424564492667309e-05, + "loss": 0.618, + "step": 3532 + }, + { + "epoch": 0.6280888888888889, + "grad_norm": 0.4969865001081727, + "learning_rate": 6.419187662156743e-05, + "loss": 0.6076, + "step": 3533 + }, + { + "epoch": 0.6282666666666666, + "grad_norm": 0.3473146792785242, + "learning_rate": 6.413812018869918e-05, + "loss": 0.6025, + "step": 3534 + }, + { + "epoch": 0.6284444444444445, + "grad_norm": 0.34805733210335354, + "learning_rate": 6.40843756458913e-05, + "loss": 0.5854, + "step": 3535 + }, + { + "epoch": 0.6286222222222222, + "grad_norm": 0.3239626108847237, + "learning_rate": 6.403064301096294e-05, + "loss": 0.5545, + "step": 3536 + }, + { + "epoch": 0.6288, + "grad_norm": 0.3802899492632204, + "learning_rate": 6.397692230172918e-05, + "loss": 0.6472, + "step": 3537 + }, + { + "epoch": 0.6289777777777777, + "grad_norm": 0.3679740450769246, + "learning_rate": 6.392321353600124e-05, + "loss": 0.5777, + "step": 3538 + }, + { + "epoch": 0.6291555555555556, + "grad_norm": 0.380688082322288, + "learning_rate": 6.386951673158629e-05, + "loss": 0.6141, + "step": 3539 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 0.3916850997059995, + "learning_rate": 6.381583190628768e-05, + "loss": 0.5801, + "step": 3540 + }, + { + "epoch": 0.6295111111111111, + "grad_norm": 0.3609732841254345, + "learning_rate": 6.376215907790458e-05, + "loss": 0.5754, + "step": 3541 + }, + { + "epoch": 0.6296888888888889, + "grad_norm": 0.3728953022456657, + "learning_rate": 6.37084982642324e-05, + "loss": 0.5875, + "step": 3542 + }, + { + "epoch": 0.6298666666666667, + "grad_norm": 0.3442642696488891, + "learning_rate": 6.365484948306237e-05, + "loss": 0.6076, + "step": 3543 + }, + { + "epoch": 0.6300444444444444, + "grad_norm": 0.35495526276676037, + "learning_rate": 6.360121275218191e-05, + "loss": 0.6145, + "step": 3544 + }, + { + "epoch": 0.6302222222222222, + "grad_norm": 0.3622861146244761, + "learning_rate": 6.35475880893743e-05, + "loss": 0.6028, + "step": 3545 + }, + { + "epoch": 0.6304, + "grad_norm": 0.34979537722006593, + "learning_rate": 6.349397551241894e-05, + "loss": 0.5677, + "step": 3546 + }, + { + "epoch": 0.6305777777777778, + "grad_norm": 0.3563191619482736, + "learning_rate": 6.344037503909107e-05, + "loss": 0.593, + "step": 3547 + }, + { + "epoch": 0.6307555555555555, + "grad_norm": 0.35477512949167805, + "learning_rate": 6.338678668716209e-05, + "loss": 0.6049, + "step": 3548 + }, + { + "epoch": 0.6309333333333333, + "grad_norm": 0.35439656579729806, + "learning_rate": 6.333321047439925e-05, + "loss": 0.5905, + "step": 3549 + }, + { + "epoch": 0.6311111111111111, + "grad_norm": 0.35037068289693934, + "learning_rate": 6.327964641856585e-05, + "loss": 0.5684, + "step": 3550 + }, + { + "epoch": 0.6312888888888889, + "grad_norm": 0.36293152726456257, + "learning_rate": 6.322609453742113e-05, + "loss": 0.6106, + "step": 3551 + }, + { + "epoch": 0.6314666666666666, + "grad_norm": 0.3523435466776201, + "learning_rate": 6.317255484872028e-05, + "loss": 0.5698, + "step": 3552 + }, + { + "epoch": 0.6316444444444445, + "grad_norm": 0.34265393737932, + "learning_rate": 6.311902737021447e-05, + "loss": 0.5689, + "step": 3553 + }, + { + "epoch": 0.6318222222222222, + "grad_norm": 0.3501369098553959, + "learning_rate": 6.306551211965087e-05, + "loss": 0.6044, + "step": 3554 + }, + { + "epoch": 0.632, + "grad_norm": 0.35140657805071, + "learning_rate": 6.301200911477243e-05, + "loss": 0.6326, + "step": 3555 + }, + { + "epoch": 0.6321777777777777, + "grad_norm": 0.39085407842069364, + "learning_rate": 6.295851837331826e-05, + "loss": 0.6193, + "step": 3556 + }, + { + "epoch": 0.6323555555555556, + "grad_norm": 0.35576980451479745, + "learning_rate": 6.290503991302324e-05, + "loss": 0.6019, + "step": 3557 + }, + { + "epoch": 0.6325333333333333, + "grad_norm": 0.7752817910640124, + "learning_rate": 6.285157375161825e-05, + "loss": 0.6049, + "step": 3558 + }, + { + "epoch": 0.6327111111111111, + "grad_norm": 0.3646257448803145, + "learning_rate": 6.279811990683006e-05, + "loss": 0.5776, + "step": 3559 + }, + { + "epoch": 0.6328888888888888, + "grad_norm": 0.3551345148143536, + "learning_rate": 6.274467839638142e-05, + "loss": 0.572, + "step": 3560 + }, + { + "epoch": 0.6330666666666667, + "grad_norm": 0.36103613728553446, + "learning_rate": 6.26912492379909e-05, + "loss": 0.5909, + "step": 3561 + }, + { + "epoch": 0.6332444444444445, + "grad_norm": 0.33590653824655003, + "learning_rate": 6.2637832449373e-05, + "loss": 0.5457, + "step": 3562 + }, + { + "epoch": 0.6334222222222222, + "grad_norm": 0.3584554705871418, + "learning_rate": 6.258442804823818e-05, + "loss": 0.5899, + "step": 3563 + }, + { + "epoch": 0.6336, + "grad_norm": 0.33794756157224276, + "learning_rate": 6.253103605229279e-05, + "loss": 0.5733, + "step": 3564 + }, + { + "epoch": 0.6337777777777778, + "grad_norm": 0.34775818070893616, + "learning_rate": 6.24776564792389e-05, + "loss": 0.573, + "step": 3565 + }, + { + "epoch": 0.6339555555555556, + "grad_norm": 0.3587005246754528, + "learning_rate": 6.242428934677469e-05, + "loss": 0.6147, + "step": 3566 + }, + { + "epoch": 0.6341333333333333, + "grad_norm": 0.34490202538297937, + "learning_rate": 6.237093467259406e-05, + "loss": 0.5555, + "step": 3567 + }, + { + "epoch": 0.6343111111111112, + "grad_norm": 0.3442695358512481, + "learning_rate": 6.231759247438689e-05, + "loss": 0.6082, + "step": 3568 + }, + { + "epoch": 0.6344888888888889, + "grad_norm": 0.35557839116978457, + "learning_rate": 6.22642627698388e-05, + "loss": 0.5502, + "step": 3569 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 0.3512858908955408, + "learning_rate": 6.22109455766314e-05, + "loss": 0.6629, + "step": 3570 + }, + { + "epoch": 0.6348444444444444, + "grad_norm": 0.3492902926188577, + "learning_rate": 6.215764091244202e-05, + "loss": 0.5978, + "step": 3571 + }, + { + "epoch": 0.6350222222222223, + "grad_norm": 0.3346870877084782, + "learning_rate": 6.210434879494398e-05, + "loss": 0.607, + "step": 3572 + }, + { + "epoch": 0.6352, + "grad_norm": 0.33149197422111853, + "learning_rate": 6.205106924180628e-05, + "loss": 0.5434, + "step": 3573 + }, + { + "epoch": 0.6353777777777778, + "grad_norm": 0.3360393627377263, + "learning_rate": 6.19978022706939e-05, + "loss": 0.5503, + "step": 3574 + }, + { + "epoch": 0.6355555555555555, + "grad_norm": 0.3511337103197416, + "learning_rate": 6.194454789926753e-05, + "loss": 0.6064, + "step": 3575 + }, + { + "epoch": 0.6357333333333334, + "grad_norm": 0.343008834351783, + "learning_rate": 6.18913061451838e-05, + "loss": 0.5664, + "step": 3576 + }, + { + "epoch": 0.6359111111111111, + "grad_norm": 0.3586285495123366, + "learning_rate": 6.183807702609502e-05, + "loss": 0.5865, + "step": 3577 + }, + { + "epoch": 0.6360888888888889, + "grad_norm": 0.542675413299601, + "learning_rate": 6.178486055964945e-05, + "loss": 0.587, + "step": 3578 + }, + { + "epoch": 0.6362666666666666, + "grad_norm": 0.3560700431742999, + "learning_rate": 6.173165676349103e-05, + "loss": 0.5908, + "step": 3579 + }, + { + "epoch": 0.6364444444444445, + "grad_norm": 0.35079808482782393, + "learning_rate": 6.167846565525959e-05, + "loss": 0.5714, + "step": 3580 + }, + { + "epoch": 0.6366222222222222, + "grad_norm": 0.3426357295417441, + "learning_rate": 6.162528725259078e-05, + "loss": 0.5869, + "step": 3581 + }, + { + "epoch": 0.6368, + "grad_norm": 0.35842479969212593, + "learning_rate": 6.157212157311587e-05, + "loss": 0.5887, + "step": 3582 + }, + { + "epoch": 0.6369777777777778, + "grad_norm": 0.3355367005280455, + "learning_rate": 6.151896863446213e-05, + "loss": 0.5042, + "step": 3583 + }, + { + "epoch": 0.6371555555555556, + "grad_norm": 0.3840250612265355, + "learning_rate": 6.146582845425242e-05, + "loss": 0.6047, + "step": 3584 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 0.3433237126454255, + "learning_rate": 6.141270105010546e-05, + "loss": 0.6181, + "step": 3585 + }, + { + "epoch": 0.6375111111111111, + "grad_norm": 0.3499383746671317, + "learning_rate": 6.135958643963572e-05, + "loss": 0.5421, + "step": 3586 + }, + { + "epoch": 0.6376888888888889, + "grad_norm": 0.35382809154052436, + "learning_rate": 6.130648464045347e-05, + "loss": 0.575, + "step": 3587 + }, + { + "epoch": 0.6378666666666667, + "grad_norm": 0.3518188382626932, + "learning_rate": 6.125339567016463e-05, + "loss": 0.6014, + "step": 3588 + }, + { + "epoch": 0.6380444444444444, + "grad_norm": 0.37384338260400213, + "learning_rate": 6.120031954637101e-05, + "loss": 0.586, + "step": 3589 + }, + { + "epoch": 0.6382222222222222, + "grad_norm": 0.3653998650388993, + "learning_rate": 6.114725628666998e-05, + "loss": 0.5737, + "step": 3590 + }, + { + "epoch": 0.6384, + "grad_norm": 0.37182084882117417, + "learning_rate": 6.109420590865483e-05, + "loss": 0.6172, + "step": 3591 + }, + { + "epoch": 0.6385777777777778, + "grad_norm": 0.3400798485017609, + "learning_rate": 6.104116842991441e-05, + "loss": 0.529, + "step": 3592 + }, + { + "epoch": 0.6387555555555555, + "grad_norm": 0.3426619492386837, + "learning_rate": 6.098814386803347e-05, + "loss": 0.6251, + "step": 3593 + }, + { + "epoch": 0.6389333333333334, + "grad_norm": 0.3501116711393118, + "learning_rate": 6.0935132240592295e-05, + "loss": 0.5691, + "step": 3594 + }, + { + "epoch": 0.6391111111111111, + "grad_norm": 0.37362110136957377, + "learning_rate": 6.0882133565167055e-05, + "loss": 0.6166, + "step": 3595 + }, + { + "epoch": 0.6392888888888889, + "grad_norm": 0.34956877901583, + "learning_rate": 6.082914785932947e-05, + "loss": 0.5668, + "step": 3596 + }, + { + "epoch": 0.6394666666666666, + "grad_norm": 0.3425135692081391, + "learning_rate": 6.0776175140647064e-05, + "loss": 0.5861, + "step": 3597 + }, + { + "epoch": 0.6396444444444445, + "grad_norm": 0.3633986640495545, + "learning_rate": 6.072321542668301e-05, + "loss": 0.6178, + "step": 3598 + }, + { + "epoch": 0.6398222222222222, + "grad_norm": 0.35718099890882815, + "learning_rate": 6.067026873499622e-05, + "loss": 0.5801, + "step": 3599 + }, + { + "epoch": 0.64, + "grad_norm": 0.3428943765469151, + "learning_rate": 6.061733508314116e-05, + "loss": 0.5551, + "step": 3600 + }, + { + "epoch": 0.6401777777777777, + "grad_norm": 0.3620896830376724, + "learning_rate": 6.0564414488668165e-05, + "loss": 0.5534, + "step": 3601 + }, + { + "epoch": 0.6403555555555556, + "grad_norm": 0.35713715218538716, + "learning_rate": 6.0511506969123044e-05, + "loss": 0.6019, + "step": 3602 + }, + { + "epoch": 0.6405333333333333, + "grad_norm": 0.3399310349375786, + "learning_rate": 6.0458612542047456e-05, + "loss": 0.5524, + "step": 3603 + }, + { + "epoch": 0.6407111111111111, + "grad_norm": 0.35280661807942976, + "learning_rate": 6.0405731224978546e-05, + "loss": 0.5915, + "step": 3604 + }, + { + "epoch": 0.6408888888888888, + "grad_norm": 0.3624197279910715, + "learning_rate": 6.035286303544927e-05, + "loss": 0.5965, + "step": 3605 + }, + { + "epoch": 0.6410666666666667, + "grad_norm": 0.37170746095734314, + "learning_rate": 6.030000799098808e-05, + "loss": 0.596, + "step": 3606 + }, + { + "epoch": 0.6412444444444444, + "grad_norm": 0.34632664950674946, + "learning_rate": 6.024716610911924e-05, + "loss": 0.5892, + "step": 3607 + }, + { + "epoch": 0.6414222222222222, + "grad_norm": 0.3471931167338859, + "learning_rate": 6.0194337407362466e-05, + "loss": 0.5871, + "step": 3608 + }, + { + "epoch": 0.6416, + "grad_norm": 0.3570258689320991, + "learning_rate": 6.0141521903233235e-05, + "loss": 0.6014, + "step": 3609 + }, + { + "epoch": 0.6417777777777778, + "grad_norm": 0.38872224192835486, + "learning_rate": 6.008871961424258e-05, + "loss": 0.5682, + "step": 3610 + }, + { + "epoch": 0.6419555555555555, + "grad_norm": 0.3795415590392823, + "learning_rate": 6.003593055789725e-05, + "loss": 0.604, + "step": 3611 + }, + { + "epoch": 0.6421333333333333, + "grad_norm": 0.35418504513328236, + "learning_rate": 5.998315475169942e-05, + "loss": 0.5997, + "step": 3612 + }, + { + "epoch": 0.6423111111111112, + "grad_norm": 0.35941259139829723, + "learning_rate": 5.9930392213147116e-05, + "loss": 0.592, + "step": 3613 + }, + { + "epoch": 0.6424888888888889, + "grad_norm": 0.35565618143860017, + "learning_rate": 5.987764295973373e-05, + "loss": 0.6069, + "step": 3614 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 0.3672921698113864, + "learning_rate": 5.982490700894844e-05, + "loss": 0.6058, + "step": 3615 + }, + { + "epoch": 0.6428444444444444, + "grad_norm": 0.36360234288234605, + "learning_rate": 5.9772184378275854e-05, + "loss": 0.5763, + "step": 3616 + }, + { + "epoch": 0.6430222222222223, + "grad_norm": 0.34416994375996074, + "learning_rate": 5.971947508519631e-05, + "loss": 0.5716, + "step": 3617 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3785566191898189, + "learning_rate": 5.9666779147185593e-05, + "loss": 0.5764, + "step": 3618 + }, + { + "epoch": 0.6433777777777778, + "grad_norm": 0.3449185400170919, + "learning_rate": 5.9614096581715196e-05, + "loss": 0.5536, + "step": 3619 + }, + { + "epoch": 0.6435555555555555, + "grad_norm": 0.36290689660054826, + "learning_rate": 5.956142740625203e-05, + "loss": 0.5884, + "step": 3620 + }, + { + "epoch": 0.6437333333333334, + "grad_norm": 0.35742552147412654, + "learning_rate": 5.9508771638258654e-05, + "loss": 0.6239, + "step": 3621 + }, + { + "epoch": 0.6439111111111111, + "grad_norm": 0.33570459402326264, + "learning_rate": 5.94561292951932e-05, + "loss": 0.5522, + "step": 3622 + }, + { + "epoch": 0.6440888888888889, + "grad_norm": 0.3439704717730226, + "learning_rate": 5.94035003945093e-05, + "loss": 0.6095, + "step": 3623 + }, + { + "epoch": 0.6442666666666667, + "grad_norm": 0.33866081392070085, + "learning_rate": 5.935088495365613e-05, + "loss": 0.5909, + "step": 3624 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 0.3648555868908178, + "learning_rate": 5.929828299007845e-05, + "loss": 0.6537, + "step": 3625 + }, + { + "epoch": 0.6446222222222222, + "grad_norm": 0.3621514256478139, + "learning_rate": 5.9245694521216464e-05, + "loss": 0.5785, + "step": 3626 + }, + { + "epoch": 0.6448, + "grad_norm": 0.330012553903769, + "learning_rate": 5.9193119564506035e-05, + "loss": 0.5645, + "step": 3627 + }, + { + "epoch": 0.6449777777777778, + "grad_norm": 0.3543822424593375, + "learning_rate": 5.914055813737839e-05, + "loss": 0.6368, + "step": 3628 + }, + { + "epoch": 0.6451555555555556, + "grad_norm": 0.3448404119091526, + "learning_rate": 5.908801025726043e-05, + "loss": 0.6099, + "step": 3629 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 0.38372058412664345, + "learning_rate": 5.90354759415744e-05, + "loss": 0.6211, + "step": 3630 + }, + { + "epoch": 0.6455111111111111, + "grad_norm": 0.34748770951860847, + "learning_rate": 5.898295520773822e-05, + "loss": 0.5741, + "step": 3631 + }, + { + "epoch": 0.6456888888888889, + "grad_norm": 0.3332256196844322, + "learning_rate": 5.893044807316516e-05, + "loss": 0.5112, + "step": 3632 + }, + { + "epoch": 0.6458666666666667, + "grad_norm": 0.3524257255133568, + "learning_rate": 5.8877954555264034e-05, + "loss": 0.5948, + "step": 3633 + }, + { + "epoch": 0.6460444444444444, + "grad_norm": 0.37536332792058563, + "learning_rate": 5.88254746714392e-05, + "loss": 0.6007, + "step": 3634 + }, + { + "epoch": 0.6462222222222223, + "grad_norm": 0.3401923225619182, + "learning_rate": 5.877300843909039e-05, + "loss": 0.5933, + "step": 3635 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3827200268635426, + "learning_rate": 5.872055587561287e-05, + "loss": 0.6173, + "step": 3636 + }, + { + "epoch": 0.6465777777777778, + "grad_norm": 0.33696110007659563, + "learning_rate": 5.86681169983974e-05, + "loss": 0.6143, + "step": 3637 + }, + { + "epoch": 0.6467555555555555, + "grad_norm": 0.3616523708236569, + "learning_rate": 5.861569182483013e-05, + "loss": 0.5623, + "step": 3638 + }, + { + "epoch": 0.6469333333333334, + "grad_norm": 0.330474537152214, + "learning_rate": 5.856328037229275e-05, + "loss": 0.5439, + "step": 3639 + }, + { + "epoch": 0.6471111111111111, + "grad_norm": 0.3570580044650607, + "learning_rate": 5.851088265816229e-05, + "loss": 0.601, + "step": 3640 + }, + { + "epoch": 0.6472888888888889, + "grad_norm": 0.3844020305251516, + "learning_rate": 5.845849869981137e-05, + "loss": 0.6124, + "step": 3641 + }, + { + "epoch": 0.6474666666666666, + "grad_norm": 0.3526083964359886, + "learning_rate": 5.8406128514607894e-05, + "loss": 0.5936, + "step": 3642 + }, + { + "epoch": 0.6476444444444445, + "grad_norm": 0.37011468592008107, + "learning_rate": 5.8353772119915376e-05, + "loss": 0.5882, + "step": 3643 + }, + { + "epoch": 0.6478222222222222, + "grad_norm": 0.3553719660528463, + "learning_rate": 5.83014295330925e-05, + "loss": 0.5586, + "step": 3644 + }, + { + "epoch": 0.648, + "grad_norm": 0.36940514675211517, + "learning_rate": 5.824910077149371e-05, + "loss": 0.6187, + "step": 3645 + }, + { + "epoch": 0.6481777777777777, + "grad_norm": 0.37706720400180127, + "learning_rate": 5.8196785852468524e-05, + "loss": 0.6348, + "step": 3646 + }, + { + "epoch": 0.6483555555555556, + "grad_norm": 0.3574009887838271, + "learning_rate": 5.8144484793362183e-05, + "loss": 0.5933, + "step": 3647 + }, + { + "epoch": 0.6485333333333333, + "grad_norm": 0.3494989304118295, + "learning_rate": 5.809219761151504e-05, + "loss": 0.567, + "step": 3648 + }, + { + "epoch": 0.6487111111111111, + "grad_norm": 0.33222398716018736, + "learning_rate": 5.803992432426313e-05, + "loss": 0.5553, + "step": 3649 + }, + { + "epoch": 0.6488888888888888, + "grad_norm": 0.35770588020278876, + "learning_rate": 5.798766494893759e-05, + "loss": 0.5887, + "step": 3650 + }, + { + "epoch": 0.6490666666666667, + "grad_norm": 0.34363918052300946, + "learning_rate": 5.793541950286525e-05, + "loss": 0.5529, + "step": 3651 + }, + { + "epoch": 0.6492444444444444, + "grad_norm": 0.40714223195235766, + "learning_rate": 5.788318800336805e-05, + "loss": 0.5717, + "step": 3652 + }, + { + "epoch": 0.6494222222222222, + "grad_norm": 0.35105818741588674, + "learning_rate": 5.7830970467763456e-05, + "loss": 0.5854, + "step": 3653 + }, + { + "epoch": 0.6496, + "grad_norm": 0.36190994315693215, + "learning_rate": 5.777876691336428e-05, + "loss": 0.6103, + "step": 3654 + }, + { + "epoch": 0.6497777777777778, + "grad_norm": 0.35405315857611974, + "learning_rate": 5.772657735747868e-05, + "loss": 0.594, + "step": 3655 + }, + { + "epoch": 0.6499555555555555, + "grad_norm": 0.34851815309047113, + "learning_rate": 5.767440181741019e-05, + "loss": 0.5901, + "step": 3656 + }, + { + "epoch": 0.6501333333333333, + "grad_norm": 0.33081768470922984, + "learning_rate": 5.762224031045769e-05, + "loss": 0.569, + "step": 3657 + }, + { + "epoch": 0.6503111111111111, + "grad_norm": 0.34795808382731863, + "learning_rate": 5.757009285391539e-05, + "loss": 0.5517, + "step": 3658 + }, + { + "epoch": 0.6504888888888889, + "grad_norm": 0.35188535283666056, + "learning_rate": 5.751795946507289e-05, + "loss": 0.5943, + "step": 3659 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 0.3416260959050426, + "learning_rate": 5.746584016121506e-05, + "loss": 0.576, + "step": 3660 + }, + { + "epoch": 0.6508444444444444, + "grad_norm": 0.3782479717293079, + "learning_rate": 5.7413734959622154e-05, + "loss": 0.5989, + "step": 3661 + }, + { + "epoch": 0.6510222222222222, + "grad_norm": 0.3640211225542614, + "learning_rate": 5.7361643877569726e-05, + "loss": 0.5537, + "step": 3662 + }, + { + "epoch": 0.6512, + "grad_norm": 0.3611950292266739, + "learning_rate": 5.730956693232865e-05, + "loss": 0.6005, + "step": 3663 + }, + { + "epoch": 0.6513777777777778, + "grad_norm": 0.38188780470835093, + "learning_rate": 5.725750414116512e-05, + "loss": 0.5875, + "step": 3664 + }, + { + "epoch": 0.6515555555555556, + "grad_norm": 0.3658645736037116, + "learning_rate": 5.7205455521340664e-05, + "loss": 0.56, + "step": 3665 + }, + { + "epoch": 0.6517333333333334, + "grad_norm": 0.3363545348744477, + "learning_rate": 5.715342109011197e-05, + "loss": 0.5807, + "step": 3666 + }, + { + "epoch": 0.6519111111111111, + "grad_norm": 0.38168650893393086, + "learning_rate": 5.710140086473129e-05, + "loss": 0.6215, + "step": 3667 + }, + { + "epoch": 0.6520888888888889, + "grad_norm": 0.32285625948519464, + "learning_rate": 5.704939486244585e-05, + "loss": 0.5718, + "step": 3668 + }, + { + "epoch": 0.6522666666666667, + "grad_norm": 0.4351810686514931, + "learning_rate": 5.699740310049847e-05, + "loss": 0.5813, + "step": 3669 + }, + { + "epoch": 0.6524444444444445, + "grad_norm": 0.38417927546628566, + "learning_rate": 5.694542559612694e-05, + "loss": 0.6186, + "step": 3670 + }, + { + "epoch": 0.6526222222222222, + "grad_norm": 0.3399410619714422, + "learning_rate": 5.689346236656465e-05, + "loss": 0.5959, + "step": 3671 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3571073390895776, + "learning_rate": 5.684151342903992e-05, + "loss": 0.5782, + "step": 3672 + }, + { + "epoch": 0.6529777777777778, + "grad_norm": 0.3782638001435793, + "learning_rate": 5.6789578800776657e-05, + "loss": 0.5614, + "step": 3673 + }, + { + "epoch": 0.6531555555555556, + "grad_norm": 0.33367554548758843, + "learning_rate": 5.6737658498993705e-05, + "loss": 0.5749, + "step": 3674 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.3605185855518717, + "learning_rate": 5.668575254090549e-05, + "loss": 0.5976, + "step": 3675 + }, + { + "epoch": 0.6535111111111112, + "grad_norm": 0.32927454750553914, + "learning_rate": 5.6633860943721376e-05, + "loss": 0.5539, + "step": 3676 + }, + { + "epoch": 0.6536888888888889, + "grad_norm": 0.3330113145026618, + "learning_rate": 5.6581983724646134e-05, + "loss": 0.5243, + "step": 3677 + }, + { + "epoch": 0.6538666666666667, + "grad_norm": 0.35874514621568315, + "learning_rate": 5.653012090087977e-05, + "loss": 0.5664, + "step": 3678 + }, + { + "epoch": 0.6540444444444444, + "grad_norm": 0.352777141995489, + "learning_rate": 5.6478272489617435e-05, + "loss": 0.563, + "step": 3679 + }, + { + "epoch": 0.6542222222222223, + "grad_norm": 0.360922996740039, + "learning_rate": 5.6426438508049586e-05, + "loss": 0.602, + "step": 3680 + }, + { + "epoch": 0.6544, + "grad_norm": 0.341169596886129, + "learning_rate": 5.637461897336185e-05, + "loss": 0.5374, + "step": 3681 + }, + { + "epoch": 0.6545777777777778, + "grad_norm": 0.33385876548773685, + "learning_rate": 5.632281390273504e-05, + "loss": 0.6026, + "step": 3682 + }, + { + "epoch": 0.6547555555555555, + "grad_norm": 0.3642168212086464, + "learning_rate": 5.627102331334525e-05, + "loss": 0.5521, + "step": 3683 + }, + { + "epoch": 0.6549333333333334, + "grad_norm": 0.3407992070632586, + "learning_rate": 5.62192472223637e-05, + "loss": 0.5686, + "step": 3684 + }, + { + "epoch": 0.6551111111111111, + "grad_norm": 0.3511116529870407, + "learning_rate": 5.616748564695684e-05, + "loss": 0.5886, + "step": 3685 + }, + { + "epoch": 0.6552888888888889, + "grad_norm": 0.33872765751719236, + "learning_rate": 5.611573860428631e-05, + "loss": 0.5378, + "step": 3686 + }, + { + "epoch": 0.6554666666666666, + "grad_norm": 0.35819948424700543, + "learning_rate": 5.606400611150889e-05, + "loss": 0.571, + "step": 3687 + }, + { + "epoch": 0.6556444444444445, + "grad_norm": 0.35103158782362637, + "learning_rate": 5.60122881857766e-05, + "loss": 0.5859, + "step": 3688 + }, + { + "epoch": 0.6558222222222222, + "grad_norm": 0.3611830397005914, + "learning_rate": 5.596058484423656e-05, + "loss": 0.5793, + "step": 3689 + }, + { + "epoch": 0.656, + "grad_norm": 0.34265305509760013, + "learning_rate": 5.590889610403113e-05, + "loss": 0.6121, + "step": 3690 + }, + { + "epoch": 0.6561777777777777, + "grad_norm": 0.3347845244091521, + "learning_rate": 5.58572219822978e-05, + "loss": 0.5493, + "step": 3691 + }, + { + "epoch": 0.6563555555555556, + "grad_norm": 0.3514548575455999, + "learning_rate": 5.580556249616911e-05, + "loss": 0.5787, + "step": 3692 + }, + { + "epoch": 0.6565333333333333, + "grad_norm": 0.37089259067215413, + "learning_rate": 5.575391766277297e-05, + "loss": 0.5918, + "step": 3693 + }, + { + "epoch": 0.6567111111111111, + "grad_norm": 0.34641717036089437, + "learning_rate": 5.570228749923217e-05, + "loss": 0.5519, + "step": 3694 + }, + { + "epoch": 0.6568888888888889, + "grad_norm": 0.3349639923342679, + "learning_rate": 5.5650672022664896e-05, + "loss": 0.5448, + "step": 3695 + }, + { + "epoch": 0.6570666666666667, + "grad_norm": 0.33892264641173697, + "learning_rate": 5.559907125018421e-05, + "loss": 0.5888, + "step": 3696 + }, + { + "epoch": 0.6572444444444444, + "grad_norm": 0.3706524664214029, + "learning_rate": 5.554748519889858e-05, + "loss": 0.658, + "step": 3697 + }, + { + "epoch": 0.6574222222222222, + "grad_norm": 0.34529823305343504, + "learning_rate": 5.5495913885911265e-05, + "loss": 0.5921, + "step": 3698 + }, + { + "epoch": 0.6576, + "grad_norm": 0.3563740637613725, + "learning_rate": 5.5444357328320985e-05, + "loss": 0.566, + "step": 3699 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 0.3667431109579284, + "learning_rate": 5.5392815543221254e-05, + "loss": 0.6147, + "step": 3700 + }, + { + "epoch": 0.6579555555555555, + "grad_norm": 0.36518416485491273, + "learning_rate": 5.534128854770089e-05, + "loss": 0.6201, + "step": 3701 + }, + { + "epoch": 0.6581333333333333, + "grad_norm": 0.34773657590350704, + "learning_rate": 5.528977635884375e-05, + "loss": 0.5689, + "step": 3702 + }, + { + "epoch": 0.6583111111111111, + "grad_norm": 0.3530176806566833, + "learning_rate": 5.5238278993728756e-05, + "loss": 0.5857, + "step": 3703 + }, + { + "epoch": 0.6584888888888889, + "grad_norm": 0.33495610749260346, + "learning_rate": 5.5186796469429956e-05, + "loss": 0.5804, + "step": 3704 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 0.35968340807869076, + "learning_rate": 5.513532880301645e-05, + "loss": 0.5413, + "step": 3705 + }, + { + "epoch": 0.6588444444444445, + "grad_norm": 0.37352011675406216, + "learning_rate": 5.508387601155243e-05, + "loss": 0.625, + "step": 3706 + }, + { + "epoch": 0.6590222222222222, + "grad_norm": 0.3568111178401562, + "learning_rate": 5.503243811209713e-05, + "loss": 0.5753, + "step": 3707 + }, + { + "epoch": 0.6592, + "grad_norm": 0.3582129020937405, + "learning_rate": 5.498101512170486e-05, + "loss": 0.5989, + "step": 3708 + }, + { + "epoch": 0.6593777777777777, + "grad_norm": 0.33605525824591304, + "learning_rate": 5.4929607057425015e-05, + "loss": 0.5224, + "step": 3709 + }, + { + "epoch": 0.6595555555555556, + "grad_norm": 0.37212750134613615, + "learning_rate": 5.4878213936302e-05, + "loss": 0.5551, + "step": 3710 + }, + { + "epoch": 0.6597333333333333, + "grad_norm": 0.3252409361183586, + "learning_rate": 5.4826835775375285e-05, + "loss": 0.6087, + "step": 3711 + }, + { + "epoch": 0.6599111111111111, + "grad_norm": 0.35620452460367513, + "learning_rate": 5.477547259167939e-05, + "loss": 0.592, + "step": 3712 + }, + { + "epoch": 0.6600888888888888, + "grad_norm": 0.37465822067242516, + "learning_rate": 5.4724124402243837e-05, + "loss": 0.6187, + "step": 3713 + }, + { + "epoch": 0.6602666666666667, + "grad_norm": 0.38319880412547924, + "learning_rate": 5.467279122409319e-05, + "loss": 0.581, + "step": 3714 + }, + { + "epoch": 0.6604444444444444, + "grad_norm": 0.3437437267363973, + "learning_rate": 5.46214730742471e-05, + "loss": 0.525, + "step": 3715 + }, + { + "epoch": 0.6606222222222222, + "grad_norm": 0.35851619945769814, + "learning_rate": 5.4570169969720055e-05, + "loss": 0.596, + "step": 3716 + }, + { + "epoch": 0.6608, + "grad_norm": 0.4593652326437602, + "learning_rate": 5.451888192752184e-05, + "loss": 0.6269, + "step": 3717 + }, + { + "epoch": 0.6609777777777778, + "grad_norm": 0.36749277886033155, + "learning_rate": 5.4467608964656905e-05, + "loss": 0.5828, + "step": 3718 + }, + { + "epoch": 0.6611555555555556, + "grad_norm": 0.3591357295312584, + "learning_rate": 5.441635109812504e-05, + "loss": 0.6097, + "step": 3719 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 0.34741984473289583, + "learning_rate": 5.436510834492072e-05, + "loss": 0.5938, + "step": 3720 + }, + { + "epoch": 0.6615111111111112, + "grad_norm": 0.34992349716413, + "learning_rate": 5.431388072203373e-05, + "loss": 0.5635, + "step": 3721 + }, + { + "epoch": 0.6616888888888889, + "grad_norm": 0.3418413646637175, + "learning_rate": 5.4262668246448475e-05, + "loss": 0.5847, + "step": 3722 + }, + { + "epoch": 0.6618666666666667, + "grad_norm": 0.36975539394784773, + "learning_rate": 5.4211470935144715e-05, + "loss": 0.5424, + "step": 3723 + }, + { + "epoch": 0.6620444444444444, + "grad_norm": 0.33751654601563225, + "learning_rate": 5.4160288805096845e-05, + "loss": 0.599, + "step": 3724 + }, + { + "epoch": 0.6622222222222223, + "grad_norm": 0.3431978875411624, + "learning_rate": 5.410912187327446e-05, + "loss": 0.5164, + "step": 3725 + }, + { + "epoch": 0.6624, + "grad_norm": 0.37189954351334, + "learning_rate": 5.4057970156641994e-05, + "loss": 0.6025, + "step": 3726 + }, + { + "epoch": 0.6625777777777778, + "grad_norm": 0.32721099794335723, + "learning_rate": 5.4006833672158885e-05, + "loss": 0.5299, + "step": 3727 + }, + { + "epoch": 0.6627555555555555, + "grad_norm": 0.368190475024461, + "learning_rate": 5.3955712436779534e-05, + "loss": 0.5913, + "step": 3728 + }, + { + "epoch": 0.6629333333333334, + "grad_norm": 0.3407316840208625, + "learning_rate": 5.3904606467453254e-05, + "loss": 0.5816, + "step": 3729 + }, + { + "epoch": 0.6631111111111111, + "grad_norm": 0.3549760611940972, + "learning_rate": 5.385351578112429e-05, + "loss": 0.605, + "step": 3730 + }, + { + "epoch": 0.6632888888888889, + "grad_norm": 0.3396666114160034, + "learning_rate": 5.380244039473184e-05, + "loss": 0.5629, + "step": 3731 + }, + { + "epoch": 0.6634666666666666, + "grad_norm": 0.34588927208694353, + "learning_rate": 5.375138032521004e-05, + "loss": 0.586, + "step": 3732 + }, + { + "epoch": 0.6636444444444445, + "grad_norm": 0.364545977171092, + "learning_rate": 5.3700335589487925e-05, + "loss": 0.5552, + "step": 3733 + }, + { + "epoch": 0.6638222222222222, + "grad_norm": 0.3376695797701531, + "learning_rate": 5.364930620448946e-05, + "loss": 0.5871, + "step": 3734 + }, + { + "epoch": 0.664, + "grad_norm": 0.3670168776506038, + "learning_rate": 5.35982921871335e-05, + "loss": 0.6, + "step": 3735 + }, + { + "epoch": 0.6641777777777778, + "grad_norm": 0.36599522643191545, + "learning_rate": 5.354729355433383e-05, + "loss": 0.6191, + "step": 3736 + }, + { + "epoch": 0.6643555555555556, + "grad_norm": 0.35783214765567045, + "learning_rate": 5.3496310322999134e-05, + "loss": 0.6039, + "step": 3737 + }, + { + "epoch": 0.6645333333333333, + "grad_norm": 0.40812885957732903, + "learning_rate": 5.344534251003296e-05, + "loss": 0.5612, + "step": 3738 + }, + { + "epoch": 0.6647111111111111, + "grad_norm": 0.35493173709646303, + "learning_rate": 5.3394390132333805e-05, + "loss": 0.5777, + "step": 3739 + }, + { + "epoch": 0.6648888888888889, + "grad_norm": 0.346816382573614, + "learning_rate": 5.33434532067949e-05, + "loss": 0.5815, + "step": 3740 + }, + { + "epoch": 0.6650666666666667, + "grad_norm": 0.3428371997888651, + "learning_rate": 5.329253175030462e-05, + "loss": 0.5586, + "step": 3741 + }, + { + "epoch": 0.6652444444444444, + "grad_norm": 0.3203381640603382, + "learning_rate": 5.3241625779745873e-05, + "loss": 0.5272, + "step": 3742 + }, + { + "epoch": 0.6654222222222222, + "grad_norm": 0.35403288824240003, + "learning_rate": 5.319073531199679e-05, + "loss": 0.5714, + "step": 3743 + }, + { + "epoch": 0.6656, + "grad_norm": 0.39248273989205185, + "learning_rate": 5.3139860363929996e-05, + "loss": 0.6162, + "step": 3744 + }, + { + "epoch": 0.6657777777777778, + "grad_norm": 0.4082558760683123, + "learning_rate": 5.3089000952413346e-05, + "loss": 0.6, + "step": 3745 + }, + { + "epoch": 0.6659555555555555, + "grad_norm": 0.33500367072715787, + "learning_rate": 5.303815709430918e-05, + "loss": 0.5613, + "step": 3746 + }, + { + "epoch": 0.6661333333333334, + "grad_norm": 0.3412212071133008, + "learning_rate": 5.298732880647502e-05, + "loss": 0.5969, + "step": 3747 + }, + { + "epoch": 0.6663111111111111, + "grad_norm": 0.3502787660500914, + "learning_rate": 5.29365161057629e-05, + "loss": 0.5725, + "step": 3748 + }, + { + "epoch": 0.6664888888888889, + "grad_norm": 0.3417881819264822, + "learning_rate": 5.2885719009020006e-05, + "loss": 0.572, + "step": 3749 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.4092137058679559, + "learning_rate": 5.283493753308808e-05, + "loss": 0.5675, + "step": 3750 + }, + { + "epoch": 0.6668444444444445, + "grad_norm": 0.35247479931877196, + "learning_rate": 5.278417169480383e-05, + "loss": 0.5846, + "step": 3751 + }, + { + "epoch": 0.6670222222222222, + "grad_norm": 0.3792257183861252, + "learning_rate": 5.273342151099874e-05, + "loss": 0.5847, + "step": 3752 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3239661765215962, + "learning_rate": 5.268268699849912e-05, + "loss": 0.5244, + "step": 3753 + }, + { + "epoch": 0.6673777777777777, + "grad_norm": 0.3553481753323244, + "learning_rate": 5.263196817412608e-05, + "loss": 0.6102, + "step": 3754 + }, + { + "epoch": 0.6675555555555556, + "grad_norm": 0.3450954912095572, + "learning_rate": 5.2581265054695494e-05, + "loss": 0.5829, + "step": 3755 + }, + { + "epoch": 0.6677333333333333, + "grad_norm": 0.3296186380513435, + "learning_rate": 5.25305776570181e-05, + "loss": 0.583, + "step": 3756 + }, + { + "epoch": 0.6679111111111111, + "grad_norm": 0.37008182550415547, + "learning_rate": 5.247990599789935e-05, + "loss": 0.6049, + "step": 3757 + }, + { + "epoch": 0.6680888888888888, + "grad_norm": 0.40823817708212623, + "learning_rate": 5.2429250094139526e-05, + "loss": 0.5689, + "step": 3758 + }, + { + "epoch": 0.6682666666666667, + "grad_norm": 0.3812022744836966, + "learning_rate": 5.237860996253365e-05, + "loss": 0.6231, + "step": 3759 + }, + { + "epoch": 0.6684444444444444, + "grad_norm": 0.3704249487965545, + "learning_rate": 5.2327985619871555e-05, + "loss": 0.5828, + "step": 3760 + }, + { + "epoch": 0.6686222222222222, + "grad_norm": 0.3422141445527099, + "learning_rate": 5.2277377082937806e-05, + "loss": 0.5916, + "step": 3761 + }, + { + "epoch": 0.6688, + "grad_norm": 0.3685350400713173, + "learning_rate": 5.2226784368511735e-05, + "loss": 0.6045, + "step": 3762 + }, + { + "epoch": 0.6689777777777778, + "grad_norm": 0.367866697588512, + "learning_rate": 5.217620749336745e-05, + "loss": 0.5773, + "step": 3763 + }, + { + "epoch": 0.6691555555555555, + "grad_norm": 0.3587826585206924, + "learning_rate": 5.2125646474273785e-05, + "loss": 0.6054, + "step": 3764 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 0.37339954378788254, + "learning_rate": 5.207510132799436e-05, + "loss": 0.6345, + "step": 3765 + }, + { + "epoch": 0.6695111111111111, + "grad_norm": 0.3451301432072929, + "learning_rate": 5.202457207128736e-05, + "loss": 0.5562, + "step": 3766 + }, + { + "epoch": 0.6696888888888889, + "grad_norm": 0.42835158085976643, + "learning_rate": 5.1974058720906014e-05, + "loss": 0.5966, + "step": 3767 + }, + { + "epoch": 0.6698666666666667, + "grad_norm": 0.3423620134896019, + "learning_rate": 5.192356129359794e-05, + "loss": 0.5154, + "step": 3768 + }, + { + "epoch": 0.6700444444444444, + "grad_norm": 0.35299452449965724, + "learning_rate": 5.1873079806105785e-05, + "loss": 0.5207, + "step": 3769 + }, + { + "epoch": 0.6702222222222223, + "grad_norm": 0.3613850679716077, + "learning_rate": 5.1822614275166614e-05, + "loss": 0.6096, + "step": 3770 + }, + { + "epoch": 0.6704, + "grad_norm": 0.3642423032154287, + "learning_rate": 5.17721647175125e-05, + "loss": 0.6047, + "step": 3771 + }, + { + "epoch": 0.6705777777777778, + "grad_norm": 0.36754967202922495, + "learning_rate": 5.1721731149869925e-05, + "loss": 0.5757, + "step": 3772 + }, + { + "epoch": 0.6707555555555555, + "grad_norm": 0.3556164799296794, + "learning_rate": 5.1671313588960355e-05, + "loss": 0.5818, + "step": 3773 + }, + { + "epoch": 0.6709333333333334, + "grad_norm": 0.44022696041050036, + "learning_rate": 5.16209120514997e-05, + "loss": 0.5449, + "step": 3774 + }, + { + "epoch": 0.6711111111111111, + "grad_norm": 0.34366825152326763, + "learning_rate": 5.1570526554198704e-05, + "loss": 0.5346, + "step": 3775 + }, + { + "epoch": 0.6712888888888889, + "grad_norm": 0.3429518023848944, + "learning_rate": 5.152015711376274e-05, + "loss": 0.6139, + "step": 3776 + }, + { + "epoch": 0.6714666666666667, + "grad_norm": 0.36547213032738785, + "learning_rate": 5.146980374689192e-05, + "loss": 0.6601, + "step": 3777 + }, + { + "epoch": 0.6716444444444445, + "grad_norm": 0.3460266225059201, + "learning_rate": 5.141946647028092e-05, + "loss": 0.5377, + "step": 3778 + }, + { + "epoch": 0.6718222222222222, + "grad_norm": 0.35100353735664414, + "learning_rate": 5.136914530061917e-05, + "loss": 0.5794, + "step": 3779 + }, + { + "epoch": 0.672, + "grad_norm": 0.36933101402424406, + "learning_rate": 5.1318840254590725e-05, + "loss": 0.5673, + "step": 3780 + }, + { + "epoch": 0.6721777777777778, + "grad_norm": 0.35799785565842296, + "learning_rate": 5.1268551348874296e-05, + "loss": 0.5994, + "step": 3781 + }, + { + "epoch": 0.6723555555555556, + "grad_norm": 0.34980612555669427, + "learning_rate": 5.121827860014326e-05, + "loss": 0.596, + "step": 3782 + }, + { + "epoch": 0.6725333333333333, + "grad_norm": 0.3869567562483847, + "learning_rate": 5.11680220250656e-05, + "loss": 0.5737, + "step": 3783 + }, + { + "epoch": 0.6727111111111111, + "grad_norm": 0.40077405619198875, + "learning_rate": 5.111778164030396e-05, + "loss": 0.6231, + "step": 3784 + }, + { + "epoch": 0.6728888888888889, + "grad_norm": 0.33628240154867695, + "learning_rate": 5.106755746251565e-05, + "loss": 0.5866, + "step": 3785 + }, + { + "epoch": 0.6730666666666667, + "grad_norm": 0.38035745082408773, + "learning_rate": 5.101734950835253e-05, + "loss": 0.5619, + "step": 3786 + }, + { + "epoch": 0.6732444444444444, + "grad_norm": 0.36311862093378333, + "learning_rate": 5.0967157794461154e-05, + "loss": 0.6134, + "step": 3787 + }, + { + "epoch": 0.6734222222222223, + "grad_norm": 0.3421953430124754, + "learning_rate": 5.0916982337482644e-05, + "loss": 0.535, + "step": 3788 + }, + { + "epoch": 0.6736, + "grad_norm": 0.35218075396362114, + "learning_rate": 5.086682315405279e-05, + "loss": 0.6128, + "step": 3789 + }, + { + "epoch": 0.6737777777777778, + "grad_norm": 0.39903409978381105, + "learning_rate": 5.081668026080183e-05, + "loss": 0.6067, + "step": 3790 + }, + { + "epoch": 0.6739555555555555, + "grad_norm": 0.3435573961476874, + "learning_rate": 5.076655367435487e-05, + "loss": 0.523, + "step": 3791 + }, + { + "epoch": 0.6741333333333334, + "grad_norm": 0.3734677873865915, + "learning_rate": 5.071644341133131e-05, + "loss": 0.5682, + "step": 3792 + }, + { + "epoch": 0.6743111111111111, + "grad_norm": 0.32798848351480425, + "learning_rate": 5.066634948834541e-05, + "loss": 0.5562, + "step": 3793 + }, + { + "epoch": 0.6744888888888889, + "grad_norm": 0.34973305116801057, + "learning_rate": 5.061627192200575e-05, + "loss": 0.6369, + "step": 3794 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 0.39554832346531404, + "learning_rate": 5.0566210728915786e-05, + "loss": 0.5606, + "step": 3795 + }, + { + "epoch": 0.6748444444444445, + "grad_norm": 0.3503082660038528, + "learning_rate": 5.051616592567323e-05, + "loss": 0.58, + "step": 3796 + }, + { + "epoch": 0.6750222222222222, + "grad_norm": 0.5505857665869399, + "learning_rate": 5.046613752887064e-05, + "loss": 0.5632, + "step": 3797 + }, + { + "epoch": 0.6752, + "grad_norm": 0.32473678902894215, + "learning_rate": 5.041612555509492e-05, + "loss": 0.5579, + "step": 3798 + }, + { + "epoch": 0.6753777777777777, + "grad_norm": 0.35967215565695027, + "learning_rate": 5.0366130020927624e-05, + "loss": 0.6195, + "step": 3799 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 0.3631887712577227, + "learning_rate": 5.031615094294488e-05, + "loss": 0.6026, + "step": 3800 + }, + { + "epoch": 0.6757333333333333, + "grad_norm": 0.3459963302028815, + "learning_rate": 5.02661883377173e-05, + "loss": 0.5307, + "step": 3801 + }, + { + "epoch": 0.6759111111111111, + "grad_norm": 0.3828382104822761, + "learning_rate": 5.0216242221810075e-05, + "loss": 0.6169, + "step": 3802 + }, + { + "epoch": 0.6760888888888889, + "grad_norm": 0.379709160745068, + "learning_rate": 5.0166312611782916e-05, + "loss": 0.5668, + "step": 3803 + }, + { + "epoch": 0.6762666666666667, + "grad_norm": 0.34704142397707294, + "learning_rate": 5.011639952419005e-05, + "loss": 0.5558, + "step": 3804 + }, + { + "epoch": 0.6764444444444444, + "grad_norm": 0.36262162029044515, + "learning_rate": 5.0066502975580244e-05, + "loss": 0.5498, + "step": 3805 + }, + { + "epoch": 0.6766222222222222, + "grad_norm": 0.38016595918740326, + "learning_rate": 5.001662298249678e-05, + "loss": 0.618, + "step": 3806 + }, + { + "epoch": 0.6768, + "grad_norm": 0.35110390690577636, + "learning_rate": 4.9966759561477424e-05, + "loss": 0.5545, + "step": 3807 + }, + { + "epoch": 0.6769777777777778, + "grad_norm": 0.3617674763207987, + "learning_rate": 4.991691272905449e-05, + "loss": 0.6053, + "step": 3808 + }, + { + "epoch": 0.6771555555555555, + "grad_norm": 0.34578344893770513, + "learning_rate": 4.986708250175476e-05, + "loss": 0.574, + "step": 3809 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 0.34567593734131025, + "learning_rate": 4.981726889609952e-05, + "loss": 0.5964, + "step": 3810 + }, + { + "epoch": 0.6775111111111111, + "grad_norm": 0.36819427764691953, + "learning_rate": 4.976747192860456e-05, + "loss": 0.5875, + "step": 3811 + }, + { + "epoch": 0.6776888888888889, + "grad_norm": 0.34110912747908734, + "learning_rate": 4.971769161578013e-05, + "loss": 0.5572, + "step": 3812 + }, + { + "epoch": 0.6778666666666666, + "grad_norm": 0.35040782541782817, + "learning_rate": 4.9667927974131e-05, + "loss": 0.5744, + "step": 3813 + }, + { + "epoch": 0.6780444444444444, + "grad_norm": 0.3265403757922863, + "learning_rate": 4.9618181020156274e-05, + "loss": 0.5501, + "step": 3814 + }, + { + "epoch": 0.6782222222222222, + "grad_norm": 0.35891825557525664, + "learning_rate": 4.9568450770349775e-05, + "loss": 0.6172, + "step": 3815 + }, + { + "epoch": 0.6784, + "grad_norm": 0.34750698466013846, + "learning_rate": 4.9518737241199495e-05, + "loss": 0.5723, + "step": 3816 + }, + { + "epoch": 0.6785777777777777, + "grad_norm": 0.369180027545402, + "learning_rate": 4.9469040449188185e-05, + "loss": 0.5641, + "step": 3817 + }, + { + "epoch": 0.6787555555555556, + "grad_norm": 0.37708735468285687, + "learning_rate": 4.9419360410792745e-05, + "loss": 0.5615, + "step": 3818 + }, + { + "epoch": 0.6789333333333334, + "grad_norm": 0.4431329040018839, + "learning_rate": 4.936969714248481e-05, + "loss": 0.6027, + "step": 3819 + }, + { + "epoch": 0.6791111111111111, + "grad_norm": 0.3575795416823746, + "learning_rate": 4.932005066073014e-05, + "loss": 0.6085, + "step": 3820 + }, + { + "epoch": 0.6792888888888889, + "grad_norm": 0.3424091399815592, + "learning_rate": 4.9270420981989294e-05, + "loss": 0.5758, + "step": 3821 + }, + { + "epoch": 0.6794666666666667, + "grad_norm": 0.3810216072083059, + "learning_rate": 4.9220808122716924e-05, + "loss": 0.5942, + "step": 3822 + }, + { + "epoch": 0.6796444444444445, + "grad_norm": 0.37531728510793283, + "learning_rate": 4.91712120993623e-05, + "loss": 0.5483, + "step": 3823 + }, + { + "epoch": 0.6798222222222222, + "grad_norm": 0.3517852213212341, + "learning_rate": 4.912163292836903e-05, + "loss": 0.582, + "step": 3824 + }, + { + "epoch": 0.68, + "grad_norm": 0.36090366339817603, + "learning_rate": 4.9072070626175203e-05, + "loss": 0.5374, + "step": 3825 + }, + { + "epoch": 0.6801777777777778, + "grad_norm": 0.3478217412734873, + "learning_rate": 4.9022525209213264e-05, + "loss": 0.5559, + "step": 3826 + }, + { + "epoch": 0.6803555555555556, + "grad_norm": 0.34398676563975056, + "learning_rate": 4.8972996693910054e-05, + "loss": 0.5355, + "step": 3827 + }, + { + "epoch": 0.6805333333333333, + "grad_norm": 0.37694958802860384, + "learning_rate": 4.892348509668684e-05, + "loss": 0.5887, + "step": 3828 + }, + { + "epoch": 0.6807111111111112, + "grad_norm": 0.37462673336384417, + "learning_rate": 4.887399043395927e-05, + "loss": 0.6418, + "step": 3829 + }, + { + "epoch": 0.6808888888888889, + "grad_norm": 0.3384279319810809, + "learning_rate": 4.882451272213736e-05, + "loss": 0.5602, + "step": 3830 + }, + { + "epoch": 0.6810666666666667, + "grad_norm": 0.3532464607557794, + "learning_rate": 4.877505197762553e-05, + "loss": 0.5492, + "step": 3831 + }, + { + "epoch": 0.6812444444444444, + "grad_norm": 0.35755721729900775, + "learning_rate": 4.872560821682256e-05, + "loss": 0.5708, + "step": 3832 + }, + { + "epoch": 0.6814222222222223, + "grad_norm": 0.3786332812544826, + "learning_rate": 4.867618145612162e-05, + "loss": 0.565, + "step": 3833 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3385648540747402, + "learning_rate": 4.86267717119102e-05, + "loss": 0.5839, + "step": 3834 + }, + { + "epoch": 0.6817777777777778, + "grad_norm": 0.37030014327537, + "learning_rate": 4.85773790005702e-05, + "loss": 0.6042, + "step": 3835 + }, + { + "epoch": 0.6819555555555555, + "grad_norm": 0.34660154424499623, + "learning_rate": 4.8528003338477823e-05, + "loss": 0.5794, + "step": 3836 + }, + { + "epoch": 0.6821333333333334, + "grad_norm": 0.3633376712867026, + "learning_rate": 4.847864474200371e-05, + "loss": 0.5519, + "step": 3837 + }, + { + "epoch": 0.6823111111111111, + "grad_norm": 0.35660589921265107, + "learning_rate": 4.8429303227512645e-05, + "loss": 0.5858, + "step": 3838 + }, + { + "epoch": 0.6824888888888889, + "grad_norm": 0.3393857504319453, + "learning_rate": 4.837997881136404e-05, + "loss": 0.57, + "step": 3839 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 0.3301556350550329, + "learning_rate": 4.833067150991133e-05, + "loss": 0.5301, + "step": 3840 + }, + { + "epoch": 0.6828444444444445, + "grad_norm": 0.36474614368015157, + "learning_rate": 4.8281381339502565e-05, + "loss": 0.5639, + "step": 3841 + }, + { + "epoch": 0.6830222222222222, + "grad_norm": 0.3401723902768643, + "learning_rate": 4.823210831647984e-05, + "loss": 0.5693, + "step": 3842 + }, + { + "epoch": 0.6832, + "grad_norm": 0.36054270662523535, + "learning_rate": 4.818285245717984e-05, + "loss": 0.5973, + "step": 3843 + }, + { + "epoch": 0.6833777777777778, + "grad_norm": 0.3694129586838312, + "learning_rate": 4.813361377793327e-05, + "loss": 0.5933, + "step": 3844 + }, + { + "epoch": 0.6835555555555556, + "grad_norm": 0.34642722604450854, + "learning_rate": 4.808439229506546e-05, + "loss": 0.6042, + "step": 3845 + }, + { + "epoch": 0.6837333333333333, + "grad_norm": 0.36260310020770314, + "learning_rate": 4.8035188024895685e-05, + "loss": 0.5615, + "step": 3846 + }, + { + "epoch": 0.6839111111111111, + "grad_norm": 0.35158156525650475, + "learning_rate": 4.7986000983737856e-05, + "loss": 0.6045, + "step": 3847 + }, + { + "epoch": 0.6840888888888889, + "grad_norm": 0.3440241194751509, + "learning_rate": 4.793683118789991e-05, + "loss": 0.4972, + "step": 3848 + }, + { + "epoch": 0.6842666666666667, + "grad_norm": 0.3323429868860486, + "learning_rate": 4.7887678653684184e-05, + "loss": 0.5597, + "step": 3849 + }, + { + "epoch": 0.6844444444444444, + "grad_norm": 0.3521174206342374, + "learning_rate": 4.783854339738729e-05, + "loss": 0.5585, + "step": 3850 + }, + { + "epoch": 0.6846222222222222, + "grad_norm": 0.3647623603677191, + "learning_rate": 4.7789425435300107e-05, + "loss": 0.599, + "step": 3851 + }, + { + "epoch": 0.6848, + "grad_norm": 0.34949764487443796, + "learning_rate": 4.7740324783707734e-05, + "loss": 0.5994, + "step": 3852 + }, + { + "epoch": 0.6849777777777778, + "grad_norm": 0.3540874873670466, + "learning_rate": 4.76912414588896e-05, + "loss": 0.5629, + "step": 3853 + }, + { + "epoch": 0.6851555555555555, + "grad_norm": 0.36175317590547346, + "learning_rate": 4.764217547711934e-05, + "loss": 0.6116, + "step": 3854 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 0.35043526832572747, + "learning_rate": 4.759312685466486e-05, + "loss": 0.5409, + "step": 3855 + }, + { + "epoch": 0.6855111111111111, + "grad_norm": 0.34570341800457427, + "learning_rate": 4.75440956077883e-05, + "loss": 0.6085, + "step": 3856 + }, + { + "epoch": 0.6856888888888889, + "grad_norm": 0.3660548286797606, + "learning_rate": 4.749508175274605e-05, + "loss": 0.5927, + "step": 3857 + }, + { + "epoch": 0.6858666666666666, + "grad_norm": 0.3649511403309351, + "learning_rate": 4.7446085305788725e-05, + "loss": 0.5539, + "step": 3858 + }, + { + "epoch": 0.6860444444444445, + "grad_norm": 0.3534702171556326, + "learning_rate": 4.7397106283161166e-05, + "loss": 0.5525, + "step": 3859 + }, + { + "epoch": 0.6862222222222222, + "grad_norm": 0.34427844562878507, + "learning_rate": 4.734814470110244e-05, + "loss": 0.5718, + "step": 3860 + }, + { + "epoch": 0.6864, + "grad_norm": 0.37661405409639365, + "learning_rate": 4.729920057584584e-05, + "loss": 0.5797, + "step": 3861 + }, + { + "epoch": 0.6865777777777777, + "grad_norm": 0.3606437950815262, + "learning_rate": 4.725027392361887e-05, + "loss": 0.573, + "step": 3862 + }, + { + "epoch": 0.6867555555555556, + "grad_norm": 0.35634022099202656, + "learning_rate": 4.7201364760643264e-05, + "loss": 0.5706, + "step": 3863 + }, + { + "epoch": 0.6869333333333333, + "grad_norm": 0.3534861640952797, + "learning_rate": 4.715247310313482e-05, + "loss": 0.5837, + "step": 3864 + }, + { + "epoch": 0.6871111111111111, + "grad_norm": 0.3553531452104539, + "learning_rate": 4.710359896730379e-05, + "loss": 0.5756, + "step": 3865 + }, + { + "epoch": 0.6872888888888888, + "grad_norm": 0.3823974292421038, + "learning_rate": 4.7054742369354324e-05, + "loss": 0.5861, + "step": 3866 + }, + { + "epoch": 0.6874666666666667, + "grad_norm": 0.3760494487706127, + "learning_rate": 4.700590332548503e-05, + "loss": 0.5547, + "step": 3867 + }, + { + "epoch": 0.6876444444444444, + "grad_norm": 0.371549022417296, + "learning_rate": 4.695708185188844e-05, + "loss": 0.5946, + "step": 3868 + }, + { + "epoch": 0.6878222222222222, + "grad_norm": 0.3615090859357574, + "learning_rate": 4.690827796475152e-05, + "loss": 0.5449, + "step": 3869 + }, + { + "epoch": 0.688, + "grad_norm": 0.35782069921441423, + "learning_rate": 4.685949168025514e-05, + "loss": 0.6002, + "step": 3870 + }, + { + "epoch": 0.6881777777777778, + "grad_norm": 0.36235867684088197, + "learning_rate": 4.681072301457461e-05, + "loss": 0.5379, + "step": 3871 + }, + { + "epoch": 0.6883555555555556, + "grad_norm": 0.35585365070286906, + "learning_rate": 4.676197198387913e-05, + "loss": 0.6212, + "step": 3872 + }, + { + "epoch": 0.6885333333333333, + "grad_norm": 0.37002129824452296, + "learning_rate": 4.671323860433222e-05, + "loss": 0.6297, + "step": 3873 + }, + { + "epoch": 0.6887111111111112, + "grad_norm": 0.3633301309645365, + "learning_rate": 4.666452289209152e-05, + "loss": 0.5547, + "step": 3874 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 0.3534253713893809, + "learning_rate": 4.661582486330879e-05, + "loss": 0.5618, + "step": 3875 + }, + { + "epoch": 0.6890666666666667, + "grad_norm": 0.3593716800161438, + "learning_rate": 4.656714453412993e-05, + "loss": 0.5702, + "step": 3876 + }, + { + "epoch": 0.6892444444444444, + "grad_norm": 0.356579718553429, + "learning_rate": 4.651848192069498e-05, + "loss": 0.5589, + "step": 3877 + }, + { + "epoch": 0.6894222222222223, + "grad_norm": 0.3491130956442169, + "learning_rate": 4.64698370391381e-05, + "loss": 0.6025, + "step": 3878 + }, + { + "epoch": 0.6896, + "grad_norm": 0.35685189397008116, + "learning_rate": 4.642120990558758e-05, + "loss": 0.5465, + "step": 3879 + }, + { + "epoch": 0.6897777777777778, + "grad_norm": 0.34426661670923675, + "learning_rate": 4.637260053616581e-05, + "loss": 0.5827, + "step": 3880 + }, + { + "epoch": 0.6899555555555555, + "grad_norm": 0.3515706163716907, + "learning_rate": 4.6324008946989314e-05, + "loss": 0.5887, + "step": 3881 + }, + { + "epoch": 0.6901333333333334, + "grad_norm": 0.35541335364086346, + "learning_rate": 4.62754351541687e-05, + "loss": 0.5735, + "step": 3882 + }, + { + "epoch": 0.6903111111111111, + "grad_norm": 0.37546258532550886, + "learning_rate": 4.622687917380868e-05, + "loss": 0.5746, + "step": 3883 + }, + { + "epoch": 0.6904888888888889, + "grad_norm": 0.34909350657030414, + "learning_rate": 4.6178341022008054e-05, + "loss": 0.5477, + "step": 3884 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 0.35697970341341345, + "learning_rate": 4.612982071485974e-05, + "loss": 0.5839, + "step": 3885 + }, + { + "epoch": 0.6908444444444445, + "grad_norm": 0.34245040410489874, + "learning_rate": 4.60813182684507e-05, + "loss": 0.576, + "step": 3886 + }, + { + "epoch": 0.6910222222222222, + "grad_norm": 0.35896187582512795, + "learning_rate": 4.6032833698862044e-05, + "loss": 0.5917, + "step": 3887 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3479823427169532, + "learning_rate": 4.5984367022168786e-05, + "loss": 0.6126, + "step": 3888 + }, + { + "epoch": 0.6913777777777778, + "grad_norm": 0.36654176836804997, + "learning_rate": 4.593591825444028e-05, + "loss": 0.6102, + "step": 3889 + }, + { + "epoch": 0.6915555555555556, + "grad_norm": 0.3496069876103817, + "learning_rate": 4.588748741173963e-05, + "loss": 0.5949, + "step": 3890 + }, + { + "epoch": 0.6917333333333333, + "grad_norm": 0.3504383001973791, + "learning_rate": 4.5839074510124314e-05, + "loss": 0.5858, + "step": 3891 + }, + { + "epoch": 0.6919111111111111, + "grad_norm": 0.33586675396245075, + "learning_rate": 4.5790679565645544e-05, + "loss": 0.5928, + "step": 3892 + }, + { + "epoch": 0.6920888888888889, + "grad_norm": 0.36542453642785044, + "learning_rate": 4.5742302594348894e-05, + "loss": 0.6056, + "step": 3893 + }, + { + "epoch": 0.6922666666666667, + "grad_norm": 0.36214877658285244, + "learning_rate": 4.569394361227367e-05, + "loss": 0.5848, + "step": 3894 + }, + { + "epoch": 0.6924444444444444, + "grad_norm": 0.36339216006050884, + "learning_rate": 4.564560263545351e-05, + "loss": 0.6122, + "step": 3895 + }, + { + "epoch": 0.6926222222222223, + "grad_norm": 0.3591363101456611, + "learning_rate": 4.559727967991584e-05, + "loss": 0.6134, + "step": 3896 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3425321461841997, + "learning_rate": 4.554897476168223e-05, + "loss": 0.6014, + "step": 3897 + }, + { + "epoch": 0.6929777777777778, + "grad_norm": 0.34664678541799615, + "learning_rate": 4.5500687896768256e-05, + "loss": 0.5988, + "step": 3898 + }, + { + "epoch": 0.6931555555555555, + "grad_norm": 0.34113760838225743, + "learning_rate": 4.54524191011835e-05, + "loss": 0.5651, + "step": 3899 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.35157119209602583, + "learning_rate": 4.540416839093156e-05, + "loss": 0.5755, + "step": 3900 + }, + { + "epoch": 0.6935111111111111, + "grad_norm": 0.44879573349370927, + "learning_rate": 4.5355935782010015e-05, + "loss": 0.5823, + "step": 3901 + }, + { + "epoch": 0.6936888888888889, + "grad_norm": 0.3615969911867251, + "learning_rate": 4.5307721290410475e-05, + "loss": 0.5743, + "step": 3902 + }, + { + "epoch": 0.6938666666666666, + "grad_norm": 0.3869661107969985, + "learning_rate": 4.5259524932118526e-05, + "loss": 0.579, + "step": 3903 + }, + { + "epoch": 0.6940444444444445, + "grad_norm": 0.3722069911520766, + "learning_rate": 4.521134672311373e-05, + "loss": 0.5587, + "step": 3904 + }, + { + "epoch": 0.6942222222222222, + "grad_norm": 0.34651584775074395, + "learning_rate": 4.516318667936967e-05, + "loss": 0.5768, + "step": 3905 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3624580281710354, + "learning_rate": 4.511504481685386e-05, + "loss": 0.5995, + "step": 3906 + }, + { + "epoch": 0.6945777777777777, + "grad_norm": 0.354355033707949, + "learning_rate": 4.5066921151527816e-05, + "loss": 0.588, + "step": 3907 + }, + { + "epoch": 0.6947555555555556, + "grad_norm": 0.3511116105009167, + "learning_rate": 4.5018815699347004e-05, + "loss": 0.577, + "step": 3908 + }, + { + "epoch": 0.6949333333333333, + "grad_norm": 0.47298146572786093, + "learning_rate": 4.497072847626087e-05, + "loss": 0.6052, + "step": 3909 + }, + { + "epoch": 0.6951111111111111, + "grad_norm": 0.35753521927971, + "learning_rate": 4.4922659498212796e-05, + "loss": 0.5675, + "step": 3910 + }, + { + "epoch": 0.6952888888888888, + "grad_norm": 0.36180816973757196, + "learning_rate": 4.487460878114017e-05, + "loss": 0.5889, + "step": 3911 + }, + { + "epoch": 0.6954666666666667, + "grad_norm": 0.36529679553091143, + "learning_rate": 4.482657634097416e-05, + "loss": 0.5563, + "step": 3912 + }, + { + "epoch": 0.6956444444444444, + "grad_norm": 0.3435383267586004, + "learning_rate": 4.477856219364015e-05, + "loss": 0.584, + "step": 3913 + }, + { + "epoch": 0.6958222222222222, + "grad_norm": 0.36156980011748857, + "learning_rate": 4.4730566355057145e-05, + "loss": 0.6063, + "step": 3914 + }, + { + "epoch": 0.696, + "grad_norm": 0.3649794175465803, + "learning_rate": 4.4682588841138396e-05, + "loss": 0.5851, + "step": 3915 + }, + { + "epoch": 0.6961777777777778, + "grad_norm": 0.3438709227117961, + "learning_rate": 4.4634629667790774e-05, + "loss": 0.5718, + "step": 3916 + }, + { + "epoch": 0.6963555555555555, + "grad_norm": 0.34980101432518595, + "learning_rate": 4.4586688850915345e-05, + "loss": 0.5796, + "step": 3917 + }, + { + "epoch": 0.6965333333333333, + "grad_norm": 0.42776970744186266, + "learning_rate": 4.453876640640684e-05, + "loss": 0.6401, + "step": 3918 + }, + { + "epoch": 0.6967111111111111, + "grad_norm": 0.3463354402996292, + "learning_rate": 4.449086235015414e-05, + "loss": 0.5768, + "step": 3919 + }, + { + "epoch": 0.6968888888888889, + "grad_norm": 0.34704208506945794, + "learning_rate": 4.444297669803981e-05, + "loss": 0.6196, + "step": 3920 + }, + { + "epoch": 0.6970666666666666, + "grad_norm": 0.3653213905469017, + "learning_rate": 4.43951094659404e-05, + "loss": 0.5632, + "step": 3921 + }, + { + "epoch": 0.6972444444444444, + "grad_norm": 0.33422460568468454, + "learning_rate": 4.434726066972649e-05, + "loss": 0.5462, + "step": 3922 + }, + { + "epoch": 0.6974222222222223, + "grad_norm": 0.37515661882135604, + "learning_rate": 4.429943032526225e-05, + "loss": 0.5945, + "step": 3923 + }, + { + "epoch": 0.6976, + "grad_norm": 0.33298265892550527, + "learning_rate": 4.4251618448406073e-05, + "loss": 0.5896, + "step": 3924 + }, + { + "epoch": 0.6977777777777778, + "grad_norm": 0.34416117702092097, + "learning_rate": 4.42038250550099e-05, + "loss": 0.5701, + "step": 3925 + }, + { + "epoch": 0.6979555555555556, + "grad_norm": 0.34864441108067806, + "learning_rate": 4.415605016091985e-05, + "loss": 0.5772, + "step": 3926 + }, + { + "epoch": 0.6981333333333334, + "grad_norm": 0.32104961212294114, + "learning_rate": 4.410829378197562e-05, + "loss": 0.5817, + "step": 3927 + }, + { + "epoch": 0.6983111111111111, + "grad_norm": 0.34796075997418663, + "learning_rate": 4.406055593401104e-05, + "loss": 0.5426, + "step": 3928 + }, + { + "epoch": 0.6984888888888889, + "grad_norm": 0.3360833218633838, + "learning_rate": 4.401283663285355e-05, + "loss": 0.5557, + "step": 3929 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 0.35748969097689026, + "learning_rate": 4.396513589432467e-05, + "loss": 0.583, + "step": 3930 + }, + { + "epoch": 0.6988444444444445, + "grad_norm": 0.38150841739853314, + "learning_rate": 4.3917453734239566e-05, + "loss": 0.5926, + "step": 3931 + }, + { + "epoch": 0.6990222222222222, + "grad_norm": 0.38293614438182194, + "learning_rate": 4.386979016840735e-05, + "loss": 0.5796, + "step": 3932 + }, + { + "epoch": 0.6992, + "grad_norm": 0.3560881302825979, + "learning_rate": 4.3822145212630964e-05, + "loss": 0.5847, + "step": 3933 + }, + { + "epoch": 0.6993777777777778, + "grad_norm": 0.34797452041665655, + "learning_rate": 4.377451888270715e-05, + "loss": 0.5607, + "step": 3934 + }, + { + "epoch": 0.6995555555555556, + "grad_norm": 0.37016955208761415, + "learning_rate": 4.37269111944265e-05, + "loss": 0.5989, + "step": 3935 + }, + { + "epoch": 0.6997333333333333, + "grad_norm": 0.35275258352121736, + "learning_rate": 4.367932216357342e-05, + "loss": 0.5698, + "step": 3936 + }, + { + "epoch": 0.6999111111111112, + "grad_norm": 0.39192577891409336, + "learning_rate": 4.363175180592611e-05, + "loss": 0.6282, + "step": 3937 + }, + { + "epoch": 0.7000888888888889, + "grad_norm": 0.3978897026500953, + "learning_rate": 4.35842001372566e-05, + "loss": 0.557, + "step": 3938 + }, + { + "epoch": 0.7002666666666667, + "grad_norm": 0.40034566690856965, + "learning_rate": 4.3536667173330726e-05, + "loss": 0.5879, + "step": 3939 + }, + { + "epoch": 0.7004444444444444, + "grad_norm": 0.3303189964398963, + "learning_rate": 4.348915292990809e-05, + "loss": 0.5353, + "step": 3940 + }, + { + "epoch": 0.7006222222222223, + "grad_norm": 0.3399606245816993, + "learning_rate": 4.344165742274215e-05, + "loss": 0.561, + "step": 3941 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3678808617997972, + "learning_rate": 4.339418066758008e-05, + "loss": 0.6164, + "step": 3942 + }, + { + "epoch": 0.7009777777777778, + "grad_norm": 0.3553190925477395, + "learning_rate": 4.334672268016288e-05, + "loss": 0.5794, + "step": 3943 + }, + { + "epoch": 0.7011555555555555, + "grad_norm": 0.338682014565456, + "learning_rate": 4.3299283476225315e-05, + "loss": 0.5305, + "step": 3944 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 0.35077909335599466, + "learning_rate": 4.325186307149593e-05, + "loss": 0.6154, + "step": 3945 + }, + { + "epoch": 0.7015111111111111, + "grad_norm": 0.341559459947685, + "learning_rate": 4.320446148169707e-05, + "loss": 0.5657, + "step": 3946 + }, + { + "epoch": 0.7016888888888889, + "grad_norm": 0.34594671246955, + "learning_rate": 4.3157078722544685e-05, + "loss": 0.5702, + "step": 3947 + }, + { + "epoch": 0.7018666666666666, + "grad_norm": 0.34414227911143863, + "learning_rate": 4.310971480974875e-05, + "loss": 0.5798, + "step": 3948 + }, + { + "epoch": 0.7020444444444445, + "grad_norm": 0.3796641230876599, + "learning_rate": 4.30623697590127e-05, + "loss": 0.5896, + "step": 3949 + }, + { + "epoch": 0.7022222222222222, + "grad_norm": 0.3622368752336497, + "learning_rate": 4.301504358603401e-05, + "loss": 0.573, + "step": 3950 + }, + { + "epoch": 0.7024, + "grad_norm": 0.337223851700592, + "learning_rate": 4.296773630650358e-05, + "loss": 0.5236, + "step": 3951 + }, + { + "epoch": 0.7025777777777777, + "grad_norm": 0.3458866641583537, + "learning_rate": 4.292044793610637e-05, + "loss": 0.5643, + "step": 3952 + }, + { + "epoch": 0.7027555555555556, + "grad_norm": 0.34992414132112304, + "learning_rate": 4.287317849052075e-05, + "loss": 0.576, + "step": 3953 + }, + { + "epoch": 0.7029333333333333, + "grad_norm": 0.36262374160369426, + "learning_rate": 4.2825927985419144e-05, + "loss": 0.5958, + "step": 3954 + }, + { + "epoch": 0.7031111111111111, + "grad_norm": 0.35643758035536016, + "learning_rate": 4.2778696436467404e-05, + "loss": 0.5541, + "step": 3955 + }, + { + "epoch": 0.7032888888888889, + "grad_norm": 0.38528338069749735, + "learning_rate": 4.273148385932526e-05, + "loss": 0.5869, + "step": 3956 + }, + { + "epoch": 0.7034666666666667, + "grad_norm": 0.3627013274430368, + "learning_rate": 4.268429026964611e-05, + "loss": 0.6232, + "step": 3957 + }, + { + "epoch": 0.7036444444444444, + "grad_norm": 0.3564755402380381, + "learning_rate": 4.263711568307707e-05, + "loss": 0.595, + "step": 3958 + }, + { + "epoch": 0.7038222222222222, + "grad_norm": 0.414100561482083, + "learning_rate": 4.258996011525893e-05, + "loss": 0.5607, + "step": 3959 + }, + { + "epoch": 0.704, + "grad_norm": 0.34566607212314415, + "learning_rate": 4.2542823581826205e-05, + "loss": 0.593, + "step": 3960 + }, + { + "epoch": 0.7041777777777778, + "grad_norm": 0.35451448279944625, + "learning_rate": 4.2495706098407085e-05, + "loss": 0.588, + "step": 3961 + }, + { + "epoch": 0.7043555555555555, + "grad_norm": 0.36541964699174234, + "learning_rate": 4.244860768062343e-05, + "loss": 0.6064, + "step": 3962 + }, + { + "epoch": 0.7045333333333333, + "grad_norm": 0.32826248336139785, + "learning_rate": 4.2401528344090804e-05, + "loss": 0.5537, + "step": 3963 + }, + { + "epoch": 0.7047111111111111, + "grad_norm": 0.3385579201298541, + "learning_rate": 4.235446810441841e-05, + "loss": 0.5438, + "step": 3964 + }, + { + "epoch": 0.7048888888888889, + "grad_norm": 0.3766815200629706, + "learning_rate": 4.2307426977209164e-05, + "loss": 0.5692, + "step": 3965 + }, + { + "epoch": 0.7050666666666666, + "grad_norm": 0.35430660682312387, + "learning_rate": 4.226040497805962e-05, + "loss": 0.5467, + "step": 3966 + }, + { + "epoch": 0.7052444444444445, + "grad_norm": 0.3854060841328241, + "learning_rate": 4.2213402122559986e-05, + "loss": 0.5578, + "step": 3967 + }, + { + "epoch": 0.7054222222222222, + "grad_norm": 0.36729374311813695, + "learning_rate": 4.216641842629413e-05, + "loss": 0.5559, + "step": 3968 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3453875029217099, + "learning_rate": 4.2119453904839565e-05, + "loss": 0.553, + "step": 3969 + }, + { + "epoch": 0.7057777777777777, + "grad_norm": 0.3455780184007163, + "learning_rate": 4.20725085737675e-05, + "loss": 0.5616, + "step": 3970 + }, + { + "epoch": 0.7059555555555556, + "grad_norm": 0.40194484512667955, + "learning_rate": 4.202558244864261e-05, + "loss": 0.565, + "step": 3971 + }, + { + "epoch": 0.7061333333333333, + "grad_norm": 0.38172898189925675, + "learning_rate": 4.197867554502347e-05, + "loss": 0.5999, + "step": 3972 + }, + { + "epoch": 0.7063111111111111, + "grad_norm": 0.4468107895110227, + "learning_rate": 4.193178787846198e-05, + "loss": 0.6152, + "step": 3973 + }, + { + "epoch": 0.7064888888888889, + "grad_norm": 0.3499827521892334, + "learning_rate": 4.188491946450398e-05, + "loss": 0.5905, + "step": 3974 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.349635214945576, + "learning_rate": 4.1838070318688604e-05, + "loss": 0.5308, + "step": 3975 + }, + { + "epoch": 0.7068444444444445, + "grad_norm": 0.38503473700201507, + "learning_rate": 4.1791240456548905e-05, + "loss": 0.5635, + "step": 3976 + }, + { + "epoch": 0.7070222222222222, + "grad_norm": 0.33552138655392005, + "learning_rate": 4.174442989361126e-05, + "loss": 0.5559, + "step": 3977 + }, + { + "epoch": 0.7072, + "grad_norm": 0.36321069238589176, + "learning_rate": 4.169763864539591e-05, + "loss": 0.6015, + "step": 3978 + }, + { + "epoch": 0.7073777777777778, + "grad_norm": 0.36501739700508484, + "learning_rate": 4.165086672741647e-05, + "loss": 0.5872, + "step": 3979 + }, + { + "epoch": 0.7075555555555556, + "grad_norm": 0.35122587976977737, + "learning_rate": 4.160411415518026e-05, + "loss": 0.5476, + "step": 3980 + }, + { + "epoch": 0.7077333333333333, + "grad_norm": 0.35213675073648626, + "learning_rate": 4.1557380944188184e-05, + "loss": 0.549, + "step": 3981 + }, + { + "epoch": 0.7079111111111112, + "grad_norm": 0.3504046998050794, + "learning_rate": 4.15106671099347e-05, + "loss": 0.5597, + "step": 3982 + }, + { + "epoch": 0.7080888888888889, + "grad_norm": 0.3663624151216087, + "learning_rate": 4.1463972667907845e-05, + "loss": 0.5719, + "step": 3983 + }, + { + "epoch": 0.7082666666666667, + "grad_norm": 0.35311120741861096, + "learning_rate": 4.141729763358925e-05, + "loss": 0.5466, + "step": 3984 + }, + { + "epoch": 0.7084444444444444, + "grad_norm": 0.335107702821768, + "learning_rate": 4.137064202245407e-05, + "loss": 0.5842, + "step": 3985 + }, + { + "epoch": 0.7086222222222223, + "grad_norm": 0.3604053414337539, + "learning_rate": 4.132400584997106e-05, + "loss": 0.5994, + "step": 3986 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3720568033523875, + "learning_rate": 4.1277389131602495e-05, + "loss": 0.5714, + "step": 3987 + }, + { + "epoch": 0.7089777777777778, + "grad_norm": 0.3900155621358108, + "learning_rate": 4.123079188280424e-05, + "loss": 0.5968, + "step": 3988 + }, + { + "epoch": 0.7091555555555555, + "grad_norm": 0.34689047078560853, + "learning_rate": 4.1184214119025676e-05, + "loss": 0.5679, + "step": 3989 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 0.3683538834004813, + "learning_rate": 4.1137655855709723e-05, + "loss": 0.5762, + "step": 3990 + }, + { + "epoch": 0.7095111111111111, + "grad_norm": 0.35717293467111283, + "learning_rate": 4.1091117108292854e-05, + "loss": 0.5982, + "step": 3991 + }, + { + "epoch": 0.7096888888888889, + "grad_norm": 0.3564725434720055, + "learning_rate": 4.104459789220506e-05, + "loss": 0.5981, + "step": 3992 + }, + { + "epoch": 0.7098666666666666, + "grad_norm": 0.35998229520275443, + "learning_rate": 4.099809822286984e-05, + "loss": 0.596, + "step": 3993 + }, + { + "epoch": 0.7100444444444445, + "grad_norm": 0.35842618817594113, + "learning_rate": 4.095161811570429e-05, + "loss": 0.618, + "step": 3994 + }, + { + "epoch": 0.7102222222222222, + "grad_norm": 0.36539750101547136, + "learning_rate": 4.090515758611884e-05, + "loss": 0.5847, + "step": 3995 + }, + { + "epoch": 0.7104, + "grad_norm": 0.37946245521047917, + "learning_rate": 4.085871664951769e-05, + "loss": 0.5677, + "step": 3996 + }, + { + "epoch": 0.7105777777777778, + "grad_norm": 0.35822125396244453, + "learning_rate": 4.081229532129827e-05, + "loss": 0.5778, + "step": 3997 + }, + { + "epoch": 0.7107555555555556, + "grad_norm": 0.3328832917499689, + "learning_rate": 4.076589361685177e-05, + "loss": 0.539, + "step": 3998 + }, + { + "epoch": 0.7109333333333333, + "grad_norm": 0.39820749728525684, + "learning_rate": 4.0719511551562606e-05, + "loss": 0.5774, + "step": 3999 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 0.3422717473390323, + "learning_rate": 4.067314914080898e-05, + "loss": 0.5835, + "step": 4000 + }, + { + "epoch": 0.7112888888888889, + "grad_norm": 0.352775396067291, + "learning_rate": 4.062680639996225e-05, + "loss": 0.5528, + "step": 4001 + }, + { + "epoch": 0.7114666666666667, + "grad_norm": 0.3411915180281605, + "learning_rate": 4.0580483344387586e-05, + "loss": 0.5491, + "step": 4002 + }, + { + "epoch": 0.7116444444444444, + "grad_norm": 0.37745777867250085, + "learning_rate": 4.053417998944331e-05, + "loss": 0.5865, + "step": 4003 + }, + { + "epoch": 0.7118222222222222, + "grad_norm": 0.3457036361204248, + "learning_rate": 4.048789635048154e-05, + "loss": 0.6043, + "step": 4004 + }, + { + "epoch": 0.712, + "grad_norm": 0.34575881214027726, + "learning_rate": 4.044163244284753e-05, + "loss": 0.5889, + "step": 4005 + }, + { + "epoch": 0.7121777777777778, + "grad_norm": 0.3816693732527338, + "learning_rate": 4.039538828188023e-05, + "loss": 0.5362, + "step": 4006 + }, + { + "epoch": 0.7123555555555555, + "grad_norm": 0.3830993084749753, + "learning_rate": 4.0349163882911944e-05, + "loss": 0.6239, + "step": 4007 + }, + { + "epoch": 0.7125333333333334, + "grad_norm": 0.3838765996228099, + "learning_rate": 4.030295926126845e-05, + "loss": 0.5719, + "step": 4008 + }, + { + "epoch": 0.7127111111111111, + "grad_norm": 0.3442245500074592, + "learning_rate": 4.025677443226894e-05, + "loss": 0.5474, + "step": 4009 + }, + { + "epoch": 0.7128888888888889, + "grad_norm": 0.36384519083507694, + "learning_rate": 4.0210609411226075e-05, + "loss": 0.5823, + "step": 4010 + }, + { + "epoch": 0.7130666666666666, + "grad_norm": 0.3468376114904197, + "learning_rate": 4.016446421344594e-05, + "loss": 0.5713, + "step": 4011 + }, + { + "epoch": 0.7132444444444445, + "grad_norm": 0.35870762072720874, + "learning_rate": 4.0118338854228034e-05, + "loss": 0.5849, + "step": 4012 + }, + { + "epoch": 0.7134222222222222, + "grad_norm": 0.3540438295911249, + "learning_rate": 4.007223334886531e-05, + "loss": 0.6114, + "step": 4013 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3497239823866297, + "learning_rate": 4.0026147712644104e-05, + "loss": 0.5804, + "step": 4014 + }, + { + "epoch": 0.7137777777777777, + "grad_norm": 0.3378418022451801, + "learning_rate": 3.998008196084417e-05, + "loss": 0.5747, + "step": 4015 + }, + { + "epoch": 0.7139555555555556, + "grad_norm": 0.3488442436582133, + "learning_rate": 3.99340361087387e-05, + "loss": 0.5768, + "step": 4016 + }, + { + "epoch": 0.7141333333333333, + "grad_norm": 0.3555741020905532, + "learning_rate": 3.988801017159425e-05, + "loss": 0.5682, + "step": 4017 + }, + { + "epoch": 0.7143111111111111, + "grad_norm": 0.36376143575835795, + "learning_rate": 3.98420041646708e-05, + "loss": 0.6142, + "step": 4018 + }, + { + "epoch": 0.7144888888888888, + "grad_norm": 0.34565095752321096, + "learning_rate": 3.979601810322169e-05, + "loss": 0.5899, + "step": 4019 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 0.3515841199041589, + "learning_rate": 3.975005200249372e-05, + "loss": 0.5976, + "step": 4020 + }, + { + "epoch": 0.7148444444444444, + "grad_norm": 0.35633212917359547, + "learning_rate": 3.970410587772691e-05, + "loss": 0.5795, + "step": 4021 + }, + { + "epoch": 0.7150222222222222, + "grad_norm": 0.35423930445191737, + "learning_rate": 3.965817974415492e-05, + "loss": 0.5708, + "step": 4022 + }, + { + "epoch": 0.7152, + "grad_norm": 0.34987913178714297, + "learning_rate": 3.961227361700448e-05, + "loss": 0.6183, + "step": 4023 + }, + { + "epoch": 0.7153777777777778, + "grad_norm": 0.35004593012021906, + "learning_rate": 3.956638751149596e-05, + "loss": 0.6109, + "step": 4024 + }, + { + "epoch": 0.7155555555555555, + "grad_norm": 0.36854247956585573, + "learning_rate": 3.952052144284285e-05, + "loss": 0.551, + "step": 4025 + }, + { + "epoch": 0.7157333333333333, + "grad_norm": 0.35958798106968465, + "learning_rate": 3.947467542625225e-05, + "loss": 0.6042, + "step": 4026 + }, + { + "epoch": 0.7159111111111112, + "grad_norm": 0.34431588711996347, + "learning_rate": 3.9428849476924325e-05, + "loss": 0.5873, + "step": 4027 + }, + { + "epoch": 0.7160888888888889, + "grad_norm": 0.33196810414443173, + "learning_rate": 3.9383043610052885e-05, + "loss": 0.4939, + "step": 4028 + }, + { + "epoch": 0.7162666666666667, + "grad_norm": 0.3430989019721864, + "learning_rate": 3.933725784082483e-05, + "loss": 0.5867, + "step": 4029 + }, + { + "epoch": 0.7164444444444444, + "grad_norm": 0.3524933130613355, + "learning_rate": 3.929149218442052e-05, + "loss": 0.5634, + "step": 4030 + }, + { + "epoch": 0.7166222222222223, + "grad_norm": 0.37469501040717856, + "learning_rate": 3.924574665601366e-05, + "loss": 0.5826, + "step": 4031 + }, + { + "epoch": 0.7168, + "grad_norm": 0.35083104199648535, + "learning_rate": 3.920002127077123e-05, + "loss": 0.5904, + "step": 4032 + }, + { + "epoch": 0.7169777777777778, + "grad_norm": 0.3724475501147572, + "learning_rate": 3.915431604385355e-05, + "loss": 0.5904, + "step": 4033 + }, + { + "epoch": 0.7171555555555555, + "grad_norm": 0.34618822255733345, + "learning_rate": 3.910863099041424e-05, + "loss": 0.5483, + "step": 4034 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 0.3601642914062829, + "learning_rate": 3.9062966125600284e-05, + "loss": 0.5803, + "step": 4035 + }, + { + "epoch": 0.7175111111111111, + "grad_norm": 0.36856301709854145, + "learning_rate": 3.901732146455193e-05, + "loss": 0.5816, + "step": 4036 + }, + { + "epoch": 0.7176888888888889, + "grad_norm": 0.36459665313297895, + "learning_rate": 3.897169702240271e-05, + "loss": 0.5928, + "step": 4037 + }, + { + "epoch": 0.7178666666666667, + "grad_norm": 0.35997596044762503, + "learning_rate": 3.892609281427949e-05, + "loss": 0.5516, + "step": 4038 + }, + { + "epoch": 0.7180444444444445, + "grad_norm": 0.33535915386543563, + "learning_rate": 3.8880508855302425e-05, + "loss": 0.5719, + "step": 4039 + }, + { + "epoch": 0.7182222222222222, + "grad_norm": 0.35264025990849457, + "learning_rate": 3.8834945160584924e-05, + "loss": 0.5647, + "step": 4040 + }, + { + "epoch": 0.7184, + "grad_norm": 0.35588030402193865, + "learning_rate": 3.878940174523371e-05, + "loss": 0.5785, + "step": 4041 + }, + { + "epoch": 0.7185777777777778, + "grad_norm": 0.36948990063759324, + "learning_rate": 3.8743878624348785e-05, + "loss": 0.5777, + "step": 4042 + }, + { + "epoch": 0.7187555555555556, + "grad_norm": 0.36018982939033206, + "learning_rate": 3.869837581302338e-05, + "loss": 0.5814, + "step": 4043 + }, + { + "epoch": 0.7189333333333333, + "grad_norm": 0.3551419485843042, + "learning_rate": 3.865289332634407e-05, + "loss": 0.527, + "step": 4044 + }, + { + "epoch": 0.7191111111111111, + "grad_norm": 0.3348241237514712, + "learning_rate": 3.860743117939055e-05, + "loss": 0.5448, + "step": 4045 + }, + { + "epoch": 0.7192888888888889, + "grad_norm": 0.3690214567390772, + "learning_rate": 3.856198938723598e-05, + "loss": 0.5956, + "step": 4046 + }, + { + "epoch": 0.7194666666666667, + "grad_norm": 0.363777524524831, + "learning_rate": 3.851656796494654e-05, + "loss": 0.5784, + "step": 4047 + }, + { + "epoch": 0.7196444444444444, + "grad_norm": 0.3642390604505968, + "learning_rate": 3.847116692758189e-05, + "loss": 0.5884, + "step": 4048 + }, + { + "epoch": 0.7198222222222223, + "grad_norm": 0.34675180166115704, + "learning_rate": 3.8425786290194676e-05, + "loss": 0.5721, + "step": 4049 + }, + { + "epoch": 0.72, + "grad_norm": 0.33592207755262066, + "learning_rate": 3.838042606783106e-05, + "loss": 0.539, + "step": 4050 + }, + { + "epoch": 0.7201777777777778, + "grad_norm": 0.34386299621856947, + "learning_rate": 3.833508627553016e-05, + "loss": 0.5481, + "step": 4051 + }, + { + "epoch": 0.7203555555555555, + "grad_norm": 0.38573054045041, + "learning_rate": 3.828976692832458e-05, + "loss": 0.532, + "step": 4052 + }, + { + "epoch": 0.7205333333333334, + "grad_norm": 0.41869844148188573, + "learning_rate": 3.824446804123992e-05, + "loss": 0.5444, + "step": 4053 + }, + { + "epoch": 0.7207111111111111, + "grad_norm": 0.34660825949047447, + "learning_rate": 3.819918962929513e-05, + "loss": 0.5816, + "step": 4054 + }, + { + "epoch": 0.7208888888888889, + "grad_norm": 0.32896659407585827, + "learning_rate": 3.815393170750232e-05, + "loss": 0.5408, + "step": 4055 + }, + { + "epoch": 0.7210666666666666, + "grad_norm": 0.37168262839612937, + "learning_rate": 3.810869429086685e-05, + "loss": 0.6041, + "step": 4056 + }, + { + "epoch": 0.7212444444444445, + "grad_norm": 0.3892816750713322, + "learning_rate": 3.806347739438724e-05, + "loss": 0.6131, + "step": 4057 + }, + { + "epoch": 0.7214222222222222, + "grad_norm": 0.37509559352770205, + "learning_rate": 3.801828103305521e-05, + "loss": 0.5295, + "step": 4058 + }, + { + "epoch": 0.7216, + "grad_norm": 0.34406718198370034, + "learning_rate": 3.79731052218557e-05, + "loss": 0.5656, + "step": 4059 + }, + { + "epoch": 0.7217777777777777, + "grad_norm": 0.3633341354228841, + "learning_rate": 3.792794997576681e-05, + "loss": 0.6319, + "step": 4060 + }, + { + "epoch": 0.7219555555555556, + "grad_norm": 0.3558423098248106, + "learning_rate": 3.7882815309759824e-05, + "loss": 0.5751, + "step": 4061 + }, + { + "epoch": 0.7221333333333333, + "grad_norm": 0.38394086631127355, + "learning_rate": 3.7837701238799216e-05, + "loss": 0.5474, + "step": 4062 + }, + { + "epoch": 0.7223111111111111, + "grad_norm": 0.34309866648557463, + "learning_rate": 3.779260777784263e-05, + "loss": 0.5832, + "step": 4063 + }, + { + "epoch": 0.7224888888888888, + "grad_norm": 0.34719677290897155, + "learning_rate": 3.7747534941840854e-05, + "loss": 0.5959, + "step": 4064 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 0.3712393325948241, + "learning_rate": 3.7702482745737874e-05, + "loss": 0.6139, + "step": 4065 + }, + { + "epoch": 0.7228444444444444, + "grad_norm": 0.3443376122367561, + "learning_rate": 3.765745120447081e-05, + "loss": 0.5326, + "step": 4066 + }, + { + "epoch": 0.7230222222222222, + "grad_norm": 0.36927227522192413, + "learning_rate": 3.761244033296992e-05, + "loss": 0.5395, + "step": 4067 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3518835753039752, + "learning_rate": 3.756745014615868e-05, + "loss": 0.588, + "step": 4068 + }, + { + "epoch": 0.7233777777777778, + "grad_norm": 0.3534793121640155, + "learning_rate": 3.752248065895354e-05, + "loss": 0.635, + "step": 4069 + }, + { + "epoch": 0.7235555555555555, + "grad_norm": 0.39624142216125907, + "learning_rate": 3.747753188626434e-05, + "loss": 0.5726, + "step": 4070 + }, + { + "epoch": 0.7237333333333333, + "grad_norm": 0.3855795554773093, + "learning_rate": 3.7432603842993786e-05, + "loss": 0.5975, + "step": 4071 + }, + { + "epoch": 0.7239111111111111, + "grad_norm": 0.3499491109496454, + "learning_rate": 3.738769654403796e-05, + "loss": 0.5854, + "step": 4072 + }, + { + "epoch": 0.7240888888888889, + "grad_norm": 0.350011506488243, + "learning_rate": 3.7342810004285836e-05, + "loss": 0.5427, + "step": 4073 + }, + { + "epoch": 0.7242666666666666, + "grad_norm": 0.33381178021937996, + "learning_rate": 3.7297944238619706e-05, + "loss": 0.5577, + "step": 4074 + }, + { + "epoch": 0.7244444444444444, + "grad_norm": 0.3571728387181711, + "learning_rate": 3.725309926191479e-05, + "loss": 0.5613, + "step": 4075 + }, + { + "epoch": 0.7246222222222222, + "grad_norm": 0.35226413538193374, + "learning_rate": 3.720827508903962e-05, + "loss": 0.5745, + "step": 4076 + }, + { + "epoch": 0.7248, + "grad_norm": 0.34273982433556716, + "learning_rate": 3.716347173485563e-05, + "loss": 0.551, + "step": 4077 + }, + { + "epoch": 0.7249777777777778, + "grad_norm": 0.3508742438452673, + "learning_rate": 3.711868921421745e-05, + "loss": 0.5621, + "step": 4078 + }, + { + "epoch": 0.7251555555555556, + "grad_norm": 0.35235341107733753, + "learning_rate": 3.707392754197281e-05, + "loss": 0.5797, + "step": 4079 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 0.36041693800123004, + "learning_rate": 3.7029186732962515e-05, + "loss": 0.571, + "step": 4080 + }, + { + "epoch": 0.7255111111111111, + "grad_norm": 0.3341291652682757, + "learning_rate": 3.6984466802020436e-05, + "loss": 0.5835, + "step": 4081 + }, + { + "epoch": 0.7256888888888889, + "grad_norm": 0.3484984545996794, + "learning_rate": 3.6939767763973545e-05, + "loss": 0.5918, + "step": 4082 + }, + { + "epoch": 0.7258666666666667, + "grad_norm": 0.33309187853905786, + "learning_rate": 3.6895089633641856e-05, + "loss": 0.5734, + "step": 4083 + }, + { + "epoch": 0.7260444444444445, + "grad_norm": 0.35480576818029674, + "learning_rate": 3.6850432425838485e-05, + "loss": 0.5579, + "step": 4084 + }, + { + "epoch": 0.7262222222222222, + "grad_norm": 0.34738002862697276, + "learning_rate": 3.680579615536961e-05, + "loss": 0.5504, + "step": 4085 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3443099710650306, + "learning_rate": 3.676118083703442e-05, + "loss": 0.593, + "step": 4086 + }, + { + "epoch": 0.7265777777777778, + "grad_norm": 0.3610581388304893, + "learning_rate": 3.671658648562523e-05, + "loss": 0.5769, + "step": 4087 + }, + { + "epoch": 0.7267555555555556, + "grad_norm": 0.34144703749747113, + "learning_rate": 3.667201311592733e-05, + "loss": 0.5748, + "step": 4088 + }, + { + "epoch": 0.7269333333333333, + "grad_norm": 0.35182172456475097, + "learning_rate": 3.66274607427191e-05, + "loss": 0.5783, + "step": 4089 + }, + { + "epoch": 0.7271111111111112, + "grad_norm": 0.44634107103541587, + "learning_rate": 3.6582929380771956e-05, + "loss": 0.5867, + "step": 4090 + }, + { + "epoch": 0.7272888888888889, + "grad_norm": 0.3976661646796536, + "learning_rate": 3.6538419044850335e-05, + "loss": 0.6094, + "step": 4091 + }, + { + "epoch": 0.7274666666666667, + "grad_norm": 0.5028916670005593, + "learning_rate": 3.6493929749711734e-05, + "loss": 0.5832, + "step": 4092 + }, + { + "epoch": 0.7276444444444444, + "grad_norm": 0.35196196607357777, + "learning_rate": 3.644946151010654e-05, + "loss": 0.5778, + "step": 4093 + }, + { + "epoch": 0.7278222222222223, + "grad_norm": 0.35633831307443015, + "learning_rate": 3.640501434077841e-05, + "loss": 0.5873, + "step": 4094 + }, + { + "epoch": 0.728, + "grad_norm": 0.3451010902823216, + "learning_rate": 3.6360588256463734e-05, + "loss": 0.54, + "step": 4095 + }, + { + "epoch": 0.7281777777777778, + "grad_norm": 0.3513175148107401, + "learning_rate": 3.631618327189218e-05, + "loss": 0.5679, + "step": 4096 + }, + { + "epoch": 0.7283555555555555, + "grad_norm": 0.3798334042299023, + "learning_rate": 3.627179940178615e-05, + "loss": 0.6058, + "step": 4097 + }, + { + "epoch": 0.7285333333333334, + "grad_norm": 0.42111831238176, + "learning_rate": 3.622743666086132e-05, + "loss": 0.5997, + "step": 4098 + }, + { + "epoch": 0.7287111111111111, + "grad_norm": 0.4005910154856992, + "learning_rate": 3.61830950638261e-05, + "loss": 0.536, + "step": 4099 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 0.3601110817172015, + "learning_rate": 3.6138774625382134e-05, + "loss": 0.5917, + "step": 4100 + }, + { + "epoch": 0.7290666666666666, + "grad_norm": 0.3582316720363536, + "learning_rate": 3.609447536022379e-05, + "loss": 0.5859, + "step": 4101 + }, + { + "epoch": 0.7292444444444445, + "grad_norm": 0.3600353042347371, + "learning_rate": 3.605019728303871e-05, + "loss": 0.5647, + "step": 4102 + }, + { + "epoch": 0.7294222222222222, + "grad_norm": 0.33023332277899525, + "learning_rate": 3.600594040850724e-05, + "loss": 0.558, + "step": 4103 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3414980810119145, + "learning_rate": 3.596170475130287e-05, + "loss": 0.5708, + "step": 4104 + }, + { + "epoch": 0.7297777777777777, + "grad_norm": 0.36301467661581527, + "learning_rate": 3.591749032609197e-05, + "loss": 0.5392, + "step": 4105 + }, + { + "epoch": 0.7299555555555556, + "grad_norm": 0.3489219650050646, + "learning_rate": 3.5873297147533915e-05, + "loss": 0.5762, + "step": 4106 + }, + { + "epoch": 0.7301333333333333, + "grad_norm": 0.33037371048255665, + "learning_rate": 3.582912523028101e-05, + "loss": 0.5223, + "step": 4107 + }, + { + "epoch": 0.7303111111111111, + "grad_norm": 0.39428863615782694, + "learning_rate": 3.5784974588978545e-05, + "loss": 0.5553, + "step": 4108 + }, + { + "epoch": 0.7304888888888889, + "grad_norm": 0.35566476944171455, + "learning_rate": 3.574084523826471e-05, + "loss": 0.5544, + "step": 4109 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 0.37357581249243205, + "learning_rate": 3.569673719277066e-05, + "loss": 0.5891, + "step": 4110 + }, + { + "epoch": 0.7308444444444444, + "grad_norm": 0.42399272556018314, + "learning_rate": 3.5652650467120485e-05, + "loss": 0.5836, + "step": 4111 + }, + { + "epoch": 0.7310222222222222, + "grad_norm": 0.3583603897360114, + "learning_rate": 3.5608585075931214e-05, + "loss": 0.5504, + "step": 4112 + }, + { + "epoch": 0.7312, + "grad_norm": 0.34530773141229776, + "learning_rate": 3.556454103381278e-05, + "loss": 0.5627, + "step": 4113 + }, + { + "epoch": 0.7313777777777778, + "grad_norm": 0.42322166626875707, + "learning_rate": 3.552051835536807e-05, + "loss": 0.6473, + "step": 4114 + }, + { + "epoch": 0.7315555555555555, + "grad_norm": 0.35419830900647725, + "learning_rate": 3.547651705519285e-05, + "loss": 0.564, + "step": 4115 + }, + { + "epoch": 0.7317333333333333, + "grad_norm": 0.35548346764348576, + "learning_rate": 3.543253714787583e-05, + "loss": 0.5713, + "step": 4116 + }, + { + "epoch": 0.7319111111111111, + "grad_norm": 0.3353066853989554, + "learning_rate": 3.538857864799862e-05, + "loss": 0.564, + "step": 4117 + }, + { + "epoch": 0.7320888888888889, + "grad_norm": 0.33495408069979127, + "learning_rate": 3.534464157013574e-05, + "loss": 0.5566, + "step": 4118 + }, + { + "epoch": 0.7322666666666666, + "grad_norm": 0.4003992692488059, + "learning_rate": 3.530072592885451e-05, + "loss": 0.6307, + "step": 4119 + }, + { + "epoch": 0.7324444444444445, + "grad_norm": 0.4072528101181034, + "learning_rate": 3.5256831738715345e-05, + "loss": 0.5914, + "step": 4120 + }, + { + "epoch": 0.7326222222222222, + "grad_norm": 0.3290746062081954, + "learning_rate": 3.521295901427132e-05, + "loss": 0.5625, + "step": 4121 + }, + { + "epoch": 0.7328, + "grad_norm": 0.36993259684977065, + "learning_rate": 3.516910777006862e-05, + "loss": 0.6043, + "step": 4122 + }, + { + "epoch": 0.7329777777777777, + "grad_norm": 0.3654148060441524, + "learning_rate": 3.512527802064607e-05, + "loss": 0.6229, + "step": 4123 + }, + { + "epoch": 0.7331555555555556, + "grad_norm": 0.36813705642004046, + "learning_rate": 3.508146978053562e-05, + "loss": 0.5839, + "step": 4124 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.3625315228535952, + "learning_rate": 3.5037683064261806e-05, + "loss": 0.5463, + "step": 4125 + }, + { + "epoch": 0.7335111111111111, + "grad_norm": 0.34439952872635465, + "learning_rate": 3.4993917886342334e-05, + "loss": 0.5528, + "step": 4126 + }, + { + "epoch": 0.7336888888888888, + "grad_norm": 0.3377119460572574, + "learning_rate": 3.4950174261287504e-05, + "loss": 0.5434, + "step": 4127 + }, + { + "epoch": 0.7338666666666667, + "grad_norm": 0.3660037048768086, + "learning_rate": 3.4906452203600616e-05, + "loss": 0.5889, + "step": 4128 + }, + { + "epoch": 0.7340444444444445, + "grad_norm": 0.4348114832931297, + "learning_rate": 3.4862751727777797e-05, + "loss": 0.5815, + "step": 4129 + }, + { + "epoch": 0.7342222222222222, + "grad_norm": 0.36818651603667996, + "learning_rate": 3.4819072848307986e-05, + "loss": 0.5486, + "step": 4130 + }, + { + "epoch": 0.7344, + "grad_norm": 0.35238484136797127, + "learning_rate": 3.4775415579673e-05, + "loss": 0.5792, + "step": 4131 + }, + { + "epoch": 0.7345777777777778, + "grad_norm": 0.35053655269699074, + "learning_rate": 3.473177993634745e-05, + "loss": 0.5106, + "step": 4132 + }, + { + "epoch": 0.7347555555555556, + "grad_norm": 0.33947758882610224, + "learning_rate": 3.468816593279883e-05, + "loss": 0.5689, + "step": 4133 + }, + { + "epoch": 0.7349333333333333, + "grad_norm": 0.3472516389991502, + "learning_rate": 3.4644573583487404e-05, + "loss": 0.5507, + "step": 4134 + }, + { + "epoch": 0.7351111111111112, + "grad_norm": 0.3383343297937684, + "learning_rate": 3.4601002902866284e-05, + "loss": 0.5468, + "step": 4135 + }, + { + "epoch": 0.7352888888888889, + "grad_norm": 0.3486170421568541, + "learning_rate": 3.455745390538141e-05, + "loss": 0.5787, + "step": 4136 + }, + { + "epoch": 0.7354666666666667, + "grad_norm": 0.3539950870947957, + "learning_rate": 3.45139266054715e-05, + "loss": 0.5639, + "step": 4137 + }, + { + "epoch": 0.7356444444444444, + "grad_norm": 0.3531963654280004, + "learning_rate": 3.44704210175681e-05, + "loss": 0.594, + "step": 4138 + }, + { + "epoch": 0.7358222222222223, + "grad_norm": 0.3345096244368699, + "learning_rate": 3.4426937156095563e-05, + "loss": 0.557, + "step": 4139 + }, + { + "epoch": 0.736, + "grad_norm": 0.3463203137827246, + "learning_rate": 3.438347503547102e-05, + "loss": 0.5437, + "step": 4140 + }, + { + "epoch": 0.7361777777777778, + "grad_norm": 0.35629817370214173, + "learning_rate": 3.434003467010441e-05, + "loss": 0.5686, + "step": 4141 + }, + { + "epoch": 0.7363555555555555, + "grad_norm": 0.3622511641958309, + "learning_rate": 3.429661607439848e-05, + "loss": 0.5954, + "step": 4142 + }, + { + "epoch": 0.7365333333333334, + "grad_norm": 0.3409325476288989, + "learning_rate": 3.425321926274863e-05, + "loss": 0.5354, + "step": 4143 + }, + { + "epoch": 0.7367111111111111, + "grad_norm": 0.33337697753647727, + "learning_rate": 3.420984424954328e-05, + "loss": 0.5295, + "step": 4144 + }, + { + "epoch": 0.7368888888888889, + "grad_norm": 0.3623020617043478, + "learning_rate": 3.416649104916333e-05, + "loss": 0.5942, + "step": 4145 + }, + { + "epoch": 0.7370666666666666, + "grad_norm": 0.36944463521944043, + "learning_rate": 3.412315967598274e-05, + "loss": 0.5389, + "step": 4146 + }, + { + "epoch": 0.7372444444444445, + "grad_norm": 0.34130609525144595, + "learning_rate": 3.407985014436797e-05, + "loss": 0.5818, + "step": 4147 + }, + { + "epoch": 0.7374222222222222, + "grad_norm": 0.35987167085537464, + "learning_rate": 3.403656246867849e-05, + "loss": 0.5959, + "step": 4148 + }, + { + "epoch": 0.7376, + "grad_norm": 0.34565129078536555, + "learning_rate": 3.399329666326625e-05, + "loss": 0.6006, + "step": 4149 + }, + { + "epoch": 0.7377777777777778, + "grad_norm": 0.36100202026702954, + "learning_rate": 3.3950052742476245e-05, + "loss": 0.5615, + "step": 4150 + }, + { + "epoch": 0.7379555555555556, + "grad_norm": 0.3300455130205511, + "learning_rate": 3.390683072064594e-05, + "loss": 0.5434, + "step": 4151 + }, + { + "epoch": 0.7381333333333333, + "grad_norm": 0.3676732014123496, + "learning_rate": 3.386363061210571e-05, + "loss": 0.6191, + "step": 4152 + }, + { + "epoch": 0.7383111111111111, + "grad_norm": 0.35786896139702584, + "learning_rate": 3.3820452431178606e-05, + "loss": 0.5757, + "step": 4153 + }, + { + "epoch": 0.7384888888888889, + "grad_norm": 0.3650214589300631, + "learning_rate": 3.377729619218043e-05, + "loss": 0.5624, + "step": 4154 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 0.3465952080971314, + "learning_rate": 3.3734161909419695e-05, + "loss": 0.5983, + "step": 4155 + }, + { + "epoch": 0.7388444444444444, + "grad_norm": 0.36796677731123845, + "learning_rate": 3.369104959719763e-05, + "loss": 0.578, + "step": 4156 + }, + { + "epoch": 0.7390222222222222, + "grad_norm": 0.39879091582494125, + "learning_rate": 3.3647959269808205e-05, + "loss": 0.5241, + "step": 4157 + }, + { + "epoch": 0.7392, + "grad_norm": 0.3436222373377845, + "learning_rate": 3.360489094153806e-05, + "loss": 0.5967, + "step": 4158 + }, + { + "epoch": 0.7393777777777778, + "grad_norm": 0.34481521565108186, + "learning_rate": 3.356184462666658e-05, + "loss": 0.6108, + "step": 4159 + }, + { + "epoch": 0.7395555555555555, + "grad_norm": 0.38876036512875123, + "learning_rate": 3.351882033946583e-05, + "loss": 0.6192, + "step": 4160 + }, + { + "epoch": 0.7397333333333334, + "grad_norm": 0.4868480567014935, + "learning_rate": 3.3475818094200585e-05, + "loss": 0.6012, + "step": 4161 + }, + { + "epoch": 0.7399111111111111, + "grad_norm": 0.3418390358719068, + "learning_rate": 3.343283790512829e-05, + "loss": 0.5624, + "step": 4162 + }, + { + "epoch": 0.7400888888888889, + "grad_norm": 0.36078330953090015, + "learning_rate": 3.33898797864991e-05, + "loss": 0.5807, + "step": 4163 + }, + { + "epoch": 0.7402666666666666, + "grad_norm": 0.3564857528496916, + "learning_rate": 3.334694375255585e-05, + "loss": 0.5422, + "step": 4164 + }, + { + "epoch": 0.7404444444444445, + "grad_norm": 0.3683482387828699, + "learning_rate": 3.330402981753403e-05, + "loss": 0.5646, + "step": 4165 + }, + { + "epoch": 0.7406222222222222, + "grad_norm": 0.3449637883678767, + "learning_rate": 3.326113799566187e-05, + "loss": 0.5506, + "step": 4166 + }, + { + "epoch": 0.7408, + "grad_norm": 0.3658986217297458, + "learning_rate": 3.321826830116012e-05, + "loss": 0.5539, + "step": 4167 + }, + { + "epoch": 0.7409777777777777, + "grad_norm": 0.35742008880722825, + "learning_rate": 3.3175420748242406e-05, + "loss": 0.568, + "step": 4168 + }, + { + "epoch": 0.7411555555555556, + "grad_norm": 0.3422450316940728, + "learning_rate": 3.313259535111478e-05, + "loss": 0.5586, + "step": 4169 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 0.39071089627359307, + "learning_rate": 3.3089792123976195e-05, + "loss": 0.5969, + "step": 4170 + }, + { + "epoch": 0.7415111111111111, + "grad_norm": 0.3349522184099262, + "learning_rate": 3.3047011081018e-05, + "loss": 0.5461, + "step": 4171 + }, + { + "epoch": 0.7416888888888888, + "grad_norm": 0.33970565814244474, + "learning_rate": 3.300425223642444e-05, + "loss": 0.5523, + "step": 4172 + }, + { + "epoch": 0.7418666666666667, + "grad_norm": 0.3460660317974209, + "learning_rate": 3.296151560437214e-05, + "loss": 0.5691, + "step": 4173 + }, + { + "epoch": 0.7420444444444444, + "grad_norm": 0.35764984100815206, + "learning_rate": 3.2918801199030635e-05, + "loss": 0.5428, + "step": 4174 + }, + { + "epoch": 0.7422222222222222, + "grad_norm": 0.3317173069594749, + "learning_rate": 3.287610903456181e-05, + "loss": 0.5737, + "step": 4175 + }, + { + "epoch": 0.7424, + "grad_norm": 0.337324590260528, + "learning_rate": 3.283343912512046e-05, + "loss": 0.562, + "step": 4176 + }, + { + "epoch": 0.7425777777777778, + "grad_norm": 0.35698393268935386, + "learning_rate": 3.279079148485375e-05, + "loss": 0.573, + "step": 4177 + }, + { + "epoch": 0.7427555555555555, + "grad_norm": 0.4109720519173487, + "learning_rate": 3.27481661279016e-05, + "loss": 0.6212, + "step": 4178 + }, + { + "epoch": 0.7429333333333333, + "grad_norm": 0.3387820550022395, + "learning_rate": 3.2705563068396514e-05, + "loss": 0.5678, + "step": 4179 + }, + { + "epoch": 0.7431111111111111, + "grad_norm": 0.35281991014179076, + "learning_rate": 3.266298232046362e-05, + "loss": 0.5521, + "step": 4180 + }, + { + "epoch": 0.7432888888888889, + "grad_norm": 0.34370307132414746, + "learning_rate": 3.26204238982206e-05, + "loss": 0.5439, + "step": 4181 + }, + { + "epoch": 0.7434666666666667, + "grad_norm": 0.48350048148904473, + "learning_rate": 3.257788781577777e-05, + "loss": 0.5635, + "step": 4182 + }, + { + "epoch": 0.7436444444444444, + "grad_norm": 0.35462018137979245, + "learning_rate": 3.253537408723805e-05, + "loss": 0.5497, + "step": 4183 + }, + { + "epoch": 0.7438222222222223, + "grad_norm": 0.33374514836867014, + "learning_rate": 3.249288272669691e-05, + "loss": 0.5454, + "step": 4184 + }, + { + "epoch": 0.744, + "grad_norm": 0.36179135386948374, + "learning_rate": 3.2450413748242437e-05, + "loss": 0.5886, + "step": 4185 + }, + { + "epoch": 0.7441777777777778, + "grad_norm": 0.35322559599523196, + "learning_rate": 3.240796716595528e-05, + "loss": 0.5571, + "step": 4186 + }, + { + "epoch": 0.7443555555555555, + "grad_norm": 0.3521601650138431, + "learning_rate": 3.236554299390866e-05, + "loss": 0.5457, + "step": 4187 + }, + { + "epoch": 0.7445333333333334, + "grad_norm": 0.3528364935475116, + "learning_rate": 3.2323141246168396e-05, + "loss": 0.5872, + "step": 4188 + }, + { + "epoch": 0.7447111111111111, + "grad_norm": 0.34858074322798915, + "learning_rate": 3.2280761936792837e-05, + "loss": 0.582, + "step": 4189 + }, + { + "epoch": 0.7448888888888889, + "grad_norm": 0.3862473146426205, + "learning_rate": 3.2238405079832936e-05, + "loss": 0.5487, + "step": 4190 + }, + { + "epoch": 0.7450666666666667, + "grad_norm": 0.3512183257338081, + "learning_rate": 3.219607068933208e-05, + "loss": 0.5913, + "step": 4191 + }, + { + "epoch": 0.7452444444444445, + "grad_norm": 0.35140397046037664, + "learning_rate": 3.2153758779326435e-05, + "loss": 0.5792, + "step": 4192 + }, + { + "epoch": 0.7454222222222222, + "grad_norm": 0.34880024133834064, + "learning_rate": 3.211146936384445e-05, + "loss": 0.5893, + "step": 4193 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3924495259050675, + "learning_rate": 3.2069202456907366e-05, + "loss": 0.5931, + "step": 4194 + }, + { + "epoch": 0.7457777777777778, + "grad_norm": 0.34911644870176195, + "learning_rate": 3.202695807252871e-05, + "loss": 0.554, + "step": 4195 + }, + { + "epoch": 0.7459555555555556, + "grad_norm": 0.3348938294170673, + "learning_rate": 3.1984736224714816e-05, + "loss": 0.5532, + "step": 4196 + }, + { + "epoch": 0.7461333333333333, + "grad_norm": 0.35132961633684906, + "learning_rate": 3.194253692746425e-05, + "loss": 0.5817, + "step": 4197 + }, + { + "epoch": 0.7463111111111111, + "grad_norm": 0.3206817272795673, + "learning_rate": 3.19003601947684e-05, + "loss": 0.5528, + "step": 4198 + }, + { + "epoch": 0.7464888888888889, + "grad_norm": 0.32968430977894264, + "learning_rate": 3.185820604061088e-05, + "loss": 0.5327, + "step": 4199 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.3387615682396391, + "learning_rate": 3.1816074478968106e-05, + "loss": 0.5469, + "step": 4200 + }, + { + "epoch": 0.7468444444444444, + "grad_norm": 0.3718264749895764, + "learning_rate": 3.1773965523808754e-05, + "loss": 0.6096, + "step": 4201 + }, + { + "epoch": 0.7470222222222223, + "grad_norm": 0.3650002592452039, + "learning_rate": 3.173187918909416e-05, + "loss": 0.6021, + "step": 4202 + }, + { + "epoch": 0.7472, + "grad_norm": 0.3415189432822939, + "learning_rate": 3.1689815488778096e-05, + "loss": 0.5631, + "step": 4203 + }, + { + "epoch": 0.7473777777777778, + "grad_norm": 0.3428618174151365, + "learning_rate": 3.164777443680684e-05, + "loss": 0.5772, + "step": 4204 + }, + { + "epoch": 0.7475555555555555, + "grad_norm": 0.33093855710863396, + "learning_rate": 3.16057560471192e-05, + "loss": 0.5829, + "step": 4205 + }, + { + "epoch": 0.7477333333333334, + "grad_norm": 0.3498256505756481, + "learning_rate": 3.1563760333646395e-05, + "loss": 0.5645, + "step": 4206 + }, + { + "epoch": 0.7479111111111111, + "grad_norm": 0.3577293233560542, + "learning_rate": 3.152178731031219e-05, + "loss": 0.6156, + "step": 4207 + }, + { + "epoch": 0.7480888888888889, + "grad_norm": 0.34255333849627845, + "learning_rate": 3.14798369910328e-05, + "loss": 0.5747, + "step": 4208 + }, + { + "epoch": 0.7482666666666666, + "grad_norm": 0.33516520282152806, + "learning_rate": 3.1437909389716915e-05, + "loss": 0.5747, + "step": 4209 + }, + { + "epoch": 0.7484444444444445, + "grad_norm": 0.3442120636670075, + "learning_rate": 3.13960045202657e-05, + "loss": 0.6036, + "step": 4210 + }, + { + "epoch": 0.7486222222222222, + "grad_norm": 0.3731906782420752, + "learning_rate": 3.1354122396572774e-05, + "loss": 0.6107, + "step": 4211 + }, + { + "epoch": 0.7488, + "grad_norm": 0.33151817581592286, + "learning_rate": 3.1312263032524216e-05, + "loss": 0.5398, + "step": 4212 + }, + { + "epoch": 0.7489777777777777, + "grad_norm": 0.3468051855372648, + "learning_rate": 3.127042644199856e-05, + "loss": 0.5577, + "step": 4213 + }, + { + "epoch": 0.7491555555555556, + "grad_norm": 0.3631674818492094, + "learning_rate": 3.1228612638866795e-05, + "loss": 0.5899, + "step": 4214 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 0.38503654967646217, + "learning_rate": 3.118682163699236e-05, + "loss": 0.593, + "step": 4215 + }, + { + "epoch": 0.7495111111111111, + "grad_norm": 0.3266389435776212, + "learning_rate": 3.114505345023113e-05, + "loss": 0.5386, + "step": 4216 + }, + { + "epoch": 0.7496888888888888, + "grad_norm": 0.34972426476361096, + "learning_rate": 3.110330809243134e-05, + "loss": 0.5787, + "step": 4217 + }, + { + "epoch": 0.7498666666666667, + "grad_norm": 0.3417197018503155, + "learning_rate": 3.106158557743385e-05, + "loss": 0.5669, + "step": 4218 + }, + { + "epoch": 0.7500444444444444, + "grad_norm": 0.35981740931242173, + "learning_rate": 3.101988591907168e-05, + "loss": 0.5593, + "step": 4219 + }, + { + "epoch": 0.7502222222222222, + "grad_norm": 0.3528973730962451, + "learning_rate": 3.0978209131170566e-05, + "loss": 0.5666, + "step": 4220 + }, + { + "epoch": 0.7504, + "grad_norm": 0.33954281288581745, + "learning_rate": 3.0936555227548355e-05, + "loss": 0.5521, + "step": 4221 + }, + { + "epoch": 0.7505777777777778, + "grad_norm": 0.3488425071859353, + "learning_rate": 3.089492422201561e-05, + "loss": 0.5729, + "step": 4222 + }, + { + "epoch": 0.7507555555555555, + "grad_norm": 0.3272921881612388, + "learning_rate": 3.085331612837502e-05, + "loss": 0.5368, + "step": 4223 + }, + { + "epoch": 0.7509333333333333, + "grad_norm": 0.36300224569083905, + "learning_rate": 3.081173096042194e-05, + "loss": 0.6139, + "step": 4224 + }, + { + "epoch": 0.7511111111111111, + "grad_norm": 0.3574674623555468, + "learning_rate": 3.0770168731943895e-05, + "loss": 0.593, + "step": 4225 + }, + { + "epoch": 0.7512888888888889, + "grad_norm": 0.3511692874694419, + "learning_rate": 3.072862945672094e-05, + "loss": 0.5444, + "step": 4226 + }, + { + "epoch": 0.7514666666666666, + "grad_norm": 0.3582921225868174, + "learning_rate": 3.068711314852548e-05, + "loss": 0.5633, + "step": 4227 + }, + { + "epoch": 0.7516444444444444, + "grad_norm": 0.3688101911355421, + "learning_rate": 3.064561982112232e-05, + "loss": 0.5409, + "step": 4228 + }, + { + "epoch": 0.7518222222222222, + "grad_norm": 0.3686330421240707, + "learning_rate": 3.060414948826862e-05, + "loss": 0.6031, + "step": 4229 + }, + { + "epoch": 0.752, + "grad_norm": 0.38082723294809234, + "learning_rate": 3.056270216371395e-05, + "loss": 0.616, + "step": 4230 + }, + { + "epoch": 0.7521777777777777, + "grad_norm": 0.38249101558826504, + "learning_rate": 3.0521277861200216e-05, + "loss": 0.5973, + "step": 4231 + }, + { + "epoch": 0.7523555555555556, + "grad_norm": 0.36227280753592483, + "learning_rate": 3.047987659446172e-05, + "loss": 0.5794, + "step": 4232 + }, + { + "epoch": 0.7525333333333334, + "grad_norm": 0.38258056128230283, + "learning_rate": 3.043849837722511e-05, + "loss": 0.6031, + "step": 4233 + }, + { + "epoch": 0.7527111111111111, + "grad_norm": 0.34606211062994974, + "learning_rate": 3.039714322320939e-05, + "loss": 0.6047, + "step": 4234 + }, + { + "epoch": 0.7528888888888889, + "grad_norm": 0.32825314961616503, + "learning_rate": 3.0355811146125935e-05, + "loss": 0.5289, + "step": 4235 + }, + { + "epoch": 0.7530666666666667, + "grad_norm": 0.34763060449947897, + "learning_rate": 3.0314502159678458e-05, + "loss": 0.5481, + "step": 4236 + }, + { + "epoch": 0.7532444444444445, + "grad_norm": 0.35145590639389757, + "learning_rate": 3.0273216277563e-05, + "loss": 0.5829, + "step": 4237 + }, + { + "epoch": 0.7534222222222222, + "grad_norm": 0.3732241198757729, + "learning_rate": 3.023195351346797e-05, + "loss": 0.6407, + "step": 4238 + }, + { + "epoch": 0.7536, + "grad_norm": 0.34161327580409323, + "learning_rate": 3.0190713881074105e-05, + "loss": 0.5641, + "step": 4239 + }, + { + "epoch": 0.7537777777777778, + "grad_norm": 0.3498273968194717, + "learning_rate": 3.014949739405448e-05, + "loss": 0.5837, + "step": 4240 + }, + { + "epoch": 0.7539555555555556, + "grad_norm": 0.38238022147716455, + "learning_rate": 3.010830406607441e-05, + "loss": 0.588, + "step": 4241 + }, + { + "epoch": 0.7541333333333333, + "grad_norm": 0.3899432688526802, + "learning_rate": 3.0067133910791722e-05, + "loss": 0.6273, + "step": 4242 + }, + { + "epoch": 0.7543111111111112, + "grad_norm": 0.38350916865529533, + "learning_rate": 3.002598694185631e-05, + "loss": 0.6152, + "step": 4243 + }, + { + "epoch": 0.7544888888888889, + "grad_norm": 0.3749368999724824, + "learning_rate": 2.998486317291066e-05, + "loss": 0.5729, + "step": 4244 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 0.36091053563070535, + "learning_rate": 2.9943762617589264e-05, + "loss": 0.5561, + "step": 4245 + }, + { + "epoch": 0.7548444444444444, + "grad_norm": 0.36189171484542015, + "learning_rate": 2.990268528951923e-05, + "loss": 0.5456, + "step": 4246 + }, + { + "epoch": 0.7550222222222223, + "grad_norm": 0.34447403945244204, + "learning_rate": 2.986163120231965e-05, + "loss": 0.5423, + "step": 4247 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3429685993377397, + "learning_rate": 2.9820600369602224e-05, + "loss": 0.5674, + "step": 4248 + }, + { + "epoch": 0.7553777777777778, + "grad_norm": 0.35842641976135975, + "learning_rate": 2.977959280497068e-05, + "loss": 0.5845, + "step": 4249 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 0.36809913546801193, + "learning_rate": 2.9738608522021173e-05, + "loss": 0.6279, + "step": 4250 + }, + { + "epoch": 0.7557333333333334, + "grad_norm": 0.3434563800908367, + "learning_rate": 2.969764753434209e-05, + "loss": 0.5724, + "step": 4251 + }, + { + "epoch": 0.7559111111111111, + "grad_norm": 0.3784185150737185, + "learning_rate": 2.965670985551412e-05, + "loss": 0.5857, + "step": 4252 + }, + { + "epoch": 0.7560888888888889, + "grad_norm": 0.35480404145148375, + "learning_rate": 2.9615795499110222e-05, + "loss": 0.5303, + "step": 4253 + }, + { + "epoch": 0.7562666666666666, + "grad_norm": 0.39442699998781444, + "learning_rate": 2.9574904478695586e-05, + "loss": 0.5298, + "step": 4254 + }, + { + "epoch": 0.7564444444444445, + "grad_norm": 0.34309851387691853, + "learning_rate": 2.9534036807827726e-05, + "loss": 0.5789, + "step": 4255 + }, + { + "epoch": 0.7566222222222222, + "grad_norm": 0.3335078622608469, + "learning_rate": 2.9493192500056345e-05, + "loss": 0.5448, + "step": 4256 + }, + { + "epoch": 0.7568, + "grad_norm": 0.35155594852221766, + "learning_rate": 2.9452371568923455e-05, + "loss": 0.5482, + "step": 4257 + }, + { + "epoch": 0.7569777777777777, + "grad_norm": 0.344885376604275, + "learning_rate": 2.9411574027963296e-05, + "loss": 0.5591, + "step": 4258 + }, + { + "epoch": 0.7571555555555556, + "grad_norm": 0.34105851054595154, + "learning_rate": 2.9370799890702362e-05, + "loss": 0.5607, + "step": 4259 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 0.3641676724156971, + "learning_rate": 2.9330049170659357e-05, + "loss": 0.5547, + "step": 4260 + }, + { + "epoch": 0.7575111111111111, + "grad_norm": 0.36797442473954056, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.573, + "step": 4261 + }, + { + "epoch": 0.7576888888888889, + "grad_norm": 0.373526058225425, + "learning_rate": 2.9248618036263255e-05, + "loss": 0.624, + "step": 4262 + }, + { + "epoch": 0.7578666666666667, + "grad_norm": 0.35284043969981105, + "learning_rate": 2.920793764890878e-05, + "loss": 0.5553, + "step": 4263 + }, + { + "epoch": 0.7580444444444444, + "grad_norm": 0.3508506808649338, + "learning_rate": 2.9167280732769463e-05, + "loss": 0.544, + "step": 4264 + }, + { + "epoch": 0.7582222222222222, + "grad_norm": 0.3964831825740827, + "learning_rate": 2.9126647301325173e-05, + "loss": 0.5755, + "step": 4265 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3366898115073589, + "learning_rate": 2.908603736804798e-05, + "loss": 0.6127, + "step": 4266 + }, + { + "epoch": 0.7585777777777778, + "grad_norm": 0.3588474336080617, + "learning_rate": 2.9045450946402175e-05, + "loss": 0.5849, + "step": 4267 + }, + { + "epoch": 0.7587555555555555, + "grad_norm": 0.36381022116264883, + "learning_rate": 2.9004888049844248e-05, + "loss": 0.5851, + "step": 4268 + }, + { + "epoch": 0.7589333333333333, + "grad_norm": 0.3612514999903744, + "learning_rate": 2.8964348691822895e-05, + "loss": 0.556, + "step": 4269 + }, + { + "epoch": 0.7591111111111111, + "grad_norm": 0.3583575434999154, + "learning_rate": 2.892383288577898e-05, + "loss": 0.5757, + "step": 4270 + }, + { + "epoch": 0.7592888888888889, + "grad_norm": 0.3523042240458458, + "learning_rate": 2.8883340645145597e-05, + "loss": 0.5763, + "step": 4271 + }, + { + "epoch": 0.7594666666666666, + "grad_norm": 0.35235017432930843, + "learning_rate": 2.8842871983347998e-05, + "loss": 0.5433, + "step": 4272 + }, + { + "epoch": 0.7596444444444445, + "grad_norm": 0.3333974637428611, + "learning_rate": 2.8802426913803638e-05, + "loss": 0.5627, + "step": 4273 + }, + { + "epoch": 0.7598222222222222, + "grad_norm": 0.3813781671578012, + "learning_rate": 2.8762005449922147e-05, + "loss": 0.5742, + "step": 4274 + }, + { + "epoch": 0.76, + "grad_norm": 0.6391010007936021, + "learning_rate": 2.8721607605105337e-05, + "loss": 0.5756, + "step": 4275 + }, + { + "epoch": 0.7601777777777777, + "grad_norm": 0.384595088211196, + "learning_rate": 2.8681233392747086e-05, + "loss": 0.5757, + "step": 4276 + }, + { + "epoch": 0.7603555555555556, + "grad_norm": 0.3556352714948297, + "learning_rate": 2.864088282623366e-05, + "loss": 0.5824, + "step": 4277 + }, + { + "epoch": 0.7605333333333333, + "grad_norm": 0.3295268869241266, + "learning_rate": 2.8600555918943218e-05, + "loss": 0.5339, + "step": 4278 + }, + { + "epoch": 0.7607111111111111, + "grad_norm": 0.37932401859673354, + "learning_rate": 2.8560252684246324e-05, + "loss": 0.5995, + "step": 4279 + }, + { + "epoch": 0.7608888888888888, + "grad_norm": 0.3480377899987346, + "learning_rate": 2.8519973135505462e-05, + "loss": 0.5371, + "step": 4280 + }, + { + "epoch": 0.7610666666666667, + "grad_norm": 0.3306271257294146, + "learning_rate": 2.8479717286075502e-05, + "loss": 0.5305, + "step": 4281 + }, + { + "epoch": 0.7612444444444444, + "grad_norm": 0.3618439726182244, + "learning_rate": 2.84394851493032e-05, + "loss": 0.5685, + "step": 4282 + }, + { + "epoch": 0.7614222222222222, + "grad_norm": 0.3677953744762058, + "learning_rate": 2.8399276738527714e-05, + "loss": 0.5841, + "step": 4283 + }, + { + "epoch": 0.7616, + "grad_norm": 0.37789648238799733, + "learning_rate": 2.8359092067080106e-05, + "loss": 0.6087, + "step": 4284 + }, + { + "epoch": 0.7617777777777778, + "grad_norm": 0.3463045651522611, + "learning_rate": 2.83189311482837e-05, + "loss": 0.4997, + "step": 4285 + }, + { + "epoch": 0.7619555555555556, + "grad_norm": 0.37363212688999853, + "learning_rate": 2.82787939954539e-05, + "loss": 0.5804, + "step": 4286 + }, + { + "epoch": 0.7621333333333333, + "grad_norm": 0.4170357868104384, + "learning_rate": 2.823868062189825e-05, + "loss": 0.5486, + "step": 4287 + }, + { + "epoch": 0.7623111111111112, + "grad_norm": 0.33958013225061306, + "learning_rate": 2.8198591040916387e-05, + "loss": 0.5661, + "step": 4288 + }, + { + "epoch": 0.7624888888888889, + "grad_norm": 0.3405927889172959, + "learning_rate": 2.8158525265800094e-05, + "loss": 0.5762, + "step": 4289 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 0.34444028874973215, + "learning_rate": 2.811848330983321e-05, + "loss": 0.5701, + "step": 4290 + }, + { + "epoch": 0.7628444444444444, + "grad_norm": 0.34313612935464455, + "learning_rate": 2.8078465186291724e-05, + "loss": 0.5984, + "step": 4291 + }, + { + "epoch": 0.7630222222222223, + "grad_norm": 0.3353216679805234, + "learning_rate": 2.8038470908443714e-05, + "loss": 0.5581, + "step": 4292 + }, + { + "epoch": 0.7632, + "grad_norm": 0.39899946565810035, + "learning_rate": 2.799850048954932e-05, + "loss": 0.576, + "step": 4293 + }, + { + "epoch": 0.7633777777777778, + "grad_norm": 0.36971140324940005, + "learning_rate": 2.795855394286081e-05, + "loss": 0.5808, + "step": 4294 + }, + { + "epoch": 0.7635555555555555, + "grad_norm": 0.3399741677234007, + "learning_rate": 2.791863128162251e-05, + "loss": 0.532, + "step": 4295 + }, + { + "epoch": 0.7637333333333334, + "grad_norm": 0.3296102634535836, + "learning_rate": 2.787873251907086e-05, + "loss": 0.5761, + "step": 4296 + }, + { + "epoch": 0.7639111111111111, + "grad_norm": 0.3553281966073167, + "learning_rate": 2.7838857668434327e-05, + "loss": 0.6294, + "step": 4297 + }, + { + "epoch": 0.7640888888888889, + "grad_norm": 0.3345832561474351, + "learning_rate": 2.779900674293351e-05, + "loss": 0.5623, + "step": 4298 + }, + { + "epoch": 0.7642666666666666, + "grad_norm": 0.3416907437433639, + "learning_rate": 2.775917975578104e-05, + "loss": 0.5966, + "step": 4299 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 0.3534629423549445, + "learning_rate": 2.7719376720181546e-05, + "loss": 0.6135, + "step": 4300 + }, + { + "epoch": 0.7646222222222222, + "grad_norm": 0.3780017127778086, + "learning_rate": 2.76795976493319e-05, + "loss": 0.5498, + "step": 4301 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3794798968228671, + "learning_rate": 2.7639842556420792e-05, + "loss": 0.5827, + "step": 4302 + }, + { + "epoch": 0.7649777777777778, + "grad_norm": 0.357809527543104, + "learning_rate": 2.7600111454629207e-05, + "loss": 0.5473, + "step": 4303 + }, + { + "epoch": 0.7651555555555556, + "grad_norm": 0.3351256496248654, + "learning_rate": 2.756040435712992e-05, + "loss": 0.5639, + "step": 4304 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 0.3566561021838032, + "learning_rate": 2.7520721277088024e-05, + "loss": 0.5611, + "step": 4305 + }, + { + "epoch": 0.7655111111111111, + "grad_norm": 0.3585275296411708, + "learning_rate": 2.7481062227660348e-05, + "loss": 0.5955, + "step": 4306 + }, + { + "epoch": 0.7656888888888889, + "grad_norm": 0.35075049383760293, + "learning_rate": 2.7441427221996065e-05, + "loss": 0.5567, + "step": 4307 + }, + { + "epoch": 0.7658666666666667, + "grad_norm": 0.3693363754292584, + "learning_rate": 2.74018162732361e-05, + "loss": 0.5625, + "step": 4308 + }, + { + "epoch": 0.7660444444444444, + "grad_norm": 0.3336393395709656, + "learning_rate": 2.7362229394513584e-05, + "loss": 0.5304, + "step": 4309 + }, + { + "epoch": 0.7662222222222222, + "grad_norm": 0.3337213509668497, + "learning_rate": 2.7322666598953574e-05, + "loss": 0.5339, + "step": 4310 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3580448734708473, + "learning_rate": 2.72831278996732e-05, + "loss": 0.5871, + "step": 4311 + }, + { + "epoch": 0.7665777777777778, + "grad_norm": 0.3458516402497339, + "learning_rate": 2.724361330978157e-05, + "loss": 0.5535, + "step": 4312 + }, + { + "epoch": 0.7667555555555555, + "grad_norm": 0.3493775043646141, + "learning_rate": 2.72041228423798e-05, + "loss": 0.538, + "step": 4313 + }, + { + "epoch": 0.7669333333333334, + "grad_norm": 0.38591941572637034, + "learning_rate": 2.7164656510561026e-05, + "loss": 0.571, + "step": 4314 + }, + { + "epoch": 0.7671111111111111, + "grad_norm": 0.34755620415215, + "learning_rate": 2.7125214327410354e-05, + "loss": 0.5964, + "step": 4315 + }, + { + "epoch": 0.7672888888888889, + "grad_norm": 0.37337456092855464, + "learning_rate": 2.7085796306004906e-05, + "loss": 0.577, + "step": 4316 + }, + { + "epoch": 0.7674666666666666, + "grad_norm": 0.3393712759076397, + "learning_rate": 2.70464024594138e-05, + "loss": 0.5622, + "step": 4317 + }, + { + "epoch": 0.7676444444444445, + "grad_norm": 0.3472721818167264, + "learning_rate": 2.7007032800698105e-05, + "loss": 0.5587, + "step": 4318 + }, + { + "epoch": 0.7678222222222222, + "grad_norm": 0.33382249429187827, + "learning_rate": 2.6967687342910898e-05, + "loss": 0.5763, + "step": 4319 + }, + { + "epoch": 0.768, + "grad_norm": 0.3345773200547266, + "learning_rate": 2.6928366099097235e-05, + "loss": 0.5203, + "step": 4320 + }, + { + "epoch": 0.7681777777777777, + "grad_norm": 0.36781694157511413, + "learning_rate": 2.6889069082294114e-05, + "loss": 0.586, + "step": 4321 + }, + { + "epoch": 0.7683555555555556, + "grad_norm": 0.37297300973136055, + "learning_rate": 2.6849796305530538e-05, + "loss": 0.5745, + "step": 4322 + }, + { + "epoch": 0.7685333333333333, + "grad_norm": 0.3556462465455715, + "learning_rate": 2.681054778182748e-05, + "loss": 0.5866, + "step": 4323 + }, + { + "epoch": 0.7687111111111111, + "grad_norm": 0.3484887184658991, + "learning_rate": 2.6771323524197756e-05, + "loss": 0.5885, + "step": 4324 + }, + { + "epoch": 0.7688888888888888, + "grad_norm": 0.35259251401627323, + "learning_rate": 2.6732123545646347e-05, + "loss": 0.5827, + "step": 4325 + }, + { + "epoch": 0.7690666666666667, + "grad_norm": 0.35727181634412014, + "learning_rate": 2.669294785916995e-05, + "loss": 0.5419, + "step": 4326 + }, + { + "epoch": 0.7692444444444444, + "grad_norm": 0.4038358436186105, + "learning_rate": 2.6653796477757432e-05, + "loss": 0.5643, + "step": 4327 + }, + { + "epoch": 0.7694222222222222, + "grad_norm": 0.3348318890939429, + "learning_rate": 2.661466941438938e-05, + "loss": 0.5379, + "step": 4328 + }, + { + "epoch": 0.7696, + "grad_norm": 0.3314200102478855, + "learning_rate": 2.6575566682038556e-05, + "loss": 0.5795, + "step": 4329 + }, + { + "epoch": 0.7697777777777778, + "grad_norm": 0.34230045633343376, + "learning_rate": 2.6536488293669392e-05, + "loss": 0.5316, + "step": 4330 + }, + { + "epoch": 0.7699555555555555, + "grad_norm": 0.34712692150174207, + "learning_rate": 2.649743426223853e-05, + "loss": 0.5689, + "step": 4331 + }, + { + "epoch": 0.7701333333333333, + "grad_norm": 0.3637005118566602, + "learning_rate": 2.6458404600694263e-05, + "loss": 0.5732, + "step": 4332 + }, + { + "epoch": 0.7703111111111111, + "grad_norm": 0.3409605257991041, + "learning_rate": 2.6419399321977058e-05, + "loss": 0.5673, + "step": 4333 + }, + { + "epoch": 0.7704888888888889, + "grad_norm": 0.34102074326844156, + "learning_rate": 2.6380418439019062e-05, + "loss": 0.5411, + "step": 4334 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 0.41449092255940173, + "learning_rate": 2.6341461964744508e-05, + "loss": 0.558, + "step": 4335 + }, + { + "epoch": 0.7708444444444444, + "grad_norm": 0.35129929624361794, + "learning_rate": 2.6302529912069452e-05, + "loss": 0.5214, + "step": 4336 + }, + { + "epoch": 0.7710222222222223, + "grad_norm": 0.3440983635006046, + "learning_rate": 2.626362229390189e-05, + "loss": 0.558, + "step": 4337 + }, + { + "epoch": 0.7712, + "grad_norm": 0.367434874537577, + "learning_rate": 2.6224739123141684e-05, + "loss": 0.5552, + "step": 4338 + }, + { + "epoch": 0.7713777777777778, + "grad_norm": 0.3479638399219766, + "learning_rate": 2.618588041268063e-05, + "loss": 0.5925, + "step": 4339 + }, + { + "epoch": 0.7715555555555556, + "grad_norm": 0.36234312790855217, + "learning_rate": 2.6147046175402368e-05, + "loss": 0.5792, + "step": 4340 + }, + { + "epoch": 0.7717333333333334, + "grad_norm": 0.3348044773849246, + "learning_rate": 2.6108236424182465e-05, + "loss": 0.5454, + "step": 4341 + }, + { + "epoch": 0.7719111111111111, + "grad_norm": 0.36110873069662, + "learning_rate": 2.6069451171888336e-05, + "loss": 0.597, + "step": 4342 + }, + { + "epoch": 0.7720888888888889, + "grad_norm": 0.3653360457710555, + "learning_rate": 2.6030690431379312e-05, + "loss": 0.5769, + "step": 4343 + }, + { + "epoch": 0.7722666666666667, + "grad_norm": 0.36745737378190635, + "learning_rate": 2.5991954215506563e-05, + "loss": 0.5943, + "step": 4344 + }, + { + "epoch": 0.7724444444444445, + "grad_norm": 0.35139256054606194, + "learning_rate": 2.5953242537113142e-05, + "loss": 0.5625, + "step": 4345 + }, + { + "epoch": 0.7726222222222222, + "grad_norm": 0.3727655064714463, + "learning_rate": 2.591455540903397e-05, + "loss": 0.5547, + "step": 4346 + }, + { + "epoch": 0.7728, + "grad_norm": 0.3649092121675021, + "learning_rate": 2.587589284409583e-05, + "loss": 0.6164, + "step": 4347 + }, + { + "epoch": 0.7729777777777778, + "grad_norm": 0.36540280578138495, + "learning_rate": 2.583725485511729e-05, + "loss": 0.5565, + "step": 4348 + }, + { + "epoch": 0.7731555555555556, + "grad_norm": 0.340857849254035, + "learning_rate": 2.5798641454908944e-05, + "loss": 0.5985, + "step": 4349 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.4160572957314163, + "learning_rate": 2.5760052656273002e-05, + "loss": 0.558, + "step": 4350 + }, + { + "epoch": 0.7735111111111111, + "grad_norm": 0.361466601989381, + "learning_rate": 2.572148847200375e-05, + "loss": 0.6073, + "step": 4351 + }, + { + "epoch": 0.7736888888888889, + "grad_norm": 0.33777502252947933, + "learning_rate": 2.5682948914887106e-05, + "loss": 0.5166, + "step": 4352 + }, + { + "epoch": 0.7738666666666667, + "grad_norm": 0.7659817761463893, + "learning_rate": 2.564443399770101e-05, + "loss": 0.5562, + "step": 4353 + }, + { + "epoch": 0.7740444444444444, + "grad_norm": 0.3576573310205456, + "learning_rate": 2.5605943733215042e-05, + "loss": 0.5464, + "step": 4354 + }, + { + "epoch": 0.7742222222222223, + "grad_norm": 0.3488566127820092, + "learning_rate": 2.5567478134190824e-05, + "loss": 0.5681, + "step": 4355 + }, + { + "epoch": 0.7744, + "grad_norm": 0.33697213028749157, + "learning_rate": 2.5529037213381545e-05, + "loss": 0.5372, + "step": 4356 + }, + { + "epoch": 0.7745777777777778, + "grad_norm": 0.3347178334244202, + "learning_rate": 2.5490620983532497e-05, + "loss": 0.5966, + "step": 4357 + }, + { + "epoch": 0.7747555555555555, + "grad_norm": 0.3862298275640225, + "learning_rate": 2.545222945738053e-05, + "loss": 0.5907, + "step": 4358 + }, + { + "epoch": 0.7749333333333334, + "grad_norm": 0.379651023808002, + "learning_rate": 2.541386264765444e-05, + "loss": 0.6125, + "step": 4359 + }, + { + "epoch": 0.7751111111111111, + "grad_norm": 0.3518332473303472, + "learning_rate": 2.537552056707483e-05, + "loss": 0.5513, + "step": 4360 + }, + { + "epoch": 0.7752888888888889, + "grad_norm": 0.358073703256899, + "learning_rate": 2.5337203228354035e-05, + "loss": 0.5361, + "step": 4361 + }, + { + "epoch": 0.7754666666666666, + "grad_norm": 0.36338966574155696, + "learning_rate": 2.529891064419625e-05, + "loss": 0.5621, + "step": 4362 + }, + { + "epoch": 0.7756444444444445, + "grad_norm": 0.35607301106378386, + "learning_rate": 2.5260642827297444e-05, + "loss": 0.5564, + "step": 4363 + }, + { + "epoch": 0.7758222222222222, + "grad_norm": 0.33652985093432897, + "learning_rate": 2.5222399790345354e-05, + "loss": 0.5385, + "step": 4364 + }, + { + "epoch": 0.776, + "grad_norm": 0.3457816274153398, + "learning_rate": 2.5184181546019515e-05, + "loss": 0.553, + "step": 4365 + }, + { + "epoch": 0.7761777777777777, + "grad_norm": 0.34582483496806654, + "learning_rate": 2.514598810699126e-05, + "loss": 0.5392, + "step": 4366 + }, + { + "epoch": 0.7763555555555556, + "grad_norm": 0.3527195791056369, + "learning_rate": 2.5107819485923668e-05, + "loss": 0.5961, + "step": 4367 + }, + { + "epoch": 0.7765333333333333, + "grad_norm": 0.37363506435994237, + "learning_rate": 2.5069675695471617e-05, + "loss": 0.5517, + "step": 4368 + }, + { + "epoch": 0.7767111111111111, + "grad_norm": 0.36456307277282124, + "learning_rate": 2.5031556748281715e-05, + "loss": 0.5622, + "step": 4369 + }, + { + "epoch": 0.7768888888888889, + "grad_norm": 0.3335076885305479, + "learning_rate": 2.4993462656992384e-05, + "loss": 0.5848, + "step": 4370 + }, + { + "epoch": 0.7770666666666667, + "grad_norm": 0.3520408549118235, + "learning_rate": 2.4955393434233754e-05, + "loss": 0.5585, + "step": 4371 + }, + { + "epoch": 0.7772444444444444, + "grad_norm": 0.35848023015856245, + "learning_rate": 2.4917349092627752e-05, + "loss": 0.6004, + "step": 4372 + }, + { + "epoch": 0.7774222222222222, + "grad_norm": 0.34380938237682085, + "learning_rate": 2.4879329644788053e-05, + "loss": 0.5771, + "step": 4373 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3743374585688741, + "learning_rate": 2.4841335103319972e-05, + "loss": 0.5436, + "step": 4374 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 0.3390044209994759, + "learning_rate": 2.4803365480820785e-05, + "loss": 0.5625, + "step": 4375 + }, + { + "epoch": 0.7779555555555555, + "grad_norm": 0.3652287624288249, + "learning_rate": 2.4765420789879257e-05, + "loss": 0.6076, + "step": 4376 + }, + { + "epoch": 0.7781333333333333, + "grad_norm": 0.3631401587485521, + "learning_rate": 2.4727501043076128e-05, + "loss": 0.6241, + "step": 4377 + }, + { + "epoch": 0.7783111111111111, + "grad_norm": 0.39266179299732584, + "learning_rate": 2.4689606252983623e-05, + "loss": 0.5936, + "step": 4378 + }, + { + "epoch": 0.7784888888888889, + "grad_norm": 0.3390134965453679, + "learning_rate": 2.465173643216594e-05, + "loss": 0.5304, + "step": 4379 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 0.36430903628025196, + "learning_rate": 2.4613891593178752e-05, + "loss": 0.5727, + "step": 4380 + }, + { + "epoch": 0.7788444444444445, + "grad_norm": 0.36216824153035626, + "learning_rate": 2.4576071748569695e-05, + "loss": 0.6094, + "step": 4381 + }, + { + "epoch": 0.7790222222222222, + "grad_norm": 0.3594210930056404, + "learning_rate": 2.45382769108779e-05, + "loss": 0.5745, + "step": 4382 + }, + { + "epoch": 0.7792, + "grad_norm": 0.35199554147819706, + "learning_rate": 2.4500507092634338e-05, + "loss": 0.5746, + "step": 4383 + }, + { + "epoch": 0.7793777777777777, + "grad_norm": 0.3675465305250382, + "learning_rate": 2.4462762306361654e-05, + "loss": 0.5677, + "step": 4384 + }, + { + "epoch": 0.7795555555555556, + "grad_norm": 0.39778150996015027, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.6007, + "step": 4385 + }, + { + "epoch": 0.7797333333333333, + "grad_norm": 0.341386610907, + "learning_rate": 2.4387347879777955e-05, + "loss": 0.5412, + "step": 4386 + }, + { + "epoch": 0.7799111111111111, + "grad_norm": 0.36750685066754046, + "learning_rate": 2.434967826447072e-05, + "loss": 0.5777, + "step": 4387 + }, + { + "epoch": 0.7800888888888889, + "grad_norm": 0.3492894710233438, + "learning_rate": 2.431203373114187e-05, + "loss": 0.5351, + "step": 4388 + }, + { + "epoch": 0.7802666666666667, + "grad_norm": 0.37279206195034853, + "learning_rate": 2.427441429227253e-05, + "loss": 0.5897, + "step": 4389 + }, + { + "epoch": 0.7804444444444445, + "grad_norm": 0.37704310937474866, + "learning_rate": 2.4236819960335476e-05, + "loss": 0.6039, + "step": 4390 + }, + { + "epoch": 0.7806222222222222, + "grad_norm": 0.3341105513119436, + "learning_rate": 2.4199250747795154e-05, + "loss": 0.5557, + "step": 4391 + }, + { + "epoch": 0.7808, + "grad_norm": 0.36680834655227373, + "learning_rate": 2.41617066671077e-05, + "loss": 0.6013, + "step": 4392 + }, + { + "epoch": 0.7809777777777778, + "grad_norm": 0.34670883620130083, + "learning_rate": 2.4124187730720917e-05, + "loss": 0.5573, + "step": 4393 + }, + { + "epoch": 0.7811555555555556, + "grad_norm": 0.3569936970649313, + "learning_rate": 2.4086693951074247e-05, + "loss": 0.5742, + "step": 4394 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 0.3478642969035027, + "learning_rate": 2.4049225340598835e-05, + "loss": 0.5634, + "step": 4395 + }, + { + "epoch": 0.7815111111111112, + "grad_norm": 0.3828596888038592, + "learning_rate": 2.4011781911717436e-05, + "loss": 0.568, + "step": 4396 + }, + { + "epoch": 0.7816888888888889, + "grad_norm": 0.4140886851492242, + "learning_rate": 2.3974363676844503e-05, + "loss": 0.5715, + "step": 4397 + }, + { + "epoch": 0.7818666666666667, + "grad_norm": 0.3707233917101248, + "learning_rate": 2.3936970648386038e-05, + "loss": 0.6198, + "step": 4398 + }, + { + "epoch": 0.7820444444444444, + "grad_norm": 0.34824057998777835, + "learning_rate": 2.3899602838739864e-05, + "loss": 0.553, + "step": 4399 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 0.3529386332257371, + "learning_rate": 2.386226026029521e-05, + "loss": 0.5924, + "step": 4400 + }, + { + "epoch": 0.7824, + "grad_norm": 0.34867121025093356, + "learning_rate": 2.382494292543319e-05, + "loss": 0.5934, + "step": 4401 + }, + { + "epoch": 0.7825777777777778, + "grad_norm": 0.35736787093786476, + "learning_rate": 2.378765084652631e-05, + "loss": 0.6343, + "step": 4402 + }, + { + "epoch": 0.7827555555555555, + "grad_norm": 0.5891866621211168, + "learning_rate": 2.3750384035938922e-05, + "loss": 0.5611, + "step": 4403 + }, + { + "epoch": 0.7829333333333334, + "grad_norm": 0.3528181470032474, + "learning_rate": 2.3713142506026786e-05, + "loss": 0.5825, + "step": 4404 + }, + { + "epoch": 0.7831111111111111, + "grad_norm": 0.358480746142133, + "learning_rate": 2.3675926269137495e-05, + "loss": 0.5968, + "step": 4405 + }, + { + "epoch": 0.7832888888888889, + "grad_norm": 0.355470594033632, + "learning_rate": 2.363873533761005e-05, + "loss": 0.5891, + "step": 4406 + }, + { + "epoch": 0.7834666666666666, + "grad_norm": 0.33530432665158394, + "learning_rate": 2.360156972377522e-05, + "loss": 0.5421, + "step": 4407 + }, + { + "epoch": 0.7836444444444445, + "grad_norm": 0.335572489127656, + "learning_rate": 2.3564429439955303e-05, + "loss": 0.5477, + "step": 4408 + }, + { + "epoch": 0.7838222222222222, + "grad_norm": 0.4286231148471792, + "learning_rate": 2.3527314498464215e-05, + "loss": 0.5755, + "step": 4409 + }, + { + "epoch": 0.784, + "grad_norm": 0.36012731462924885, + "learning_rate": 2.3490224911607473e-05, + "loss": 0.5955, + "step": 4410 + }, + { + "epoch": 0.7841777777777778, + "grad_norm": 0.3485587472638476, + "learning_rate": 2.3453160691682197e-05, + "loss": 0.5263, + "step": 4411 + }, + { + "epoch": 0.7843555555555556, + "grad_norm": 0.3578893285300129, + "learning_rate": 2.3416121850977056e-05, + "loss": 0.5919, + "step": 4412 + }, + { + "epoch": 0.7845333333333333, + "grad_norm": 0.35010437123247085, + "learning_rate": 2.3379108401772365e-05, + "loss": 0.6142, + "step": 4413 + }, + { + "epoch": 0.7847111111111111, + "grad_norm": 0.3816677020835402, + "learning_rate": 2.334212035633997e-05, + "loss": 0.5826, + "step": 4414 + }, + { + "epoch": 0.7848888888888889, + "grad_norm": 0.354311571518565, + "learning_rate": 2.3305157726943327e-05, + "loss": 0.5465, + "step": 4415 + }, + { + "epoch": 0.7850666666666667, + "grad_norm": 0.3994530359459713, + "learning_rate": 2.3268220525837437e-05, + "loss": 0.6093, + "step": 4416 + }, + { + "epoch": 0.7852444444444444, + "grad_norm": 0.34590691159766396, + "learning_rate": 2.3231308765268888e-05, + "loss": 0.5938, + "step": 4417 + }, + { + "epoch": 0.7854222222222222, + "grad_norm": 0.34528876416571636, + "learning_rate": 2.319442245747584e-05, + "loss": 0.564, + "step": 4418 + }, + { + "epoch": 0.7856, + "grad_norm": 0.36273041096313324, + "learning_rate": 2.3157561614687995e-05, + "loss": 0.5657, + "step": 4419 + }, + { + "epoch": 0.7857777777777778, + "grad_norm": 0.3627294462984221, + "learning_rate": 2.312072624912662e-05, + "loss": 0.5636, + "step": 4420 + }, + { + "epoch": 0.7859555555555555, + "grad_norm": 0.33780571528960485, + "learning_rate": 2.3083916373004577e-05, + "loss": 0.5602, + "step": 4421 + }, + { + "epoch": 0.7861333333333334, + "grad_norm": 0.3300356177276644, + "learning_rate": 2.3047131998526138e-05, + "loss": 0.561, + "step": 4422 + }, + { + "epoch": 0.7863111111111111, + "grad_norm": 0.3498848742243878, + "learning_rate": 2.301037313788733e-05, + "loss": 0.5564, + "step": 4423 + }, + { + "epoch": 0.7864888888888889, + "grad_norm": 0.37315436482006004, + "learning_rate": 2.2973639803275503e-05, + "loss": 0.5794, + "step": 4424 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.3518328979317764, + "learning_rate": 2.293693200686976e-05, + "loss": 0.533, + "step": 4425 + }, + { + "epoch": 0.7868444444444445, + "grad_norm": 0.3508973530782063, + "learning_rate": 2.290024976084052e-05, + "loss": 0.5176, + "step": 4426 + }, + { + "epoch": 0.7870222222222222, + "grad_norm": 0.33838435672805656, + "learning_rate": 2.2863593077349944e-05, + "loss": 0.5472, + "step": 4427 + }, + { + "epoch": 0.7872, + "grad_norm": 0.4071923342645993, + "learning_rate": 2.2826961968551486e-05, + "loss": 0.5441, + "step": 4428 + }, + { + "epoch": 0.7873777777777777, + "grad_norm": 0.35679042741907685, + "learning_rate": 2.2790356446590377e-05, + "loss": 0.5634, + "step": 4429 + }, + { + "epoch": 0.7875555555555556, + "grad_norm": 0.33146076529201013, + "learning_rate": 2.275377652360312e-05, + "loss": 0.5218, + "step": 4430 + }, + { + "epoch": 0.7877333333333333, + "grad_norm": 0.34986509909729746, + "learning_rate": 2.2717222211717935e-05, + "loss": 0.5217, + "step": 4431 + }, + { + "epoch": 0.7879111111111111, + "grad_norm": 0.3430071867379447, + "learning_rate": 2.2680693523054407e-05, + "loss": 0.549, + "step": 4432 + }, + { + "epoch": 0.7880888888888888, + "grad_norm": 0.3529690519671126, + "learning_rate": 2.264419046972368e-05, + "loss": 0.5343, + "step": 4433 + }, + { + "epoch": 0.7882666666666667, + "grad_norm": 0.33175317404133, + "learning_rate": 2.2607713063828394e-05, + "loss": 0.5063, + "step": 4434 + }, + { + "epoch": 0.7884444444444444, + "grad_norm": 0.3653405417984674, + "learning_rate": 2.2571261317462712e-05, + "loss": 0.5478, + "step": 4435 + }, + { + "epoch": 0.7886222222222222, + "grad_norm": 0.3520472676591341, + "learning_rate": 2.253483524271225e-05, + "loss": 0.6156, + "step": 4436 + }, + { + "epoch": 0.7888, + "grad_norm": 0.4013144518995045, + "learning_rate": 2.2498434851654126e-05, + "loss": 0.5401, + "step": 4437 + }, + { + "epoch": 0.7889777777777778, + "grad_norm": 0.34867366665252647, + "learning_rate": 2.2462060156356956e-05, + "loss": 0.5148, + "step": 4438 + }, + { + "epoch": 0.7891555555555556, + "grad_norm": 0.3658504765354854, + "learning_rate": 2.2425711168880814e-05, + "loss": 0.5599, + "step": 4439 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 0.3778239547034521, + "learning_rate": 2.238938790127727e-05, + "loss": 0.5778, + "step": 4440 + }, + { + "epoch": 0.7895111111111112, + "grad_norm": 0.34462757856381065, + "learning_rate": 2.2353090365589348e-05, + "loss": 0.5594, + "step": 4441 + }, + { + "epoch": 0.7896888888888889, + "grad_norm": 0.3513516798827274, + "learning_rate": 2.2316818573851563e-05, + "loss": 0.6061, + "step": 4442 + }, + { + "epoch": 0.7898666666666667, + "grad_norm": 0.3836923635197959, + "learning_rate": 2.2280572538089872e-05, + "loss": 0.5424, + "step": 4443 + }, + { + "epoch": 0.7900444444444444, + "grad_norm": 0.36243972197616764, + "learning_rate": 2.224435227032171e-05, + "loss": 0.5948, + "step": 4444 + }, + { + "epoch": 0.7902222222222223, + "grad_norm": 0.34519664165156827, + "learning_rate": 2.220815778255596e-05, + "loss": 0.5423, + "step": 4445 + }, + { + "epoch": 0.7904, + "grad_norm": 0.3537625173251878, + "learning_rate": 2.2171989086792956e-05, + "loss": 0.6044, + "step": 4446 + }, + { + "epoch": 0.7905777777777778, + "grad_norm": 0.37719169929058854, + "learning_rate": 2.2135846195024513e-05, + "loss": 0.5711, + "step": 4447 + }, + { + "epoch": 0.7907555555555555, + "grad_norm": 0.3487790925801628, + "learning_rate": 2.209972911923377e-05, + "loss": 0.5417, + "step": 4448 + }, + { + "epoch": 0.7909333333333334, + "grad_norm": 0.34635768874747586, + "learning_rate": 2.2063637871395527e-05, + "loss": 0.5523, + "step": 4449 + }, + { + "epoch": 0.7911111111111111, + "grad_norm": 0.34840796124579476, + "learning_rate": 2.2027572463475764e-05, + "loss": 0.5756, + "step": 4450 + }, + { + "epoch": 0.7912888888888889, + "grad_norm": 0.350523468511347, + "learning_rate": 2.1991532907432145e-05, + "loss": 0.5587, + "step": 4451 + }, + { + "epoch": 0.7914666666666667, + "grad_norm": 0.3508473901939546, + "learning_rate": 2.1955519215213527e-05, + "loss": 0.5143, + "step": 4452 + }, + { + "epoch": 0.7916444444444445, + "grad_norm": 0.344343383859007, + "learning_rate": 2.1919531398760408e-05, + "loss": 0.5938, + "step": 4453 + }, + { + "epoch": 0.7918222222222222, + "grad_norm": 0.3725443311322793, + "learning_rate": 2.1883569470004485e-05, + "loss": 0.5852, + "step": 4454 + }, + { + "epoch": 0.792, + "grad_norm": 0.35888135186522996, + "learning_rate": 2.184763344086912e-05, + "loss": 0.5959, + "step": 4455 + }, + { + "epoch": 0.7921777777777778, + "grad_norm": 0.359225900311012, + "learning_rate": 2.1811723323268863e-05, + "loss": 0.6027, + "step": 4456 + }, + { + "epoch": 0.7923555555555556, + "grad_norm": 0.333395479636865, + "learning_rate": 2.177583912910979e-05, + "loss": 0.5632, + "step": 4457 + }, + { + "epoch": 0.7925333333333333, + "grad_norm": 0.3278509639443351, + "learning_rate": 2.173998087028938e-05, + "loss": 0.5462, + "step": 4458 + }, + { + "epoch": 0.7927111111111111, + "grad_norm": 0.3479318303442947, + "learning_rate": 2.170414855869647e-05, + "loss": 0.5633, + "step": 4459 + }, + { + "epoch": 0.7928888888888889, + "grad_norm": 0.35204816863943117, + "learning_rate": 2.1668342206211322e-05, + "loss": 0.5601, + "step": 4460 + }, + { + "epoch": 0.7930666666666667, + "grad_norm": 0.3520211187152833, + "learning_rate": 2.16325618247056e-05, + "loss": 0.5489, + "step": 4461 + }, + { + "epoch": 0.7932444444444444, + "grad_norm": 0.368070055178014, + "learning_rate": 2.159680742604234e-05, + "loss": 0.5655, + "step": 4462 + }, + { + "epoch": 0.7934222222222223, + "grad_norm": 0.3408177353118567, + "learning_rate": 2.1561079022075947e-05, + "loss": 0.5553, + "step": 4463 + }, + { + "epoch": 0.7936, + "grad_norm": 0.35001564237225685, + "learning_rate": 2.152537662465226e-05, + "loss": 0.5618, + "step": 4464 + }, + { + "epoch": 0.7937777777777778, + "grad_norm": 0.3670652395227683, + "learning_rate": 2.1489700245608437e-05, + "loss": 0.5728, + "step": 4465 + }, + { + "epoch": 0.7939555555555555, + "grad_norm": 0.35997823203905266, + "learning_rate": 2.145404989677303e-05, + "loss": 0.54, + "step": 4466 + }, + { + "epoch": 0.7941333333333334, + "grad_norm": 0.36123954651578766, + "learning_rate": 2.1418425589965996e-05, + "loss": 0.5568, + "step": 4467 + }, + { + "epoch": 0.7943111111111111, + "grad_norm": 0.3359671692245637, + "learning_rate": 2.1382827336998602e-05, + "loss": 0.5395, + "step": 4468 + }, + { + "epoch": 0.7944888888888889, + "grad_norm": 0.3368593471730129, + "learning_rate": 2.1347255149673505e-05, + "loss": 0.5389, + "step": 4469 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 0.3534071868639474, + "learning_rate": 2.1311709039784734e-05, + "loss": 0.5515, + "step": 4470 + }, + { + "epoch": 0.7948444444444445, + "grad_norm": 0.3748811392799846, + "learning_rate": 2.1276189019117677e-05, + "loss": 0.5618, + "step": 4471 + }, + { + "epoch": 0.7950222222222222, + "grad_norm": 0.3360369108761957, + "learning_rate": 2.1240695099448947e-05, + "loss": 0.5413, + "step": 4472 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3601178206035112, + "learning_rate": 2.1205227292546747e-05, + "loss": 0.5686, + "step": 4473 + }, + { + "epoch": 0.7953777777777777, + "grad_norm": 0.40071102305441203, + "learning_rate": 2.1169785610170356e-05, + "loss": 0.6048, + "step": 4474 + }, + { + "epoch": 0.7955555555555556, + "grad_norm": 0.32914596369978133, + "learning_rate": 2.113437006407062e-05, + "loss": 0.535, + "step": 4475 + }, + { + "epoch": 0.7957333333333333, + "grad_norm": 0.33948756585130024, + "learning_rate": 2.1098980665989532e-05, + "loss": 0.5461, + "step": 4476 + }, + { + "epoch": 0.7959111111111111, + "grad_norm": 1.6253968183443543, + "learning_rate": 2.1063617427660575e-05, + "loss": 0.5511, + "step": 4477 + }, + { + "epoch": 0.7960888888888888, + "grad_norm": 0.33518783280188064, + "learning_rate": 2.1028280360808407e-05, + "loss": 0.516, + "step": 4478 + }, + { + "epoch": 0.7962666666666667, + "grad_norm": 0.33894736764834865, + "learning_rate": 2.0992969477149183e-05, + "loss": 0.5607, + "step": 4479 + }, + { + "epoch": 0.7964444444444444, + "grad_norm": 0.3362547070357395, + "learning_rate": 2.0957684788390187e-05, + "loss": 0.5588, + "step": 4480 + }, + { + "epoch": 0.7966222222222222, + "grad_norm": 0.36097925977397793, + "learning_rate": 2.092242630623016e-05, + "loss": 0.6071, + "step": 4481 + }, + { + "epoch": 0.7968, + "grad_norm": 0.3672843359144088, + "learning_rate": 2.0887194042359083e-05, + "loss": 0.5533, + "step": 4482 + }, + { + "epoch": 0.7969777777777778, + "grad_norm": 0.36099009906248897, + "learning_rate": 2.0851988008458278e-05, + "loss": 0.5738, + "step": 4483 + }, + { + "epoch": 0.7971555555555555, + "grad_norm": 0.33563502326317596, + "learning_rate": 2.0816808216200358e-05, + "loss": 0.5555, + "step": 4484 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 0.3819935557582957, + "learning_rate": 2.078165467724924e-05, + "loss": 0.602, + "step": 4485 + }, + { + "epoch": 0.7975111111111111, + "grad_norm": 0.3716518783836378, + "learning_rate": 2.074652740326013e-05, + "loss": 0.5997, + "step": 4486 + }, + { + "epoch": 0.7976888888888889, + "grad_norm": 0.359379858939067, + "learning_rate": 2.071142640587952e-05, + "loss": 0.5673, + "step": 4487 + }, + { + "epoch": 0.7978666666666666, + "grad_norm": 0.3369182512690324, + "learning_rate": 2.06763516967452e-05, + "loss": 0.6033, + "step": 4488 + }, + { + "epoch": 0.7980444444444444, + "grad_norm": 0.3737307299317438, + "learning_rate": 2.064130328748626e-05, + "loss": 0.5042, + "step": 4489 + }, + { + "epoch": 0.7982222222222223, + "grad_norm": 0.3531128111052518, + "learning_rate": 2.060628118972303e-05, + "loss": 0.5788, + "step": 4490 + }, + { + "epoch": 0.7984, + "grad_norm": 0.3443491998389885, + "learning_rate": 2.0571285415067164e-05, + "loss": 0.5528, + "step": 4491 + }, + { + "epoch": 0.7985777777777778, + "grad_norm": 0.3538940388148492, + "learning_rate": 2.0536315975121544e-05, + "loss": 0.582, + "step": 4492 + }, + { + "epoch": 0.7987555555555556, + "grad_norm": 0.3615864338705043, + "learning_rate": 2.050137288148035e-05, + "loss": 0.5618, + "step": 4493 + }, + { + "epoch": 0.7989333333333334, + "grad_norm": 0.37572026154617716, + "learning_rate": 2.0466456145729007e-05, + "loss": 0.566, + "step": 4494 + }, + { + "epoch": 0.7991111111111111, + "grad_norm": 0.35576437549087414, + "learning_rate": 2.043156577944425e-05, + "loss": 0.5694, + "step": 4495 + }, + { + "epoch": 0.7992888888888889, + "grad_norm": 0.38746520990094374, + "learning_rate": 2.039670179419395e-05, + "loss": 0.6312, + "step": 4496 + }, + { + "epoch": 0.7994666666666667, + "grad_norm": 0.37006287789317643, + "learning_rate": 2.036186420153743e-05, + "loss": 0.5911, + "step": 4497 + }, + { + "epoch": 0.7996444444444445, + "grad_norm": 0.35691504053422074, + "learning_rate": 2.032705301302501e-05, + "loss": 0.5755, + "step": 4498 + }, + { + "epoch": 0.7998222222222222, + "grad_norm": 0.42304185796233584, + "learning_rate": 2.029226824019853e-05, + "loss": 0.5525, + "step": 4499 + }, + { + "epoch": 0.8, + "grad_norm": 0.33026419052579314, + "learning_rate": 2.025750989459081e-05, + "loss": 0.5484, + "step": 4500 + }, + { + "epoch": 0.8001777777777778, + "grad_norm": 0.3461808966618599, + "learning_rate": 2.022277798772614e-05, + "loss": 0.5496, + "step": 4501 + }, + { + "epoch": 0.8003555555555556, + "grad_norm": 0.4358772247852598, + "learning_rate": 2.018807253111984e-05, + "loss": 0.5523, + "step": 4502 + }, + { + "epoch": 0.8005333333333333, + "grad_norm": 0.35519831242472266, + "learning_rate": 2.0153393536278653e-05, + "loss": 0.5928, + "step": 4503 + }, + { + "epoch": 0.8007111111111112, + "grad_norm": 0.333958091319544, + "learning_rate": 2.0118741014700372e-05, + "loss": 0.5458, + "step": 4504 + }, + { + "epoch": 0.8008888888888889, + "grad_norm": 0.740108692807291, + "learning_rate": 2.0084114977874135e-05, + "loss": 0.5655, + "step": 4505 + }, + { + "epoch": 0.8010666666666667, + "grad_norm": 0.3508957376837852, + "learning_rate": 2.004951543728023e-05, + "loss": 0.5666, + "step": 4506 + }, + { + "epoch": 0.8012444444444444, + "grad_norm": 0.3360677832305794, + "learning_rate": 2.0014942404390214e-05, + "loss": 0.5376, + "step": 4507 + }, + { + "epoch": 0.8014222222222223, + "grad_norm": 0.3467867005964346, + "learning_rate": 1.99803958906668e-05, + "loss": 0.528, + "step": 4508 + }, + { + "epoch": 0.8016, + "grad_norm": 0.3712939416264039, + "learning_rate": 1.994587590756397e-05, + "loss": 0.6204, + "step": 4509 + }, + { + "epoch": 0.8017777777777778, + "grad_norm": 0.35065132253225106, + "learning_rate": 1.991138246652685e-05, + "loss": 0.5979, + "step": 4510 + }, + { + "epoch": 0.8019555555555555, + "grad_norm": 0.3396135587110741, + "learning_rate": 1.9876915578991808e-05, + "loss": 0.5266, + "step": 4511 + }, + { + "epoch": 0.8021333333333334, + "grad_norm": 0.3568987090779943, + "learning_rate": 1.9842475256386384e-05, + "loss": 0.5692, + "step": 4512 + }, + { + "epoch": 0.8023111111111111, + "grad_norm": 0.356532759147842, + "learning_rate": 1.9808061510129317e-05, + "loss": 0.5551, + "step": 4513 + }, + { + "epoch": 0.8024888888888889, + "grad_norm": 0.4643125194046535, + "learning_rate": 1.9773674351630545e-05, + "loss": 0.5584, + "step": 4514 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 0.35714688138901224, + "learning_rate": 1.973931379229118e-05, + "loss": 0.5629, + "step": 4515 + }, + { + "epoch": 0.8028444444444445, + "grad_norm": 0.4130592376490037, + "learning_rate": 1.970497984350351e-05, + "loss": 0.5994, + "step": 4516 + }, + { + "epoch": 0.8030222222222222, + "grad_norm": 0.34797770338119544, + "learning_rate": 1.967067251665101e-05, + "loss": 0.5304, + "step": 4517 + }, + { + "epoch": 0.8032, + "grad_norm": 0.33840676355723787, + "learning_rate": 1.9636391823108335e-05, + "loss": 0.5841, + "step": 4518 + }, + { + "epoch": 0.8033777777777777, + "grad_norm": 0.32757421432600875, + "learning_rate": 1.9602137774241326e-05, + "loss": 0.5391, + "step": 4519 + }, + { + "epoch": 0.8035555555555556, + "grad_norm": 0.39733954575909924, + "learning_rate": 1.9567910381406875e-05, + "loss": 0.5517, + "step": 4520 + }, + { + "epoch": 0.8037333333333333, + "grad_norm": 0.34193620603567504, + "learning_rate": 1.9533709655953235e-05, + "loss": 0.5303, + "step": 4521 + }, + { + "epoch": 0.8039111111111111, + "grad_norm": 0.3432225042695941, + "learning_rate": 1.94995356092196e-05, + "loss": 0.5446, + "step": 4522 + }, + { + "epoch": 0.8040888888888889, + "grad_norm": 0.3872453157838041, + "learning_rate": 1.9465388252536543e-05, + "loss": 0.5415, + "step": 4523 + }, + { + "epoch": 0.8042666666666667, + "grad_norm": 0.508534416595769, + "learning_rate": 1.9431267597225568e-05, + "loss": 0.5429, + "step": 4524 + }, + { + "epoch": 0.8044444444444444, + "grad_norm": 0.35620050998119435, + "learning_rate": 1.939717365459952e-05, + "loss": 0.5617, + "step": 4525 + }, + { + "epoch": 0.8046222222222222, + "grad_norm": 0.34453399955177694, + "learning_rate": 1.9363106435962197e-05, + "loss": 0.6012, + "step": 4526 + }, + { + "epoch": 0.8048, + "grad_norm": 0.3720643563949712, + "learning_rate": 1.932906595260874e-05, + "loss": 0.5632, + "step": 4527 + }, + { + "epoch": 0.8049777777777778, + "grad_norm": 0.35443001911900646, + "learning_rate": 1.9295052215825228e-05, + "loss": 0.5429, + "step": 4528 + }, + { + "epoch": 0.8051555555555555, + "grad_norm": 0.34949477323600175, + "learning_rate": 1.9261065236889066e-05, + "loss": 0.5337, + "step": 4529 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 0.36482230095932267, + "learning_rate": 1.9227105027068603e-05, + "loss": 0.5483, + "step": 4530 + }, + { + "epoch": 0.8055111111111111, + "grad_norm": 0.3606895951567287, + "learning_rate": 1.9193171597623437e-05, + "loss": 0.5604, + "step": 4531 + }, + { + "epoch": 0.8056888888888889, + "grad_norm": 0.3530758010834439, + "learning_rate": 1.9159264959804247e-05, + "loss": 0.5352, + "step": 4532 + }, + { + "epoch": 0.8058666666666666, + "grad_norm": 0.34552307378285196, + "learning_rate": 1.9125385124852813e-05, + "loss": 0.5537, + "step": 4533 + }, + { + "epoch": 0.8060444444444445, + "grad_norm": 0.3457387904227504, + "learning_rate": 1.9091532104002052e-05, + "loss": 0.5378, + "step": 4534 + }, + { + "epoch": 0.8062222222222222, + "grad_norm": 0.3760289220757086, + "learning_rate": 1.9057705908475998e-05, + "loss": 0.6071, + "step": 4535 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3481072292087357, + "learning_rate": 1.9023906549489767e-05, + "loss": 0.5649, + "step": 4536 + }, + { + "epoch": 0.8065777777777777, + "grad_norm": 0.36904692154365754, + "learning_rate": 1.8990134038249585e-05, + "loss": 0.5675, + "step": 4537 + }, + { + "epoch": 0.8067555555555556, + "grad_norm": 0.34327763958380925, + "learning_rate": 1.8956388385952772e-05, + "loss": 0.5583, + "step": 4538 + }, + { + "epoch": 0.8069333333333333, + "grad_norm": 0.38008301820958273, + "learning_rate": 1.8922669603787778e-05, + "loss": 0.5433, + "step": 4539 + }, + { + "epoch": 0.8071111111111111, + "grad_norm": 0.35588886442620565, + "learning_rate": 1.8888977702934085e-05, + "loss": 0.5648, + "step": 4540 + }, + { + "epoch": 0.8072888888888888, + "grad_norm": 0.3612943294548691, + "learning_rate": 1.885531269456231e-05, + "loss": 0.5634, + "step": 4541 + }, + { + "epoch": 0.8074666666666667, + "grad_norm": 0.3463952388177529, + "learning_rate": 1.8821674589834136e-05, + "loss": 0.5301, + "step": 4542 + }, + { + "epoch": 0.8076444444444445, + "grad_norm": 0.3436568356015561, + "learning_rate": 1.8788063399902333e-05, + "loss": 0.5811, + "step": 4543 + }, + { + "epoch": 0.8078222222222222, + "grad_norm": 0.35952420836401094, + "learning_rate": 1.875447913591073e-05, + "loss": 0.6169, + "step": 4544 + }, + { + "epoch": 0.808, + "grad_norm": 0.3513013344849414, + "learning_rate": 1.8720921808994263e-05, + "loss": 0.5715, + "step": 4545 + }, + { + "epoch": 0.8081777777777778, + "grad_norm": 0.3301564649722469, + "learning_rate": 1.8687391430278845e-05, + "loss": 0.5266, + "step": 4546 + }, + { + "epoch": 0.8083555555555556, + "grad_norm": 0.3782404872532981, + "learning_rate": 1.8653888010881637e-05, + "loss": 0.5864, + "step": 4547 + }, + { + "epoch": 0.8085333333333333, + "grad_norm": 0.3601340175097413, + "learning_rate": 1.862041156191062e-05, + "loss": 0.5488, + "step": 4548 + }, + { + "epoch": 0.8087111111111112, + "grad_norm": 0.34018039796512534, + "learning_rate": 1.8586962094465098e-05, + "loss": 0.5577, + "step": 4549 + }, + { + "epoch": 0.8088888888888889, + "grad_norm": 0.3499380589292849, + "learning_rate": 1.8553539619635153e-05, + "loss": 0.599, + "step": 4550 + }, + { + "epoch": 0.8090666666666667, + "grad_norm": 0.3757626733167595, + "learning_rate": 1.852014414850218e-05, + "loss": 0.5762, + "step": 4551 + }, + { + "epoch": 0.8092444444444444, + "grad_norm": 0.349663344518537, + "learning_rate": 1.8486775692138403e-05, + "loss": 0.5639, + "step": 4552 + }, + { + "epoch": 0.8094222222222223, + "grad_norm": 0.359312478214904, + "learning_rate": 1.8453434261607273e-05, + "loss": 0.5801, + "step": 4553 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3755293871556833, + "learning_rate": 1.8420119867963116e-05, + "loss": 0.5651, + "step": 4554 + }, + { + "epoch": 0.8097777777777778, + "grad_norm": 0.3546961413125564, + "learning_rate": 1.8386832522251397e-05, + "loss": 0.5486, + "step": 4555 + }, + { + "epoch": 0.8099555555555555, + "grad_norm": 0.36844581318373004, + "learning_rate": 1.8353572235508576e-05, + "loss": 0.5301, + "step": 4556 + }, + { + "epoch": 0.8101333333333334, + "grad_norm": 0.3474790977287327, + "learning_rate": 1.8320339018762167e-05, + "loss": 0.524, + "step": 4557 + }, + { + "epoch": 0.8103111111111111, + "grad_norm": 0.3495217777342104, + "learning_rate": 1.82871328830307e-05, + "loss": 0.5156, + "step": 4558 + }, + { + "epoch": 0.8104888888888889, + "grad_norm": 0.3431425975340414, + "learning_rate": 1.825395383932369e-05, + "loss": 0.5431, + "step": 4559 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 0.3639500557288192, + "learning_rate": 1.8220801898641726e-05, + "loss": 0.564, + "step": 4560 + }, + { + "epoch": 0.8108444444444445, + "grad_norm": 0.34495475604132514, + "learning_rate": 1.818767707197636e-05, + "loss": 0.5284, + "step": 4561 + }, + { + "epoch": 0.8110222222222222, + "grad_norm": 0.38018742762442376, + "learning_rate": 1.815457937031021e-05, + "loss": 0.5618, + "step": 4562 + }, + { + "epoch": 0.8112, + "grad_norm": 0.36094902973346266, + "learning_rate": 1.812150880461684e-05, + "loss": 0.5685, + "step": 4563 + }, + { + "epoch": 0.8113777777777778, + "grad_norm": 0.3945584192507859, + "learning_rate": 1.8088465385860854e-05, + "loss": 0.5818, + "step": 4564 + }, + { + "epoch": 0.8115555555555556, + "grad_norm": 0.35973079911753336, + "learning_rate": 1.805544912499786e-05, + "loss": 0.5541, + "step": 4565 + }, + { + "epoch": 0.8117333333333333, + "grad_norm": 0.3479356219760524, + "learning_rate": 1.802246003297443e-05, + "loss": 0.5107, + "step": 4566 + }, + { + "epoch": 0.8119111111111111, + "grad_norm": 0.3470924510788464, + "learning_rate": 1.7989498120728164e-05, + "loss": 0.5638, + "step": 4567 + }, + { + "epoch": 0.8120888888888889, + "grad_norm": 0.34803119178180775, + "learning_rate": 1.795656339918762e-05, + "loss": 0.5609, + "step": 4568 + }, + { + "epoch": 0.8122666666666667, + "grad_norm": 0.3407490409641714, + "learning_rate": 1.7923655879272393e-05, + "loss": 0.5312, + "step": 4569 + }, + { + "epoch": 0.8124444444444444, + "grad_norm": 0.33705007966929834, + "learning_rate": 1.7890775571892936e-05, + "loss": 0.5946, + "step": 4570 + }, + { + "epoch": 0.8126222222222222, + "grad_norm": 0.33340039400453747, + "learning_rate": 1.7857922487950874e-05, + "loss": 0.5092, + "step": 4571 + }, + { + "epoch": 0.8128, + "grad_norm": 0.36565609451371067, + "learning_rate": 1.782509663833858e-05, + "loss": 0.5415, + "step": 4572 + }, + { + "epoch": 0.8129777777777778, + "grad_norm": 0.3670837480893755, + "learning_rate": 1.7792298033939625e-05, + "loss": 0.5683, + "step": 4573 + }, + { + "epoch": 0.8131555555555555, + "grad_norm": 0.37623611333394774, + "learning_rate": 1.7759526685628335e-05, + "loss": 0.5958, + "step": 4574 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.34590026979819816, + "learning_rate": 1.772678260427021e-05, + "loss": 0.5395, + "step": 4575 + }, + { + "epoch": 0.8135111111111111, + "grad_norm": 0.3586870247485163, + "learning_rate": 1.7694065800721483e-05, + "loss": 0.5395, + "step": 4576 + }, + { + "epoch": 0.8136888888888889, + "grad_norm": 0.37581326706659135, + "learning_rate": 1.7661376285829568e-05, + "loss": 0.5226, + "step": 4577 + }, + { + "epoch": 0.8138666666666666, + "grad_norm": 0.37136573441191323, + "learning_rate": 1.762871407043264e-05, + "loss": 0.5617, + "step": 4578 + }, + { + "epoch": 0.8140444444444445, + "grad_norm": 0.374560526700098, + "learning_rate": 1.7596079165359935e-05, + "loss": 0.5139, + "step": 4579 + }, + { + "epoch": 0.8142222222222222, + "grad_norm": 0.4127415183735608, + "learning_rate": 1.7563471581431624e-05, + "loss": 0.6367, + "step": 4580 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3746678968592116, + "learning_rate": 1.7530891329458764e-05, + "loss": 0.5527, + "step": 4581 + }, + { + "epoch": 0.8145777777777777, + "grad_norm": 0.3688458977138624, + "learning_rate": 1.7498338420243422e-05, + "loss": 0.6242, + "step": 4582 + }, + { + "epoch": 0.8147555555555556, + "grad_norm": 0.3508663867382437, + "learning_rate": 1.7465812864578534e-05, + "loss": 0.5451, + "step": 4583 + }, + { + "epoch": 0.8149333333333333, + "grad_norm": 0.3665796277621071, + "learning_rate": 1.7433314673248024e-05, + "loss": 0.5438, + "step": 4584 + }, + { + "epoch": 0.8151111111111111, + "grad_norm": 0.340318077502143, + "learning_rate": 1.7400843857026705e-05, + "loss": 0.572, + "step": 4585 + }, + { + "epoch": 0.8152888888888888, + "grad_norm": 0.37114039250681113, + "learning_rate": 1.736840042668032e-05, + "loss": 0.5812, + "step": 4586 + }, + { + "epoch": 0.8154666666666667, + "grad_norm": 0.3707407810397685, + "learning_rate": 1.7335984392965545e-05, + "loss": 0.5779, + "step": 4587 + }, + { + "epoch": 0.8156444444444444, + "grad_norm": 0.5453313783553445, + "learning_rate": 1.7303595766629955e-05, + "loss": 0.5474, + "step": 4588 + }, + { + "epoch": 0.8158222222222222, + "grad_norm": 0.3656498505838504, + "learning_rate": 1.7271234558412052e-05, + "loss": 0.6156, + "step": 4589 + }, + { + "epoch": 0.816, + "grad_norm": 0.3539140344372221, + "learning_rate": 1.7238900779041255e-05, + "loss": 0.5713, + "step": 4590 + }, + { + "epoch": 0.8161777777777778, + "grad_norm": 0.395913449562202, + "learning_rate": 1.7206594439237865e-05, + "loss": 0.5406, + "step": 4591 + }, + { + "epoch": 0.8163555555555555, + "grad_norm": 0.3410503372642445, + "learning_rate": 1.7174315549713104e-05, + "loss": 0.5618, + "step": 4592 + }, + { + "epoch": 0.8165333333333333, + "grad_norm": 0.3471925479117926, + "learning_rate": 1.714206412116911e-05, + "loss": 0.593, + "step": 4593 + }, + { + "epoch": 0.8167111111111112, + "grad_norm": 0.3226915829367458, + "learning_rate": 1.7109840164298807e-05, + "loss": 0.5123, + "step": 4594 + }, + { + "epoch": 0.8168888888888889, + "grad_norm": 0.3676559287810124, + "learning_rate": 1.7077643689786215e-05, + "loss": 0.596, + "step": 4595 + }, + { + "epoch": 0.8170666666666667, + "grad_norm": 0.36599664024044093, + "learning_rate": 1.704547470830601e-05, + "loss": 0.5846, + "step": 4596 + }, + { + "epoch": 0.8172444444444444, + "grad_norm": 0.3776913498622677, + "learning_rate": 1.7013333230523976e-05, + "loss": 0.5545, + "step": 4597 + }, + { + "epoch": 0.8174222222222223, + "grad_norm": 0.35568090088044013, + "learning_rate": 1.698121926709656e-05, + "loss": 0.5608, + "step": 4598 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3556319602517602, + "learning_rate": 1.69491328286713e-05, + "loss": 0.5871, + "step": 4599 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 0.3726760571642367, + "learning_rate": 1.6917073925886406e-05, + "loss": 0.5893, + "step": 4600 + }, + { + "epoch": 0.8179555555555555, + "grad_norm": 0.36508890664754523, + "learning_rate": 1.6885042569371146e-05, + "loss": 0.5783, + "step": 4601 + }, + { + "epoch": 0.8181333333333334, + "grad_norm": 0.3568767846657704, + "learning_rate": 1.6853038769745467e-05, + "loss": 0.5191, + "step": 4602 + }, + { + "epoch": 0.8183111111111111, + "grad_norm": 0.37184830502349037, + "learning_rate": 1.6821062537620356e-05, + "loss": 0.559, + "step": 4603 + }, + { + "epoch": 0.8184888888888889, + "grad_norm": 0.33484449060124793, + "learning_rate": 1.6789113883597595e-05, + "loss": 0.5413, + "step": 4604 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 0.39855687864076433, + "learning_rate": 1.6757192818269708e-05, + "loss": 0.586, + "step": 4605 + }, + { + "epoch": 0.8188444444444445, + "grad_norm": 0.37639534985133716, + "learning_rate": 1.6725299352220282e-05, + "loss": 0.5533, + "step": 4606 + }, + { + "epoch": 0.8190222222222222, + "grad_norm": 0.33750942372096643, + "learning_rate": 1.6693433496023546e-05, + "loss": 0.5423, + "step": 4607 + }, + { + "epoch": 0.8192, + "grad_norm": 0.3447269393525857, + "learning_rate": 1.6661595260244767e-05, + "loss": 0.5476, + "step": 4608 + }, + { + "epoch": 0.8193777777777778, + "grad_norm": 0.35257298750261873, + "learning_rate": 1.6629784655439872e-05, + "loss": 0.5369, + "step": 4609 + }, + { + "epoch": 0.8195555555555556, + "grad_norm": 0.3715824000515777, + "learning_rate": 1.6598001692155807e-05, + "loss": 0.5388, + "step": 4610 + }, + { + "epoch": 0.8197333333333333, + "grad_norm": 0.3529065893526655, + "learning_rate": 1.656624638093016e-05, + "loss": 0.5671, + "step": 4611 + }, + { + "epoch": 0.8199111111111111, + "grad_norm": 0.3531961411263078, + "learning_rate": 1.653451873229156e-05, + "loss": 0.5724, + "step": 4612 + }, + { + "epoch": 0.8200888888888889, + "grad_norm": 0.35740476891326806, + "learning_rate": 1.6502818756759276e-05, + "loss": 0.5906, + "step": 4613 + }, + { + "epoch": 0.8202666666666667, + "grad_norm": 0.3598842387726135, + "learning_rate": 1.64711464648435e-05, + "loss": 0.5572, + "step": 4614 + }, + { + "epoch": 0.8204444444444444, + "grad_norm": 0.3616519161404355, + "learning_rate": 1.6439501867045236e-05, + "loss": 0.5886, + "step": 4615 + }, + { + "epoch": 0.8206222222222223, + "grad_norm": 0.33252267792247087, + "learning_rate": 1.6407884973856313e-05, + "loss": 0.5437, + "step": 4616 + }, + { + "epoch": 0.8208, + "grad_norm": 0.36718085592714184, + "learning_rate": 1.6376295795759333e-05, + "loss": 0.5955, + "step": 4617 + }, + { + "epoch": 0.8209777777777778, + "grad_norm": 0.3768386185088991, + "learning_rate": 1.634473434322775e-05, + "loss": 0.5769, + "step": 4618 + }, + { + "epoch": 0.8211555555555555, + "grad_norm": 0.6255451876089728, + "learning_rate": 1.6313200626725812e-05, + "loss": 0.5602, + "step": 4619 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 0.3602775393772923, + "learning_rate": 1.6281694656708568e-05, + "loss": 0.5261, + "step": 4620 + }, + { + "epoch": 0.8215111111111111, + "grad_norm": 0.35791091421792165, + "learning_rate": 1.6250216443621867e-05, + "loss": 0.5811, + "step": 4621 + }, + { + "epoch": 0.8216888888888889, + "grad_norm": 0.32343170089489137, + "learning_rate": 1.6218765997902362e-05, + "loss": 0.5004, + "step": 4622 + }, + { + "epoch": 0.8218666666666666, + "grad_norm": 0.35279187509927945, + "learning_rate": 1.61873433299775e-05, + "loss": 0.5708, + "step": 4623 + }, + { + "epoch": 0.8220444444444445, + "grad_norm": 0.3538295935007023, + "learning_rate": 1.61559484502655e-05, + "loss": 0.5079, + "step": 4624 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 0.3606148985539107, + "learning_rate": 1.6124581369175396e-05, + "loss": 0.5821, + "step": 4625 + }, + { + "epoch": 0.8224, + "grad_norm": 0.35663529012171563, + "learning_rate": 1.6093242097106986e-05, + "loss": 0.5112, + "step": 4626 + }, + { + "epoch": 0.8225777777777777, + "grad_norm": 0.3423582697935085, + "learning_rate": 1.606193064445085e-05, + "loss": 0.5808, + "step": 4627 + }, + { + "epoch": 0.8227555555555556, + "grad_norm": 0.35298797109406516, + "learning_rate": 1.6030647021588373e-05, + "loss": 0.5641, + "step": 4628 + }, + { + "epoch": 0.8229333333333333, + "grad_norm": 0.33778769108216783, + "learning_rate": 1.5999391238891616e-05, + "loss": 0.5288, + "step": 4629 + }, + { + "epoch": 0.8231111111111111, + "grad_norm": 0.3715260785403377, + "learning_rate": 1.5968163306723572e-05, + "loss": 0.5933, + "step": 4630 + }, + { + "epoch": 0.8232888888888888, + "grad_norm": 0.35288695134186454, + "learning_rate": 1.593696323543783e-05, + "loss": 0.5968, + "step": 4631 + }, + { + "epoch": 0.8234666666666667, + "grad_norm": 0.33637348736462946, + "learning_rate": 1.590579103537889e-05, + "loss": 0.5678, + "step": 4632 + }, + { + "epoch": 0.8236444444444444, + "grad_norm": 0.3543964142635851, + "learning_rate": 1.587464671688187e-05, + "loss": 0.579, + "step": 4633 + }, + { + "epoch": 0.8238222222222222, + "grad_norm": 0.3533556402221236, + "learning_rate": 1.58435302902728e-05, + "loss": 0.5574, + "step": 4634 + }, + { + "epoch": 0.824, + "grad_norm": 0.3824685468415375, + "learning_rate": 1.5812441765868292e-05, + "loss": 0.6465, + "step": 4635 + }, + { + "epoch": 0.8241777777777778, + "grad_norm": 0.35155479081329283, + "learning_rate": 1.578138115397587e-05, + "loss": 0.5558, + "step": 4636 + }, + { + "epoch": 0.8243555555555555, + "grad_norm": 0.3403565334633738, + "learning_rate": 1.5750348464893683e-05, + "loss": 0.5111, + "step": 4637 + }, + { + "epoch": 0.8245333333333333, + "grad_norm": 0.35526059669149385, + "learning_rate": 1.571934370891066e-05, + "loss": 0.5524, + "step": 4638 + }, + { + "epoch": 0.8247111111111111, + "grad_norm": 0.34540197164354736, + "learning_rate": 1.5688366896306494e-05, + "loss": 0.5879, + "step": 4639 + }, + { + "epoch": 0.8248888888888889, + "grad_norm": 0.34040304986408365, + "learning_rate": 1.565741803735159e-05, + "loss": 0.546, + "step": 4640 + }, + { + "epoch": 0.8250666666666666, + "grad_norm": 0.39170139567938245, + "learning_rate": 1.5626497142307084e-05, + "loss": 0.5821, + "step": 4641 + }, + { + "epoch": 0.8252444444444444, + "grad_norm": 0.338488145541669, + "learning_rate": 1.5595604221424852e-05, + "loss": 0.5558, + "step": 4642 + }, + { + "epoch": 0.8254222222222222, + "grad_norm": 0.3704198341446206, + "learning_rate": 1.5564739284947484e-05, + "loss": 0.5431, + "step": 4643 + }, + { + "epoch": 0.8256, + "grad_norm": 0.33742454014359957, + "learning_rate": 1.5533902343108286e-05, + "loss": 0.5433, + "step": 4644 + }, + { + "epoch": 0.8257777777777778, + "grad_norm": 0.35108012187141485, + "learning_rate": 1.550309340613132e-05, + "loss": 0.5702, + "step": 4645 + }, + { + "epoch": 0.8259555555555556, + "grad_norm": 0.36221545891823625, + "learning_rate": 1.547231248423132e-05, + "loss": 0.5193, + "step": 4646 + }, + { + "epoch": 0.8261333333333334, + "grad_norm": 0.4450807898838353, + "learning_rate": 1.544155958761374e-05, + "loss": 0.5527, + "step": 4647 + }, + { + "epoch": 0.8263111111111111, + "grad_norm": 0.3711891715977658, + "learning_rate": 1.5410834726474756e-05, + "loss": 0.5343, + "step": 4648 + }, + { + "epoch": 0.8264888888888889, + "grad_norm": 0.4097167895986613, + "learning_rate": 1.5380137911001248e-05, + "loss": 0.5802, + "step": 4649 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.36882713887124335, + "learning_rate": 1.5349469151370776e-05, + "loss": 0.588, + "step": 4650 + }, + { + "epoch": 0.8268444444444445, + "grad_norm": 0.35511304058788473, + "learning_rate": 1.5318828457751634e-05, + "loss": 0.5543, + "step": 4651 + }, + { + "epoch": 0.8270222222222222, + "grad_norm": 0.3357813184958634, + "learning_rate": 1.52882158403028e-05, + "loss": 0.5075, + "step": 4652 + }, + { + "epoch": 0.8272, + "grad_norm": 0.33540687207754266, + "learning_rate": 1.525763130917387e-05, + "loss": 0.5171, + "step": 4653 + }, + { + "epoch": 0.8273777777777778, + "grad_norm": 0.35509188796866664, + "learning_rate": 1.5227074874505276e-05, + "loss": 0.5958, + "step": 4654 + }, + { + "epoch": 0.8275555555555556, + "grad_norm": 0.36599668887657677, + "learning_rate": 1.519654654642796e-05, + "loss": 0.5607, + "step": 4655 + }, + { + "epoch": 0.8277333333333333, + "grad_norm": 0.3680427195730148, + "learning_rate": 1.5166046335063733e-05, + "loss": 0.5643, + "step": 4656 + }, + { + "epoch": 0.8279111111111112, + "grad_norm": 0.371580746616385, + "learning_rate": 1.5135574250524897e-05, + "loss": 0.5486, + "step": 4657 + }, + { + "epoch": 0.8280888888888889, + "grad_norm": 0.36303029268511805, + "learning_rate": 1.5105130302914594e-05, + "loss": 0.536, + "step": 4658 + }, + { + "epoch": 0.8282666666666667, + "grad_norm": 0.3422375504765315, + "learning_rate": 1.5074714502326492e-05, + "loss": 0.5249, + "step": 4659 + }, + { + "epoch": 0.8284444444444444, + "grad_norm": 0.35983951449013896, + "learning_rate": 1.504432685884506e-05, + "loss": 0.5779, + "step": 4660 + }, + { + "epoch": 0.8286222222222223, + "grad_norm": 0.47904230310632456, + "learning_rate": 1.5013967382545324e-05, + "loss": 0.5386, + "step": 4661 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3622219941103378, + "learning_rate": 1.4983636083493014e-05, + "loss": 0.6015, + "step": 4662 + }, + { + "epoch": 0.8289777777777778, + "grad_norm": 0.32784605092850366, + "learning_rate": 1.4953332971744538e-05, + "loss": 0.5337, + "step": 4663 + }, + { + "epoch": 0.8291555555555555, + "grad_norm": 0.36675632596644536, + "learning_rate": 1.4923058057346929e-05, + "loss": 0.6006, + "step": 4664 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 0.34932092477537297, + "learning_rate": 1.4892811350337876e-05, + "loss": 0.5006, + "step": 4665 + }, + { + "epoch": 0.8295111111111111, + "grad_norm": 0.3709937673398102, + "learning_rate": 1.4862592860745728e-05, + "loss": 0.5573, + "step": 4666 + }, + { + "epoch": 0.8296888888888889, + "grad_norm": 0.3465718005692071, + "learning_rate": 1.4832402598589479e-05, + "loss": 0.5674, + "step": 4667 + }, + { + "epoch": 0.8298666666666666, + "grad_norm": 0.3766189999754378, + "learning_rate": 1.4802240573878733e-05, + "loss": 0.5824, + "step": 4668 + }, + { + "epoch": 0.8300444444444445, + "grad_norm": 0.3345696668177558, + "learning_rate": 1.4772106796613772e-05, + "loss": 0.4999, + "step": 4669 + }, + { + "epoch": 0.8302222222222222, + "grad_norm": 0.3530872814444731, + "learning_rate": 1.4742001276785488e-05, + "loss": 0.5865, + "step": 4670 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3446780163398616, + "learning_rate": 1.4711924024375422e-05, + "loss": 0.5636, + "step": 4671 + }, + { + "epoch": 0.8305777777777777, + "grad_norm": 0.3295083179724757, + "learning_rate": 1.468187504935572e-05, + "loss": 0.5664, + "step": 4672 + }, + { + "epoch": 0.8307555555555556, + "grad_norm": 0.3577560216218309, + "learning_rate": 1.4651854361689178e-05, + "loss": 0.5677, + "step": 4673 + }, + { + "epoch": 0.8309333333333333, + "grad_norm": 0.33439144923605224, + "learning_rate": 1.4621861971329187e-05, + "loss": 0.5893, + "step": 4674 + }, + { + "epoch": 0.8311111111111111, + "grad_norm": 0.35681904113401597, + "learning_rate": 1.4591897888219764e-05, + "loss": 0.5512, + "step": 4675 + }, + { + "epoch": 0.8312888888888889, + "grad_norm": 0.3357108046587461, + "learning_rate": 1.4561962122295591e-05, + "loss": 0.5385, + "step": 4676 + }, + { + "epoch": 0.8314666666666667, + "grad_norm": 0.35755230277570965, + "learning_rate": 1.4532054683481832e-05, + "loss": 0.559, + "step": 4677 + }, + { + "epoch": 0.8316444444444444, + "grad_norm": 0.3579772552384265, + "learning_rate": 1.4502175581694443e-05, + "loss": 0.5535, + "step": 4678 + }, + { + "epoch": 0.8318222222222222, + "grad_norm": 0.34125427623228594, + "learning_rate": 1.447232482683979e-05, + "loss": 0.5493, + "step": 4679 + }, + { + "epoch": 0.832, + "grad_norm": 0.3413170267507868, + "learning_rate": 1.444250242881503e-05, + "loss": 0.5737, + "step": 4680 + }, + { + "epoch": 0.8321777777777778, + "grad_norm": 0.5051103282140549, + "learning_rate": 1.4412708397507724e-05, + "loss": 0.5698, + "step": 4681 + }, + { + "epoch": 0.8323555555555555, + "grad_norm": 0.3626556516412714, + "learning_rate": 1.4382942742796223e-05, + "loss": 0.5961, + "step": 4682 + }, + { + "epoch": 0.8325333333333333, + "grad_norm": 0.3549541391723931, + "learning_rate": 1.4353205474549291e-05, + "loss": 0.56, + "step": 4683 + }, + { + "epoch": 0.8327111111111111, + "grad_norm": 0.38606667689986, + "learning_rate": 1.4323496602626452e-05, + "loss": 0.5902, + "step": 4684 + }, + { + "epoch": 0.8328888888888889, + "grad_norm": 0.3426806560213806, + "learning_rate": 1.4293816136877637e-05, + "loss": 0.5625, + "step": 4685 + }, + { + "epoch": 0.8330666666666666, + "grad_norm": 0.38067762170165054, + "learning_rate": 1.4264164087143539e-05, + "loss": 0.5785, + "step": 4686 + }, + { + "epoch": 0.8332444444444445, + "grad_norm": 0.3456259157854561, + "learning_rate": 1.4234540463255263e-05, + "loss": 0.5616, + "step": 4687 + }, + { + "epoch": 0.8334222222222222, + "grad_norm": 0.3416700683997748, + "learning_rate": 1.4204945275034598e-05, + "loss": 0.5257, + "step": 4688 + }, + { + "epoch": 0.8336, + "grad_norm": 0.39219776182546007, + "learning_rate": 1.417537853229387e-05, + "loss": 0.5746, + "step": 4689 + }, + { + "epoch": 0.8337777777777777, + "grad_norm": 0.35035732783692075, + "learning_rate": 1.4145840244835983e-05, + "loss": 0.5536, + "step": 4690 + }, + { + "epoch": 0.8339555555555556, + "grad_norm": 0.8101289168537157, + "learning_rate": 1.4116330422454394e-05, + "loss": 0.5424, + "step": 4691 + }, + { + "epoch": 0.8341333333333333, + "grad_norm": 0.3731380113997993, + "learning_rate": 1.408684907493314e-05, + "loss": 0.5434, + "step": 4692 + }, + { + "epoch": 0.8343111111111111, + "grad_norm": 0.34209317962476193, + "learning_rate": 1.4057396212046791e-05, + "loss": 0.5847, + "step": 4693 + }, + { + "epoch": 0.8344888888888888, + "grad_norm": 0.36687853786095037, + "learning_rate": 1.4027971843560494e-05, + "loss": 0.5787, + "step": 4694 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 0.3708913832996026, + "learning_rate": 1.3998575979229944e-05, + "loss": 0.5504, + "step": 4695 + }, + { + "epoch": 0.8348444444444444, + "grad_norm": 0.3609894673592285, + "learning_rate": 1.3969208628801388e-05, + "loss": 0.5582, + "step": 4696 + }, + { + "epoch": 0.8350222222222222, + "grad_norm": 0.34447180561894797, + "learning_rate": 1.3939869802011618e-05, + "loss": 0.5484, + "step": 4697 + }, + { + "epoch": 0.8352, + "grad_norm": 0.4006985868252093, + "learning_rate": 1.391055950858795e-05, + "loss": 0.6479, + "step": 4698 + }, + { + "epoch": 0.8353777777777778, + "grad_norm": 0.3573360529017714, + "learning_rate": 1.3881277758248267e-05, + "loss": 0.5531, + "step": 4699 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 0.33213851664597305, + "learning_rate": 1.3852024560700982e-05, + "loss": 0.5513, + "step": 4700 + }, + { + "epoch": 0.8357333333333333, + "grad_norm": 0.41589753370762306, + "learning_rate": 1.3822799925645036e-05, + "loss": 0.5824, + "step": 4701 + }, + { + "epoch": 0.8359111111111112, + "grad_norm": 0.3976682442153046, + "learning_rate": 1.379360386276991e-05, + "loss": 0.5795, + "step": 4702 + }, + { + "epoch": 0.8360888888888889, + "grad_norm": 0.3763522462606709, + "learning_rate": 1.376443638175554e-05, + "loss": 0.5923, + "step": 4703 + }, + { + "epoch": 0.8362666666666667, + "grad_norm": 0.37430688082659974, + "learning_rate": 1.373529749227256e-05, + "loss": 0.604, + "step": 4704 + }, + { + "epoch": 0.8364444444444444, + "grad_norm": 0.3366737118690265, + "learning_rate": 1.370618720398189e-05, + "loss": 0.5547, + "step": 4705 + }, + { + "epoch": 0.8366222222222223, + "grad_norm": 0.33683852653720964, + "learning_rate": 1.3677105526535194e-05, + "loss": 0.5751, + "step": 4706 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3478412773567568, + "learning_rate": 1.3648052469574469e-05, + "loss": 0.5882, + "step": 4707 + }, + { + "epoch": 0.8369777777777778, + "grad_norm": 0.34079237360477427, + "learning_rate": 1.3619028042732373e-05, + "loss": 0.5661, + "step": 4708 + }, + { + "epoch": 0.8371555555555555, + "grad_norm": 0.3692521434195599, + "learning_rate": 1.3590032255631912e-05, + "loss": 0.5918, + "step": 4709 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 0.34799369761895843, + "learning_rate": 1.3561065117886783e-05, + "loss": 0.5549, + "step": 4710 + }, + { + "epoch": 0.8375111111111111, + "grad_norm": 0.33730414740936115, + "learning_rate": 1.3532126639100995e-05, + "loss": 0.5323, + "step": 4711 + }, + { + "epoch": 0.8376888888888889, + "grad_norm": 0.3452876062552767, + "learning_rate": 1.3503216828869192e-05, + "loss": 0.5286, + "step": 4712 + }, + { + "epoch": 0.8378666666666666, + "grad_norm": 0.3535072696111852, + "learning_rate": 1.3474335696776453e-05, + "loss": 0.5694, + "step": 4713 + }, + { + "epoch": 0.8380444444444445, + "grad_norm": 0.35005022069257885, + "learning_rate": 1.344548325239835e-05, + "loss": 0.5851, + "step": 4714 + }, + { + "epoch": 0.8382222222222222, + "grad_norm": 0.3657775293257551, + "learning_rate": 1.3416659505300977e-05, + "loss": 0.5443, + "step": 4715 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3453286227490814, + "learning_rate": 1.3387864465040877e-05, + "loss": 0.5563, + "step": 4716 + }, + { + "epoch": 0.8385777777777778, + "grad_norm": 0.3341158471072853, + "learning_rate": 1.3359098141165093e-05, + "loss": 0.5348, + "step": 4717 + }, + { + "epoch": 0.8387555555555556, + "grad_norm": 0.3546163479163679, + "learning_rate": 1.333036054321114e-05, + "loss": 0.5437, + "step": 4718 + }, + { + "epoch": 0.8389333333333333, + "grad_norm": 0.3737540011008875, + "learning_rate": 1.3301651680707018e-05, + "loss": 0.5805, + "step": 4719 + }, + { + "epoch": 0.8391111111111111, + "grad_norm": 0.3722643561649962, + "learning_rate": 1.3272971563171189e-05, + "loss": 0.5949, + "step": 4720 + }, + { + "epoch": 0.8392888888888889, + "grad_norm": 0.3707395251000393, + "learning_rate": 1.3244320200112592e-05, + "loss": 0.5432, + "step": 4721 + }, + { + "epoch": 0.8394666666666667, + "grad_norm": 0.37836128676167874, + "learning_rate": 1.321569760103063e-05, + "loss": 0.6212, + "step": 4722 + }, + { + "epoch": 0.8396444444444444, + "grad_norm": 0.3544421358498244, + "learning_rate": 1.3187103775415156e-05, + "loss": 0.5936, + "step": 4723 + }, + { + "epoch": 0.8398222222222222, + "grad_norm": 0.33515258921654983, + "learning_rate": 1.3158538732746517e-05, + "loss": 0.4993, + "step": 4724 + }, + { + "epoch": 0.84, + "grad_norm": 0.34699889471605555, + "learning_rate": 1.3130002482495485e-05, + "loss": 0.5541, + "step": 4725 + }, + { + "epoch": 0.8401777777777778, + "grad_norm": 0.35462966752122066, + "learning_rate": 1.3101495034123313e-05, + "loss": 0.6124, + "step": 4726 + }, + { + "epoch": 0.8403555555555555, + "grad_norm": 0.34788234745486424, + "learning_rate": 1.3073016397081638e-05, + "loss": 0.5097, + "step": 4727 + }, + { + "epoch": 0.8405333333333334, + "grad_norm": 0.3536679810195945, + "learning_rate": 1.3044566580812668e-05, + "loss": 0.556, + "step": 4728 + }, + { + "epoch": 0.8407111111111111, + "grad_norm": 0.35160306526637813, + "learning_rate": 1.3016145594748907e-05, + "loss": 0.5345, + "step": 4729 + }, + { + "epoch": 0.8408888888888889, + "grad_norm": 0.35094622854145036, + "learning_rate": 1.2987753448313456e-05, + "loss": 0.5662, + "step": 4730 + }, + { + "epoch": 0.8410666666666666, + "grad_norm": 0.3342374365740332, + "learning_rate": 1.2959390150919681e-05, + "loss": 0.571, + "step": 4731 + }, + { + "epoch": 0.8412444444444445, + "grad_norm": 0.3438614824877184, + "learning_rate": 1.2931055711971574e-05, + "loss": 0.537, + "step": 4732 + }, + { + "epoch": 0.8414222222222222, + "grad_norm": 0.37726351991721147, + "learning_rate": 1.2902750140863373e-05, + "loss": 0.5907, + "step": 4733 + }, + { + "epoch": 0.8416, + "grad_norm": 0.36288867754957466, + "learning_rate": 1.2874473446979918e-05, + "loss": 0.5746, + "step": 4734 + }, + { + "epoch": 0.8417777777777777, + "grad_norm": 0.38479088388874017, + "learning_rate": 1.2846225639696318e-05, + "loss": 0.5817, + "step": 4735 + }, + { + "epoch": 0.8419555555555556, + "grad_norm": 0.36044158051225755, + "learning_rate": 1.2818006728378219e-05, + "loss": 0.6072, + "step": 4736 + }, + { + "epoch": 0.8421333333333333, + "grad_norm": 0.3322545508931253, + "learning_rate": 1.278981672238161e-05, + "loss": 0.5309, + "step": 4737 + }, + { + "epoch": 0.8423111111111111, + "grad_norm": 0.34998865222203523, + "learning_rate": 1.276165563105296e-05, + "loss": 0.5469, + "step": 4738 + }, + { + "epoch": 0.8424888888888888, + "grad_norm": 0.3614983042391235, + "learning_rate": 1.2733523463729102e-05, + "loss": 0.5456, + "step": 4739 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 0.3621769330883001, + "learning_rate": 1.2705420229737307e-05, + "loss": 0.5717, + "step": 4740 + }, + { + "epoch": 0.8428444444444444, + "grad_norm": 0.3453553891206609, + "learning_rate": 1.2677345938395247e-05, + "loss": 0.5683, + "step": 4741 + }, + { + "epoch": 0.8430222222222222, + "grad_norm": 0.35642392507552145, + "learning_rate": 1.2649300599010993e-05, + "loss": 0.5589, + "step": 4742 + }, + { + "epoch": 0.8432, + "grad_norm": 0.36333954238460286, + "learning_rate": 1.2621284220883011e-05, + "loss": 0.6244, + "step": 4743 + }, + { + "epoch": 0.8433777777777778, + "grad_norm": 0.34318169765839596, + "learning_rate": 1.2593296813300193e-05, + "loss": 0.5744, + "step": 4744 + }, + { + "epoch": 0.8435555555555555, + "grad_norm": 0.3655856012168314, + "learning_rate": 1.2565338385541792e-05, + "loss": 0.5598, + "step": 4745 + }, + { + "epoch": 0.8437333333333333, + "grad_norm": 0.34238761557465225, + "learning_rate": 1.253740894687747e-05, + "loss": 0.5552, + "step": 4746 + }, + { + "epoch": 0.8439111111111111, + "grad_norm": 0.34912156269278777, + "learning_rate": 1.250950850656727e-05, + "loss": 0.521, + "step": 4747 + }, + { + "epoch": 0.8440888888888889, + "grad_norm": 0.3985107189538369, + "learning_rate": 1.248163707386163e-05, + "loss": 0.6066, + "step": 4748 + }, + { + "epoch": 0.8442666666666667, + "grad_norm": 0.33137349715867626, + "learning_rate": 1.2453794658001371e-05, + "loss": 0.544, + "step": 4749 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 0.35943001535811403, + "learning_rate": 1.242598126821769e-05, + "loss": 0.5482, + "step": 4750 + }, + { + "epoch": 0.8446222222222223, + "grad_norm": 0.351013040159158, + "learning_rate": 1.2398196913732118e-05, + "loss": 0.5661, + "step": 4751 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3442254909732594, + "learning_rate": 1.2370441603756677e-05, + "loss": 0.57, + "step": 4752 + }, + { + "epoch": 0.8449777777777778, + "grad_norm": 0.399703745467623, + "learning_rate": 1.2342715347493594e-05, + "loss": 0.5219, + "step": 4753 + }, + { + "epoch": 0.8451555555555555, + "grad_norm": 0.33405618723654806, + "learning_rate": 1.2315018154135626e-05, + "loss": 0.542, + "step": 4754 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 0.4422806735381209, + "learning_rate": 1.2287350032865763e-05, + "loss": 0.5598, + "step": 4755 + }, + { + "epoch": 0.8455111111111111, + "grad_norm": 0.35883067303193905, + "learning_rate": 1.2259710992857465e-05, + "loss": 0.5566, + "step": 4756 + }, + { + "epoch": 0.8456888888888889, + "grad_norm": 0.3750495629289882, + "learning_rate": 1.2232101043274436e-05, + "loss": 0.5646, + "step": 4757 + }, + { + "epoch": 0.8458666666666667, + "grad_norm": 0.3679931369848527, + "learning_rate": 1.2204520193270863e-05, + "loss": 0.5854, + "step": 4758 + }, + { + "epoch": 0.8460444444444445, + "grad_norm": 0.3552643263327099, + "learning_rate": 1.2176968451991166e-05, + "loss": 0.5749, + "step": 4759 + }, + { + "epoch": 0.8462222222222222, + "grad_norm": 0.38476147490687185, + "learning_rate": 1.2149445828570195e-05, + "loss": 0.5859, + "step": 4760 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3642212026187823, + "learning_rate": 1.2121952332133091e-05, + "loss": 0.5524, + "step": 4761 + }, + { + "epoch": 0.8465777777777778, + "grad_norm": 0.35584989368735986, + "learning_rate": 1.2094487971795398e-05, + "loss": 0.5268, + "step": 4762 + }, + { + "epoch": 0.8467555555555556, + "grad_norm": 0.34108832633769864, + "learning_rate": 1.2067052756662945e-05, + "loss": 0.507, + "step": 4763 + }, + { + "epoch": 0.8469333333333333, + "grad_norm": 0.34378925720418096, + "learning_rate": 1.2039646695831918e-05, + "loss": 0.5772, + "step": 4764 + }, + { + "epoch": 0.8471111111111111, + "grad_norm": 0.41049911433418446, + "learning_rate": 1.2012269798388842e-05, + "loss": 0.5434, + "step": 4765 + }, + { + "epoch": 0.8472888888888889, + "grad_norm": 0.3767381869755635, + "learning_rate": 1.1984922073410576e-05, + "loss": 0.5489, + "step": 4766 + }, + { + "epoch": 0.8474666666666667, + "grad_norm": 0.3721096722238437, + "learning_rate": 1.195760352996429e-05, + "loss": 0.5695, + "step": 4767 + }, + { + "epoch": 0.8476444444444444, + "grad_norm": 0.362848093784351, + "learning_rate": 1.1930314177107493e-05, + "loss": 0.5459, + "step": 4768 + }, + { + "epoch": 0.8478222222222223, + "grad_norm": 0.34717286324703983, + "learning_rate": 1.1903054023888017e-05, + "loss": 0.5899, + "step": 4769 + }, + { + "epoch": 0.848, + "grad_norm": 0.3545219124315328, + "learning_rate": 1.1875823079343996e-05, + "loss": 0.5747, + "step": 4770 + }, + { + "epoch": 0.8481777777777778, + "grad_norm": 0.34753265955345275, + "learning_rate": 1.1848621352503885e-05, + "loss": 0.5376, + "step": 4771 + }, + { + "epoch": 0.8483555555555555, + "grad_norm": 0.36568871995128516, + "learning_rate": 1.1821448852386475e-05, + "loss": 0.5473, + "step": 4772 + }, + { + "epoch": 0.8485333333333334, + "grad_norm": 0.4502738349367513, + "learning_rate": 1.1794305588000843e-05, + "loss": 0.6042, + "step": 4773 + }, + { + "epoch": 0.8487111111111111, + "grad_norm": 0.3670197534523983, + "learning_rate": 1.1767191568346392e-05, + "loss": 0.5677, + "step": 4774 + }, + { + "epoch": 0.8488888888888889, + "grad_norm": 0.35559272170060463, + "learning_rate": 1.1740106802412765e-05, + "loss": 0.5809, + "step": 4775 + }, + { + "epoch": 0.8490666666666666, + "grad_norm": 0.35541789322148615, + "learning_rate": 1.1713051299180044e-05, + "loss": 0.5343, + "step": 4776 + }, + { + "epoch": 0.8492444444444445, + "grad_norm": 0.35009086622697455, + "learning_rate": 1.1686025067618423e-05, + "loss": 0.5456, + "step": 4777 + }, + { + "epoch": 0.8494222222222222, + "grad_norm": 0.36433975410470026, + "learning_rate": 1.1659028116688575e-05, + "loss": 0.5535, + "step": 4778 + }, + { + "epoch": 0.8496, + "grad_norm": 0.6527544694719213, + "learning_rate": 1.163206045534131e-05, + "loss": 0.5668, + "step": 4779 + }, + { + "epoch": 0.8497777777777777, + "grad_norm": 0.34961369799225933, + "learning_rate": 1.1605122092517874e-05, + "loss": 0.5416, + "step": 4780 + }, + { + "epoch": 0.8499555555555556, + "grad_norm": 0.34719313568414933, + "learning_rate": 1.1578213037149633e-05, + "loss": 0.5738, + "step": 4781 + }, + { + "epoch": 0.8501333333333333, + "grad_norm": 0.3538687342981935, + "learning_rate": 1.1551333298158407e-05, + "loss": 0.5439, + "step": 4782 + }, + { + "epoch": 0.8503111111111111, + "grad_norm": 0.34614117005604167, + "learning_rate": 1.1524482884456146e-05, + "loss": 0.56, + "step": 4783 + }, + { + "epoch": 0.8504888888888888, + "grad_norm": 0.39264264525109654, + "learning_rate": 1.1497661804945215e-05, + "loss": 0.5929, + "step": 4784 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 0.3910874273180067, + "learning_rate": 1.1470870068518113e-05, + "loss": 0.5485, + "step": 4785 + }, + { + "epoch": 0.8508444444444444, + "grad_norm": 0.3608068510718258, + "learning_rate": 1.1444107684057725e-05, + "loss": 0.5767, + "step": 4786 + }, + { + "epoch": 0.8510222222222222, + "grad_norm": 0.37414130238485216, + "learning_rate": 1.1417374660437153e-05, + "loss": 0.5872, + "step": 4787 + }, + { + "epoch": 0.8512, + "grad_norm": 0.36434561219039396, + "learning_rate": 1.139067100651976e-05, + "loss": 0.5596, + "step": 4788 + }, + { + "epoch": 0.8513777777777778, + "grad_norm": 0.36067319438993584, + "learning_rate": 1.1363996731159188e-05, + "loss": 0.5985, + "step": 4789 + }, + { + "epoch": 0.8515555555555555, + "grad_norm": 0.35726958111512597, + "learning_rate": 1.1337351843199329e-05, + "loss": 0.5697, + "step": 4790 + }, + { + "epoch": 0.8517333333333333, + "grad_norm": 0.33676148330206906, + "learning_rate": 1.131073635147435e-05, + "loss": 0.5334, + "step": 4791 + }, + { + "epoch": 0.8519111111111111, + "grad_norm": 0.3504475677876654, + "learning_rate": 1.1284150264808647e-05, + "loss": 0.5481, + "step": 4792 + }, + { + "epoch": 0.8520888888888889, + "grad_norm": 0.34021128509758136, + "learning_rate": 1.1257593592016868e-05, + "loss": 0.5484, + "step": 4793 + }, + { + "epoch": 0.8522666666666666, + "grad_norm": 0.34753025891454314, + "learning_rate": 1.123106634190394e-05, + "loss": 0.5271, + "step": 4794 + }, + { + "epoch": 0.8524444444444444, + "grad_norm": 0.4117510292340522, + "learning_rate": 1.1204568523265002e-05, + "loss": 0.5446, + "step": 4795 + }, + { + "epoch": 0.8526222222222222, + "grad_norm": 0.3696257018451269, + "learning_rate": 1.117810014488544e-05, + "loss": 0.5705, + "step": 4796 + }, + { + "epoch": 0.8528, + "grad_norm": 0.33889651336059107, + "learning_rate": 1.1151661215540888e-05, + "loss": 0.5597, + "step": 4797 + }, + { + "epoch": 0.8529777777777777, + "grad_norm": 0.34895748706017543, + "learning_rate": 1.1125251743997223e-05, + "loss": 0.5562, + "step": 4798 + }, + { + "epoch": 0.8531555555555556, + "grad_norm": 0.34362043759441874, + "learning_rate": 1.109887173901053e-05, + "loss": 0.5306, + "step": 4799 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.3898415608484958, + "learning_rate": 1.107252120932717e-05, + "loss": 0.5661, + "step": 4800 + }, + { + "epoch": 0.8535111111111111, + "grad_norm": 0.3415374936095161, + "learning_rate": 1.104620016368364e-05, + "loss": 0.5626, + "step": 4801 + }, + { + "epoch": 0.8536888888888889, + "grad_norm": 0.37522158181667603, + "learning_rate": 1.1019908610806794e-05, + "loss": 0.6005, + "step": 4802 + }, + { + "epoch": 0.8538666666666667, + "grad_norm": 0.36131441899660666, + "learning_rate": 1.0993646559413572e-05, + "loss": 0.5891, + "step": 4803 + }, + { + "epoch": 0.8540444444444445, + "grad_norm": 0.34241938894189483, + "learning_rate": 1.0967414018211264e-05, + "loss": 0.5202, + "step": 4804 + }, + { + "epoch": 0.8542222222222222, + "grad_norm": 0.34719160311850034, + "learning_rate": 1.0941210995897223e-05, + "loss": 0.5645, + "step": 4805 + }, + { + "epoch": 0.8544, + "grad_norm": 0.34985427851070866, + "learning_rate": 1.0915037501159197e-05, + "loss": 0.5635, + "step": 4806 + }, + { + "epoch": 0.8545777777777778, + "grad_norm": 0.36366812435766566, + "learning_rate": 1.0888893542674949e-05, + "loss": 0.5814, + "step": 4807 + }, + { + "epoch": 0.8547555555555556, + "grad_norm": 0.366244830038732, + "learning_rate": 1.0862779129112654e-05, + "loss": 0.5615, + "step": 4808 + }, + { + "epoch": 0.8549333333333333, + "grad_norm": 0.40703197904146177, + "learning_rate": 1.0836694269130498e-05, + "loss": 0.5014, + "step": 4809 + }, + { + "epoch": 0.8551111111111112, + "grad_norm": 0.3506976385982676, + "learning_rate": 1.0810638971376996e-05, + "loss": 0.5623, + "step": 4810 + }, + { + "epoch": 0.8552888888888889, + "grad_norm": 0.3442271804689803, + "learning_rate": 1.0784613244490816e-05, + "loss": 0.5678, + "step": 4811 + }, + { + "epoch": 0.8554666666666667, + "grad_norm": 0.3392160167742886, + "learning_rate": 1.075861709710081e-05, + "loss": 0.5474, + "step": 4812 + }, + { + "epoch": 0.8556444444444444, + "grad_norm": 0.3332201752522239, + "learning_rate": 1.0732650537826061e-05, + "loss": 0.4998, + "step": 4813 + }, + { + "epoch": 0.8558222222222223, + "grad_norm": 0.3423359445476259, + "learning_rate": 1.0706713575275817e-05, + "loss": 0.5675, + "step": 4814 + }, + { + "epoch": 0.856, + "grad_norm": 0.36234805255293767, + "learning_rate": 1.068080621804951e-05, + "loss": 0.6019, + "step": 4815 + }, + { + "epoch": 0.8561777777777778, + "grad_norm": 0.35662999085142344, + "learning_rate": 1.065492847473677e-05, + "loss": 0.5682, + "step": 4816 + }, + { + "epoch": 0.8563555555555555, + "grad_norm": 0.3739820360524947, + "learning_rate": 1.0629080353917397e-05, + "loss": 0.5684, + "step": 4817 + }, + { + "epoch": 0.8565333333333334, + "grad_norm": 0.37954177131554695, + "learning_rate": 1.0603261864161384e-05, + "loss": 0.5715, + "step": 4818 + }, + { + "epoch": 0.8567111111111111, + "grad_norm": 0.356836286059798, + "learning_rate": 1.057747301402887e-05, + "loss": 0.5479, + "step": 4819 + }, + { + "epoch": 0.8568888888888889, + "grad_norm": 0.3871129480144662, + "learning_rate": 1.0551713812070207e-05, + "loss": 0.5694, + "step": 4820 + }, + { + "epoch": 0.8570666666666666, + "grad_norm": 0.3605913057905761, + "learning_rate": 1.0525984266825895e-05, + "loss": 0.5509, + "step": 4821 + }, + { + "epoch": 0.8572444444444445, + "grad_norm": 0.33998578271790825, + "learning_rate": 1.0500284386826597e-05, + "loss": 0.5954, + "step": 4822 + }, + { + "epoch": 0.8574222222222222, + "grad_norm": 0.34637790273855507, + "learning_rate": 1.0474614180593145e-05, + "loss": 0.5616, + "step": 4823 + }, + { + "epoch": 0.8576, + "grad_norm": 0.39437822277370416, + "learning_rate": 1.0448973656636562e-05, + "loss": 0.6021, + "step": 4824 + }, + { + "epoch": 0.8577777777777778, + "grad_norm": 0.3525446263767972, + "learning_rate": 1.0423362823457939e-05, + "loss": 0.6184, + "step": 4825 + }, + { + "epoch": 0.8579555555555556, + "grad_norm": 0.5661658862408048, + "learning_rate": 1.0397781689548669e-05, + "loss": 0.5526, + "step": 4826 + }, + { + "epoch": 0.8581333333333333, + "grad_norm": 0.35874014994062636, + "learning_rate": 1.0372230263390125e-05, + "loss": 0.5453, + "step": 4827 + }, + { + "epoch": 0.8583111111111111, + "grad_norm": 0.35445839982564686, + "learning_rate": 1.034670855345402e-05, + "loss": 0.5441, + "step": 4828 + }, + { + "epoch": 0.8584888888888889, + "grad_norm": 0.347106712554299, + "learning_rate": 1.032121656820202e-05, + "loss": 0.562, + "step": 4829 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 0.37276570010639615, + "learning_rate": 1.0295754316086114e-05, + "loss": 0.5663, + "step": 4830 + }, + { + "epoch": 0.8588444444444444, + "grad_norm": 0.3631701087001613, + "learning_rate": 1.0270321805548267e-05, + "loss": 0.5171, + "step": 4831 + }, + { + "epoch": 0.8590222222222222, + "grad_norm": 0.3589204793548821, + "learning_rate": 1.0244919045020763e-05, + "loss": 0.5349, + "step": 4832 + }, + { + "epoch": 0.8592, + "grad_norm": 0.34921926184657986, + "learning_rate": 1.0219546042925843e-05, + "loss": 0.5683, + "step": 4833 + }, + { + "epoch": 0.8593777777777778, + "grad_norm": 0.34527175198358123, + "learning_rate": 1.0194202807676e-05, + "loss": 0.5742, + "step": 4834 + }, + { + "epoch": 0.8595555555555555, + "grad_norm": 0.3613224906681506, + "learning_rate": 1.0168889347673816e-05, + "loss": 0.5722, + "step": 4835 + }, + { + "epoch": 0.8597333333333333, + "grad_norm": 0.37560789234311237, + "learning_rate": 1.0143605671312018e-05, + "loss": 0.5931, + "step": 4836 + }, + { + "epoch": 0.8599111111111111, + "grad_norm": 0.3442759842208663, + "learning_rate": 1.0118351786973423e-05, + "loss": 0.5554, + "step": 4837 + }, + { + "epoch": 0.8600888888888889, + "grad_norm": 0.33436241321031435, + "learning_rate": 1.0093127703031013e-05, + "loss": 0.5055, + "step": 4838 + }, + { + "epoch": 0.8602666666666666, + "grad_norm": 0.3747203433840497, + "learning_rate": 1.0067933427847864e-05, + "loss": 0.5545, + "step": 4839 + }, + { + "epoch": 0.8604444444444445, + "grad_norm": 0.34718261173951354, + "learning_rate": 1.0042768969777183e-05, + "loss": 0.5695, + "step": 4840 + }, + { + "epoch": 0.8606222222222222, + "grad_norm": 0.3634506145474074, + "learning_rate": 1.0017634337162275e-05, + "loss": 0.5901, + "step": 4841 + }, + { + "epoch": 0.8608, + "grad_norm": 0.34979519930162906, + "learning_rate": 9.992529538336571e-06, + "loss": 0.5664, + "step": 4842 + }, + { + "epoch": 0.8609777777777777, + "grad_norm": 0.34677175722720904, + "learning_rate": 9.967454581623603e-06, + "loss": 0.581, + "step": 4843 + }, + { + "epoch": 0.8611555555555556, + "grad_norm": 0.3930518351612051, + "learning_rate": 9.942409475337012e-06, + "loss": 0.5689, + "step": 4844 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 0.35651409477168866, + "learning_rate": 9.91739422778054e-06, + "loss": 0.5614, + "step": 4845 + }, + { + "epoch": 0.8615111111111111, + "grad_norm": 0.3559157065269722, + "learning_rate": 9.892408847248037e-06, + "loss": 0.548, + "step": 4846 + }, + { + "epoch": 0.8616888888888888, + "grad_norm": 0.3914772523093115, + "learning_rate": 9.867453342023437e-06, + "loss": 0.5753, + "step": 4847 + }, + { + "epoch": 0.8618666666666667, + "grad_norm": 0.35091384603381803, + "learning_rate": 9.84252772038079e-06, + "loss": 0.5464, + "step": 4848 + }, + { + "epoch": 0.8620444444444444, + "grad_norm": 0.38537027767745874, + "learning_rate": 9.817631990584165e-06, + "loss": 0.5881, + "step": 4849 + }, + { + "epoch": 0.8622222222222222, + "grad_norm": 0.3522400719219844, + "learning_rate": 9.792766160887868e-06, + "loss": 0.5615, + "step": 4850 + }, + { + "epoch": 0.8624, + "grad_norm": 0.3502685764426094, + "learning_rate": 9.767930239536115e-06, + "loss": 0.5788, + "step": 4851 + }, + { + "epoch": 0.8625777777777778, + "grad_norm": 0.3394684151860436, + "learning_rate": 9.74312423476338e-06, + "loss": 0.5599, + "step": 4852 + }, + { + "epoch": 0.8627555555555556, + "grad_norm": 0.3582072624629696, + "learning_rate": 9.718348154794044e-06, + "loss": 0.5519, + "step": 4853 + }, + { + "epoch": 0.8629333333333333, + "grad_norm": 0.37811560194121324, + "learning_rate": 9.69360200784274e-06, + "loss": 0.5697, + "step": 4854 + }, + { + "epoch": 0.8631111111111112, + "grad_norm": 0.3570092818149156, + "learning_rate": 9.668885802114003e-06, + "loss": 0.601, + "step": 4855 + }, + { + "epoch": 0.8632888888888889, + "grad_norm": 0.36027646511403677, + "learning_rate": 9.644199545802612e-06, + "loss": 0.5576, + "step": 4856 + }, + { + "epoch": 0.8634666666666667, + "grad_norm": 0.3392549055499985, + "learning_rate": 9.619543247093254e-06, + "loss": 0.5366, + "step": 4857 + }, + { + "epoch": 0.8636444444444444, + "grad_norm": 0.3648377579392886, + "learning_rate": 9.594916914160846e-06, + "loss": 0.5161, + "step": 4858 + }, + { + "epoch": 0.8638222222222223, + "grad_norm": 0.35677967372684327, + "learning_rate": 9.570320555170209e-06, + "loss": 0.5632, + "step": 4859 + }, + { + "epoch": 0.864, + "grad_norm": 0.4300638488986734, + "learning_rate": 9.545754178276344e-06, + "loss": 0.5627, + "step": 4860 + }, + { + "epoch": 0.8641777777777778, + "grad_norm": 0.3688963784810361, + "learning_rate": 9.52121779162426e-06, + "loss": 0.5917, + "step": 4861 + }, + { + "epoch": 0.8643555555555555, + "grad_norm": 0.3583213105927634, + "learning_rate": 9.496711403349034e-06, + "loss": 0.5708, + "step": 4862 + }, + { + "epoch": 0.8645333333333334, + "grad_norm": 0.36842538116585044, + "learning_rate": 9.472235021575792e-06, + "loss": 0.5701, + "step": 4863 + }, + { + "epoch": 0.8647111111111111, + "grad_norm": 0.36609889257191197, + "learning_rate": 9.44778865441972e-06, + "loss": 0.5553, + "step": 4864 + }, + { + "epoch": 0.8648888888888889, + "grad_norm": 0.3764202076089415, + "learning_rate": 9.423372309986056e-06, + "loss": 0.5559, + "step": 4865 + }, + { + "epoch": 0.8650666666666667, + "grad_norm": 0.3581394101407454, + "learning_rate": 9.398985996370058e-06, + "loss": 0.5998, + "step": 4866 + }, + { + "epoch": 0.8652444444444445, + "grad_norm": 0.37110784997986695, + "learning_rate": 9.374629721657058e-06, + "loss": 0.5732, + "step": 4867 + }, + { + "epoch": 0.8654222222222222, + "grad_norm": 0.3588890955589848, + "learning_rate": 9.350303493922407e-06, + "loss": 0.5822, + "step": 4868 + }, + { + "epoch": 0.8656, + "grad_norm": 0.38172146199191637, + "learning_rate": 9.326007321231522e-06, + "loss": 0.5895, + "step": 4869 + }, + { + "epoch": 0.8657777777777778, + "grad_norm": 0.41367249115110094, + "learning_rate": 9.301741211639803e-06, + "loss": 0.5201, + "step": 4870 + }, + { + "epoch": 0.8659555555555556, + "grad_norm": 0.3867458667685116, + "learning_rate": 9.277505173192746e-06, + "loss": 0.5669, + "step": 4871 + }, + { + "epoch": 0.8661333333333333, + "grad_norm": 0.3387990707696137, + "learning_rate": 9.253299213925847e-06, + "loss": 0.5638, + "step": 4872 + }, + { + "epoch": 0.8663111111111111, + "grad_norm": 0.4121068231758332, + "learning_rate": 9.229123341864577e-06, + "loss": 0.5368, + "step": 4873 + }, + { + "epoch": 0.8664888888888889, + "grad_norm": 0.40836474736598294, + "learning_rate": 9.204977565024564e-06, + "loss": 0.575, + "step": 4874 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.33781376916020694, + "learning_rate": 9.180861891411296e-06, + "loss": 0.5285, + "step": 4875 + }, + { + "epoch": 0.8668444444444444, + "grad_norm": 0.37513893030639056, + "learning_rate": 9.156776329020434e-06, + "loss": 0.5989, + "step": 4876 + }, + { + "epoch": 0.8670222222222222, + "grad_norm": 0.3459116859150284, + "learning_rate": 9.13272088583751e-06, + "loss": 0.5542, + "step": 4877 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3748561217449105, + "learning_rate": 9.108695569838211e-06, + "loss": 0.6067, + "step": 4878 + }, + { + "epoch": 0.8673777777777778, + "grad_norm": 0.37166177876823725, + "learning_rate": 9.08470038898811e-06, + "loss": 0.5838, + "step": 4879 + }, + { + "epoch": 0.8675555555555555, + "grad_norm": 0.3639293533177192, + "learning_rate": 9.0607353512429e-06, + "loss": 0.5805, + "step": 4880 + }, + { + "epoch": 0.8677333333333334, + "grad_norm": 0.3524614450585736, + "learning_rate": 9.036800464548157e-06, + "loss": 0.5526, + "step": 4881 + }, + { + "epoch": 0.8679111111111111, + "grad_norm": 0.35216720341116686, + "learning_rate": 9.01289573683961e-06, + "loss": 0.5722, + "step": 4882 + }, + { + "epoch": 0.8680888888888889, + "grad_norm": 0.34458582148301775, + "learning_rate": 8.989021176042844e-06, + "loss": 0.5545, + "step": 4883 + }, + { + "epoch": 0.8682666666666666, + "grad_norm": 0.3535534854360052, + "learning_rate": 8.965176790073537e-06, + "loss": 0.5945, + "step": 4884 + }, + { + "epoch": 0.8684444444444445, + "grad_norm": 0.37994299135993975, + "learning_rate": 8.941362586837309e-06, + "loss": 0.5493, + "step": 4885 + }, + { + "epoch": 0.8686222222222222, + "grad_norm": 0.35331124290447796, + "learning_rate": 8.917578574229812e-06, + "loss": 0.5588, + "step": 4886 + }, + { + "epoch": 0.8688, + "grad_norm": 0.36356216034730354, + "learning_rate": 8.89382476013667e-06, + "loss": 0.4765, + "step": 4887 + }, + { + "epoch": 0.8689777777777777, + "grad_norm": 0.3455222872070977, + "learning_rate": 8.870101152433497e-06, + "loss": 0.5255, + "step": 4888 + }, + { + "epoch": 0.8691555555555556, + "grad_norm": 0.35327539618497183, + "learning_rate": 8.846407758985886e-06, + "loss": 0.5875, + "step": 4889 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 0.3814891567007966, + "learning_rate": 8.822744587649412e-06, + "loss": 0.5812, + "step": 4890 + }, + { + "epoch": 0.8695111111111111, + "grad_norm": 0.3462120641494648, + "learning_rate": 8.799111646269642e-06, + "loss": 0.5569, + "step": 4891 + }, + { + "epoch": 0.8696888888888888, + "grad_norm": 0.3302997355712388, + "learning_rate": 8.77550894268212e-06, + "loss": 0.5108, + "step": 4892 + }, + { + "epoch": 0.8698666666666667, + "grad_norm": 0.36296029776689936, + "learning_rate": 8.751936484712343e-06, + "loss": 0.5212, + "step": 4893 + }, + { + "epoch": 0.8700444444444444, + "grad_norm": 0.349799802592816, + "learning_rate": 8.728394280175812e-06, + "loss": 0.585, + "step": 4894 + }, + { + "epoch": 0.8702222222222222, + "grad_norm": 0.3568142073863479, + "learning_rate": 8.704882336877962e-06, + "loss": 0.5437, + "step": 4895 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3608429759785507, + "learning_rate": 8.681400662614225e-06, + "loss": 0.5406, + "step": 4896 + }, + { + "epoch": 0.8705777777777778, + "grad_norm": 0.3566866548339779, + "learning_rate": 8.657949265169984e-06, + "loss": 0.6063, + "step": 4897 + }, + { + "epoch": 0.8707555555555555, + "grad_norm": 0.3419606827452609, + "learning_rate": 8.634528152320598e-06, + "loss": 0.5456, + "step": 4898 + }, + { + "epoch": 0.8709333333333333, + "grad_norm": 0.3383960381343903, + "learning_rate": 8.611137331831331e-06, + "loss": 0.5727, + "step": 4899 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 0.357649197456501, + "learning_rate": 8.587776811457505e-06, + "loss": 0.565, + "step": 4900 + }, + { + "epoch": 0.8712888888888889, + "grad_norm": 0.347940600328016, + "learning_rate": 8.564446598944276e-06, + "loss": 0.5898, + "step": 4901 + }, + { + "epoch": 0.8714666666666666, + "grad_norm": 0.34746348166072344, + "learning_rate": 8.541146702026859e-06, + "loss": 0.5326, + "step": 4902 + }, + { + "epoch": 0.8716444444444444, + "grad_norm": 0.35249845515358874, + "learning_rate": 8.51787712843033e-06, + "loss": 0.557, + "step": 4903 + }, + { + "epoch": 0.8718222222222223, + "grad_norm": 0.3526640751948099, + "learning_rate": 8.494637885869794e-06, + "loss": 0.5222, + "step": 4904 + }, + { + "epoch": 0.872, + "grad_norm": 0.3810496935385654, + "learning_rate": 8.4714289820502e-06, + "loss": 0.6027, + "step": 4905 + }, + { + "epoch": 0.8721777777777778, + "grad_norm": 0.3429711964810892, + "learning_rate": 8.44825042466657e-06, + "loss": 0.5641, + "step": 4906 + }, + { + "epoch": 0.8723555555555556, + "grad_norm": 0.3552161266510762, + "learning_rate": 8.425102221403725e-06, + "loss": 0.5364, + "step": 4907 + }, + { + "epoch": 0.8725333333333334, + "grad_norm": 0.3648316144392625, + "learning_rate": 8.401984379936523e-06, + "loss": 0.5722, + "step": 4908 + }, + { + "epoch": 0.8727111111111111, + "grad_norm": 0.36175111503660295, + "learning_rate": 8.37889690792969e-06, + "loss": 0.534, + "step": 4909 + }, + { + "epoch": 0.8728888888888889, + "grad_norm": 0.3464947372391652, + "learning_rate": 8.355839813037936e-06, + "loss": 0.564, + "step": 4910 + }, + { + "epoch": 0.8730666666666667, + "grad_norm": 0.34586843533029377, + "learning_rate": 8.332813102905868e-06, + "loss": 0.5159, + "step": 4911 + }, + { + "epoch": 0.8732444444444445, + "grad_norm": 0.3368854651890267, + "learning_rate": 8.309816785168034e-06, + "loss": 0.5341, + "step": 4912 + }, + { + "epoch": 0.8734222222222222, + "grad_norm": 0.440490177864888, + "learning_rate": 8.286850867448881e-06, + "loss": 0.5649, + "step": 4913 + }, + { + "epoch": 0.8736, + "grad_norm": 0.36037561290838116, + "learning_rate": 8.263915357362806e-06, + "loss": 0.5651, + "step": 4914 + }, + { + "epoch": 0.8737777777777778, + "grad_norm": 0.3596925635758404, + "learning_rate": 8.241010262514115e-06, + "loss": 0.551, + "step": 4915 + }, + { + "epoch": 0.8739555555555556, + "grad_norm": 0.40025486416109063, + "learning_rate": 8.218135590497023e-06, + "loss": 0.6128, + "step": 4916 + }, + { + "epoch": 0.8741333333333333, + "grad_norm": 0.3644076165648525, + "learning_rate": 8.19529134889565e-06, + "loss": 0.5695, + "step": 4917 + }, + { + "epoch": 0.8743111111111111, + "grad_norm": 0.3553574029414163, + "learning_rate": 8.172477545284052e-06, + "loss": 0.5573, + "step": 4918 + }, + { + "epoch": 0.8744888888888889, + "grad_norm": 0.3388988173482163, + "learning_rate": 8.149694187226187e-06, + "loss": 0.5336, + "step": 4919 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 0.3532443468499122, + "learning_rate": 8.12694128227589e-06, + "loss": 0.5565, + "step": 4920 + }, + { + "epoch": 0.8748444444444444, + "grad_norm": 0.37914642324948045, + "learning_rate": 8.10421883797694e-06, + "loss": 0.5475, + "step": 4921 + }, + { + "epoch": 0.8750222222222223, + "grad_norm": 0.3658351988938684, + "learning_rate": 8.081526861863008e-06, + "loss": 0.5268, + "step": 4922 + }, + { + "epoch": 0.8752, + "grad_norm": 0.35017344271945894, + "learning_rate": 8.058865361457601e-06, + "loss": 0.5732, + "step": 4923 + }, + { + "epoch": 0.8753777777777778, + "grad_norm": 0.37178827768644324, + "learning_rate": 8.03623434427424e-06, + "loss": 0.582, + "step": 4924 + }, + { + "epoch": 0.8755555555555555, + "grad_norm": 0.39488935566244054, + "learning_rate": 8.013633817816202e-06, + "loss": 0.5927, + "step": 4925 + }, + { + "epoch": 0.8757333333333334, + "grad_norm": 0.3343456484834358, + "learning_rate": 7.991063789576814e-06, + "loss": 0.5339, + "step": 4926 + }, + { + "epoch": 0.8759111111111111, + "grad_norm": 0.34469056161351913, + "learning_rate": 7.9685242670391e-06, + "loss": 0.5639, + "step": 4927 + }, + { + "epoch": 0.8760888888888889, + "grad_norm": 0.3601018622991615, + "learning_rate": 7.946015257676177e-06, + "loss": 0.5383, + "step": 4928 + }, + { + "epoch": 0.8762666666666666, + "grad_norm": 0.34341839122525364, + "learning_rate": 7.923536768950856e-06, + "loss": 0.5564, + "step": 4929 + }, + { + "epoch": 0.8764444444444445, + "grad_norm": 0.351305706174665, + "learning_rate": 7.901088808315971e-06, + "loss": 0.5291, + "step": 4930 + }, + { + "epoch": 0.8766222222222222, + "grad_norm": 0.3920813562978685, + "learning_rate": 7.878671383214153e-06, + "loss": 0.6173, + "step": 4931 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3532992199156299, + "learning_rate": 7.856284501077926e-06, + "loss": 0.6039, + "step": 4932 + }, + { + "epoch": 0.8769777777777777, + "grad_norm": 0.3935488579757422, + "learning_rate": 7.833928169329695e-06, + "loss": 0.5772, + "step": 4933 + }, + { + "epoch": 0.8771555555555556, + "grad_norm": 0.40192098740588955, + "learning_rate": 7.811602395381756e-06, + "loss": 0.5287, + "step": 4934 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 0.3586012157960648, + "learning_rate": 7.789307186636242e-06, + "loss": 0.5664, + "step": 4935 + }, + { + "epoch": 0.8775111111111111, + "grad_norm": 0.3595743228683377, + "learning_rate": 7.76704255048516e-06, + "loss": 0.5746, + "step": 4936 + }, + { + "epoch": 0.8776888888888889, + "grad_norm": 0.37029370994889327, + "learning_rate": 7.744808494310386e-06, + "loss": 0.5313, + "step": 4937 + }, + { + "epoch": 0.8778666666666667, + "grad_norm": 0.4086261730048563, + "learning_rate": 7.722605025483654e-06, + "loss": 0.5068, + "step": 4938 + }, + { + "epoch": 0.8780444444444444, + "grad_norm": 0.36716075857822594, + "learning_rate": 7.700432151366554e-06, + "loss": 0.5586, + "step": 4939 + }, + { + "epoch": 0.8782222222222222, + "grad_norm": 0.3510213919722963, + "learning_rate": 7.678289879310541e-06, + "loss": 0.5629, + "step": 4940 + }, + { + "epoch": 0.8784, + "grad_norm": 0.34684632866221277, + "learning_rate": 7.656178216656928e-06, + "loss": 0.5524, + "step": 4941 + }, + { + "epoch": 0.8785777777777778, + "grad_norm": 0.33615084751081836, + "learning_rate": 7.634097170736853e-06, + "loss": 0.5312, + "step": 4942 + }, + { + "epoch": 0.8787555555555555, + "grad_norm": 0.3677593601431943, + "learning_rate": 7.612046748871327e-06, + "loss": 0.5424, + "step": 4943 + }, + { + "epoch": 0.8789333333333333, + "grad_norm": 0.39760225939574145, + "learning_rate": 7.590026958371199e-06, + "loss": 0.5503, + "step": 4944 + }, + { + "epoch": 0.8791111111111111, + "grad_norm": 0.33579010089194705, + "learning_rate": 7.568037806537176e-06, + "loss": 0.5351, + "step": 4945 + }, + { + "epoch": 0.8792888888888889, + "grad_norm": 0.34114272321824585, + "learning_rate": 7.5460793006597806e-06, + "loss": 0.5664, + "step": 4946 + }, + { + "epoch": 0.8794666666666666, + "grad_norm": 0.3782973478343387, + "learning_rate": 7.524151448019389e-06, + "loss": 0.5708, + "step": 4947 + }, + { + "epoch": 0.8796444444444445, + "grad_norm": 0.3676685406761337, + "learning_rate": 7.50225425588621e-06, + "loss": 0.5453, + "step": 4948 + }, + { + "epoch": 0.8798222222222222, + "grad_norm": 0.37242916093812084, + "learning_rate": 7.480387731520311e-06, + "loss": 0.5619, + "step": 4949 + }, + { + "epoch": 0.88, + "grad_norm": 0.3677117971216192, + "learning_rate": 7.458551882171549e-06, + "loss": 0.5897, + "step": 4950 + }, + { + "epoch": 0.8801777777777777, + "grad_norm": 0.3260294521421455, + "learning_rate": 7.436746715079645e-06, + "loss": 0.5244, + "step": 4951 + }, + { + "epoch": 0.8803555555555556, + "grad_norm": 0.3509058225477963, + "learning_rate": 7.414972237474138e-06, + "loss": 0.5557, + "step": 4952 + }, + { + "epoch": 0.8805333333333333, + "grad_norm": 0.4318547104465892, + "learning_rate": 7.393228456574374e-06, + "loss": 0.5878, + "step": 4953 + }, + { + "epoch": 0.8807111111111111, + "grad_norm": 0.37706201529314576, + "learning_rate": 7.371515379589555e-06, + "loss": 0.555, + "step": 4954 + }, + { + "epoch": 0.8808888888888889, + "grad_norm": 0.3569905939695716, + "learning_rate": 7.349833013718666e-06, + "loss": 0.5427, + "step": 4955 + }, + { + "epoch": 0.8810666666666667, + "grad_norm": 0.350934165811864, + "learning_rate": 7.328181366150533e-06, + "loss": 0.5355, + "step": 4956 + }, + { + "epoch": 0.8812444444444445, + "grad_norm": 0.36792800818312366, + "learning_rate": 7.306560444063826e-06, + "loss": 0.578, + "step": 4957 + }, + { + "epoch": 0.8814222222222222, + "grad_norm": 0.4169243524920248, + "learning_rate": 7.284970254626922e-06, + "loss": 0.5509, + "step": 4958 + }, + { + "epoch": 0.8816, + "grad_norm": 0.3385980146133746, + "learning_rate": 7.263410804998161e-06, + "loss": 0.5708, + "step": 4959 + }, + { + "epoch": 0.8817777777777778, + "grad_norm": 0.34399054053671324, + "learning_rate": 7.2418821023255365e-06, + "loss": 0.5555, + "step": 4960 + }, + { + "epoch": 0.8819555555555556, + "grad_norm": 0.3648413887850772, + "learning_rate": 7.220384153746995e-06, + "loss": 0.5937, + "step": 4961 + }, + { + "epoch": 0.8821333333333333, + "grad_norm": 0.3519060596881368, + "learning_rate": 7.198916966390146e-06, + "loss": 0.5648, + "step": 4962 + }, + { + "epoch": 0.8823111111111112, + "grad_norm": 0.3711091594921909, + "learning_rate": 7.177480547372528e-06, + "loss": 0.5203, + "step": 4963 + }, + { + "epoch": 0.8824888888888889, + "grad_norm": 0.35220582447695326, + "learning_rate": 7.156074903801369e-06, + "loss": 0.6006, + "step": 4964 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 0.34443800193528507, + "learning_rate": 7.13470004277379e-06, + "loss": 0.5139, + "step": 4965 + }, + { + "epoch": 0.8828444444444444, + "grad_norm": 0.40627494362402267, + "learning_rate": 7.113355971376612e-06, + "loss": 0.5353, + "step": 4966 + }, + { + "epoch": 0.8830222222222223, + "grad_norm": 0.32832377807936153, + "learning_rate": 7.092042696686518e-06, + "loss": 0.5507, + "step": 4967 + }, + { + "epoch": 0.8832, + "grad_norm": 0.34917033308837014, + "learning_rate": 7.0707602257699565e-06, + "loss": 0.5288, + "step": 4968 + }, + { + "epoch": 0.8833777777777778, + "grad_norm": 0.3508002356312644, + "learning_rate": 7.0495085656831495e-06, + "loss": 0.5461, + "step": 4969 + }, + { + "epoch": 0.8835555555555555, + "grad_norm": 0.3677926771044768, + "learning_rate": 7.028287723472138e-06, + "loss": 0.5103, + "step": 4970 + }, + { + "epoch": 0.8837333333333334, + "grad_norm": 0.3644173035666489, + "learning_rate": 7.007097706172705e-06, + "loss": 0.5426, + "step": 4971 + }, + { + "epoch": 0.8839111111111111, + "grad_norm": 0.3531911324846641, + "learning_rate": 6.985938520810442e-06, + "loss": 0.5653, + "step": 4972 + }, + { + "epoch": 0.8840888888888889, + "grad_norm": 0.34920737352624664, + "learning_rate": 6.964810174400705e-06, + "loss": 0.525, + "step": 4973 + }, + { + "epoch": 0.8842666666666666, + "grad_norm": 0.35519717281731944, + "learning_rate": 6.943712673948644e-06, + "loss": 0.6067, + "step": 4974 + }, + { + "epoch": 0.8844444444444445, + "grad_norm": 0.3579342074915068, + "learning_rate": 6.922646026449142e-06, + "loss": 0.5639, + "step": 4975 + }, + { + "epoch": 0.8846222222222222, + "grad_norm": 0.3569204557717254, + "learning_rate": 6.901610238886891e-06, + "loss": 0.5762, + "step": 4976 + }, + { + "epoch": 0.8848, + "grad_norm": 0.36271755784208287, + "learning_rate": 6.880605318236344e-06, + "loss": 0.556, + "step": 4977 + }, + { + "epoch": 0.8849777777777778, + "grad_norm": 0.34077007678852844, + "learning_rate": 6.859631271461708e-06, + "loss": 0.5374, + "step": 4978 + }, + { + "epoch": 0.8851555555555556, + "grad_norm": 0.35792751648345417, + "learning_rate": 6.838688105516955e-06, + "loss": 0.5599, + "step": 4979 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 0.3589913668513894, + "learning_rate": 6.817775827345829e-06, + "loss": 0.6089, + "step": 4980 + }, + { + "epoch": 0.8855111111111111, + "grad_norm": 0.34451711638140914, + "learning_rate": 6.7968944438818404e-06, + "loss": 0.5874, + "step": 4981 + }, + { + "epoch": 0.8856888888888889, + "grad_norm": 0.3691697573221582, + "learning_rate": 6.776043962048195e-06, + "loss": 0.5506, + "step": 4982 + }, + { + "epoch": 0.8858666666666667, + "grad_norm": 0.3580165336381914, + "learning_rate": 6.755224388757974e-06, + "loss": 0.554, + "step": 4983 + }, + { + "epoch": 0.8860444444444444, + "grad_norm": 0.3752294721934302, + "learning_rate": 6.734435730913868e-06, + "loss": 0.5866, + "step": 4984 + }, + { + "epoch": 0.8862222222222222, + "grad_norm": 0.36201086539983857, + "learning_rate": 6.713677995408452e-06, + "loss": 0.5987, + "step": 4985 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3604756887631126, + "learning_rate": 6.692951189123919e-06, + "loss": 0.5511, + "step": 4986 + }, + { + "epoch": 0.8865777777777778, + "grad_norm": 0.3477636550268372, + "learning_rate": 6.672255318932341e-06, + "loss": 0.5414, + "step": 4987 + }, + { + "epoch": 0.8867555555555555, + "grad_norm": 0.3530341279912555, + "learning_rate": 6.651590391695395e-06, + "loss": 0.5888, + "step": 4988 + }, + { + "epoch": 0.8869333333333334, + "grad_norm": 0.35161616628429393, + "learning_rate": 6.630956414264644e-06, + "loss": 0.5576, + "step": 4989 + }, + { + "epoch": 0.8871111111111111, + "grad_norm": 0.4041927729714772, + "learning_rate": 6.61035339348125e-06, + "loss": 0.6343, + "step": 4990 + }, + { + "epoch": 0.8872888888888889, + "grad_norm": 0.33856019761784983, + "learning_rate": 6.589781336176204e-06, + "loss": 0.5374, + "step": 4991 + }, + { + "epoch": 0.8874666666666666, + "grad_norm": 0.3885284915670537, + "learning_rate": 6.569240249170206e-06, + "loss": 0.5794, + "step": 4992 + }, + { + "epoch": 0.8876444444444445, + "grad_norm": 0.35667795491131, + "learning_rate": 6.548730139273662e-06, + "loss": 0.5426, + "step": 4993 + }, + { + "epoch": 0.8878222222222222, + "grad_norm": 0.3638374981061401, + "learning_rate": 6.528251013286757e-06, + "loss": 0.5559, + "step": 4994 + }, + { + "epoch": 0.888, + "grad_norm": 0.3608979244833006, + "learning_rate": 6.507802877999369e-06, + "loss": 0.5468, + "step": 4995 + }, + { + "epoch": 0.8881777777777777, + "grad_norm": 0.32491895384219516, + "learning_rate": 6.4873857401910875e-06, + "loss": 0.5313, + "step": 4996 + }, + { + "epoch": 0.8883555555555556, + "grad_norm": 0.3528662823733003, + "learning_rate": 6.466999606631275e-06, + "loss": 0.5863, + "step": 4997 + }, + { + "epoch": 0.8885333333333333, + "grad_norm": 0.34073769137489074, + "learning_rate": 6.4466444840789674e-06, + "loss": 0.5222, + "step": 4998 + }, + { + "epoch": 0.8887111111111111, + "grad_norm": 0.36596257267612503, + "learning_rate": 6.426320379282946e-06, + "loss": 0.587, + "step": 4999 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 0.33637168534101186, + "learning_rate": 6.406027298981687e-06, + "loss": 0.5314, + "step": 5000 + }, + { + "epoch": 0.8890666666666667, + "grad_norm": 0.3654380980837454, + "learning_rate": 6.3857652499033974e-06, + "loss": 0.5839, + "step": 5001 + }, + { + "epoch": 0.8892444444444444, + "grad_norm": 0.3525552894867388, + "learning_rate": 6.365534238765991e-06, + "loss": 0.5226, + "step": 5002 + }, + { + "epoch": 0.8894222222222222, + "grad_norm": 0.3557091719077854, + "learning_rate": 6.345334272277092e-06, + "loss": 0.555, + "step": 5003 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3389981183235064, + "learning_rate": 6.325165357134022e-06, + "loss": 0.5218, + "step": 5004 + }, + { + "epoch": 0.8897777777777778, + "grad_norm": 0.36210474787081404, + "learning_rate": 6.3050275000238414e-06, + "loss": 0.535, + "step": 5005 + }, + { + "epoch": 0.8899555555555556, + "grad_norm": 0.36732324809316436, + "learning_rate": 6.284920707623232e-06, + "loss": 0.6119, + "step": 5006 + }, + { + "epoch": 0.8901333333333333, + "grad_norm": 0.34123277302421423, + "learning_rate": 6.264844986598695e-06, + "loss": 0.5506, + "step": 5007 + }, + { + "epoch": 0.8903111111111112, + "grad_norm": 0.37968768960721694, + "learning_rate": 6.244800343606305e-06, + "loss": 0.5928, + "step": 5008 + }, + { + "epoch": 0.8904888888888889, + "grad_norm": 0.34169195159398585, + "learning_rate": 6.22478678529197e-06, + "loss": 0.5624, + "step": 5009 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.3560548797808062, + "learning_rate": 6.2048043182911245e-06, + "loss": 0.5951, + "step": 5010 + }, + { + "epoch": 0.8908444444444444, + "grad_norm": 0.340923851394203, + "learning_rate": 6.18485294922907e-06, + "loss": 0.5401, + "step": 5011 + }, + { + "epoch": 0.8910222222222223, + "grad_norm": 0.3610705594006183, + "learning_rate": 6.164932684720637e-06, + "loss": 0.5325, + "step": 5012 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3466926729042091, + "learning_rate": 6.145043531370498e-06, + "loss": 0.5602, + "step": 5013 + }, + { + "epoch": 0.8913777777777778, + "grad_norm": 0.33891049189584005, + "learning_rate": 6.1251854957728445e-06, + "loss": 0.5703, + "step": 5014 + }, + { + "epoch": 0.8915555555555555, + "grad_norm": 0.36857645357246654, + "learning_rate": 6.105358584511733e-06, + "loss": 0.5749, + "step": 5015 + }, + { + "epoch": 0.8917333333333334, + "grad_norm": 0.3801085081840926, + "learning_rate": 6.085562804160727e-06, + "loss": 0.5948, + "step": 5016 + }, + { + "epoch": 0.8919111111111111, + "grad_norm": 0.3879828280407481, + "learning_rate": 6.065798161283187e-06, + "loss": 0.5884, + "step": 5017 + }, + { + "epoch": 0.8920888888888889, + "grad_norm": 0.38240214289773117, + "learning_rate": 6.046064662432105e-06, + "loss": 0.5467, + "step": 5018 + }, + { + "epoch": 0.8922666666666667, + "grad_norm": 0.33721482017331217, + "learning_rate": 6.026362314150136e-06, + "loss": 0.5289, + "step": 5019 + }, + { + "epoch": 0.8924444444444445, + "grad_norm": 0.3757898139758863, + "learning_rate": 6.006691122969643e-06, + "loss": 0.5474, + "step": 5020 + }, + { + "epoch": 0.8926222222222222, + "grad_norm": 0.35362258474386377, + "learning_rate": 5.987051095412632e-06, + "loss": 0.5597, + "step": 5021 + }, + { + "epoch": 0.8928, + "grad_norm": 0.3571203511671678, + "learning_rate": 5.967442237990783e-06, + "loss": 0.5639, + "step": 5022 + }, + { + "epoch": 0.8929777777777778, + "grad_norm": 0.3623767948783689, + "learning_rate": 5.9478645572054406e-06, + "loss": 0.5674, + "step": 5023 + }, + { + "epoch": 0.8931555555555556, + "grad_norm": 0.33966115668940144, + "learning_rate": 5.928318059547622e-06, + "loss": 0.5553, + "step": 5024 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.3467063412290638, + "learning_rate": 5.908802751497988e-06, + "loss": 0.5701, + "step": 5025 + }, + { + "epoch": 0.8935111111111111, + "grad_norm": 0.3346566742104518, + "learning_rate": 5.889318639526875e-06, + "loss": 0.5595, + "step": 5026 + }, + { + "epoch": 0.8936888888888889, + "grad_norm": 0.4296464805878177, + "learning_rate": 5.869865730094271e-06, + "loss": 0.5616, + "step": 5027 + }, + { + "epoch": 0.8938666666666667, + "grad_norm": 0.3465972038776893, + "learning_rate": 5.850444029649804e-06, + "loss": 0.5493, + "step": 5028 + }, + { + "epoch": 0.8940444444444444, + "grad_norm": 0.3565655259205237, + "learning_rate": 5.831053544632803e-06, + "loss": 0.5464, + "step": 5029 + }, + { + "epoch": 0.8942222222222223, + "grad_norm": 0.38441586711030085, + "learning_rate": 5.811694281472158e-06, + "loss": 0.6153, + "step": 5030 + }, + { + "epoch": 0.8944, + "grad_norm": 0.34304361941421097, + "learning_rate": 5.792366246586511e-06, + "loss": 0.5074, + "step": 5031 + }, + { + "epoch": 0.8945777777777778, + "grad_norm": 0.3453592481791633, + "learning_rate": 5.773069446384061e-06, + "loss": 0.5316, + "step": 5032 + }, + { + "epoch": 0.8947555555555555, + "grad_norm": 0.3374299219049813, + "learning_rate": 5.753803887262743e-06, + "loss": 0.5288, + "step": 5033 + }, + { + "epoch": 0.8949333333333334, + "grad_norm": 0.3228364987044134, + "learning_rate": 5.734569575610027e-06, + "loss": 0.5368, + "step": 5034 + }, + { + "epoch": 0.8951111111111111, + "grad_norm": 0.35533818079732377, + "learning_rate": 5.715366517803123e-06, + "loss": 0.5438, + "step": 5035 + }, + { + "epoch": 0.8952888888888889, + "grad_norm": 0.34821543939370586, + "learning_rate": 5.696194720208792e-06, + "loss": 0.5102, + "step": 5036 + }, + { + "epoch": 0.8954666666666666, + "grad_norm": 0.32735015436309456, + "learning_rate": 5.677054189183517e-06, + "loss": 0.5559, + "step": 5037 + }, + { + "epoch": 0.8956444444444445, + "grad_norm": 0.39232035417015443, + "learning_rate": 5.657944931073312e-06, + "loss": 0.5916, + "step": 5038 + }, + { + "epoch": 0.8958222222222222, + "grad_norm": 0.3423079683123103, + "learning_rate": 5.63886695221395e-06, + "loss": 0.5595, + "step": 5039 + }, + { + "epoch": 0.896, + "grad_norm": 0.35412465539752674, + "learning_rate": 5.619820258930719e-06, + "loss": 0.5691, + "step": 5040 + }, + { + "epoch": 0.8961777777777777, + "grad_norm": 0.36353787847930075, + "learning_rate": 5.600804857538588e-06, + "loss": 0.5709, + "step": 5041 + }, + { + "epoch": 0.8963555555555556, + "grad_norm": 0.3691614187546208, + "learning_rate": 5.581820754342137e-06, + "loss": 0.5655, + "step": 5042 + }, + { + "epoch": 0.8965333333333333, + "grad_norm": 0.375739456665465, + "learning_rate": 5.562867955635587e-06, + "loss": 0.5932, + "step": 5043 + }, + { + "epoch": 0.8967111111111111, + "grad_norm": 0.3595972612421251, + "learning_rate": 5.543946467702754e-06, + "loss": 0.5568, + "step": 5044 + }, + { + "epoch": 0.8968888888888888, + "grad_norm": 0.3767788880692, + "learning_rate": 5.525056296817099e-06, + "loss": 0.5587, + "step": 5045 + }, + { + "epoch": 0.8970666666666667, + "grad_norm": 0.34342813588366167, + "learning_rate": 5.506197449241679e-06, + "loss": 0.5368, + "step": 5046 + }, + { + "epoch": 0.8972444444444444, + "grad_norm": 0.35004629720539576, + "learning_rate": 5.4873699312291695e-06, + "loss": 0.5512, + "step": 5047 + }, + { + "epoch": 0.8974222222222222, + "grad_norm": 0.3386492511363029, + "learning_rate": 5.468573749021866e-06, + "loss": 0.518, + "step": 5048 + }, + { + "epoch": 0.8976, + "grad_norm": 0.38071086192975595, + "learning_rate": 5.449808908851673e-06, + "loss": 0.5635, + "step": 5049 + }, + { + "epoch": 0.8977777777777778, + "grad_norm": 0.36057247620962357, + "learning_rate": 5.431075416940101e-06, + "loss": 0.5088, + "step": 5050 + }, + { + "epoch": 0.8979555555555555, + "grad_norm": 0.34094275530983353, + "learning_rate": 5.412373279498273e-06, + "loss": 0.5662, + "step": 5051 + }, + { + "epoch": 0.8981333333333333, + "grad_norm": 0.3269356539111094, + "learning_rate": 5.393702502726905e-06, + "loss": 0.5343, + "step": 5052 + }, + { + "epoch": 0.8983111111111111, + "grad_norm": 0.4427265434685174, + "learning_rate": 5.375063092816313e-06, + "loss": 0.5559, + "step": 5053 + }, + { + "epoch": 0.8984888888888889, + "grad_norm": 0.3471073482109714, + "learning_rate": 5.356455055946441e-06, + "loss": 0.5647, + "step": 5054 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 0.3651122289531994, + "learning_rate": 5.337878398286799e-06, + "loss": 0.5623, + "step": 5055 + }, + { + "epoch": 0.8988444444444444, + "grad_norm": 0.3764993611257617, + "learning_rate": 5.319333125996495e-06, + "loss": 0.5717, + "step": 5056 + }, + { + "epoch": 0.8990222222222222, + "grad_norm": 0.3807105325911764, + "learning_rate": 5.300819245224275e-06, + "loss": 0.5927, + "step": 5057 + }, + { + "epoch": 0.8992, + "grad_norm": 0.3675944331230521, + "learning_rate": 5.282336762108398e-06, + "loss": 0.5673, + "step": 5058 + }, + { + "epoch": 0.8993777777777778, + "grad_norm": 0.36388847045961525, + "learning_rate": 5.263885682776804e-06, + "loss": 0.5561, + "step": 5059 + }, + { + "epoch": 0.8995555555555556, + "grad_norm": 0.49477231743873035, + "learning_rate": 5.245466013346945e-06, + "loss": 0.5667, + "step": 5060 + }, + { + "epoch": 0.8997333333333334, + "grad_norm": 0.33638623314391286, + "learning_rate": 5.2270777599259135e-06, + "loss": 0.5372, + "step": 5061 + }, + { + "epoch": 0.8999111111111111, + "grad_norm": 0.3521882897829024, + "learning_rate": 5.208720928610333e-06, + "loss": 0.5535, + "step": 5062 + }, + { + "epoch": 0.9000888888888889, + "grad_norm": 0.5628962500588736, + "learning_rate": 5.190395525486491e-06, + "loss": 0.5342, + "step": 5063 + }, + { + "epoch": 0.9002666666666667, + "grad_norm": 0.47380844913950926, + "learning_rate": 5.172101556630149e-06, + "loss": 0.5004, + "step": 5064 + }, + { + "epoch": 0.9004444444444445, + "grad_norm": 0.33969452465633604, + "learning_rate": 5.15383902810671e-06, + "loss": 0.5313, + "step": 5065 + }, + { + "epoch": 0.9006222222222222, + "grad_norm": 0.36288858740325197, + "learning_rate": 5.1356079459711655e-06, + "loss": 0.5517, + "step": 5066 + }, + { + "epoch": 0.9008, + "grad_norm": 0.3445956683850495, + "learning_rate": 5.1174083162680465e-06, + "loss": 0.5494, + "step": 5067 + }, + { + "epoch": 0.9009777777777778, + "grad_norm": 0.3400050859421058, + "learning_rate": 5.0992401450314584e-06, + "loss": 0.5579, + "step": 5068 + }, + { + "epoch": 0.9011555555555556, + "grad_norm": 0.34723176501871816, + "learning_rate": 5.0811034382850955e-06, + "loss": 0.5809, + "step": 5069 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 0.36678676466026955, + "learning_rate": 5.062998202042213e-06, + "loss": 0.575, + "step": 5070 + }, + { + "epoch": 0.9015111111111112, + "grad_norm": 0.3548915445767976, + "learning_rate": 5.044924442305621e-06, + "loss": 0.5429, + "step": 5071 + }, + { + "epoch": 0.9016888888888889, + "grad_norm": 0.34639744607076345, + "learning_rate": 5.026882165067703e-06, + "loss": 0.5578, + "step": 5072 + }, + { + "epoch": 0.9018666666666667, + "grad_norm": 0.3574891053512251, + "learning_rate": 5.008871376310409e-06, + "loss": 0.527, + "step": 5073 + }, + { + "epoch": 0.9020444444444444, + "grad_norm": 0.36545262702333997, + "learning_rate": 4.99089208200525e-06, + "loss": 0.5702, + "step": 5074 + }, + { + "epoch": 0.9022222222222223, + "grad_norm": 0.34434599716911823, + "learning_rate": 4.972944288113268e-06, + "loss": 0.5271, + "step": 5075 + }, + { + "epoch": 0.9024, + "grad_norm": 2.2598719635417917, + "learning_rate": 4.955028000585094e-06, + "loss": 0.5426, + "step": 5076 + }, + { + "epoch": 0.9025777777777778, + "grad_norm": 0.358430518318424, + "learning_rate": 4.937143225360896e-06, + "loss": 0.5485, + "step": 5077 + }, + { + "epoch": 0.9027555555555555, + "grad_norm": 0.34452523243508193, + "learning_rate": 4.9192899683703996e-06, + "loss": 0.5737, + "step": 5078 + }, + { + "epoch": 0.9029333333333334, + "grad_norm": 0.3750587258150285, + "learning_rate": 4.901468235532902e-06, + "loss": 0.5624, + "step": 5079 + }, + { + "epoch": 0.9031111111111111, + "grad_norm": 0.36812721003704635, + "learning_rate": 4.8836780327571664e-06, + "loss": 0.5759, + "step": 5080 + }, + { + "epoch": 0.9032888888888889, + "grad_norm": 0.3732436350062708, + "learning_rate": 4.865919365941629e-06, + "loss": 0.5814, + "step": 5081 + }, + { + "epoch": 0.9034666666666666, + "grad_norm": 0.35324019305978754, + "learning_rate": 4.8481922409741474e-06, + "loss": 0.5413, + "step": 5082 + }, + { + "epoch": 0.9036444444444445, + "grad_norm": 0.3618793458163752, + "learning_rate": 4.830496663732231e-06, + "loss": 0.5412, + "step": 5083 + }, + { + "epoch": 0.9038222222222222, + "grad_norm": 0.37846797630893925, + "learning_rate": 4.812832640082809e-06, + "loss": 0.5489, + "step": 5084 + }, + { + "epoch": 0.904, + "grad_norm": 0.34356591369838496, + "learning_rate": 4.795200175882486e-06, + "loss": 0.518, + "step": 5085 + }, + { + "epoch": 0.9041777777777777, + "grad_norm": 0.3508628832012701, + "learning_rate": 4.777599276977263e-06, + "loss": 0.5753, + "step": 5086 + }, + { + "epoch": 0.9043555555555556, + "grad_norm": 0.3452490825715901, + "learning_rate": 4.7600299492028155e-06, + "loss": 0.5501, + "step": 5087 + }, + { + "epoch": 0.9045333333333333, + "grad_norm": 0.36972176015346464, + "learning_rate": 4.74249219838423e-06, + "loss": 0.5615, + "step": 5088 + }, + { + "epoch": 0.9047111111111111, + "grad_norm": 0.3941148509553186, + "learning_rate": 4.7249860303361755e-06, + "loss": 0.6022, + "step": 5089 + }, + { + "epoch": 0.9048888888888889, + "grad_norm": 0.3220473714961397, + "learning_rate": 4.7075114508628785e-06, + "loss": 0.5456, + "step": 5090 + }, + { + "epoch": 0.9050666666666667, + "grad_norm": 0.34631849756345634, + "learning_rate": 4.690068465758035e-06, + "loss": 0.5409, + "step": 5091 + }, + { + "epoch": 0.9052444444444444, + "grad_norm": 0.3571419750704605, + "learning_rate": 4.6726570808049095e-06, + "loss": 0.5349, + "step": 5092 + }, + { + "epoch": 0.9054222222222222, + "grad_norm": 0.377215705994575, + "learning_rate": 4.6552773017762615e-06, + "loss": 0.5701, + "step": 5093 + }, + { + "epoch": 0.9056, + "grad_norm": 0.33026636751756705, + "learning_rate": 4.637929134434393e-06, + "loss": 0.5185, + "step": 5094 + }, + { + "epoch": 0.9057777777777778, + "grad_norm": 0.34404645691890035, + "learning_rate": 4.620612584531103e-06, + "loss": 0.4857, + "step": 5095 + }, + { + "epoch": 0.9059555555555555, + "grad_norm": 0.3636931261712703, + "learning_rate": 4.603327657807733e-06, + "loss": 0.5379, + "step": 5096 + }, + { + "epoch": 0.9061333333333333, + "grad_norm": 0.43522914914875505, + "learning_rate": 4.586074359995119e-06, + "loss": 0.5644, + "step": 5097 + }, + { + "epoch": 0.9063111111111111, + "grad_norm": 0.33286835169108125, + "learning_rate": 4.568852696813619e-06, + "loss": 0.525, + "step": 5098 + }, + { + "epoch": 0.9064888888888889, + "grad_norm": 0.35748639930805165, + "learning_rate": 4.551662673973101e-06, + "loss": 0.5726, + "step": 5099 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.3639371884789815, + "learning_rate": 4.534504297172948e-06, + "loss": 0.595, + "step": 5100 + }, + { + "epoch": 0.9068444444444445, + "grad_norm": 0.3693780307815125, + "learning_rate": 4.517377572102044e-06, + "loss": 0.6004, + "step": 5101 + }, + { + "epoch": 0.9070222222222222, + "grad_norm": 0.47298505997685597, + "learning_rate": 4.500282504438769e-06, + "loss": 0.5805, + "step": 5102 + }, + { + "epoch": 0.9072, + "grad_norm": 0.3657160369621279, + "learning_rate": 4.483219099851044e-06, + "loss": 0.6041, + "step": 5103 + }, + { + "epoch": 0.9073777777777777, + "grad_norm": 0.3452471411735382, + "learning_rate": 4.466187363996232e-06, + "loss": 0.5203, + "step": 5104 + }, + { + "epoch": 0.9075555555555556, + "grad_norm": 0.35058032752439017, + "learning_rate": 4.449187302521263e-06, + "loss": 0.5357, + "step": 5105 + }, + { + "epoch": 0.9077333333333333, + "grad_norm": 0.3677682480853437, + "learning_rate": 4.4322189210625034e-06, + "loss": 0.5266, + "step": 5106 + }, + { + "epoch": 0.9079111111111111, + "grad_norm": 0.3302307961448783, + "learning_rate": 4.415282225245887e-06, + "loss": 0.5327, + "step": 5107 + }, + { + "epoch": 0.9080888888888888, + "grad_norm": 0.36411783954271787, + "learning_rate": 4.398377220686745e-06, + "loss": 0.5944, + "step": 5108 + }, + { + "epoch": 0.9082666666666667, + "grad_norm": 0.3589049128132145, + "learning_rate": 4.381503912990015e-06, + "loss": 0.5704, + "step": 5109 + }, + { + "epoch": 0.9084444444444445, + "grad_norm": 0.3597884333599353, + "learning_rate": 4.364662307750012e-06, + "loss": 0.5703, + "step": 5110 + }, + { + "epoch": 0.9086222222222222, + "grad_norm": 0.35557627346990905, + "learning_rate": 4.347852410550645e-06, + "loss": 0.5376, + "step": 5111 + }, + { + "epoch": 0.9088, + "grad_norm": 0.35112168032411384, + "learning_rate": 4.331074226965226e-06, + "loss": 0.5384, + "step": 5112 + }, + { + "epoch": 0.9089777777777778, + "grad_norm": 0.36962589897003206, + "learning_rate": 4.314327762556624e-06, + "loss": 0.5645, + "step": 5113 + }, + { + "epoch": 0.9091555555555556, + "grad_norm": 0.3829102262889725, + "learning_rate": 4.297613022877111e-06, + "loss": 0.5597, + "step": 5114 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 0.3717325359685093, + "learning_rate": 4.2809300134685095e-06, + "loss": 0.5557, + "step": 5115 + }, + { + "epoch": 0.9095111111111112, + "grad_norm": 0.35710985862670386, + "learning_rate": 4.264278739862093e-06, + "loss": 0.5419, + "step": 5116 + }, + { + "epoch": 0.9096888888888889, + "grad_norm": 0.346335970137347, + "learning_rate": 4.247659207578614e-06, + "loss": 0.5112, + "step": 5117 + }, + { + "epoch": 0.9098666666666667, + "grad_norm": 0.3609406995265904, + "learning_rate": 4.231071422128308e-06, + "loss": 0.5641, + "step": 5118 + }, + { + "epoch": 0.9100444444444444, + "grad_norm": 0.35658533460421904, + "learning_rate": 4.214515389010865e-06, + "loss": 0.5114, + "step": 5119 + }, + { + "epoch": 0.9102222222222223, + "grad_norm": 0.3283001955079264, + "learning_rate": 4.1979911137154825e-06, + "loss": 0.5299, + "step": 5120 + }, + { + "epoch": 0.9104, + "grad_norm": 0.3793467949510737, + "learning_rate": 4.181498601720801e-06, + "loss": 0.5586, + "step": 5121 + }, + { + "epoch": 0.9105777777777778, + "grad_norm": 0.36781059822887613, + "learning_rate": 4.165037858494936e-06, + "loss": 0.5896, + "step": 5122 + }, + { + "epoch": 0.9107555555555555, + "grad_norm": 0.362023923353087, + "learning_rate": 4.148608889495475e-06, + "loss": 0.5894, + "step": 5123 + }, + { + "epoch": 0.9109333333333334, + "grad_norm": 0.350305332968479, + "learning_rate": 4.132211700169464e-06, + "loss": 0.5467, + "step": 5124 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 0.36825412032578164, + "learning_rate": 4.115846295953418e-06, + "loss": 0.5354, + "step": 5125 + }, + { + "epoch": 0.9112888888888889, + "grad_norm": 0.3579120497706332, + "learning_rate": 4.09951268227331e-06, + "loss": 0.5035, + "step": 5126 + }, + { + "epoch": 0.9114666666666666, + "grad_norm": 0.39524139650094203, + "learning_rate": 4.083210864544584e-06, + "loss": 0.5938, + "step": 5127 + }, + { + "epoch": 0.9116444444444445, + "grad_norm": 0.35276759412639686, + "learning_rate": 4.066940848172107e-06, + "loss": 0.5365, + "step": 5128 + }, + { + "epoch": 0.9118222222222222, + "grad_norm": 0.37276027967182973, + "learning_rate": 4.050702638550275e-06, + "loss": 0.5737, + "step": 5129 + }, + { + "epoch": 0.912, + "grad_norm": 0.3726299724967845, + "learning_rate": 4.034496241062824e-06, + "loss": 0.5705, + "step": 5130 + }, + { + "epoch": 0.9121777777777778, + "grad_norm": 0.38400336089017784, + "learning_rate": 4.01832166108308e-06, + "loss": 0.6242, + "step": 5131 + }, + { + "epoch": 0.9123555555555556, + "grad_norm": 0.36829262590511874, + "learning_rate": 4.002178903973674e-06, + "loss": 0.5993, + "step": 5132 + }, + { + "epoch": 0.9125333333333333, + "grad_norm": 0.36182386564401386, + "learning_rate": 3.986067975086838e-06, + "loss": 0.5526, + "step": 5133 + }, + { + "epoch": 0.9127111111111111, + "grad_norm": 0.37574686746649544, + "learning_rate": 3.9699888797641195e-06, + "loss": 0.5306, + "step": 5134 + }, + { + "epoch": 0.9128888888888889, + "grad_norm": 0.36029371274709715, + "learning_rate": 3.95394162333661e-06, + "loss": 0.5519, + "step": 5135 + }, + { + "epoch": 0.9130666666666667, + "grad_norm": 0.3493479019354734, + "learning_rate": 3.937926211124743e-06, + "loss": 0.5238, + "step": 5136 + }, + { + "epoch": 0.9132444444444444, + "grad_norm": 0.35914805914076947, + "learning_rate": 3.921942648438526e-06, + "loss": 0.5664, + "step": 5137 + }, + { + "epoch": 0.9134222222222222, + "grad_norm": 0.35601899050295577, + "learning_rate": 3.905990940577275e-06, + "loss": 0.5546, + "step": 5138 + }, + { + "epoch": 0.9136, + "grad_norm": 0.3422747728252129, + "learning_rate": 3.890071092829828e-06, + "loss": 0.5492, + "step": 5139 + }, + { + "epoch": 0.9137777777777778, + "grad_norm": 0.3520173603898526, + "learning_rate": 3.8741831104744274e-06, + "loss": 0.5019, + "step": 5140 + }, + { + "epoch": 0.9139555555555555, + "grad_norm": 0.3461354557766037, + "learning_rate": 3.858326998778761e-06, + "loss": 0.5447, + "step": 5141 + }, + { + "epoch": 0.9141333333333334, + "grad_norm": 0.4429906638018704, + "learning_rate": 3.842502762999944e-06, + "loss": 0.5964, + "step": 5142 + }, + { + "epoch": 0.9143111111111111, + "grad_norm": 0.519003317711788, + "learning_rate": 3.8267104083845265e-06, + "loss": 0.5187, + "step": 5143 + }, + { + "epoch": 0.9144888888888889, + "grad_norm": 0.48031557176649636, + "learning_rate": 3.8109499401684847e-06, + "loss": 0.5771, + "step": 5144 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 0.48631790121650526, + "learning_rate": 3.795221363577239e-06, + "loss": 0.5368, + "step": 5145 + }, + { + "epoch": 0.9148444444444445, + "grad_norm": 0.39271895500917803, + "learning_rate": 3.7795246838256084e-06, + "loss": 0.5386, + "step": 5146 + }, + { + "epoch": 0.9150222222222222, + "grad_norm": 0.34028015067040746, + "learning_rate": 3.7638599061178504e-06, + "loss": 0.5218, + "step": 5147 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3507908403602419, + "learning_rate": 3.7482270356476557e-06, + "loss": 0.5986, + "step": 5148 + }, + { + "epoch": 0.9153777777777777, + "grad_norm": 0.35320509302855646, + "learning_rate": 3.7326260775981227e-06, + "loss": 0.5719, + "step": 5149 + }, + { + "epoch": 0.9155555555555556, + "grad_norm": 0.35151463145916706, + "learning_rate": 3.717057037141769e-06, + "loss": 0.543, + "step": 5150 + }, + { + "epoch": 0.9157333333333333, + "grad_norm": 0.3485543149978255, + "learning_rate": 3.7015199194405325e-06, + "loss": 0.5789, + "step": 5151 + }, + { + "epoch": 0.9159111111111111, + "grad_norm": 0.3681799012171259, + "learning_rate": 3.6860147296457816e-06, + "loss": 0.5334, + "step": 5152 + }, + { + "epoch": 0.9160888888888888, + "grad_norm": 0.3614557773301528, + "learning_rate": 3.67054147289827e-06, + "loss": 0.5648, + "step": 5153 + }, + { + "epoch": 0.9162666666666667, + "grad_norm": 0.3692548576572285, + "learning_rate": 3.6551001543281726e-06, + "loss": 0.5419, + "step": 5154 + }, + { + "epoch": 0.9164444444444444, + "grad_norm": 0.34956135439933617, + "learning_rate": 3.639690779055116e-06, + "loss": 0.5324, + "step": 5155 + }, + { + "epoch": 0.9166222222222222, + "grad_norm": 0.3452128245231248, + "learning_rate": 3.6243133521880577e-06, + "loss": 0.5241, + "step": 5156 + }, + { + "epoch": 0.9168, + "grad_norm": 0.3590939412493103, + "learning_rate": 3.6089678788254423e-06, + "loss": 0.5267, + "step": 5157 + }, + { + "epoch": 0.9169777777777778, + "grad_norm": 0.37138443221436024, + "learning_rate": 3.5936543640550547e-06, + "loss": 0.6133, + "step": 5158 + }, + { + "epoch": 0.9171555555555555, + "grad_norm": 0.331071153420671, + "learning_rate": 3.578372812954156e-06, + "loss": 0.5803, + "step": 5159 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 0.3542555192315465, + "learning_rate": 3.5631232305893046e-06, + "loss": 0.5239, + "step": 5160 + }, + { + "epoch": 0.9175111111111112, + "grad_norm": 0.3378798234336313, + "learning_rate": 3.547905622016601e-06, + "loss": 0.572, + "step": 5161 + }, + { + "epoch": 0.9176888888888889, + "grad_norm": 0.37154754473273177, + "learning_rate": 3.532719992281397e-06, + "loss": 0.5873, + "step": 5162 + }, + { + "epoch": 0.9178666666666667, + "grad_norm": 0.36367586620061826, + "learning_rate": 3.5175663464185436e-06, + "loss": 0.5447, + "step": 5163 + }, + { + "epoch": 0.9180444444444444, + "grad_norm": 0.3480227928922605, + "learning_rate": 3.5024446894522554e-06, + "loss": 0.5663, + "step": 5164 + }, + { + "epoch": 0.9182222222222223, + "grad_norm": 0.3470123907410669, + "learning_rate": 3.487355026396133e-06, + "loss": 0.5605, + "step": 5165 + }, + { + "epoch": 0.9184, + "grad_norm": 0.34286031286193924, + "learning_rate": 3.472297362253174e-06, + "loss": 0.582, + "step": 5166 + }, + { + "epoch": 0.9185777777777778, + "grad_norm": 0.3764571533169989, + "learning_rate": 3.4572717020157853e-06, + "loss": 0.5645, + "step": 5167 + }, + { + "epoch": 0.9187555555555555, + "grad_norm": 0.3541064327410212, + "learning_rate": 3.442278050665726e-06, + "loss": 0.566, + "step": 5168 + }, + { + "epoch": 0.9189333333333334, + "grad_norm": 0.3486433028154304, + "learning_rate": 3.4273164131741753e-06, + "loss": 0.5342, + "step": 5169 + }, + { + "epoch": 0.9191111111111111, + "grad_norm": 0.355460177148592, + "learning_rate": 3.4123867945016983e-06, + "loss": 0.53, + "step": 5170 + }, + { + "epoch": 0.9192888888888889, + "grad_norm": 0.33789411676419545, + "learning_rate": 3.3974891995982026e-06, + "loss": 0.5335, + "step": 5171 + }, + { + "epoch": 0.9194666666666667, + "grad_norm": 0.4733886870054763, + "learning_rate": 3.382623633403037e-06, + "loss": 0.5522, + "step": 5172 + }, + { + "epoch": 0.9196444444444445, + "grad_norm": 0.391565506036996, + "learning_rate": 3.367790100844892e-06, + "loss": 0.5526, + "step": 5173 + }, + { + "epoch": 0.9198222222222222, + "grad_norm": 0.35650829502658204, + "learning_rate": 3.3529886068418447e-06, + "loss": 0.5475, + "step": 5174 + }, + { + "epoch": 0.92, + "grad_norm": 0.3475378732928639, + "learning_rate": 3.3382191563013588e-06, + "loss": 0.5895, + "step": 5175 + }, + { + "epoch": 0.9201777777777778, + "grad_norm": 0.35491100687567145, + "learning_rate": 3.323481754120261e-06, + "loss": 0.5893, + "step": 5176 + }, + { + "epoch": 0.9203555555555556, + "grad_norm": 0.3355598005643602, + "learning_rate": 3.308776405184777e-06, + "loss": 0.5385, + "step": 5177 + }, + { + "epoch": 0.9205333333333333, + "grad_norm": 0.34152425953551774, + "learning_rate": 3.2941031143704503e-06, + "loss": 0.5859, + "step": 5178 + }, + { + "epoch": 0.9207111111111111, + "grad_norm": 0.35613888434743146, + "learning_rate": 3.2794618865422677e-06, + "loss": 0.556, + "step": 5179 + }, + { + "epoch": 0.9208888888888889, + "grad_norm": 0.35494124210847294, + "learning_rate": 3.264852726554535e-06, + "loss": 0.5477, + "step": 5180 + }, + { + "epoch": 0.9210666666666667, + "grad_norm": 0.3571771768448111, + "learning_rate": 3.250275639250955e-06, + "loss": 0.5715, + "step": 5181 + }, + { + "epoch": 0.9212444444444444, + "grad_norm": 0.3601177771043013, + "learning_rate": 3.235730629464551e-06, + "loss": 0.5803, + "step": 5182 + }, + { + "epoch": 0.9214222222222223, + "grad_norm": 0.37321635044932994, + "learning_rate": 3.221217702017787e-06, + "loss": 0.5376, + "step": 5183 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3619891260147188, + "learning_rate": 3.2067368617223924e-06, + "loss": 0.559, + "step": 5184 + }, + { + "epoch": 0.9217777777777778, + "grad_norm": 0.37928917539145074, + "learning_rate": 3.1922881133795825e-06, + "loss": 0.5685, + "step": 5185 + }, + { + "epoch": 0.9219555555555555, + "grad_norm": 0.380841204242253, + "learning_rate": 3.177871461779791e-06, + "loss": 0.5712, + "step": 5186 + }, + { + "epoch": 0.9221333333333334, + "grad_norm": 0.3635877216519205, + "learning_rate": 3.163486911702929e-06, + "loss": 0.5689, + "step": 5187 + }, + { + "epoch": 0.9223111111111111, + "grad_norm": 0.4138488951733727, + "learning_rate": 3.149134467918191e-06, + "loss": 0.5978, + "step": 5188 + }, + { + "epoch": 0.9224888888888889, + "grad_norm": 0.34140945563192293, + "learning_rate": 3.134814135184161e-06, + "loss": 0.5212, + "step": 5189 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.34702298805572057, + "learning_rate": 3.1205259182487624e-06, + "loss": 0.5633, + "step": 5190 + }, + { + "epoch": 0.9228444444444445, + "grad_norm": 0.39947010219578805, + "learning_rate": 3.1062698218492724e-06, + "loss": 0.592, + "step": 5191 + }, + { + "epoch": 0.9230222222222222, + "grad_norm": 0.3318201473187928, + "learning_rate": 3.092045850712333e-06, + "loss": 0.4932, + "step": 5192 + }, + { + "epoch": 0.9232, + "grad_norm": 0.35191289602622544, + "learning_rate": 3.0778540095539156e-06, + "loss": 0.5549, + "step": 5193 + }, + { + "epoch": 0.9233777777777777, + "grad_norm": 0.3311251479849193, + "learning_rate": 3.063694303079345e-06, + "loss": 0.5548, + "step": 5194 + }, + { + "epoch": 0.9235555555555556, + "grad_norm": 0.3879932909834961, + "learning_rate": 3.049566735983289e-06, + "loss": 0.5521, + "step": 5195 + }, + { + "epoch": 0.9237333333333333, + "grad_norm": 0.43230928961559584, + "learning_rate": 3.035471312949778e-06, + "loss": 0.5311, + "step": 5196 + }, + { + "epoch": 0.9239111111111111, + "grad_norm": 0.3792734534789123, + "learning_rate": 3.0214080386521626e-06, + "loss": 0.5728, + "step": 5197 + }, + { + "epoch": 0.9240888888888888, + "grad_norm": 0.35889086322788477, + "learning_rate": 3.0073769177531463e-06, + "loss": 0.522, + "step": 5198 + }, + { + "epoch": 0.9242666666666667, + "grad_norm": 0.35540181518873853, + "learning_rate": 2.9933779549047636e-06, + "loss": 0.5185, + "step": 5199 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 0.3882251010141909, + "learning_rate": 2.9794111547483907e-06, + "loss": 0.6139, + "step": 5200 + }, + { + "epoch": 0.9246222222222222, + "grad_norm": 0.3745772284413269, + "learning_rate": 2.9654765219147563e-06, + "loss": 0.5977, + "step": 5201 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3320636102677679, + "learning_rate": 2.9515740610238762e-06, + "loss": 0.5443, + "step": 5202 + }, + { + "epoch": 0.9249777777777778, + "grad_norm": 0.35092225519399317, + "learning_rate": 2.9377037766851747e-06, + "loss": 0.5364, + "step": 5203 + }, + { + "epoch": 0.9251555555555555, + "grad_norm": 0.7484364376340257, + "learning_rate": 2.9238656734973167e-06, + "loss": 0.5593, + "step": 5204 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 0.3602163416545684, + "learning_rate": 2.9100597560484e-06, + "loss": 0.5851, + "step": 5205 + }, + { + "epoch": 0.9255111111111111, + "grad_norm": 0.3500410601160329, + "learning_rate": 2.8962860289157513e-06, + "loss": 0.611, + "step": 5206 + }, + { + "epoch": 0.9256888888888889, + "grad_norm": 0.35074020947725, + "learning_rate": 2.8825444966661063e-06, + "loss": 0.5428, + "step": 5207 + }, + { + "epoch": 0.9258666666666666, + "grad_norm": 0.3735741411277997, + "learning_rate": 2.8688351638554543e-06, + "loss": 0.5177, + "step": 5208 + }, + { + "epoch": 0.9260444444444444, + "grad_norm": 0.35723305854311777, + "learning_rate": 2.8551580350291817e-06, + "loss": 0.5651, + "step": 5209 + }, + { + "epoch": 0.9262222222222222, + "grad_norm": 0.3559805056580754, + "learning_rate": 2.8415131147219276e-06, + "loss": 0.5723, + "step": 5210 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3913503257383937, + "learning_rate": 2.8279004074577285e-06, + "loss": 0.5577, + "step": 5211 + }, + { + "epoch": 0.9265777777777777, + "grad_norm": 0.37114870341059303, + "learning_rate": 2.8143199177498525e-06, + "loss": 0.5638, + "step": 5212 + }, + { + "epoch": 0.9267555555555556, + "grad_norm": 0.3584507808312592, + "learning_rate": 2.800771650100964e-06, + "loss": 0.5291, + "step": 5213 + }, + { + "epoch": 0.9269333333333334, + "grad_norm": 0.423400141656338, + "learning_rate": 2.7872556090029923e-06, + "loss": 0.5934, + "step": 5214 + }, + { + "epoch": 0.9271111111111111, + "grad_norm": 0.35068976878422503, + "learning_rate": 2.773771798937208e-06, + "loss": 0.5833, + "step": 5215 + }, + { + "epoch": 0.9272888888888889, + "grad_norm": 0.3465886756047563, + "learning_rate": 2.760320224374191e-06, + "loss": 0.5708, + "step": 5216 + }, + { + "epoch": 0.9274666666666667, + "grad_norm": 0.3448903687595693, + "learning_rate": 2.746900889773829e-06, + "loss": 0.5809, + "step": 5217 + }, + { + "epoch": 0.9276444444444445, + "grad_norm": 0.34518172097961836, + "learning_rate": 2.7335137995853188e-06, + "loss": 0.5468, + "step": 5218 + }, + { + "epoch": 0.9278222222222222, + "grad_norm": 0.4172719503894572, + "learning_rate": 2.7201589582471763e-06, + "loss": 0.5447, + "step": 5219 + }, + { + "epoch": 0.928, + "grad_norm": 0.373077227511944, + "learning_rate": 2.7068363701872155e-06, + "loss": 0.5521, + "step": 5220 + }, + { + "epoch": 0.9281777777777778, + "grad_norm": 0.35812578476529233, + "learning_rate": 2.6935460398225697e-06, + "loss": 0.5521, + "step": 5221 + }, + { + "epoch": 0.9283555555555556, + "grad_norm": 0.41293663098170164, + "learning_rate": 2.6802879715596585e-06, + "loss": 0.5862, + "step": 5222 + }, + { + "epoch": 0.9285333333333333, + "grad_norm": 0.3697987938093545, + "learning_rate": 2.66706216979421e-06, + "loss": 0.5632, + "step": 5223 + }, + { + "epoch": 0.9287111111111112, + "grad_norm": 0.36477036202064156, + "learning_rate": 2.653868638911272e-06, + "loss": 0.5087, + "step": 5224 + }, + { + "epoch": 0.9288888888888889, + "grad_norm": 0.3717172288092181, + "learning_rate": 2.6407073832851682e-06, + "loss": 0.5755, + "step": 5225 + }, + { + "epoch": 0.9290666666666667, + "grad_norm": 0.3313904033496613, + "learning_rate": 2.6275784072795405e-06, + "loss": 0.5177, + "step": 5226 + }, + { + "epoch": 0.9292444444444444, + "grad_norm": 0.3541283506194142, + "learning_rate": 2.6144817152473298e-06, + "loss": 0.592, + "step": 5227 + }, + { + "epoch": 0.9294222222222223, + "grad_norm": 0.363546268306384, + "learning_rate": 2.6014173115307292e-06, + "loss": 0.5625, + "step": 5228 + }, + { + "epoch": 0.9296, + "grad_norm": 0.3441569867708462, + "learning_rate": 2.5883852004613074e-06, + "loss": 0.5288, + "step": 5229 + }, + { + "epoch": 0.9297777777777778, + "grad_norm": 0.3632864590675993, + "learning_rate": 2.57538538635983e-06, + "loss": 0.5287, + "step": 5230 + }, + { + "epoch": 0.9299555555555555, + "grad_norm": 0.3343882034342482, + "learning_rate": 2.56241787353646e-06, + "loss": 0.5204, + "step": 5231 + }, + { + "epoch": 0.9301333333333334, + "grad_norm": 0.35333098925624495, + "learning_rate": 2.549482666290537e-06, + "loss": 0.5661, + "step": 5232 + }, + { + "epoch": 0.9303111111111111, + "grad_norm": 0.35706572963037697, + "learning_rate": 2.536579768910818e-06, + "loss": 0.5253, + "step": 5233 + }, + { + "epoch": 0.9304888888888889, + "grad_norm": 0.3407031331535539, + "learning_rate": 2.523709185675205e-06, + "loss": 0.5645, + "step": 5234 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 0.3313501853633208, + "learning_rate": 2.510870920851016e-06, + "loss": 0.5886, + "step": 5235 + }, + { + "epoch": 0.9308444444444445, + "grad_norm": 0.3518301772257647, + "learning_rate": 2.4980649786947695e-06, + "loss": 0.505, + "step": 5236 + }, + { + "epoch": 0.9310222222222222, + "grad_norm": 0.33392710606804554, + "learning_rate": 2.4852913634523023e-06, + "loss": 0.5628, + "step": 5237 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3273304965567965, + "learning_rate": 2.472550079358715e-06, + "loss": 0.5505, + "step": 5238 + }, + { + "epoch": 0.9313777777777777, + "grad_norm": 0.37751051695255455, + "learning_rate": 2.4598411306384185e-06, + "loss": 0.6323, + "step": 5239 + }, + { + "epoch": 0.9315555555555556, + "grad_norm": 0.3765049607333993, + "learning_rate": 2.4471645215050743e-06, + "loss": 0.5968, + "step": 5240 + }, + { + "epoch": 0.9317333333333333, + "grad_norm": 0.32908875140069194, + "learning_rate": 2.434520256161632e-06, + "loss": 0.534, + "step": 5241 + }, + { + "epoch": 0.9319111111111111, + "grad_norm": 0.36414118701144976, + "learning_rate": 2.421908338800305e-06, + "loss": 0.608, + "step": 5242 + }, + { + "epoch": 0.9320888888888889, + "grad_norm": 0.3767999681296671, + "learning_rate": 2.409328773602615e-06, + "loss": 0.5478, + "step": 5243 + }, + { + "epoch": 0.9322666666666667, + "grad_norm": 0.35039822210530885, + "learning_rate": 2.3967815647393256e-06, + "loss": 0.5453, + "step": 5244 + }, + { + "epoch": 0.9324444444444444, + "grad_norm": 0.37747047804756695, + "learning_rate": 2.384266716370476e-06, + "loss": 0.5697, + "step": 5245 + }, + { + "epoch": 0.9326222222222222, + "grad_norm": 0.35569079514058394, + "learning_rate": 2.371784232645391e-06, + "loss": 0.595, + "step": 5246 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3634310013100204, + "learning_rate": 2.359334117702661e-06, + "loss": 0.5834, + "step": 5247 + }, + { + "epoch": 0.9329777777777778, + "grad_norm": 0.3526973892567933, + "learning_rate": 2.3469163756701273e-06, + "loss": 0.5702, + "step": 5248 + }, + { + "epoch": 0.9331555555555555, + "grad_norm": 0.35812201643996516, + "learning_rate": 2.334531010664931e-06, + "loss": 0.5904, + "step": 5249 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.35401127783065856, + "learning_rate": 2.322178026793431e-06, + "loss": 0.5568, + "step": 5250 + }, + { + "epoch": 0.9335111111111111, + "grad_norm": 0.7042918927917877, + "learning_rate": 2.3098574281513185e-06, + "loss": 0.5789, + "step": 5251 + }, + { + "epoch": 0.9336888888888889, + "grad_norm": 0.3892363301460388, + "learning_rate": 2.2975692188234475e-06, + "loss": 0.5717, + "step": 5252 + }, + { + "epoch": 0.9338666666666666, + "grad_norm": 0.3718016748977139, + "learning_rate": 2.2853134028840594e-06, + "loss": 0.5424, + "step": 5253 + }, + { + "epoch": 0.9340444444444445, + "grad_norm": 0.33675808435914883, + "learning_rate": 2.2730899843965257e-06, + "loss": 0.5762, + "step": 5254 + }, + { + "epoch": 0.9342222222222222, + "grad_norm": 0.35475447430044343, + "learning_rate": 2.260898967413594e-06, + "loss": 0.5699, + "step": 5255 + }, + { + "epoch": 0.9344, + "grad_norm": 0.3303972476591452, + "learning_rate": 2.2487403559771636e-06, + "loss": 0.5031, + "step": 5256 + }, + { + "epoch": 0.9345777777777777, + "grad_norm": 0.3774733281549121, + "learning_rate": 2.2366141541184883e-06, + "loss": 0.5487, + "step": 5257 + }, + { + "epoch": 0.9347555555555556, + "grad_norm": 0.3441550969788787, + "learning_rate": 2.2245203658579962e-06, + "loss": 0.5457, + "step": 5258 + }, + { + "epoch": 0.9349333333333333, + "grad_norm": 0.35148168347118697, + "learning_rate": 2.212458995205413e-06, + "loss": 0.5394, + "step": 5259 + }, + { + "epoch": 0.9351111111111111, + "grad_norm": 0.3492774525260916, + "learning_rate": 2.2004300461597073e-06, + "loss": 0.5505, + "step": 5260 + }, + { + "epoch": 0.9352888888888888, + "grad_norm": 0.36128489264301855, + "learning_rate": 2.188433522709088e-06, + "loss": 0.5588, + "step": 5261 + }, + { + "epoch": 0.9354666666666667, + "grad_norm": 0.32867752822948326, + "learning_rate": 2.1764694288310184e-06, + "loss": 0.5477, + "step": 5262 + }, + { + "epoch": 0.9356444444444444, + "grad_norm": 0.36191190158101877, + "learning_rate": 2.1645377684922252e-06, + "loss": 0.5624, + "step": 5263 + }, + { + "epoch": 0.9358222222222222, + "grad_norm": 0.5240451145392441, + "learning_rate": 2.152638545648644e-06, + "loss": 0.5367, + "step": 5264 + }, + { + "epoch": 0.936, + "grad_norm": 0.3240433096840552, + "learning_rate": 2.1407717642455082e-06, + "loss": 0.5262, + "step": 5265 + }, + { + "epoch": 0.9361777777777778, + "grad_norm": 0.3567531938105747, + "learning_rate": 2.128937428217259e-06, + "loss": 0.5374, + "step": 5266 + }, + { + "epoch": 0.9363555555555556, + "grad_norm": 0.35008253187215066, + "learning_rate": 2.117135541487569e-06, + "loss": 0.5425, + "step": 5267 + }, + { + "epoch": 0.9365333333333333, + "grad_norm": 0.3724918942605047, + "learning_rate": 2.1053661079693976e-06, + "loss": 0.5498, + "step": 5268 + }, + { + "epoch": 0.9367111111111112, + "grad_norm": 0.3488246745219126, + "learning_rate": 2.0936291315649113e-06, + "loss": 0.5358, + "step": 5269 + }, + { + "epoch": 0.9368888888888889, + "grad_norm": 0.36840778758020337, + "learning_rate": 2.0819246161655092e-06, + "loss": 0.5752, + "step": 5270 + }, + { + "epoch": 0.9370666666666667, + "grad_norm": 0.35457885943501455, + "learning_rate": 2.0702525656518534e-06, + "loss": 0.5374, + "step": 5271 + }, + { + "epoch": 0.9372444444444444, + "grad_norm": 0.3888329421261131, + "learning_rate": 2.0586129838938263e-06, + "loss": 0.5521, + "step": 5272 + }, + { + "epoch": 0.9374222222222223, + "grad_norm": 0.32556892946070465, + "learning_rate": 2.0470058747505516e-06, + "loss": 0.5056, + "step": 5273 + }, + { + "epoch": 0.9376, + "grad_norm": 0.37746947301796263, + "learning_rate": 2.0354312420703847e-06, + "loss": 0.5492, + "step": 5274 + }, + { + "epoch": 0.9377777777777778, + "grad_norm": 0.35591614326200327, + "learning_rate": 2.023889089690911e-06, + "loss": 0.5582, + "step": 5275 + }, + { + "epoch": 0.9379555555555555, + "grad_norm": 0.3285066271862678, + "learning_rate": 2.012379421438937e-06, + "loss": 0.5329, + "step": 5276 + }, + { + "epoch": 0.9381333333333334, + "grad_norm": 0.3605098620380117, + "learning_rate": 2.0009022411305313e-06, + "loss": 0.5952, + "step": 5277 + }, + { + "epoch": 0.9383111111111111, + "grad_norm": 0.3747214465320203, + "learning_rate": 1.989457552570939e-06, + "loss": 0.5555, + "step": 5278 + }, + { + "epoch": 0.9384888888888889, + "grad_norm": 0.35790797694265153, + "learning_rate": 1.9780453595547145e-06, + "loss": 0.5608, + "step": 5279 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 0.4175122800171325, + "learning_rate": 1.96666566586553e-06, + "loss": 0.5148, + "step": 5280 + }, + { + "epoch": 0.9388444444444445, + "grad_norm": 0.34609462319658957, + "learning_rate": 1.955318475276391e-06, + "loss": 0.5959, + "step": 5281 + }, + { + "epoch": 0.9390222222222222, + "grad_norm": 0.37011007137890656, + "learning_rate": 1.9440037915494316e-06, + "loss": 0.5464, + "step": 5282 + }, + { + "epoch": 0.9392, + "grad_norm": 0.36530700661180315, + "learning_rate": 1.9327216184360843e-06, + "loss": 0.593, + "step": 5283 + }, + { + "epoch": 0.9393777777777778, + "grad_norm": 0.3432745073859673, + "learning_rate": 1.921471959676957e-06, + "loss": 0.5825, + "step": 5284 + }, + { + "epoch": 0.9395555555555556, + "grad_norm": 0.37732265708241935, + "learning_rate": 1.9102548190018887e-06, + "loss": 0.5582, + "step": 5285 + }, + { + "epoch": 0.9397333333333333, + "grad_norm": 0.3440408926572364, + "learning_rate": 1.89907020012996e-06, + "loss": 0.537, + "step": 5286 + }, + { + "epoch": 0.9399111111111111, + "grad_norm": 0.39292759798214, + "learning_rate": 1.887918106769415e-06, + "loss": 0.555, + "step": 5287 + }, + { + "epoch": 0.9400888888888889, + "grad_norm": 0.35950334473642725, + "learning_rate": 1.8767985426177748e-06, + "loss": 0.5427, + "step": 5288 + }, + { + "epoch": 0.9402666666666667, + "grad_norm": 0.3440525012612628, + "learning_rate": 1.865711511361734e-06, + "loss": 0.5185, + "step": 5289 + }, + { + "epoch": 0.9404444444444444, + "grad_norm": 0.38243832953269186, + "learning_rate": 1.8546570166772193e-06, + "loss": 0.6008, + "step": 5290 + }, + { + "epoch": 0.9406222222222222, + "grad_norm": 0.3767184537860918, + "learning_rate": 1.843635062229354e-06, + "loss": 0.5689, + "step": 5291 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3708024428732624, + "learning_rate": 1.8326456516725155e-06, + "loss": 0.5554, + "step": 5292 + }, + { + "epoch": 0.9409777777777778, + "grad_norm": 0.35507369216051266, + "learning_rate": 1.821688788650211e-06, + "loss": 0.5109, + "step": 5293 + }, + { + "epoch": 0.9411555555555555, + "grad_norm": 0.3511085498837757, + "learning_rate": 1.810764476795257e-06, + "loss": 0.5279, + "step": 5294 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 0.35301873355061625, + "learning_rate": 1.7998727197295784e-06, + "loss": 0.5562, + "step": 5295 + }, + { + "epoch": 0.9415111111111111, + "grad_norm": 0.3770329563922565, + "learning_rate": 1.7890135210643865e-06, + "loss": 0.5723, + "step": 5296 + }, + { + "epoch": 0.9416888888888889, + "grad_norm": 0.362297747041444, + "learning_rate": 1.778186884400046e-06, + "loss": 0.5586, + "step": 5297 + }, + { + "epoch": 0.9418666666666666, + "grad_norm": 0.33828352400251355, + "learning_rate": 1.767392813326163e-06, + "loss": 0.5137, + "step": 5298 + }, + { + "epoch": 0.9420444444444445, + "grad_norm": 0.40197056613897847, + "learning_rate": 1.7566313114215082e-06, + "loss": 0.536, + "step": 5299 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 0.3434618217131165, + "learning_rate": 1.7459023822540943e-06, + "loss": 0.5895, + "step": 5300 + }, + { + "epoch": 0.9424, + "grad_norm": 0.33019712985345034, + "learning_rate": 1.7352060293810868e-06, + "loss": 0.5477, + "step": 5301 + }, + { + "epoch": 0.9425777777777777, + "grad_norm": 0.3768755077792002, + "learning_rate": 1.7245422563489045e-06, + "loss": 0.5862, + "step": 5302 + }, + { + "epoch": 0.9427555555555556, + "grad_norm": 0.3507032336343113, + "learning_rate": 1.7139110666931191e-06, + "loss": 0.5116, + "step": 5303 + }, + { + "epoch": 0.9429333333333333, + "grad_norm": 0.3614892470305919, + "learning_rate": 1.7033124639385333e-06, + "loss": 0.5617, + "step": 5304 + }, + { + "epoch": 0.9431111111111111, + "grad_norm": 0.35151746024174557, + "learning_rate": 1.6927464515991142e-06, + "loss": 0.5633, + "step": 5305 + }, + { + "epoch": 0.9432888888888888, + "grad_norm": 0.3698253321825334, + "learning_rate": 1.6822130331780484e-06, + "loss": 0.5663, + "step": 5306 + }, + { + "epoch": 0.9434666666666667, + "grad_norm": 0.3507490856884903, + "learning_rate": 1.6717122121677088e-06, + "loss": 0.5314, + "step": 5307 + }, + { + "epoch": 0.9436444444444444, + "grad_norm": 0.33157682103988173, + "learning_rate": 1.6612439920496548e-06, + "loss": 0.5134, + "step": 5308 + }, + { + "epoch": 0.9438222222222222, + "grad_norm": 0.3450966457133679, + "learning_rate": 1.6508083762946324e-06, + "loss": 0.5132, + "step": 5309 + }, + { + "epoch": 0.944, + "grad_norm": 0.36970095565930994, + "learning_rate": 1.6404053683626076e-06, + "loss": 0.5912, + "step": 5310 + }, + { + "epoch": 0.9441777777777778, + "grad_norm": 0.38493115738719047, + "learning_rate": 1.6300349717026875e-06, + "loss": 0.5632, + "step": 5311 + }, + { + "epoch": 0.9443555555555555, + "grad_norm": 0.37204627315981625, + "learning_rate": 1.619697189753211e-06, + "loss": 0.5559, + "step": 5312 + }, + { + "epoch": 0.9445333333333333, + "grad_norm": 0.35743393176457144, + "learning_rate": 1.6093920259416696e-06, + "loss": 0.5259, + "step": 5313 + }, + { + "epoch": 0.9447111111111111, + "grad_norm": 0.3683940186535053, + "learning_rate": 1.5991194836847746e-06, + "loss": 0.5406, + "step": 5314 + }, + { + "epoch": 0.9448888888888889, + "grad_norm": 0.37002343244193114, + "learning_rate": 1.5888795663883904e-06, + "loss": 0.5309, + "step": 5315 + }, + { + "epoch": 0.9450666666666667, + "grad_norm": 0.3483337769977269, + "learning_rate": 1.5786722774475793e-06, + "loss": 0.5918, + "step": 5316 + }, + { + "epoch": 0.9452444444444444, + "grad_norm": 0.341713628604192, + "learning_rate": 1.5684976202465784e-06, + "loss": 0.526, + "step": 5317 + }, + { + "epoch": 0.9454222222222223, + "grad_norm": 0.36775993877312124, + "learning_rate": 1.5583555981588338e-06, + "loss": 0.5721, + "step": 5318 + }, + { + "epoch": 0.9456, + "grad_norm": 0.34590965148087716, + "learning_rate": 1.5482462145469224e-06, + "loss": 0.5485, + "step": 5319 + }, + { + "epoch": 0.9457777777777778, + "grad_norm": 0.35971688745675895, + "learning_rate": 1.5381694727626295e-06, + "loss": 0.5387, + "step": 5320 + }, + { + "epoch": 0.9459555555555555, + "grad_norm": 0.35607081447358163, + "learning_rate": 1.5281253761469161e-06, + "loss": 0.5649, + "step": 5321 + }, + { + "epoch": 0.9461333333333334, + "grad_norm": 0.3510700520739768, + "learning_rate": 1.5181139280299295e-06, + "loss": 0.5671, + "step": 5322 + }, + { + "epoch": 0.9463111111111111, + "grad_norm": 0.34212333227749525, + "learning_rate": 1.50813513173097e-06, + "loss": 0.5338, + "step": 5323 + }, + { + "epoch": 0.9464888888888889, + "grad_norm": 0.3619475089093974, + "learning_rate": 1.4981889905585134e-06, + "loss": 0.5446, + "step": 5324 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.3381827481499597, + "learning_rate": 1.4882755078102327e-06, + "loss": 0.5357, + "step": 5325 + }, + { + "epoch": 0.9468444444444445, + "grad_norm": 0.37066267828269134, + "learning_rate": 1.4783946867729547e-06, + "loss": 0.5777, + "step": 5326 + }, + { + "epoch": 0.9470222222222222, + "grad_norm": 0.3706286998566814, + "learning_rate": 1.468546530722681e-06, + "loss": 0.5785, + "step": 5327 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3511841440117857, + "learning_rate": 1.4587310429245882e-06, + "loss": 0.5379, + "step": 5328 + }, + { + "epoch": 0.9473777777777778, + "grad_norm": 0.34150136023459365, + "learning_rate": 1.4489482266329956e-06, + "loss": 0.5693, + "step": 5329 + }, + { + "epoch": 0.9475555555555556, + "grad_norm": 0.36761504570399756, + "learning_rate": 1.4391980850914311e-06, + "loss": 0.5668, + "step": 5330 + }, + { + "epoch": 0.9477333333333333, + "grad_norm": 0.3423601140556568, + "learning_rate": 1.429480621532564e-06, + "loss": 0.5541, + "step": 5331 + }, + { + "epoch": 0.9479111111111111, + "grad_norm": 0.35062857338708975, + "learning_rate": 1.4197958391782284e-06, + "loss": 0.5273, + "step": 5332 + }, + { + "epoch": 0.9480888888888889, + "grad_norm": 0.3438328556286129, + "learning_rate": 1.4101437412394336e-06, + "loss": 0.5905, + "step": 5333 + }, + { + "epoch": 0.9482666666666667, + "grad_norm": 0.3473827178403967, + "learning_rate": 1.4005243309163418e-06, + "loss": 0.5686, + "step": 5334 + }, + { + "epoch": 0.9484444444444444, + "grad_norm": 0.3420838734297711, + "learning_rate": 1.3909376113982798e-06, + "loss": 0.5276, + "step": 5335 + }, + { + "epoch": 0.9486222222222223, + "grad_norm": 0.35300484746503047, + "learning_rate": 1.3813835858637715e-06, + "loss": 0.5747, + "step": 5336 + }, + { + "epoch": 0.9488, + "grad_norm": 0.34766411407479836, + "learning_rate": 1.3718622574804163e-06, + "loss": 0.546, + "step": 5337 + }, + { + "epoch": 0.9489777777777778, + "grad_norm": 0.3365621744039928, + "learning_rate": 1.362373629405067e-06, + "loss": 0.5386, + "step": 5338 + }, + { + "epoch": 0.9491555555555555, + "grad_norm": 0.34917634568600764, + "learning_rate": 1.3529177047836627e-06, + "loss": 0.5577, + "step": 5339 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 0.39029017879047967, + "learning_rate": 1.343494486751362e-06, + "loss": 0.5844, + "step": 5340 + }, + { + "epoch": 0.9495111111111111, + "grad_norm": 0.3558562257771291, + "learning_rate": 1.3341039784324106e-06, + "loss": 0.5465, + "step": 5341 + }, + { + "epoch": 0.9496888888888889, + "grad_norm": 0.35064301673146553, + "learning_rate": 1.3247461829402729e-06, + "loss": 0.4971, + "step": 5342 + }, + { + "epoch": 0.9498666666666666, + "grad_norm": 0.35849762798032986, + "learning_rate": 1.3154211033775344e-06, + "loss": 0.5518, + "step": 5343 + }, + { + "epoch": 0.9500444444444445, + "grad_norm": 0.34128929216535925, + "learning_rate": 1.3061287428359325e-06, + "loss": 0.5721, + "step": 5344 + }, + { + "epoch": 0.9502222222222222, + "grad_norm": 0.3579953394538537, + "learning_rate": 1.2968691043963699e-06, + "loss": 0.5519, + "step": 5345 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3708075142823777, + "learning_rate": 1.2876421911288905e-06, + "loss": 0.5986, + "step": 5346 + }, + { + "epoch": 0.9505777777777777, + "grad_norm": 0.3574258124770667, + "learning_rate": 1.2784480060926919e-06, + "loss": 0.5623, + "step": 5347 + }, + { + "epoch": 0.9507555555555556, + "grad_norm": 0.3741393739641284, + "learning_rate": 1.269286552336113e-06, + "loss": 0.5228, + "step": 5348 + }, + { + "epoch": 0.9509333333333333, + "grad_norm": 0.3350966867099206, + "learning_rate": 1.2601578328966578e-06, + "loss": 0.5701, + "step": 5349 + }, + { + "epoch": 0.9511111111111111, + "grad_norm": 0.3436924377359069, + "learning_rate": 1.2510618508009608e-06, + "loss": 0.5448, + "step": 5350 + }, + { + "epoch": 0.9512888888888889, + "grad_norm": 0.34746060247246097, + "learning_rate": 1.2419986090648205e-06, + "loss": 0.5868, + "step": 5351 + }, + { + "epoch": 0.9514666666666667, + "grad_norm": 0.36130259692645544, + "learning_rate": 1.2329681106931557e-06, + "loss": 0.5166, + "step": 5352 + }, + { + "epoch": 0.9516444444444444, + "grad_norm": 0.3537993097331322, + "learning_rate": 1.2239703586800378e-06, + "loss": 0.5582, + "step": 5353 + }, + { + "epoch": 0.9518222222222222, + "grad_norm": 0.35351759198379995, + "learning_rate": 1.2150053560087026e-06, + "loss": 0.55, + "step": 5354 + }, + { + "epoch": 0.952, + "grad_norm": 0.36201929801838933, + "learning_rate": 1.2060731056514951e-06, + "loss": 0.5512, + "step": 5355 + }, + { + "epoch": 0.9521777777777778, + "grad_norm": 0.3339019435790876, + "learning_rate": 1.197173610569924e-06, + "loss": 0.5098, + "step": 5356 + }, + { + "epoch": 0.9523555555555555, + "grad_norm": 0.34992317516829263, + "learning_rate": 1.1883068737146285e-06, + "loss": 0.5486, + "step": 5357 + }, + { + "epoch": 0.9525333333333333, + "grad_norm": 0.40767003048670447, + "learning_rate": 1.1794728980253911e-06, + "loss": 0.5959, + "step": 5358 + }, + { + "epoch": 0.9527111111111111, + "grad_norm": 0.38108034839002447, + "learning_rate": 1.170671686431124e-06, + "loss": 0.5604, + "step": 5359 + }, + { + "epoch": 0.9528888888888889, + "grad_norm": 0.35078210040558905, + "learning_rate": 1.161903241849882e-06, + "loss": 0.5381, + "step": 5360 + }, + { + "epoch": 0.9530666666666666, + "grad_norm": 0.3374556292462107, + "learning_rate": 1.1531675671888619e-06, + "loss": 0.5258, + "step": 5361 + }, + { + "epoch": 0.9532444444444444, + "grad_norm": 0.36605471136271917, + "learning_rate": 1.1444646653443914e-06, + "loss": 0.5297, + "step": 5362 + }, + { + "epoch": 0.9534222222222222, + "grad_norm": 0.36669112939208975, + "learning_rate": 1.1357945392019064e-06, + "loss": 0.5897, + "step": 5363 + }, + { + "epoch": 0.9536, + "grad_norm": 0.35287350856307304, + "learning_rate": 1.1271571916360413e-06, + "loss": 0.5862, + "step": 5364 + }, + { + "epoch": 0.9537777777777777, + "grad_norm": 0.34656218670688343, + "learning_rate": 1.1185526255104938e-06, + "loss": 0.5632, + "step": 5365 + }, + { + "epoch": 0.9539555555555556, + "grad_norm": 0.3578602751675772, + "learning_rate": 1.1099808436781378e-06, + "loss": 0.5384, + "step": 5366 + }, + { + "epoch": 0.9541333333333334, + "grad_norm": 0.3444475857913409, + "learning_rate": 1.1014418489809331e-06, + "loss": 0.5026, + "step": 5367 + }, + { + "epoch": 0.9543111111111111, + "grad_norm": 0.3378381163817232, + "learning_rate": 1.092935644250026e-06, + "loss": 0.5372, + "step": 5368 + }, + { + "epoch": 0.9544888888888889, + "grad_norm": 0.3604527420605561, + "learning_rate": 1.0844622323056387e-06, + "loss": 0.5737, + "step": 5369 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 0.3582489023230677, + "learning_rate": 1.0760216159571679e-06, + "loss": 0.5639, + "step": 5370 + }, + { + "epoch": 0.9548444444444445, + "grad_norm": 0.34267321561581965, + "learning_rate": 1.0676137980030864e-06, + "loss": 0.5314, + "step": 5371 + }, + { + "epoch": 0.9550222222222222, + "grad_norm": 0.3544674141514693, + "learning_rate": 1.0592387812310311e-06, + "loss": 0.5389, + "step": 5372 + }, + { + "epoch": 0.9552, + "grad_norm": 0.3400652746822241, + "learning_rate": 1.0508965684177586e-06, + "loss": 0.5424, + "step": 5373 + }, + { + "epoch": 0.9553777777777778, + "grad_norm": 0.3479779916557745, + "learning_rate": 1.042587162329134e-06, + "loss": 0.5617, + "step": 5374 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 0.37529025937367, + "learning_rate": 1.0343105657201534e-06, + "loss": 0.5922, + "step": 5375 + }, + { + "epoch": 0.9557333333333333, + "grad_norm": 0.36627646800943486, + "learning_rate": 1.0260667813349445e-06, + "loss": 0.5446, + "step": 5376 + }, + { + "epoch": 0.9559111111111112, + "grad_norm": 0.3521949148309049, + "learning_rate": 1.0178558119067315e-06, + "loss": 0.5546, + "step": 5377 + }, + { + "epoch": 0.9560888888888889, + "grad_norm": 0.3477477444889299, + "learning_rate": 1.0096776601578705e-06, + "loss": 0.5468, + "step": 5378 + }, + { + "epoch": 0.9562666666666667, + "grad_norm": 0.34647139462035736, + "learning_rate": 1.0015323287998702e-06, + "loss": 0.5633, + "step": 5379 + }, + { + "epoch": 0.9564444444444444, + "grad_norm": 0.37700557480211677, + "learning_rate": 9.934198205332924e-07, + "loss": 0.5719, + "step": 5380 + }, + { + "epoch": 0.9566222222222223, + "grad_norm": 0.3616641937202924, + "learning_rate": 9.853401380478743e-07, + "loss": 0.5722, + "step": 5381 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3774046038469421, + "learning_rate": 9.772932840224292e-07, + "loss": 0.5725, + "step": 5382 + }, + { + "epoch": 0.9569777777777778, + "grad_norm": 0.34431839569775835, + "learning_rate": 9.692792611249224e-07, + "loss": 0.5755, + "step": 5383 + }, + { + "epoch": 0.9571555555555555, + "grad_norm": 0.3513637012455241, + "learning_rate": 9.612980720124065e-07, + "loss": 0.5193, + "step": 5384 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 0.37272470585199746, + "learning_rate": 9.533497193310537e-07, + "loss": 0.5405, + "step": 5385 + }, + { + "epoch": 0.9575111111111111, + "grad_norm": 0.3572153740789805, + "learning_rate": 9.454342057161558e-07, + "loss": 0.5659, + "step": 5386 + }, + { + "epoch": 0.9576888888888889, + "grad_norm": 0.3734913505550753, + "learning_rate": 9.375515337921136e-07, + "loss": 0.5467, + "step": 5387 + }, + { + "epoch": 0.9578666666666666, + "grad_norm": 0.3718162208776986, + "learning_rate": 9.297017061724367e-07, + "loss": 0.5842, + "step": 5388 + }, + { + "epoch": 0.9580444444444445, + "grad_norm": 0.3950648245050587, + "learning_rate": 9.218847254597429e-07, + "loss": 0.5884, + "step": 5389 + }, + { + "epoch": 0.9582222222222222, + "grad_norm": 0.3432289642551042, + "learning_rate": 9.141005942457814e-07, + "loss": 0.5358, + "step": 5390 + }, + { + "epoch": 0.9584, + "grad_norm": 0.39974622614959526, + "learning_rate": 9.063493151113655e-07, + "loss": 0.521, + "step": 5391 + }, + { + "epoch": 0.9585777777777778, + "grad_norm": 0.37147636089316044, + "learning_rate": 8.98630890626484e-07, + "loss": 0.5613, + "step": 5392 + }, + { + "epoch": 0.9587555555555556, + "grad_norm": 0.35183537377008933, + "learning_rate": 8.909453233501452e-07, + "loss": 0.5463, + "step": 5393 + }, + { + "epoch": 0.9589333333333333, + "grad_norm": 0.3921724110598052, + "learning_rate": 8.832926158305444e-07, + "loss": 0.5355, + "step": 5394 + }, + { + "epoch": 0.9591111111111111, + "grad_norm": 0.35512229464074563, + "learning_rate": 8.756727706049295e-07, + "loss": 0.5189, + "step": 5395 + }, + { + "epoch": 0.9592888888888889, + "grad_norm": 0.3991940619770938, + "learning_rate": 8.680857901996798e-07, + "loss": 0.5858, + "step": 5396 + }, + { + "epoch": 0.9594666666666667, + "grad_norm": 0.3633625172871414, + "learning_rate": 8.605316771302719e-07, + "loss": 0.5401, + "step": 5397 + }, + { + "epoch": 0.9596444444444444, + "grad_norm": 0.35451917904652025, + "learning_rate": 8.530104339012801e-07, + "loss": 0.586, + "step": 5398 + }, + { + "epoch": 0.9598222222222222, + "grad_norm": 0.3806410750706742, + "learning_rate": 8.455220630063764e-07, + "loss": 0.5293, + "step": 5399 + }, + { + "epoch": 0.96, + "grad_norm": 0.35760426167700343, + "learning_rate": 8.380665669283527e-07, + "loss": 0.55, + "step": 5400 + }, + { + "epoch": 0.9601777777777778, + "grad_norm": 0.3679085796450376, + "learning_rate": 8.30643948139087e-07, + "loss": 0.5874, + "step": 5401 + }, + { + "epoch": 0.9603555555555555, + "grad_norm": 0.3528354724167527, + "learning_rate": 8.232542090995665e-07, + "loss": 0.5475, + "step": 5402 + }, + { + "epoch": 0.9605333333333334, + "grad_norm": 0.3616565251982588, + "learning_rate": 8.158973522598534e-07, + "loss": 0.5467, + "step": 5403 + }, + { + "epoch": 0.9607111111111111, + "grad_norm": 0.3425461512134011, + "learning_rate": 8.085733800591411e-07, + "loss": 0.5392, + "step": 5404 + }, + { + "epoch": 0.9608888888888889, + "grad_norm": 0.34906939838943474, + "learning_rate": 8.012822949256982e-07, + "loss": 0.5302, + "step": 5405 + }, + { + "epoch": 0.9610666666666666, + "grad_norm": 0.3694655899937999, + "learning_rate": 7.94024099276891e-07, + "loss": 0.5304, + "step": 5406 + }, + { + "epoch": 0.9612444444444445, + "grad_norm": 0.35474902290277344, + "learning_rate": 7.867987955191947e-07, + "loss": 0.573, + "step": 5407 + }, + { + "epoch": 0.9614222222222222, + "grad_norm": 0.35602761656784154, + "learning_rate": 7.796063860481595e-07, + "loss": 0.6127, + "step": 5408 + }, + { + "epoch": 0.9616, + "grad_norm": 0.34456720727142676, + "learning_rate": 7.724468732484336e-07, + "loss": 0.5302, + "step": 5409 + }, + { + "epoch": 0.9617777777777777, + "grad_norm": 0.3421583240788226, + "learning_rate": 7.653202594937848e-07, + "loss": 0.5596, + "step": 5410 + }, + { + "epoch": 0.9619555555555556, + "grad_norm": 0.3356554085273903, + "learning_rate": 7.58226547147034e-07, + "loss": 0.5638, + "step": 5411 + }, + { + "epoch": 0.9621333333333333, + "grad_norm": 0.37827977989320466, + "learning_rate": 7.511657385601223e-07, + "loss": 0.5751, + "step": 5412 + }, + { + "epoch": 0.9623111111111111, + "grad_norm": 0.36017266466342873, + "learning_rate": 7.441378360740659e-07, + "loss": 0.5442, + "step": 5413 + }, + { + "epoch": 0.9624888888888888, + "grad_norm": 0.35887186855440556, + "learning_rate": 7.371428420189896e-07, + "loss": 0.587, + "step": 5414 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 0.36722490455094403, + "learning_rate": 7.301807587140718e-07, + "loss": 0.5263, + "step": 5415 + }, + { + "epoch": 0.9628444444444444, + "grad_norm": 0.357107058037139, + "learning_rate": 7.232515884676328e-07, + "loss": 0.5606, + "step": 5416 + }, + { + "epoch": 0.9630222222222222, + "grad_norm": 0.3466909570255286, + "learning_rate": 7.163553335770123e-07, + "loss": 0.5609, + "step": 5417 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3561257401402519, + "learning_rate": 7.094919963287039e-07, + "loss": 0.5529, + "step": 5418 + }, + { + "epoch": 0.9633777777777778, + "grad_norm": 0.35009188376432077, + "learning_rate": 7.026615789982426e-07, + "loss": 0.5494, + "step": 5419 + }, + { + "epoch": 0.9635555555555556, + "grad_norm": 0.38536900595610263, + "learning_rate": 6.958640838502617e-07, + "loss": 0.5822, + "step": 5420 + }, + { + "epoch": 0.9637333333333333, + "grad_norm": 0.3401491511918444, + "learning_rate": 6.890995131384914e-07, + "loss": 0.5695, + "step": 5421 + }, + { + "epoch": 0.9639111111111112, + "grad_norm": 0.3474105035663054, + "learning_rate": 6.823678691057378e-07, + "loss": 0.531, + "step": 5422 + }, + { + "epoch": 0.9640888888888889, + "grad_norm": 0.46784446071911057, + "learning_rate": 6.756691539838711e-07, + "loss": 0.6284, + "step": 5423 + }, + { + "epoch": 0.9642666666666667, + "grad_norm": 0.3593624699289564, + "learning_rate": 6.690033699938703e-07, + "loss": 0.565, + "step": 5424 + }, + { + "epoch": 0.9644444444444444, + "grad_norm": 0.36142721943378503, + "learning_rate": 6.623705193457897e-07, + "loss": 0.581, + "step": 5425 + }, + { + "epoch": 0.9646222222222223, + "grad_norm": 0.3502868676050175, + "learning_rate": 6.557706042387479e-07, + "loss": 0.5577, + "step": 5426 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3521697660218038, + "learning_rate": 6.492036268609725e-07, + "loss": 0.5597, + "step": 5427 + }, + { + "epoch": 0.9649777777777778, + "grad_norm": 0.37050049798356277, + "learning_rate": 6.426695893897439e-07, + "loss": 0.5546, + "step": 5428 + }, + { + "epoch": 0.9651555555555555, + "grad_norm": 0.5402142513844844, + "learning_rate": 6.361684939914403e-07, + "loss": 0.5531, + "step": 5429 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 0.387153544310888, + "learning_rate": 6.297003428215043e-07, + "loss": 0.5899, + "step": 5430 + }, + { + "epoch": 0.9655111111111111, + "grad_norm": 0.37677745294000703, + "learning_rate": 6.232651380244536e-07, + "loss": 0.567, + "step": 5431 + }, + { + "epoch": 0.9656888888888889, + "grad_norm": 0.4231264377129029, + "learning_rate": 6.168628817339151e-07, + "loss": 0.5372, + "step": 5432 + }, + { + "epoch": 0.9658666666666667, + "grad_norm": 0.3568154017915155, + "learning_rate": 6.10493576072535e-07, + "loss": 0.5471, + "step": 5433 + }, + { + "epoch": 0.9660444444444445, + "grad_norm": 0.36449396883116014, + "learning_rate": 6.041572231520909e-07, + "loss": 0.5841, + "step": 5434 + }, + { + "epoch": 0.9662222222222222, + "grad_norm": 0.3602882630383994, + "learning_rate": 5.978538250733912e-07, + "loss": 0.5757, + "step": 5435 + }, + { + "epoch": 0.9664, + "grad_norm": 0.35712339853398994, + "learning_rate": 5.91583383926353e-07, + "loss": 0.5585, + "step": 5436 + }, + { + "epoch": 0.9665777777777778, + "grad_norm": 0.3877252415612054, + "learning_rate": 5.853459017899465e-07, + "loss": 0.5801, + "step": 5437 + }, + { + "epoch": 0.9667555555555556, + "grad_norm": 0.3512628538980189, + "learning_rate": 5.791413807322066e-07, + "loss": 0.5716, + "step": 5438 + }, + { + "epoch": 0.9669333333333333, + "grad_norm": 0.36079640159897963, + "learning_rate": 5.729698228102653e-07, + "loss": 0.562, + "step": 5439 + }, + { + "epoch": 0.9671111111111111, + "grad_norm": 0.34757703685942626, + "learning_rate": 5.668312300703193e-07, + "loss": 0.5626, + "step": 5440 + }, + { + "epoch": 0.9672888888888889, + "grad_norm": 0.3408285438004156, + "learning_rate": 5.607256045475961e-07, + "loss": 0.5145, + "step": 5441 + }, + { + "epoch": 0.9674666666666667, + "grad_norm": 0.3526492581847657, + "learning_rate": 5.546529482664542e-07, + "loss": 0.513, + "step": 5442 + }, + { + "epoch": 0.9676444444444444, + "grad_norm": 0.3901065834536196, + "learning_rate": 5.48613263240283e-07, + "loss": 0.5591, + "step": 5443 + }, + { + "epoch": 0.9678222222222223, + "grad_norm": 0.3759670880425791, + "learning_rate": 5.426065514715583e-07, + "loss": 0.563, + "step": 5444 + }, + { + "epoch": 0.968, + "grad_norm": 0.34732805944771866, + "learning_rate": 5.366328149517985e-07, + "loss": 0.5679, + "step": 5445 + }, + { + "epoch": 0.9681777777777778, + "grad_norm": 0.3288592844304263, + "learning_rate": 5.306920556616079e-07, + "loss": 0.5314, + "step": 5446 + }, + { + "epoch": 0.9683555555555555, + "grad_norm": 0.4675801502975746, + "learning_rate": 5.247842755706556e-07, + "loss": 0.5856, + "step": 5447 + }, + { + "epoch": 0.9685333333333334, + "grad_norm": 0.35493248860792603, + "learning_rate": 5.189094766376857e-07, + "loss": 0.5149, + "step": 5448 + }, + { + "epoch": 0.9687111111111111, + "grad_norm": 0.348665135333137, + "learning_rate": 5.130676608104845e-07, + "loss": 0.5358, + "step": 5449 + }, + { + "epoch": 0.9688888888888889, + "grad_norm": 0.33541154220408803, + "learning_rate": 5.072588300259251e-07, + "loss": 0.5423, + "step": 5450 + }, + { + "epoch": 0.9690666666666666, + "grad_norm": 0.36040820853791644, + "learning_rate": 5.014829862099224e-07, + "loss": 0.5491, + "step": 5451 + }, + { + "epoch": 0.9692444444444445, + "grad_norm": 0.360746446168027, + "learning_rate": 4.957401312774668e-07, + "loss": 0.5153, + "step": 5452 + }, + { + "epoch": 0.9694222222222222, + "grad_norm": 0.3569077457181339, + "learning_rate": 4.90030267132624e-07, + "loss": 0.57, + "step": 5453 + }, + { + "epoch": 0.9696, + "grad_norm": 0.35158743579374396, + "learning_rate": 4.84353395668502e-07, + "loss": 0.5571, + "step": 5454 + }, + { + "epoch": 0.9697777777777777, + "grad_norm": 0.3793438134930771, + "learning_rate": 4.787095187672619e-07, + "loss": 0.5853, + "step": 5455 + }, + { + "epoch": 0.9699555555555556, + "grad_norm": 0.35437261290400834, + "learning_rate": 4.7309863830016233e-07, + "loss": 0.5223, + "step": 5456 + }, + { + "epoch": 0.9701333333333333, + "grad_norm": 0.3512962838910777, + "learning_rate": 4.6752075612748194e-07, + "loss": 0.5175, + "step": 5457 + }, + { + "epoch": 0.9703111111111111, + "grad_norm": 0.34509414671064076, + "learning_rate": 4.6197587409858577e-07, + "loss": 0.5693, + "step": 5458 + }, + { + "epoch": 0.9704888888888888, + "grad_norm": 0.3688843346545426, + "learning_rate": 4.564639940518811e-07, + "loss": 0.5393, + "step": 5459 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 0.3556890760854046, + "learning_rate": 4.509851178148505e-07, + "loss": 0.5421, + "step": 5460 + }, + { + "epoch": 0.9708444444444444, + "grad_norm": 0.3516577003029395, + "learning_rate": 4.4553924720400765e-07, + "loss": 0.5952, + "step": 5461 + }, + { + "epoch": 0.9710222222222222, + "grad_norm": 0.3467120078411363, + "learning_rate": 4.4012638402495255e-07, + "loss": 0.5165, + "step": 5462 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3656582330315011, + "learning_rate": 4.3474653007231635e-07, + "loss": 0.5489, + "step": 5463 + }, + { + "epoch": 0.9713777777777778, + "grad_norm": 0.3527899949266385, + "learning_rate": 4.293996871298167e-07, + "loss": 0.5644, + "step": 5464 + }, + { + "epoch": 0.9715555555555555, + "grad_norm": 0.35307655352209605, + "learning_rate": 4.240858569701911e-07, + "loss": 0.5387, + "step": 5465 + }, + { + "epoch": 0.9717333333333333, + "grad_norm": 0.48136338695401615, + "learning_rate": 4.1880504135525243e-07, + "loss": 0.5612, + "step": 5466 + }, + { + "epoch": 0.9719111111111111, + "grad_norm": 0.37135339383803606, + "learning_rate": 4.135572420358669e-07, + "loss": 0.5407, + "step": 5467 + }, + { + "epoch": 0.9720888888888889, + "grad_norm": 0.36097237660409126, + "learning_rate": 4.083424607519426e-07, + "loss": 0.5288, + "step": 5468 + }, + { + "epoch": 0.9722666666666666, + "grad_norm": 0.34795068746128205, + "learning_rate": 4.0316069923245216e-07, + "loss": 0.5696, + "step": 5469 + }, + { + "epoch": 0.9724444444444444, + "grad_norm": 0.3494201776097614, + "learning_rate": 3.9801195919541014e-07, + "loss": 0.5696, + "step": 5470 + }, + { + "epoch": 0.9726222222222223, + "grad_norm": 0.3676729833698882, + "learning_rate": 3.9289624234790656e-07, + "loss": 0.5714, + "step": 5471 + }, + { + "epoch": 0.9728, + "grad_norm": 0.35009172317281645, + "learning_rate": 3.878135503860403e-07, + "loss": 0.5597, + "step": 5472 + }, + { + "epoch": 0.9729777777777778, + "grad_norm": 0.41742279189550324, + "learning_rate": 3.827638849950077e-07, + "loss": 0.5498, + "step": 5473 + }, + { + "epoch": 0.9731555555555556, + "grad_norm": 0.3503718331337725, + "learning_rate": 3.7774724784902514e-07, + "loss": 0.5441, + "step": 5474 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.3622794728247527, + "learning_rate": 3.7276364061137327e-07, + "loss": 0.5834, + "step": 5475 + }, + { + "epoch": 0.9735111111111111, + "grad_norm": 0.34447645874557425, + "learning_rate": 3.678130649343525e-07, + "loss": 0.5443, + "step": 5476 + }, + { + "epoch": 0.9736888888888889, + "grad_norm": 0.33317952899149395, + "learning_rate": 3.6289552245935e-07, + "loss": 0.5133, + "step": 5477 + }, + { + "epoch": 0.9738666666666667, + "grad_norm": 0.34268997990101807, + "learning_rate": 3.5801101481679476e-07, + "loss": 0.5353, + "step": 5478 + }, + { + "epoch": 0.9740444444444445, + "grad_norm": 0.34421877002071316, + "learning_rate": 3.531595436261248e-07, + "loss": 0.5289, + "step": 5479 + }, + { + "epoch": 0.9742222222222222, + "grad_norm": 0.3739710723491484, + "learning_rate": 3.483411104958756e-07, + "loss": 0.6048, + "step": 5480 + }, + { + "epoch": 0.9744, + "grad_norm": 0.35903856974801834, + "learning_rate": 3.435557170236026e-07, + "loss": 0.5873, + "step": 5481 + }, + { + "epoch": 0.9745777777777778, + "grad_norm": 0.3577075506589855, + "learning_rate": 3.3880336479590325e-07, + "loss": 0.5417, + "step": 5482 + }, + { + "epoch": 0.9747555555555556, + "grad_norm": 0.360848677637396, + "learning_rate": 3.340840553884284e-07, + "loss": 0.5737, + "step": 5483 + }, + { + "epoch": 0.9749333333333333, + "grad_norm": 0.345084974472568, + "learning_rate": 3.293977903658707e-07, + "loss": 0.5234, + "step": 5484 + }, + { + "epoch": 0.9751111111111112, + "grad_norm": 0.34056513458405746, + "learning_rate": 3.247445712819763e-07, + "loss": 0.5447, + "step": 5485 + }, + { + "epoch": 0.9752888888888889, + "grad_norm": 0.36877192195042774, + "learning_rate": 3.2012439967952224e-07, + "loss": 0.5773, + "step": 5486 + }, + { + "epoch": 0.9754666666666667, + "grad_norm": 0.3448205492016093, + "learning_rate": 3.1553727709032754e-07, + "loss": 0.5162, + "step": 5487 + }, + { + "epoch": 0.9756444444444444, + "grad_norm": 0.4726100505417255, + "learning_rate": 3.1098320503527567e-07, + "loss": 0.5562, + "step": 5488 + }, + { + "epoch": 0.9758222222222223, + "grad_norm": 0.3762643669284478, + "learning_rate": 3.0646218502425886e-07, + "loss": 0.5429, + "step": 5489 + }, + { + "epoch": 0.976, + "grad_norm": 0.3499516844482082, + "learning_rate": 3.0197421855624463e-07, + "loss": 0.5609, + "step": 5490 + }, + { + "epoch": 0.9761777777777778, + "grad_norm": 0.37564789925550124, + "learning_rate": 2.975193071191984e-07, + "loss": 0.5668, + "step": 5491 + }, + { + "epoch": 0.9763555555555555, + "grad_norm": 0.368144490085935, + "learning_rate": 2.9309745219018306e-07, + "loss": 0.5916, + "step": 5492 + }, + { + "epoch": 0.9765333333333334, + "grad_norm": 0.3679616236587089, + "learning_rate": 2.8870865523525915e-07, + "loss": 0.5615, + "step": 5493 + }, + { + "epoch": 0.9767111111111111, + "grad_norm": 0.3690715790947065, + "learning_rate": 2.8435291770952945e-07, + "loss": 0.5981, + "step": 5494 + }, + { + "epoch": 0.9768888888888889, + "grad_norm": 0.3614304588670998, + "learning_rate": 2.8003024105716093e-07, + "loss": 0.5291, + "step": 5495 + }, + { + "epoch": 0.9770666666666666, + "grad_norm": 0.33791120472386843, + "learning_rate": 2.757406267113294e-07, + "loss": 0.5246, + "step": 5496 + }, + { + "epoch": 0.9772444444444445, + "grad_norm": 0.36684205925729624, + "learning_rate": 2.7148407609427493e-07, + "loss": 0.5573, + "step": 5497 + }, + { + "epoch": 0.9774222222222222, + "grad_norm": 0.3695401872898539, + "learning_rate": 2.6726059061725763e-07, + "loss": 0.5401, + "step": 5498 + }, + { + "epoch": 0.9776, + "grad_norm": 0.4072693653699395, + "learning_rate": 2.6307017168057946e-07, + "loss": 0.55, + "step": 5499 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 0.3692389070393495, + "learning_rate": 2.589128206735847e-07, + "loss": 0.558, + "step": 5500 + }, + { + "epoch": 0.9779555555555556, + "grad_norm": 0.3391823078749012, + "learning_rate": 2.547885389746485e-07, + "loss": 0.5675, + "step": 5501 + }, + { + "epoch": 0.9781333333333333, + "grad_norm": 0.3200204008227983, + "learning_rate": 2.5069732795117706e-07, + "loss": 0.5569, + "step": 5502 + }, + { + "epoch": 0.9783111111111111, + "grad_norm": 0.3384277355644874, + "learning_rate": 2.4663918895961867e-07, + "loss": 0.5312, + "step": 5503 + }, + { + "epoch": 0.9784888888888889, + "grad_norm": 0.3283979876855168, + "learning_rate": 2.4261412334546376e-07, + "loss": 0.4883, + "step": 5504 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 0.32243300836604566, + "learning_rate": 2.386221324432225e-07, + "loss": 0.5303, + "step": 5505 + }, + { + "epoch": 0.9788444444444444, + "grad_norm": 0.35356185578856764, + "learning_rate": 2.3466321757644738e-07, + "loss": 0.5891, + "step": 5506 + }, + { + "epoch": 0.9790222222222222, + "grad_norm": 0.35582005274556755, + "learning_rate": 2.3073738005771062e-07, + "loss": 0.5725, + "step": 5507 + }, + { + "epoch": 0.9792, + "grad_norm": 0.37710228725285483, + "learning_rate": 2.268446211886599e-07, + "loss": 0.5685, + "step": 5508 + }, + { + "epoch": 0.9793777777777778, + "grad_norm": 0.41710705963907313, + "learning_rate": 2.229849422599073e-07, + "loss": 0.5389, + "step": 5509 + }, + { + "epoch": 0.9795555555555555, + "grad_norm": 0.37425483809417903, + "learning_rate": 2.1915834455116247e-07, + "loss": 0.5837, + "step": 5510 + }, + { + "epoch": 0.9797333333333333, + "grad_norm": 0.3627831290090467, + "learning_rate": 2.1536482933113277e-07, + "loss": 0.5549, + "step": 5511 + }, + { + "epoch": 0.9799111111111111, + "grad_norm": 0.38079303882544413, + "learning_rate": 2.116043978575566e-07, + "loss": 0.5439, + "step": 5512 + }, + { + "epoch": 0.9800888888888889, + "grad_norm": 0.3730370819581776, + "learning_rate": 2.0787705137721437e-07, + "loss": 0.5575, + "step": 5513 + }, + { + "epoch": 0.9802666666666666, + "grad_norm": 0.3686317814718325, + "learning_rate": 2.0418279112592863e-07, + "loss": 0.5495, + "step": 5514 + }, + { + "epoch": 0.9804444444444445, + "grad_norm": 0.37282254742977555, + "learning_rate": 2.0052161832850856e-07, + "loss": 0.5477, + "step": 5515 + }, + { + "epoch": 0.9806222222222222, + "grad_norm": 0.3452617630183563, + "learning_rate": 1.9689353419884982e-07, + "loss": 0.5481, + "step": 5516 + }, + { + "epoch": 0.9808, + "grad_norm": 0.3420643396652723, + "learning_rate": 1.9329853993982349e-07, + "loss": 0.5355, + "step": 5517 + }, + { + "epoch": 0.9809777777777777, + "grad_norm": 0.3615547780550409, + "learning_rate": 1.8973663674337616e-07, + "loss": 0.5506, + "step": 5518 + }, + { + "epoch": 0.9811555555555556, + "grad_norm": 0.3732584748243469, + "learning_rate": 1.8620782579045204e-07, + "loss": 0.5777, + "step": 5519 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 0.3613884100980496, + "learning_rate": 1.8271210825102636e-07, + "loss": 0.5288, + "step": 5520 + }, + { + "epoch": 0.9815111111111111, + "grad_norm": 0.3445004688165788, + "learning_rate": 1.7924948528412755e-07, + "loss": 0.5623, + "step": 5521 + }, + { + "epoch": 0.9816888888888889, + "grad_norm": 0.3513575965184217, + "learning_rate": 1.7581995803778172e-07, + "loss": 0.5608, + "step": 5522 + }, + { + "epoch": 0.9818666666666667, + "grad_norm": 0.3621118407549723, + "learning_rate": 1.7242352764905712e-07, + "loss": 0.5891, + "step": 5523 + }, + { + "epoch": 0.9820444444444445, + "grad_norm": 0.3460912215034074, + "learning_rate": 1.6906019524405293e-07, + "loss": 0.5587, + "step": 5524 + }, + { + "epoch": 0.9822222222222222, + "grad_norm": 0.3550422459197319, + "learning_rate": 1.6572996193786604e-07, + "loss": 0.5338, + "step": 5525 + }, + { + "epoch": 0.9824, + "grad_norm": 0.34640932884176445, + "learning_rate": 1.624328288346577e-07, + "loss": 0.5376, + "step": 5526 + }, + { + "epoch": 0.9825777777777778, + "grad_norm": 0.39468485751583904, + "learning_rate": 1.5916879702759791e-07, + "loss": 0.5826, + "step": 5527 + }, + { + "epoch": 0.9827555555555556, + "grad_norm": 0.33847566406756374, + "learning_rate": 1.5593786759886542e-07, + "loss": 0.5406, + "step": 5528 + }, + { + "epoch": 0.9829333333333333, + "grad_norm": 0.3711218036793891, + "learning_rate": 1.5274004161970335e-07, + "loss": 0.588, + "step": 5529 + }, + { + "epoch": 0.9831111111111112, + "grad_norm": 0.33041180679215765, + "learning_rate": 1.4957532015034137e-07, + "loss": 0.5202, + "step": 5530 + }, + { + "epoch": 0.9832888888888889, + "grad_norm": 0.35892403257342553, + "learning_rate": 1.4644370424004016e-07, + "loss": 0.5539, + "step": 5531 + }, + { + "epoch": 0.9834666666666667, + "grad_norm": 0.3589863036505411, + "learning_rate": 1.4334519492711362e-07, + "loss": 0.5687, + "step": 5532 + }, + { + "epoch": 0.9836444444444444, + "grad_norm": 0.4011601762703607, + "learning_rate": 1.402797932388511e-07, + "loss": 0.5445, + "step": 5533 + }, + { + "epoch": 0.9838222222222223, + "grad_norm": 0.3518349011977445, + "learning_rate": 1.3724750019161735e-07, + "loss": 0.5492, + "step": 5534 + }, + { + "epoch": 0.984, + "grad_norm": 0.33824305101839586, + "learning_rate": 1.3424831679075267e-07, + "loss": 0.5498, + "step": 5535 + }, + { + "epoch": 0.9841777777777778, + "grad_norm": 0.36741881281851074, + "learning_rate": 1.3128224403065048e-07, + "loss": 0.6051, + "step": 5536 + }, + { + "epoch": 0.9843555555555555, + "grad_norm": 0.3368415234087323, + "learning_rate": 1.2834928289472416e-07, + "loss": 0.547, + "step": 5537 + }, + { + "epoch": 0.9845333333333334, + "grad_norm": 0.3551130166564082, + "learning_rate": 1.254494343553847e-07, + "loss": 0.5621, + "step": 5538 + }, + { + "epoch": 0.9847111111111111, + "grad_norm": 0.36301317101754615, + "learning_rate": 1.225826993740853e-07, + "loss": 0.5236, + "step": 5539 + }, + { + "epoch": 0.9848888888888889, + "grad_norm": 0.33436272750414997, + "learning_rate": 1.1974907890131004e-07, + "loss": 0.5203, + "step": 5540 + }, + { + "epoch": 0.9850666666666666, + "grad_norm": 0.348942392334364, + "learning_rate": 1.1694857387652969e-07, + "loss": 0.5744, + "step": 5541 + }, + { + "epoch": 0.9852444444444445, + "grad_norm": 0.3777856821159728, + "learning_rate": 1.1418118522826814e-07, + "loss": 0.6073, + "step": 5542 + }, + { + "epoch": 0.9854222222222222, + "grad_norm": 0.3966836904203387, + "learning_rate": 1.1144691387405815e-07, + "loss": 0.5415, + "step": 5543 + }, + { + "epoch": 0.9856, + "grad_norm": 0.343002093494387, + "learning_rate": 1.0874576072045228e-07, + "loss": 0.5301, + "step": 5544 + }, + { + "epoch": 0.9857777777777778, + "grad_norm": 0.36895986842017847, + "learning_rate": 1.0607772666302306e-07, + "loss": 0.563, + "step": 5545 + }, + { + "epoch": 0.9859555555555556, + "grad_norm": 0.35532454730600016, + "learning_rate": 1.0344281258634069e-07, + "loss": 0.543, + "step": 5546 + }, + { + "epoch": 0.9861333333333333, + "grad_norm": 0.4179300742293262, + "learning_rate": 1.0084101936403967e-07, + "loss": 0.5685, + "step": 5547 + }, + { + "epoch": 0.9863111111111111, + "grad_norm": 0.34776762268395794, + "learning_rate": 9.827234785874107e-08, + "loss": 0.5414, + "step": 5548 + }, + { + "epoch": 0.9864888888888889, + "grad_norm": 0.3565924009116993, + "learning_rate": 9.573679892209697e-08, + "loss": 0.5365, + "step": 5549 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.3945472781566639, + "learning_rate": 9.323437339475715e-08, + "loss": 0.5597, + "step": 5550 + }, + { + "epoch": 0.9868444444444444, + "grad_norm": 0.36569837114274645, + "learning_rate": 9.076507210641349e-08, + "loss": 0.5806, + "step": 5551 + }, + { + "epoch": 0.9870222222222222, + "grad_norm": 0.39186677212776977, + "learning_rate": 8.832889587576665e-08, + "loss": 0.533, + "step": 5552 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3523771507508271, + "learning_rate": 8.592584551053718e-08, + "loss": 0.5568, + "step": 5553 + }, + { + "epoch": 0.9873777777777778, + "grad_norm": 0.3620987427501741, + "learning_rate": 8.355592180745442e-08, + "loss": 0.5443, + "step": 5554 + }, + { + "epoch": 0.9875555555555555, + "grad_norm": 0.3531817696270951, + "learning_rate": 8.121912555226762e-08, + "loss": 0.5704, + "step": 5555 + }, + { + "epoch": 0.9877333333333334, + "grad_norm": 0.43565048600195005, + "learning_rate": 7.891545751975704e-08, + "loss": 0.5439, + "step": 5556 + }, + { + "epoch": 0.9879111111111111, + "grad_norm": 0.35796229388168166, + "learning_rate": 7.664491847370058e-08, + "loss": 0.5362, + "step": 5557 + }, + { + "epoch": 0.9880888888888889, + "grad_norm": 0.370210252698021, + "learning_rate": 7.44075091669072e-08, + "loss": 0.5699, + "step": 5558 + }, + { + "epoch": 0.9882666666666666, + "grad_norm": 0.35624343892069227, + "learning_rate": 7.220323034117238e-08, + "loss": 0.5281, + "step": 5559 + }, + { + "epoch": 0.9884444444444445, + "grad_norm": 0.341902259729492, + "learning_rate": 7.003208272734484e-08, + "loss": 0.6027, + "step": 5560 + }, + { + "epoch": 0.9886222222222222, + "grad_norm": 0.3759589580980951, + "learning_rate": 6.789406704527102e-08, + "loss": 0.5688, + "step": 5561 + }, + { + "epoch": 0.9888, + "grad_norm": 0.35435420877715273, + "learning_rate": 6.578918400380608e-08, + "loss": 0.5493, + "step": 5562 + }, + { + "epoch": 0.9889777777777777, + "grad_norm": 0.3550138379215988, + "learning_rate": 6.37174343008251e-08, + "loss": 0.5589, + "step": 5563 + }, + { + "epoch": 0.9891555555555556, + "grad_norm": 0.34494653894257016, + "learning_rate": 6.167881862324531e-08, + "loss": 0.5377, + "step": 5564 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 0.36600237996417695, + "learning_rate": 5.967333764693716e-08, + "loss": 0.5789, + "step": 5565 + }, + { + "epoch": 0.9895111111111111, + "grad_norm": 0.3561551322628878, + "learning_rate": 5.770099203683543e-08, + "loss": 0.5734, + "step": 5566 + }, + { + "epoch": 0.9896888888888888, + "grad_norm": 0.3582113555832297, + "learning_rate": 5.576178244688368e-08, + "loss": 0.5892, + "step": 5567 + }, + { + "epoch": 0.9898666666666667, + "grad_norm": 0.3497097442952161, + "learning_rate": 5.3855709520023165e-08, + "loss": 0.5303, + "step": 5568 + }, + { + "epoch": 0.9900444444444444, + "grad_norm": 0.3539917363723397, + "learning_rate": 5.198277388821504e-08, + "loss": 0.5463, + "step": 5569 + }, + { + "epoch": 0.9902222222222222, + "grad_norm": 0.34107607348128277, + "learning_rate": 5.0142976172429246e-08, + "loss": 0.5312, + "step": 5570 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3567626598098884, + "learning_rate": 4.833631698265562e-08, + "loss": 0.5709, + "step": 5571 + }, + { + "epoch": 0.9905777777777778, + "grad_norm": 0.3373312123765254, + "learning_rate": 4.656279691789278e-08, + "loss": 0.5684, + "step": 5572 + }, + { + "epoch": 0.9907555555555555, + "grad_norm": 0.34874024947239707, + "learning_rate": 4.4822416566170364e-08, + "loss": 0.5836, + "step": 5573 + }, + { + "epoch": 0.9909333333333333, + "grad_norm": 0.3682855535615194, + "learning_rate": 4.311517650449348e-08, + "loss": 0.5652, + "step": 5574 + }, + { + "epoch": 0.9911111111111112, + "grad_norm": 0.3379455688319691, + "learning_rate": 4.144107729890934e-08, + "loss": 0.5751, + "step": 5575 + }, + { + "epoch": 0.9912888888888889, + "grad_norm": 0.35474321916641693, + "learning_rate": 3.980011950446283e-08, + "loss": 0.5935, + "step": 5576 + }, + { + "epoch": 0.9914666666666667, + "grad_norm": 0.3433754359340062, + "learning_rate": 3.819230366521875e-08, + "loss": 0.5111, + "step": 5577 + }, + { + "epoch": 0.9916444444444444, + "grad_norm": 0.3533057673642881, + "learning_rate": 3.6617630314261795e-08, + "loss": 0.5314, + "step": 5578 + }, + { + "epoch": 0.9918222222222223, + "grad_norm": 0.3790287238107738, + "learning_rate": 3.507609997366323e-08, + "loss": 0.5096, + "step": 5579 + }, + { + "epoch": 0.992, + "grad_norm": 0.3378724837986974, + "learning_rate": 3.3567713154525337e-08, + "loss": 0.5486, + "step": 5580 + }, + { + "epoch": 0.9921777777777778, + "grad_norm": 0.34264666190762966, + "learning_rate": 3.2092470356948066e-08, + "loss": 0.5473, + "step": 5581 + }, + { + "epoch": 0.9923555555555555, + "grad_norm": 0.36422873822245894, + "learning_rate": 3.065037207006238e-08, + "loss": 0.5816, + "step": 5582 + }, + { + "epoch": 0.9925333333333334, + "grad_norm": 0.35290101452688716, + "learning_rate": 2.924141877198583e-08, + "loss": 0.5676, + "step": 5583 + }, + { + "epoch": 0.9927111111111111, + "grad_norm": 0.3414961394944708, + "learning_rate": 2.786561092987805e-08, + "loss": 0.5611, + "step": 5584 + }, + { + "epoch": 0.9928888888888889, + "grad_norm": 0.35658471742544945, + "learning_rate": 2.6522948999874175e-08, + "loss": 0.5427, + "step": 5585 + }, + { + "epoch": 0.9930666666666667, + "grad_norm": 0.35565346844172646, + "learning_rate": 2.5213433427140333e-08, + "loss": 0.5786, + "step": 5586 + }, + { + "epoch": 0.9932444444444445, + "grad_norm": 0.3635110179537637, + "learning_rate": 2.3937064645840333e-08, + "loss": 0.56, + "step": 5587 + }, + { + "epoch": 0.9934222222222222, + "grad_norm": 0.3828405268996862, + "learning_rate": 2.2693843079168997e-08, + "loss": 0.5637, + "step": 5588 + }, + { + "epoch": 0.9936, + "grad_norm": 0.36113016022733274, + "learning_rate": 2.1483769139318823e-08, + "loss": 0.5789, + "step": 5589 + }, + { + "epoch": 0.9937777777777778, + "grad_norm": 0.3768477476221962, + "learning_rate": 2.030684322748e-08, + "loss": 0.5278, + "step": 5590 + }, + { + "epoch": 0.9939555555555556, + "grad_norm": 0.3406123490543402, + "learning_rate": 1.9163065733873718e-08, + "loss": 0.5495, + "step": 5591 + }, + { + "epoch": 0.9941333333333333, + "grad_norm": 0.35930426196369447, + "learning_rate": 1.8052437037707758e-08, + "loss": 0.589, + "step": 5592 + }, + { + "epoch": 0.9943111111111111, + "grad_norm": 0.36027880279376373, + "learning_rate": 1.6974957507231993e-08, + "loss": 0.5629, + "step": 5593 + }, + { + "epoch": 0.9944888888888889, + "grad_norm": 0.35053229120586743, + "learning_rate": 1.593062749967178e-08, + "loss": 0.5709, + "step": 5594 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 0.36176302718256115, + "learning_rate": 1.4919447361283477e-08, + "loss": 0.5147, + "step": 5595 + }, + { + "epoch": 0.9948444444444444, + "grad_norm": 0.3479530557886071, + "learning_rate": 1.3941417427321135e-08, + "loss": 0.5932, + "step": 5596 + }, + { + "epoch": 0.9950222222222223, + "grad_norm": 0.35241188990158684, + "learning_rate": 1.299653802205869e-08, + "loss": 0.5368, + "step": 5597 + }, + { + "epoch": 0.9952, + "grad_norm": 0.3477666321556078, + "learning_rate": 1.2084809458756675e-08, + "loss": 0.5283, + "step": 5598 + }, + { + "epoch": 0.9953777777777778, + "grad_norm": 0.3511916467782138, + "learning_rate": 1.1206232039728814e-08, + "loss": 0.5763, + "step": 5599 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 0.33903781649015696, + "learning_rate": 1.0360806056242123e-08, + "loss": 0.5349, + "step": 5600 + }, + { + "epoch": 0.9957333333333334, + "grad_norm": 0.3908329648053974, + "learning_rate": 9.548531788605707e-09, + "loss": 0.5953, + "step": 5601 + }, + { + "epoch": 0.9959111111111111, + "grad_norm": 0.33430626933594315, + "learning_rate": 8.76940950612637e-09, + "loss": 0.5574, + "step": 5602 + }, + { + "epoch": 0.9960888888888889, + "grad_norm": 0.33532419227127347, + "learning_rate": 8.023439467141902e-09, + "loss": 0.536, + "step": 5603 + }, + { + "epoch": 0.9962666666666666, + "grad_norm": 0.339786407813064, + "learning_rate": 7.3106219189655875e-09, + "loss": 0.5509, + "step": 5604 + }, + { + "epoch": 0.9964444444444445, + "grad_norm": 0.4177124318949463, + "learning_rate": 6.630957097930601e-09, + "loss": 0.5578, + "step": 5605 + }, + { + "epoch": 0.9966222222222222, + "grad_norm": 0.3462871505625693, + "learning_rate": 5.984445229390012e-09, + "loss": 0.5053, + "step": 5606 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3344964090228748, + "learning_rate": 5.371086527683478e-09, + "loss": 0.5295, + "step": 5607 + }, + { + "epoch": 0.9969777777777777, + "grad_norm": 0.35909450849651425, + "learning_rate": 4.7908811961816514e-09, + "loss": 0.552, + "step": 5608 + }, + { + "epoch": 0.9971555555555556, + "grad_norm": 0.3575909896274203, + "learning_rate": 4.2438294272528765e-09, + "loss": 0.5351, + "step": 5609 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 0.34417726966370205, + "learning_rate": 3.7299314022631874e-09, + "loss": 0.5107, + "step": 5610 + }, + { + "epoch": 0.9975111111111111, + "grad_norm": 0.3617979529916427, + "learning_rate": 3.249187291609612e-09, + "loss": 0.5871, + "step": 5611 + }, + { + "epoch": 0.9976888888888888, + "grad_norm": 0.3860283996651874, + "learning_rate": 2.8015972546646674e-09, + "loss": 0.5862, + "step": 5612 + }, + { + "epoch": 0.9978666666666667, + "grad_norm": 0.34794627497920005, + "learning_rate": 2.387161439854069e-09, + "loss": 0.5626, + "step": 5613 + }, + { + "epoch": 0.9980444444444444, + "grad_norm": 0.3702338315420281, + "learning_rate": 2.005879984556813e-09, + "loss": 0.556, + "step": 5614 + }, + { + "epoch": 0.9982222222222222, + "grad_norm": 0.3715422126141103, + "learning_rate": 1.657753015205099e-09, + "loss": 0.5546, + "step": 5615 + }, + { + "epoch": 0.9984, + "grad_norm": 0.35147150675439054, + "learning_rate": 1.3427806472177118e-09, + "loss": 0.572, + "step": 5616 + }, + { + "epoch": 0.9985777777777778, + "grad_norm": 0.3374125581857282, + "learning_rate": 1.0609629850222292e-09, + "loss": 0.5381, + "step": 5617 + }, + { + "epoch": 0.9987555555555555, + "grad_norm": 0.3728159392608189, + "learning_rate": 8.123001220550208e-10, + "loss": 0.5776, + "step": 5618 + }, + { + "epoch": 0.9989333333333333, + "grad_norm": 0.35312986373587263, + "learning_rate": 5.967921407612487e-10, + "loss": 0.5096, + "step": 5619 + }, + { + "epoch": 0.9991111111111111, + "grad_norm": 0.3860609285919614, + "learning_rate": 4.144391126059688e-10, + "loss": 0.5974, + "step": 5620 + }, + { + "epoch": 0.9992888888888889, + "grad_norm": 0.33070673654349286, + "learning_rate": 2.652410980186204e-10, + "loss": 0.5489, + "step": 5621 + }, + { + "epoch": 0.9994666666666666, + "grad_norm": 0.3257573690291597, + "learning_rate": 1.4919814649294594e-10, + "loss": 0.5492, + "step": 5622 + }, + { + "epoch": 0.9996444444444444, + "grad_norm": 0.3290364798907909, + "learning_rate": 6.631029648707099e-11, + "loss": 0.5302, + "step": 5623 + }, + { + "epoch": 0.9998222222222222, + "grad_norm": 0.3601730077323053, + "learning_rate": 1.6577575501219854e-11, + "loss": 0.5653, + "step": 5624 + }, + { + "epoch": 1.0, + "grad_norm": 0.35874852658182327, + "learning_rate": 0.0, + "loss": 0.541, + "step": 5625 + }, + { + "epoch": 1.0, + "step": 5625, + "total_flos": 5002338659401728.0, + "train_loss": 0.6259202656692928, + "train_runtime": 89183.5058, + "train_samples_per_second": 1.009, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 5625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5002338659401728.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..53481dd51193a0e71928271293246738288877dc --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..f02a35e8a576e842bc12654048b6e9d5e4215ef0 --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "q_proj", + "gate_proj", + "o_proj", + "up_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..8eed616562ddec43e9cb7b0568fc4e34492df209 --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f23be53bc331681a0eba4d7cd4a5068be16c7d7fde77a0cb8283b026b9c3940a +size 671150064 diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..14d0036f2d6ef7a43e27dd6ab3975619d8bb57a4 --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": false, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..36f429b81781b66e27ce12b99b246e24626f811a --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1389ce5936565078a758a0a88966971911a10e44b104a48ad0fce7876622673 +size 899633034 diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..faf7d29d8e68afc9227e643496ccf27c8e88977f --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_10000_epochs_1_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.9033404973257572, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.3702, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.9200655478154655, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4091, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.8568115900101086, + "learning_rate": 3.157894736842105e-05, + "loss": 1.3421, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7276561318273437, + "learning_rate": 4.210526315789474e-05, + "loss": 1.2378, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.80630356176694, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1796, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.8324188030112533, + "learning_rate": 6.31578947368421e-05, + "loss": 1.1077, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.8239631251583869, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9959, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7639673779925863, + "learning_rate": 8.421052631578948e-05, + "loss": 0.9213, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.8024484626222647, + "learning_rate": 9.473684210526316e-05, + "loss": 0.853, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.6531227626647346, + "learning_rate": 0.00010526315789473685, + "loss": 0.8696, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.5003114081934658, + "learning_rate": 0.00011578947368421053, + "loss": 0.8229, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.43361106978087643, + "learning_rate": 0.0001263157894736842, + "loss": 0.8606, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.3989391164413557, + "learning_rate": 0.0001368421052631579, + "loss": 0.8001, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.37635276210072166, + "learning_rate": 0.00014736842105263158, + "loss": 0.736, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.3749720016620839, + "learning_rate": 0.00015789473684210527, + "loss": 0.7812, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.38822191417536056, + "learning_rate": 0.00016842105263157895, + "loss": 0.8408, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.33905688002757933, + "learning_rate": 0.00017894736842105264, + "loss": 0.7776, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4009310777468807, + "learning_rate": 0.00018947368421052632, + "loss": 0.8082, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.396512227244631, + "learning_rate": 0.0002, + "loss": 0.7607, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.38131463269530463, + "learning_rate": 0.00019999865623437013, + "loss": 0.8209, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.388771662310261, + "learning_rate": 0.00019999462497359466, + "loss": 0.737, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.37535124351711646, + "learning_rate": 0.00019998790632601496, + "loss": 0.7578, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.33381879122801084, + "learning_rate": 0.0001999785004721968, + "loss": 0.7329, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.34652640125583617, + "learning_rate": 0.00019996640766492543, + "loss": 0.7115, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.3449929315808586, + "learning_rate": 0.00019995162822919883, + "loss": 0.7175, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.3458815755530234, + "learning_rate": 0.00019993416256221895, + "loss": 0.8123, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.342032875398569, + "learning_rate": 0.00019991401113338104, + "loss": 0.7321, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.31766514898039433, + "learning_rate": 0.00019989117448426108, + "loss": 0.7129, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.32126953003960074, + "learning_rate": 0.00019986565322860115, + "loss": 0.7274, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.32536127409631893, + "learning_rate": 0.00019983744805229296, + "loss": 0.7637, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.314487984166115, + "learning_rate": 0.00019980655971335945, + "loss": 0.6898, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.31136998311660374, + "learning_rate": 0.00019977298904193437, + "loss": 0.684, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.32482632417484913, + "learning_rate": 0.00019973673694024, + "loss": 0.7235, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3099515319366386, + "learning_rate": 0.00019969780438256293, + "loss": 0.6873, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.30107064068065836, + "learning_rate": 0.0001996561924152278, + "loss": 0.6785, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.2941824903305334, + "learning_rate": 0.0001996119021565693, + "loss": 0.6513, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.30994923914853884, + "learning_rate": 0.0001995649347969019, + "loss": 0.7031, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3037113147667233, + "learning_rate": 0.00019951529159848805, + "loss": 0.7109, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.3088199641547337, + "learning_rate": 0.00019946297389550433, + "loss": 0.7703, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.30294503947884, + "learning_rate": 0.00019940798309400526, + "loss": 0.7064, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.3009478698388032, + "learning_rate": 0.0001993503206718859, + "loss": 0.7027, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.32279287397339806, + "learning_rate": 0.00019928998817884182, + "loss": 0.6876, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.30000672326660394, + "learning_rate": 0.00019922698723632767, + "loss": 0.6574, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3013197784172169, + "learning_rate": 0.00019916131953751342, + "loss": 0.6766, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.30003465545797525, + "learning_rate": 0.00019909298684723904, + "loss": 0.718, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.28608785901924544, + "learning_rate": 0.00019902199100196697, + "loss": 0.6536, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.29444853278829763, + "learning_rate": 0.00019894833390973266, + "loss": 0.6514, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.2962330894917321, + "learning_rate": 0.00019887201755009357, + "loss": 0.6802, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.29757697902300934, + "learning_rate": 0.0001987930439740757, + "loss": 0.6645, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.3219996792532637, + "learning_rate": 0.00019871141530411853, + "loss": 0.6974, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.2885633090251933, + "learning_rate": 0.0001986271337340182, + "loss": 0.683, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3032543344053866, + "learning_rate": 0.00019854020152886814, + "loss": 0.712, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.2906786212439371, + "learning_rate": 0.0001984506210249986, + "loss": 0.7192, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.30456504810978957, + "learning_rate": 0.00019835839462991361, + "loss": 0.7153, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.3109362660245635, + "learning_rate": 0.00019826352482222638, + "loss": 0.6616, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.29946421248714533, + "learning_rate": 0.00019816601415159263, + "loss": 0.6651, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.2741698159306603, + "learning_rate": 0.0001980658652386421, + "loss": 0.5766, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.2932323716173052, + "learning_rate": 0.00019796308077490817, + "loss": 0.6815, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.3039664118741775, + "learning_rate": 0.00019785766352275542, + "loss": 0.7063, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.2835424116558143, + "learning_rate": 0.00019774961631530545, + "loss": 0.6568, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.2903373260317771, + "learning_rate": 0.00019763894205636072, + "loss": 0.6829, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.2908879079771421, + "learning_rate": 0.00019752564372032657, + "loss": 0.6827, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.2820705964805036, + "learning_rate": 0.00019740972435213115, + "loss": 0.6353, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.282779416359093, + "learning_rate": 0.00019729118706714375, + "loss": 0.6722, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.2968739512906537, + "learning_rate": 0.00019717003505109095, + "loss": 0.7101, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.307347798581539, + "learning_rate": 0.00019704627155997108, + "loss": 0.6958, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.2884637033507374, + "learning_rate": 0.00019691989991996663, + "loss": 0.6534, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.30038576733393246, + "learning_rate": 0.0001967909235273549, + "loss": 0.6998, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.29192806208266164, + "learning_rate": 0.00019665934584841682, + "loss": 0.6843, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.2828143400360726, + "learning_rate": 0.00019652517041934356, + "loss": 0.644, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.28909264028318, + "learning_rate": 0.00019638840084614182, + "loss": 0.6869, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.30696934408671855, + "learning_rate": 0.00019624904080453655, + "loss": 0.6412, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.28023002743316044, + "learning_rate": 0.00019610709403987246, + "loss": 0.6529, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.28117857965998544, + "learning_rate": 0.00019596256436701324, + "loss": 0.6315, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.28029665183934693, + "learning_rate": 0.000195815455670239, + "loss": 0.6506, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.2949384146250226, + "learning_rate": 0.00019566577190314197, + "loss": 0.6907, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.28744268327217054, + "learning_rate": 0.0001955135170885202, + "loss": 0.6445, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.29983300948861297, + "learning_rate": 0.00019535869531826937, + "loss": 0.6671, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.3038698684581471, + "learning_rate": 0.00019520131075327298, + "loss": 0.6499, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.2856930064996507, + "learning_rate": 0.00019504136762329047, + "loss": 0.6292, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.28690016312392796, + "learning_rate": 0.00019487887022684336, + "loss": 0.6734, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.29270353198941823, + "learning_rate": 0.00019471382293110003, + "loss": 0.6628, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.28376580045137256, + "learning_rate": 0.00019454623017175812, + "loss": 0.6208, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.2970281168568997, + "learning_rate": 0.00019437609645292546, + "loss": 0.6468, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.2971345990174716, + "learning_rate": 0.0001942034263469989, + "loss": 0.6821, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.29653412404879576, + "learning_rate": 0.00019402822449454153, + "loss": 0.6553, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.27626070918423495, + "learning_rate": 0.00019385049560415794, + "loss": 0.6238, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2876338029240312, + "learning_rate": 0.00019367024445236754, + "loss": 0.6563, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.2965477447426151, + "learning_rate": 0.00019348747588347637, + "loss": 0.6786, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.3487212794023846, + "learning_rate": 0.00019330219480944694, + "loss": 0.7069, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.2878640107558607, + "learning_rate": 0.00019311440620976597, + "loss": 0.6559, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.28113466605686666, + "learning_rate": 0.0001929241151313108, + "loss": 0.6509, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.29288843234343526, + "learning_rate": 0.00019273132668821364, + "loss": 0.6917, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2849405008631043, + "learning_rate": 0.00019253604606172417, + "loss": 0.6801, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.2860251354270671, + "learning_rate": 0.00019233827850007027, + "loss": 0.6768, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.28640334084107516, + "learning_rate": 0.00019213802931831696, + "loss": 0.6338, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.28274623890495537, + "learning_rate": 0.00019193530389822363, + "loss": 0.6042, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.29635263771652154, + "learning_rate": 0.00019173010768809933, + "loss": 0.6534, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.28085511906044697, + "learning_rate": 0.0001915224462026563, + "loss": 0.6577, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.2994093719296395, + "learning_rate": 0.00019131232502286188, + "loss": 0.6635, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.28016135788866825, + "learning_rate": 0.0001910997497957885, + "loss": 0.6306, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.2859276049903214, + "learning_rate": 0.00019088472623446183, + "loss": 0.6573, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.2949529432094483, + "learning_rate": 0.00019066726011770726, + "loss": 0.6707, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.2952858148380289, + "learning_rate": 0.0001904473572899947, + "loss": 0.6688, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.29687433263724833, + "learning_rate": 0.00019022502366128135, + "loss": 0.6557, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3274603736840435, + "learning_rate": 0.00019000026520685302, + "loss": 0.6333, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.2856127394350284, + "learning_rate": 0.0001897730879671634, + "loss": 0.6575, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.2909543989200667, + "learning_rate": 0.00018954349804767184, + "loss": 0.6626, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.3050216754760076, + "learning_rate": 0.00018931150161867916, + "loss": 0.6738, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.2977253243133897, + "learning_rate": 0.00018907710491516199, + "loss": 0.6509, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.2830623583930099, + "learning_rate": 0.0001888403142366049, + "loss": 0.6083, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.2778721417914392, + "learning_rate": 0.00018860113594683148, + "loss": 0.5844, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.2887698595117564, + "learning_rate": 0.00018835957647383303, + "loss": 0.6406, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.29644839745372237, + "learning_rate": 0.00018811564230959588, + "loss": 0.6733, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.29019043045865145, + "learning_rate": 0.00018786934000992688, + "loss": 0.6343, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.31156049249994183, + "learning_rate": 0.00018762067619427746, + "loss": 0.7005, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.29428332863315004, + "learning_rate": 0.00018736965754556528, + "loss": 0.6335, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.31497995062962403, + "learning_rate": 0.00018711629080999504, + "loss": 0.7361, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.2890976895301539, + "learning_rate": 0.00018686058279687698, + "loss": 0.6126, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.2954134281814498, + "learning_rate": 0.00018660254037844388, + "loss": 0.6535, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.28112456697322546, + "learning_rate": 0.00018634217048966637, + "loss": 0.6482, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.28861115880222316, + "learning_rate": 0.0001860794801280666, + "loss": 0.6562, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.30494694193499206, + "learning_rate": 0.0001858144763535302, + "loss": 0.6658, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3004982208010861, + "learning_rate": 0.0001855471662881164, + "loss": 0.634, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.2901946576962155, + "learning_rate": 0.00018527755711586678, + "loss": 0.6663, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.30503788029150036, + "learning_rate": 0.00018500565608261214, + "loss": 0.5757, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.2891040387420322, + "learning_rate": 0.00018473147049577774, + "loss": 0.6926, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.271266483580161, + "learning_rate": 0.00018445500772418697, + "loss": 0.6163, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.27626211178457927, + "learning_rate": 0.00018417627519786315, + "loss": 0.6008, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.28647957333813995, + "learning_rate": 0.00018389528040783012, + "loss": 0.6492, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.29056701940711743, + "learning_rate": 0.00018361203090591071, + "loss": 0.6904, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.29207159590757975, + "learning_rate": 0.00018332653430452376, + "loss": 0.6857, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.29451399058695804, + "learning_rate": 0.00018303879827647975, + "loss": 0.6291, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.2885154068087507, + "learning_rate": 0.00018274883055477436, + "loss": 0.6329, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.26861913834773543, + "learning_rate": 0.00018245663893238075, + "loss": 0.6224, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.28048011104639864, + "learning_rate": 0.00018216223126204007, + "loss": 0.6187, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.29326686568989035, + "learning_rate": 0.00018186561545605054, + "loss": 0.7009, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.29280070964817745, + "learning_rate": 0.00018156679948605467, + "loss": 0.65, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.27106551349366576, + "learning_rate": 0.00018126579138282503, + "loss": 0.6213, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.2943347329126408, + "learning_rate": 0.0001809625992360485, + "loss": 0.6451, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.2891926431972801, + "learning_rate": 0.00018065723119410884, + "loss": 0.6589, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.2805113538689055, + "learning_rate": 0.00018034969546386757, + "loss": 0.6387, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.29902427992148634, + "learning_rate": 0.0001800400003104436, + "loss": 0.6303, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3011522421150295, + "learning_rate": 0.00017972815405699103, + "loss": 0.6969, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.2800238928479077, + "learning_rate": 0.00017941416508447536, + "loss": 0.5755, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.27143926253293926, + "learning_rate": 0.0001790980418314484, + "loss": 0.6028, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.2943161871981876, + "learning_rate": 0.00017877979279382135, + "loss": 0.6681, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.289334392820797, + "learning_rate": 0.0001784594265246366, + "loss": 0.67, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.28660574491977214, + "learning_rate": 0.0001781369516338378, + "loss": 0.6533, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.28302827878206543, + "learning_rate": 0.00017781237678803847, + "loss": 0.674, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.29249601311566104, + "learning_rate": 0.000177485710710289, + "loss": 0.6481, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.291768170270842, + "learning_rate": 0.00017715696217984235, + "loss": 0.651, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.28391259096195254, + "learning_rate": 0.00017682614003191807, + "loss": 0.6293, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.2810392291736171, + "learning_rate": 0.00017649325315746478, + "loss": 0.6175, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.2867065084778468, + "learning_rate": 0.0001761583105029213, + "loss": 0.6527, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.2770673446738684, + "learning_rate": 0.00017582132106997616, + "loss": 0.651, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.27039777357068207, + "learning_rate": 0.00017548229391532572, + "loss": 0.6344, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.2804212430044406, + "learning_rate": 0.00017514123815043074, + "loss": 0.6521, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.2853133905615482, + "learning_rate": 0.00017479816294127152, + "loss": 0.6763, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.2786207325539522, + "learning_rate": 0.0001744530775081015, + "loss": 0.6077, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.28095395945426455, + "learning_rate": 0.0001741059911251997, + "loss": 0.6288, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.28815654977920335, + "learning_rate": 0.000173756913120621, + "loss": 0.5976, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.28660888249754035, + "learning_rate": 0.00017340585287594604, + "loss": 0.6374, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.40957193545608156, + "learning_rate": 0.0001730528198260285, + "loss": 0.5966, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.2898913111516385, + "learning_rate": 0.00017269782345874203, + "loss": 0.6482, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.2786245395654543, + "learning_rate": 0.00017234087331472497, + "loss": 0.6125, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.2837721281604903, + "learning_rate": 0.00017198197898712404, + "loss": 0.6363, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.2753352987391294, + "learning_rate": 0.00017162115012133643, + "loss": 0.6374, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.26440385164137004, + "learning_rate": 0.00017125839641475072, + "loss": 0.557, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.2857385564623794, + "learning_rate": 0.00017089372761648616, + "loss": 0.6179, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.2874021322021662, + "learning_rate": 0.00017052715352713075, + "loss": 0.6356, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.27466595495457424, + "learning_rate": 0.00017015868399847768, + "loss": 0.6043, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.28062479869313933, + "learning_rate": 0.00016978832893326074, + "loss": 0.5984, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.26942030615250845, + "learning_rate": 0.00016941609828488807, + "loss": 0.5955, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.295863709429359, + "learning_rate": 0.0001690420020571747, + "loss": 0.6662, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.28680145914254224, + "learning_rate": 0.0001686660503040737, + "loss": 0.6586, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.26897503141668383, + "learning_rate": 0.00016828825312940592, + "loss": 0.6134, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.26840146894603445, + "learning_rate": 0.0001679086206865886, + "loss": 0.6016, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.26929199365588663, + "learning_rate": 0.00016752716317836229, + "loss": 0.5914, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.27139751196806866, + "learning_rate": 0.0001671438908565167, + "loss": 0.6303, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.3021205383636816, + "learning_rate": 0.00016675881402161536, + "loss": 0.6159, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.28120367032543214, + "learning_rate": 0.0001663719430227186, + "loss": 0.6679, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.2914956157054564, + "learning_rate": 0.00016598328825710533, + "loss": 0.685, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.2711325731767127, + "learning_rate": 0.000165592860169994, + "loss": 0.595, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.27653328851479647, + "learning_rate": 0.00016520066925426144, + "loss": 0.6172, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.2705518127553975, + "learning_rate": 0.0001648067260501611, + "loss": 0.6154, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.27396647728634504, + "learning_rate": 0.0001644110411450398, + "loss": 0.5925, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.29077363793367056, + "learning_rate": 0.00016401362517305296, + "loss": 0.6423, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.2670728836626214, + "learning_rate": 0.00016361448881487914, + "loss": 0.5596, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.3084907250837108, + "learning_rate": 0.00016321364279743266, + "loss": 0.693, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.28989918915509644, + "learning_rate": 0.0001628110978935756, + "loss": 0.6512, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.26930064508030843, + "learning_rate": 0.00016240686492182804, + "loss": 0.6083, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.2786463373437597, + "learning_rate": 0.00016200095474607753, + "loss": 0.6663, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.2617740825176673, + "learning_rate": 0.00016159337827528685, + "loss": 0.5798, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.2655433417635973, + "learning_rate": 0.0001611841464632011, + "loss": 0.6103, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.2772930381756503, + "learning_rate": 0.0001607732703080532, + "loss": 0.6662, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.2739856896351615, + "learning_rate": 0.00016036076085226814, + "loss": 0.6097, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.27557629883711054, + "learning_rate": 0.0001599466291821666, + "loss": 0.6375, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.27001717421979243, + "learning_rate": 0.0001595308864276666, + "loss": 0.6071, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.27751369294324196, + "learning_rate": 0.0001591135437619847, + "loss": 0.6477, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.2615767149282083, + "learning_rate": 0.0001586946124013354, + "loss": 0.5764, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.27820796119436153, + "learning_rate": 0.0001582741036046301, + "loss": 0.6181, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.26386382755500865, + "learning_rate": 0.00015785202867317407, + "loss": 0.5821, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.2731116018516801, + "learning_rate": 0.00015742839895036305, + "loss": 0.6133, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.26748852975928117, + "learning_rate": 0.00015700322582137827, + "loss": 0.5941, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.2936091441118259, + "learning_rate": 0.0001565765207128805, + "loss": 0.6236, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.28627359457145607, + "learning_rate": 0.0001561482950927029, + "loss": 0.6127, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.2738706815025388, + "learning_rate": 0.00015571856046954285, + "loss": 0.6183, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.2678852345808725, + "learning_rate": 0.00015528732839265272, + "loss": 0.5853, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.26977041336993773, + "learning_rate": 0.0001548546104515294, + "loss": 0.6083, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.28709476977831255, + "learning_rate": 0.00015442041827560274, + "loss": 0.6405, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3023809565549881, + "learning_rate": 0.00015398476353392323, + "loss": 0.7043, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.2747892811127097, + "learning_rate": 0.00015354765793484834, + "loss": 0.6532, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.2769456334862002, + "learning_rate": 0.00015310911322572753, + "loss": 0.6155, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.26813200787234687, + "learning_rate": 0.000152669141192587, + "loss": 0.6019, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.2750495110055987, + "learning_rate": 0.00015222775365981273, + "loss": 0.6278, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.27039140796375544, + "learning_rate": 0.00015178496248983254, + "loss": 0.6389, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.2670339648572964, + "learning_rate": 0.00015134077958279765, + "loss": 0.6254, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.26648845002200316, + "learning_rate": 0.00015089521687626243, + "loss": 0.5973, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.2751017457817988, + "learning_rate": 0.000150448286344864, + "loss": 0.6513, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.28732132512944286, + "learning_rate": 0.00015000000000000001, + "loss": 0.6752, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.27886124697137193, + "learning_rate": 0.00014955036988950618, + "loss": 0.6554, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.2632289008248714, + "learning_rate": 0.00014909940809733222, + "loss": 0.5678, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.2766947840759987, + "learning_rate": 0.00014864712674321734, + "loss": 0.6319, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.2709193107694985, + "learning_rate": 0.00014819353798236427, + "loss": 0.6424, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.26920447067686964, + "learning_rate": 0.00014773865400511272, + "loss": 0.579, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.26282569011013873, + "learning_rate": 0.00014728248703661182, + "loss": 0.5803, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.30253062757654425, + "learning_rate": 0.00014682504933649144, + "loss": 0.6894, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.28185798122986966, + "learning_rate": 0.00014636635319853275, + "loss": 0.6701, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.2781631405974303, + "learning_rate": 0.00014590641095033787, + "loss": 0.6409, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.2783668408558492, + "learning_rate": 0.00014544523495299842, + "loss": 0.6235, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.27110724885666837, + "learning_rate": 0.0001449828376007636, + "loss": 0.6276, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.27314657189067826, + "learning_rate": 0.0001445192313207067, + "loss": 0.6402, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.27542798995603934, + "learning_rate": 0.0001440544285723915, + "loss": 0.6218, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.2596868094828067, + "learning_rate": 0.00014358844184753712, + "loss": 0.5767, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.28129216441253807, + "learning_rate": 0.00014312128366968243, + "loss": 0.61, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.26316051294531795, + "learning_rate": 0.00014265296659384956, + "loss": 0.6103, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.2625379397079638, + "learning_rate": 0.00014218350320620624, + "loss": 0.5954, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.26252034826320153, + "learning_rate": 0.0001417129061237278, + "loss": 0.6072, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.2697403042950616, + "learning_rate": 0.00014124118799385796, + "loss": 0.6324, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.2694590499091851, + "learning_rate": 0.00014076836149416887, + "loss": 0.5831, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.2799199854667609, + "learning_rate": 0.0001402944393320206, + "loss": 0.6431, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.2743505833018257, + "learning_rate": 0.00013981943424421932, + "loss": 0.5958, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.2651616201833635, + "learning_rate": 0.00013934335899667527, + "loss": 0.6146, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.2752066748441881, + "learning_rate": 0.00013886622638405952, + "loss": 0.5988, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.2654586037334672, + "learning_rate": 0.00013838804922946027, + "loss": 0.6081, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.26528475777384997, + "learning_rate": 0.00013790884038403795, + "loss": 0.6025, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.2770333595628385, + "learning_rate": 0.00013742861272668012, + "loss": 0.6071, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.27275921881990134, + "learning_rate": 0.00013694737916365517, + "loss": 0.5878, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.27460484347510516, + "learning_rate": 0.00013646515262826552, + "loss": 0.6185, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.28627779272830833, + "learning_rate": 0.0001359819460805001, + "loss": 0.6642, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.2843553197115947, + "learning_rate": 0.0001354977725066859, + "loss": 0.6002, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.2710438183016861, + "learning_rate": 0.00013501264491913906, + "loss": 0.5952, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.28027455054015277, + "learning_rate": 0.0001345265763558152, + "loss": 0.6283, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.2888905069397006, + "learning_rate": 0.00013403957987995882, + "loss": 0.6298, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.2706277103676121, + "learning_rate": 0.0001335516685797525, + "loss": 0.6021, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.2651883807306946, + "learning_rate": 0.00013306285556796495, + "loss": 0.62, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.28550563597822987, + "learning_rate": 0.00013257315398159864, + "loss": 0.6325, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.27094358518408534, + "learning_rate": 0.00013208257698153677, + "loss": 0.5908, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.2696764320997584, + "learning_rate": 0.00013159113775218964, + "loss": 0.5853, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.2657302789360468, + "learning_rate": 0.00013109884950114007, + "loss": 0.6118, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.25881513219930835, + "learning_rate": 0.00013060572545878875, + "loss": 0.5788, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.2692447885970564, + "learning_rate": 0.00013011177887799845, + "loss": 0.5949, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2718587692319407, + "learning_rate": 0.00012961702303373795, + "loss": 0.5973, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.30912170033120123, + "learning_rate": 0.00012912147122272523, + "loss": 0.7001, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.2895514425791462, + "learning_rate": 0.00012862513676307008, + "loss": 0.5999, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.2716240374125864, + "learning_rate": 0.00012812803299391628, + "loss": 0.6194, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.26039229890804566, + "learning_rate": 0.00012763017327508305, + "loss": 0.5755, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.26328712426525025, + "learning_rate": 0.0001271315709867059, + "loss": 0.6225, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.2632352487950469, + "learning_rate": 0.00012663223952887723, + "loss": 0.6173, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.2679355024595054, + "learning_rate": 0.00012613219232128608, + "loss": 0.5877, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.2764596288907185, + "learning_rate": 0.00012563144280285741, + "loss": 0.6295, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.26393403511341645, + "learning_rate": 0.00012513000443139112, + "loss": 0.5606, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.27432367106489686, + "learning_rate": 0.00012462789068320017, + "loss": 0.6311, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.2705500655990583, + "learning_rate": 0.00012412511505274844, + "loss": 0.58, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.259559547111268, + "learning_rate": 0.00012362169105228826, + "loss": 0.5805, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.25098071679480816, + "learning_rate": 0.000123117632211497, + "loss": 0.5676, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2601878941499058, + "learning_rate": 0.00012261295207711346, + "loss": 0.585, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.2621421870871061, + "learning_rate": 0.0001221076642125742, + "loss": 0.6069, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.26963854802455367, + "learning_rate": 0.00012160178219764837, + "loss": 0.6408, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.2585727486396966, + "learning_rate": 0.00012109531962807332, + "loss": 0.5798, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.2722397010778486, + "learning_rate": 0.00012058829011518896, + "loss": 0.6109, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.26035612594536284, + "learning_rate": 0.00012008070728557186, + "loss": 0.5784, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.26856540487699954, + "learning_rate": 0.00011957258478066931, + "loss": 0.6243, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.26833969016804327, + "learning_rate": 0.00011906393625643244, + "loss": 0.6047, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.26631996087657145, + "learning_rate": 0.00011855477538294935, + "loss": 0.6033, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.25471868704875356, + "learning_rate": 0.00011804511584407763, + "loss": 0.5583, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.26038533587620166, + "learning_rate": 0.00011753497133707679, + "loss": 0.5813, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.2680253814911395, + "learning_rate": 0.00011702435557223987, + "loss": 0.6196, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.26645922323423354, + "learning_rate": 0.00011651328227252517, + "loss": 0.634, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.26714834506150237, + "learning_rate": 0.00011600176517318741, + "loss": 0.6113, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.27089839537865856, + "learning_rate": 0.00011548981802140848, + "loss": 0.6277, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.2694898217685149, + "learning_rate": 0.00011497745457592816, + "loss": 0.5931, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.25363892775400476, + "learning_rate": 0.00011446468860667421, + "loss": 0.5465, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.25334437860494907, + "learning_rate": 0.00011395153389439233, + "loss": 0.5622, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.2649852486157058, + "learning_rate": 0.00011343800423027582, + "loss": 0.5981, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.2835099156763055, + "learning_rate": 0.0001129241134155949, + "loss": 0.6665, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.26830434990486457, + "learning_rate": 0.00011240987526132594, + "loss": 0.6173, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.267287012411873, + "learning_rate": 0.00011189530358778005, + "loss": 0.6243, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.2609447173808279, + "learning_rate": 0.00011138041222423177, + "loss": 0.5885, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.25543369285777234, + "learning_rate": 0.00011086521500854745, + "loss": 0.5573, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.2671858794438797, + "learning_rate": 0.00011034972578681338, + "loss": 0.5943, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.2593955077079989, + "learning_rate": 0.00010983395841296348, + "loss": 0.5934, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.2823053871701031, + "learning_rate": 0.00010931792674840718, + "loss": 0.6346, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.26018937907020806, + "learning_rate": 0.00010880164466165674, + "loss": 0.6209, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.2675372716966033, + "learning_rate": 0.00010828512602795462, + "loss": 0.6518, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.26163986747743156, + "learning_rate": 0.00010776838472890065, + "loss": 0.643, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.2658825302064493, + "learning_rate": 0.00010725143465207867, + "loss": 0.6278, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.2642568671878393, + "learning_rate": 0.00010673428969068364, + "loss": 0.6121, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.26205105488837227, + "learning_rate": 0.00010621696374314807, + "loss": 0.5926, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.24945629808113587, + "learning_rate": 0.00010569947071276847, + "loss": 0.5439, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.2705335916411653, + "learning_rate": 0.00010518182450733186, + "loss": 0.624, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.2645126634344591, + "learning_rate": 0.00010466403903874176, + "loss": 0.6149, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2610460612633549, + "learning_rate": 0.00010414612822264455, + "loss": 0.6336, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.26002849948292583, + "learning_rate": 0.00010362810597805526, + "loss": 0.5677, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.25609352645097033, + "learning_rate": 0.0001031099862269837, + "loss": 0.5538, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.27158974314716067, + "learning_rate": 0.00010259178289406011, + "loss": 0.6088, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.2574000836503734, + "learning_rate": 0.00010207350990616107, + "loss": 0.5901, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.2748624667047697, + "learning_rate": 0.0001015551811920351, + "loss": 0.6442, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.26669487646166096, + "learning_rate": 0.00010103681068192845, + "loss": 0.6091, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.2777401885135166, + "learning_rate": 0.00010051841230721065, + "loss": 0.5698, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.2593552962147439, + "learning_rate": 0.0001, + "loss": 0.5972, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.2591542124686353, + "learning_rate": 9.948158769278939e-05, + "loss": 0.5676, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.25631941791655843, + "learning_rate": 9.896318931807155e-05, + "loss": 0.5971, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.3016872466120396, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6017, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.2654014532106818, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6278, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.2544257649303829, + "learning_rate": 9.740821710593989e-05, + "loss": 0.5565, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.255499489168914, + "learning_rate": 9.689001377301633e-05, + "loss": 0.584, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.25565563033270083, + "learning_rate": 9.637189402194476e-05, + "loss": 0.606, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.25498787311023613, + "learning_rate": 9.585387177735547e-05, + "loss": 0.5737, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.2619218882769582, + "learning_rate": 9.533596096125825e-05, + "loss": 0.5824, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2638213417602552, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6056, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.26995009893117944, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6139, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.2542149578732742, + "learning_rate": 9.378303625685195e-05, + "loss": 0.5476, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.2513914064071582, + "learning_rate": 9.326571030931637e-05, + "loss": 0.5626, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.2680264121730716, + "learning_rate": 9.274856534792138e-05, + "loss": 0.5889, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.2555106002284551, + "learning_rate": 9.223161527109937e-05, + "loss": 0.5809, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.25875832684290795, + "learning_rate": 9.171487397204539e-05, + "loss": 0.5749, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.2556305902909127, + "learning_rate": 9.119835533834331e-05, + "loss": 0.5632, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.26185715283400635, + "learning_rate": 9.068207325159284e-05, + "loss": 0.5792, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.263891085361733, + "learning_rate": 9.016604158703654e-05, + "loss": 0.5838, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.24763138436301804, + "learning_rate": 8.965027421318665e-05, + "loss": 0.5698, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.2632830918798873, + "learning_rate": 8.913478499145254e-05, + "loss": 0.598, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.29082792629350307, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6009, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.2528825949585721, + "learning_rate": 8.810469641222001e-05, + "loss": 0.5542, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.25513868134702117, + "learning_rate": 8.759012473867407e-05, + "loss": 0.5803, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.2606526761809016, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6091, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2644594971115932, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6165, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.26082804034908924, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6031, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.25984517935489154, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6031, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.26762902286838963, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6284, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2586268717313893, + "learning_rate": 8.451018197859153e-05, + "loss": 0.5758, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.26965726852970207, + "learning_rate": 8.399823482681262e-05, + "loss": 0.606, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.27752151533234787, + "learning_rate": 8.348671772747487e-05, + "loss": 0.6414, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.2524249669020979, + "learning_rate": 8.297564442776014e-05, + "loss": 0.562, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.259208248804178, + "learning_rate": 8.246502866292324e-05, + "loss": 0.5783, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.2629825216619024, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6019, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.27395230361912726, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6126, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.26373286997161893, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6125, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.25465433672334553, + "learning_rate": 8.042741521933071e-05, + "loss": 0.5687, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.2559530815925589, + "learning_rate": 7.991929271442817e-05, + "loss": 0.5918, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.258837584421683, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6003, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.26429023731078133, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6279, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.2548865767012274, + "learning_rate": 7.839821780235168e-05, + "loss": 0.5805, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.33139314641727463, + "learning_rate": 7.789233578742582e-05, + "loss": 0.673, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.25474566299712065, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6018, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.27405703687372945, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6131, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.25215109009251463, + "learning_rate": 7.637830894771175e-05, + "loss": 0.5759, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.2624619027250661, + "learning_rate": 7.587488494725157e-05, + "loss": 0.5563, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.2757839250369891, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6195, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.2702293320510406, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6358, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.2633118638050873, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6259, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.26060980759540464, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6125, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.261668096986087, + "learning_rate": 7.336776047112276e-05, + "loss": 0.5948, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.2622527701047893, + "learning_rate": 7.286842901329412e-05, + "loss": 0.5977, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.25217138102728326, + "learning_rate": 7.236982672491698e-05, + "loss": 0.5847, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.25792026179637567, + "learning_rate": 7.187196700608373e-05, + "loss": 0.599, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.2574842732152005, + "learning_rate": 7.137486323692995e-05, + "loss": 0.5711, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.25043437723529727, + "learning_rate": 7.087852877727481e-05, + "loss": 0.5556, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.24459799810234237, + "learning_rate": 7.038297696626206e-05, + "loss": 0.5551, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.2597607979330205, + "learning_rate": 6.988822112200156e-05, + "loss": 0.5947, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.2609330599899115, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6233, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.26861033183050814, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6213, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.25914606292067016, + "learning_rate": 6.84088622478104e-05, + "loss": 0.5887, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.2522124490494597, + "learning_rate": 6.791742301846326e-05, + "loss": 0.5696, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.2638978941366862, + "learning_rate": 6.742684601840141e-05, + "loss": 0.604, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.2482187971750229, + "learning_rate": 6.693714443203507e-05, + "loss": 0.5346, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.2594744324645565, + "learning_rate": 6.644833142024751e-05, + "loss": 0.5865, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.25836648727917544, + "learning_rate": 6.59604201200412e-05, + "loss": 0.58, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.25013205186206017, + "learning_rate": 6.547342364418481e-05, + "loss": 0.5724, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.2684922316211901, + "learning_rate": 6.498735508086093e-05, + "loss": 0.5715, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.2507704655034135, + "learning_rate": 6.450222749331414e-05, + "loss": 0.5731, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.2604261604890052, + "learning_rate": 6.40180539194999e-05, + "loss": 0.6268, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.25478837808916244, + "learning_rate": 6.35348473717345e-05, + "loss": 0.5484, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.2618405043606778, + "learning_rate": 6.305262083634488e-05, + "loss": 0.5823, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.2647994946291963, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6142, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.2504158827408613, + "learning_rate": 6.209115961596208e-05, + "loss": 0.5829, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2643682028445015, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6289, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.2726027118055879, + "learning_rate": 6.113377361594049e-05, + "loss": 0.6087, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.2656399134442502, + "learning_rate": 6.065664100332478e-05, + "loss": 0.6232, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.2890321879342334, + "learning_rate": 6.018056575578075e-05, + "loss": 0.611, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.2500196584931281, + "learning_rate": 5.970556066797941e-05, + "loss": 0.5766, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.24714116207627002, + "learning_rate": 5.923163850583113e-05, + "loss": 0.5577, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.25915690622659465, + "learning_rate": 5.875881200614207e-05, + "loss": 0.5897, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.2582280893466348, + "learning_rate": 5.828709387627218e-05, + "loss": 0.5353, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.2493600118309958, + "learning_rate": 5.781649679379378e-05, + "loss": 0.5764, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.2696840274791407, + "learning_rate": 5.73470334061505e-05, + "loss": 0.6043, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.2556281813318383, + "learning_rate": 5.687871633031754e-05, + "loss": 0.5931, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.262313158083901, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.6144, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.26452950042359274, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.5894, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.2668491364046421, + "learning_rate": 5.54807686792933e-05, + "loss": 0.6079, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.26169529088395, + "learning_rate": 5.501716239923642e-05, + "loss": 0.5893, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.2639409061834857, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6301, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.26920981548851625, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6425, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.24533060603048004, + "learning_rate": 5.363364680146725e-05, + "loss": 0.5441, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.24769714300264078, + "learning_rate": 5.31749506635086e-05, + "loss": 0.5517, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.2559740780545734, + "learning_rate": 5.271751296338823e-05, + "loss": 0.5569, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2567723917323897, + "learning_rate": 5.226134599488728e-05, + "loss": 0.5722, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.2563199159419971, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6036, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.2541037005115942, + "learning_rate": 5.135287325678271e-05, + "loss": 0.5595, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.2589425304040646, + "learning_rate": 5.090059190266779e-05, + "loss": 0.5767, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.2584911309515902, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.5766, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.2506728798970084, + "learning_rate": 5.000000000000002e-05, + "loss": 0.5728, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.26960684664060885, + "learning_rate": 4.955171365513603e-05, + "loss": 0.5557, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.24805931464523356, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.5457, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.25987190280249384, + "learning_rate": 4.865922041720239e-05, + "loss": 0.5746, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.2573451608183604, + "learning_rate": 4.821503751016746e-05, + "loss": 0.5799, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.2654122685222725, + "learning_rate": 4.777224634018732e-05, + "loss": 0.5924, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.2597690654249084, + "learning_rate": 4.733085880741301e-05, + "loss": 0.6055, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.25243145661314076, + "learning_rate": 4.689088677427249e-05, + "loss": 0.5289, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.25824439286544515, + "learning_rate": 4.645234206515171e-05, + "loss": 0.5814, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2544582038004977, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.5619, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.27471726182067324, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.6207, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.26012264505830507, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6023, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.24092441320395502, + "learning_rate": 4.471267160734731e-05, + "loss": 0.5172, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.25352004959955127, + "learning_rate": 4.428143953045717e-05, + "loss": 0.5565, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.25212776924369507, + "learning_rate": 4.385170490729712e-05, + "loss": 0.5687, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.2616338146981191, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6115, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.2552747385884955, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.5545, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.29390560725250053, + "learning_rate": 4.257160104963696e-05, + "loss": 0.5344, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.24986156430741233, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.5611, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.25614139660011026, + "learning_rate": 4.172589639536991e-05, + "loss": 0.5625, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.2538962500697107, + "learning_rate": 4.130538759866457e-05, + "loss": 0.5843, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2722022908484456, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6219, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.25999053783595505, + "learning_rate": 4.046911357233343e-05, + "loss": 0.5811, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.2632170630137, + "learning_rate": 4.00533708178334e-05, + "loss": 0.5454, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.2589898494047925, + "learning_rate": 3.963923914773187e-05, + "loss": 0.5568, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.24241528073093044, + "learning_rate": 3.922672969194686e-05, + "loss": 0.5541, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.2792473797078321, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.5895, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.2493853268888264, + "learning_rate": 3.840662172471315e-05, + "loss": 0.5597, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.2587460910617057, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6024, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.25970986514457856, + "learning_rate": 3.759313507817196e-05, + "loss": 0.6028, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.2627758245134774, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.5857, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.25925746428700547, + "learning_rate": 3.678635720256737e-05, + "loss": 0.5724, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.41712529798668274, + "learning_rate": 3.638551118512089e-05, + "loss": 0.5884, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.25078015135979354, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.5775, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.2565385394651964, + "learning_rate": 3.558895885496023e-05, + "loss": 0.5995, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.25156874540996954, + "learning_rate": 3.519327394983888e-05, + "loss": 0.5586, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.24487007483044448, + "learning_rate": 3.479933074573858e-05, + "loss": 0.5574, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.2598900854989058, + "learning_rate": 3.440713983000601e-05, + "loss": 0.5817, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.24895228375299977, + "learning_rate": 3.401671174289469e-05, + "loss": 0.5347, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.2661519054238711, + "learning_rate": 3.362805697728145e-05, + "loss": 0.6037, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.24630003568969208, + "learning_rate": 3.324118597838464e-05, + "loss": 0.5472, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.25625016668788175, + "learning_rate": 3.285610914348332e-05, + "loss": 0.5795, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.2505624784270415, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.5897, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.43546402945514173, + "learning_rate": 3.209137931341143e-05, + "loss": 0.5565, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.2626787083120136, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6334, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.25090989395570357, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.5973, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.25336725221102824, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.5828, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.24057783653954204, + "learning_rate": 3.058390171511196e-05, + "loss": 0.5532, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.23959198782404773, + "learning_rate": 3.021167106673928e-05, + "loss": 0.5277, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.24774383573640013, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.5675, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.24960252681386738, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.5678, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.2484795625510042, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.5662, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.24545186743564423, + "learning_rate": 2.874160358524931e-05, + "loss": 0.5524, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.2516789090041251, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.5912, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.2607401899793407, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6134, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.2585203137406373, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.5772, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.25385753554872703, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5921, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.24600250850794042, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.5698, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.2576103052865085, + "learning_rate": 2.659414712405398e-05, + "loss": 0.5909, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.25480879506443094, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5544, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.2503529255141808, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.5267, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2586930060114874, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.5903, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.2530021008014392, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.5622, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.25162412800145234, + "learning_rate": 2.485876184956928e-05, + "loss": 0.5582, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.2531143883528878, + "learning_rate": 2.451770608467432e-05, + "loss": 0.5705, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.2438321700064606, + "learning_rate": 2.417867893002387e-05, + "loss": 0.5522, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.26017372144360845, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.5923, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.2592013861597092, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.5751, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.2604178559013015, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.6108, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.24911407942551256, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.5424, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.2738283985167626, + "learning_rate": 2.251428928971102e-05, + "loss": 0.5523, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.2425227355627501, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.5421, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.2656925026244092, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.602, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.24927991955892181, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5562, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.2517727024587019, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.5613, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.2539945051749779, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.5609, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.26460778710159644, + "learning_rate": 2.058583491552465e-05, + "loss": 0.5789, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.2501401457440102, + "learning_rate": 2.027184594300898e-05, + "loss": 0.5767, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.25108695158570427, + "learning_rate": 1.995999968955641e-05, + "loss": 0.5748, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.26144078355869815, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.5559, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.2424811775551495, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.5232, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.2629144195906486, + "learning_rate": 1.903740076395151e-05, + "loss": 0.5582, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.2632536996658287, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5965, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.2578948318399331, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.5688, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.25383747050695094, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.5569, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.2574345951818969, + "learning_rate": 1.783776873795994e-05, + "loss": 0.547, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.27222780005809916, + "learning_rate": 1.754336106761927e-05, + "loss": 0.5861, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.2606781785106011, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.5884, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.2551876492978511, + "learning_rate": 1.696120172352025e-05, + "loss": 0.5734, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.24618973056746155, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.5319, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.2591979469770761, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.5937, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.2573694155924506, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.5453, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.2504361140343124, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.5583, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.2469292086578109, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.5438, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.25818318752576547, + "learning_rate": 1.526852950422226e-05, + "loss": 0.5827, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.2671447662861101, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.5438, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.26518579024890976, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.5935, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.2682839201056169, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.6102, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.2668231948844124, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.6063, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.2568029936082383, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.5416, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.2490028841911143, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.5445, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.25650024875764577, + "learning_rate": 1.339745962155613e-05, + "loss": 0.545, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.25741474548439974, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5447, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2562124919206727, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.6024, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.26154778091279046, + "learning_rate": 1.263034245443473e-05, + "loss": 0.5936, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.2655385936068879, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.6057, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.25352887609445324, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.5775, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.25670604310461265, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.5839, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.2581312700128164, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6202, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.2587743828479023, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.578, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.25323698738285116, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.5811, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.26379924381872405, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6065, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.25970803047318636, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5775, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.24513668859607107, + "learning_rate": 1.045650195232819e-05, + "loss": 0.5411, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.2593808847300968, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.5887, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.26069262793188025, + "learning_rate": 9.999734793146998e-06, + "loss": 0.5884, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.255292393884229, + "learning_rate": 9.774976338718677e-06, + "loss": 0.5643, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.28751175707868937, + "learning_rate": 9.552642710005299e-06, + "loss": 0.5643, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.25959765605546553, + "learning_rate": 9.332739882292752e-06, + "loss": 0.5339, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.24980709115022895, + "learning_rate": 9.115273765538202e-06, + "loss": 0.5695, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.2452209267199703, + "learning_rate": 8.900250204211514e-06, + "loss": 0.5165, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.2524793924892976, + "learning_rate": 8.687674977138116e-06, + "loss": 0.5597, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.26606410256191265, + "learning_rate": 8.47755379734373e-06, + "loss": 0.6186, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.2533467970780035, + "learning_rate": 8.269892311900696e-06, + "loss": 0.5616, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.25755411819862256, + "learning_rate": 8.064696101776358e-06, + "loss": 0.5627, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2575217298135753, + "learning_rate": 7.861970681683051e-06, + "loss": 0.581, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.24982665016048144, + "learning_rate": 7.661721499929753e-06, + "loss": 0.5396, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.2641747404541249, + "learning_rate": 7.463953938275858e-06, + "loss": 0.5777, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.2756926446548946, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.5419, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.24519930818122074, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5412, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.25527219219142355, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5671, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.24507775526680298, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5435, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.25279856928031735, + "learning_rate": 6.512524116523633e-06, + "loss": 0.5232, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.24229897895278965, + "learning_rate": 6.329755547632499e-06, + "loss": 0.4994, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.24367438318292856, + "learning_rate": 6.149504395842087e-06, + "loss": 0.5668, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.24611400184096086, + "learning_rate": 5.971775505458444e-06, + "loss": 0.5576, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.25894143808589626, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.5907, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.2544556139308709, + "learning_rate": 5.623903547074549e-06, + "loss": 0.5649, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.25409370996140834, + "learning_rate": 5.453769828241872e-06, + "loss": 0.5291, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.24278223953313688, + "learning_rate": 5.286177068899989e-06, + "loss": 0.5242, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.2692566239437579, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6204, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.24806440013057024, + "learning_rate": 4.95863237670956e-06, + "loss": 0.5665, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.2461628864242156, + "learning_rate": 4.798689246727006e-06, + "loss": 0.5538, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.2559122982021189, + "learning_rate": 4.641304681730641e-06, + "loss": 0.604, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.2606246789860794, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5897, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.2531046625803253, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.5381, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.2644909172549217, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6065, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.25998843134511657, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5756, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.2438607876632911, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5519, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.2557809493211503, + "learning_rate": 3.750959195463466e-06, + "loss": 0.5794, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.24356844865025518, + "learning_rate": 3.611599153858214e-06, + "loss": 0.5457, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.2554892701473439, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.5605, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.2463181662462567, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.5622, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.24714707374932277, + "learning_rate": 3.209076472645112e-06, + "loss": 0.5722, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.2523035198417324, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.5323, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.2446822675297152, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5344, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.257524947253814, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.593, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.26661384217269185, + "learning_rate": 2.708812932856253e-06, + "loss": 0.571, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.2626126806242866, + "learning_rate": 2.590275647868867e-06, + "loss": 0.5903, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.25084123396603797, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.5972, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.24678763420634603, + "learning_rate": 2.3610579436393e-06, + "loss": 0.5488, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.2548720457798887, + "learning_rate": 2.250383684694579e-06, + "loss": 0.5851, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.25074968600043734, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5592, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.25708420318674485, + "learning_rate": 2.036919225091827e-06, + "loss": 0.584, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.2589322326993553, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.6178, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.25656778306444317, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.5435, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.2558065441349567, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.5498, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.24053570452087028, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.536, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.2716175789734306, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.5876, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.2525541128663851, + "learning_rate": 1.459798471131868e-06, + "loss": 0.5904, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.25817865905570425, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.5889, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.26151537175840095, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.5729, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.24225587612941454, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5125, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.2628493753306178, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.5848, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.24049741007521916, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.5325, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.25287426448549716, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5807, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.2552367256720679, + "learning_rate": 9.070131527609604e-07, + "loss": 0.5416, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.24544619556081892, + "learning_rate": 8.386804624865851e-07, + "loss": 0.5677, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.30400369196431554, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5902, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.24800737902624073, + "learning_rate": 7.100118211581852e-07, + "loss": 0.5535, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.25712522301314844, + "learning_rate": 6.496793281141056e-07, + "loss": 0.5875, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.2528708741278371, + "learning_rate": 5.920169059947411e-07, + "loss": 0.5716, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.25681466531883856, + "learning_rate": 5.370261044956971e-07, + "loss": 0.5543, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.24032602027009492, + "learning_rate": 4.847084015119574e-07, + "loss": 0.5213, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.25652917793532737, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.5792, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.26091177722146935, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6043, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.25003264999584973, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.5345, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.24675687713551328, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.5462, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.25019378660615477, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5657, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.2528106317795277, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.555, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.26073189355575477, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5597, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.25137663391032267, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.5659, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.2488256428995093, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.574, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.24954010199025753, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5641, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.23994659654705344, + "learning_rate": 8.598886661895788e-08, + "loss": 0.5419, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.2550508858357467, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5831, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.24443138036583872, + "learning_rate": 4.837177080119215e-08, + "loss": 0.5389, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.2607132224613469, + "learning_rate": 3.359233507459481e-08, + "loss": 0.5493, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.25489244073480916, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.5677, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.23953609103058476, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5311, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.23741179011189767, + "learning_rate": 5.375026405352035e-09, + "loss": 0.523, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.251135351613199, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5472, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.24728636723210387, + "learning_rate": 0.0, + "loss": 0.5395, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 372789103198208.0, + "train_loss": 0.6182057282924652, + "train_runtime": 8257.1583, + "train_samples_per_second": 1.211, + "train_steps_per_second": 0.076 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 372789103198208.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..53481dd51193a0e71928271293246738288877dc --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..6964ef6c9145e5d4c10ba065397534061cd2c5fb --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "gate_proj", + "v_proj", + "down_proj", + "q_proj", + "o_proj", + "k_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..853514ea06ae28826f5e3262aa0a07c76d6eb9a1 --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65ff6fb22dfaec71c00b932805e5dee92f9b92d22dc3cd5f6f58dcdcafd3d949 +size 671150064 diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..14d0036f2d6ef7a43e27dd6ab3975619d8bb57a4 --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 1152, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": false, + "vocab_size": 128256 +} diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..02c09000e50f126169efa2be79b44cc6f2d03acd --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d7005b204a93bd5f662d33d61974e83298a58f3195ae20f0ace5eb2d7251256 +size 899633034 diff --git a/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b7cd300a9de528940536bda1a8551389bb37fc5e --- /dev/null +++ b/single_dataset/img2json/bugsBunny-Llama-3-8B-V-img2json_dataset_5000_epochs_1_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.9456431410164449, + "learning_rate": 2e-05, + "loss": 1.4684, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.9595554488188345, + "learning_rate": 4e-05, + "loss": 1.3749, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.7819824872221672, + "learning_rate": 6e-05, + "loss": 1.3507, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.9167023815526777, + "learning_rate": 8e-05, + "loss": 1.2165, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.847495342554959, + "learning_rate": 0.0001, + "loss": 1.1304, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.8818427732884414, + "learning_rate": 0.00012, + "loss": 0.9715, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.7726751380491141, + "learning_rate": 0.00014, + "loss": 0.9336, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.6483561690537337, + "learning_rate": 0.00016, + "loss": 0.8903, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.5687921838840364, + "learning_rate": 0.00018, + "loss": 0.7977, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.4217197943554311, + "learning_rate": 0.0002, + "loss": 0.8058, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.39301658901144887, + "learning_rate": 0.00019999458931878073, + "loss": 0.8494, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.4590183081730723, + "learning_rate": 0.0001999783578606323, + "loss": 0.8636, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.41874721337638304, + "learning_rate": 0.00019995130738201966, + "loss": 0.8045, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.36156401408882166, + "learning_rate": 0.0001999134408101731, + "loss": 0.7129, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.43359389552236977, + "learning_rate": 0.00019986476224277165, + "loss": 0.737, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.3732292263108806, + "learning_rate": 0.00019980527694749952, + "loss": 0.721, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3672574865848022, + "learning_rate": 0.00019973499136147606, + "loss": 0.768, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.3788969898154259, + "learning_rate": 0.0001996539130905593, + "loss": 0.6994, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.3591018825770351, + "learning_rate": 0.0001995620509085228, + "loss": 0.7546, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.3347980700973235, + "learning_rate": 0.00019945941475610623, + "loss": 0.7799, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.3176137835338356, + "learning_rate": 0.0001993460157399396, + "loss": 0.718, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3110224161054545, + "learning_rate": 0.0001992218661313415, + "loss": 0.6712, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.3176740651877855, + "learning_rate": 0.00019908697936499103, + "loss": 0.7517, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.31311011452703175, + "learning_rate": 0.00019894137003747403, + "loss": 0.7431, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.31035422091689385, + "learning_rate": 0.00019878505390570362, + "loss": 0.7121, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.2986181957125755, + "learning_rate": 0.00019861804788521493, + "loss": 0.6987, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.32611739837441994, + "learning_rate": 0.00019844037004833473, + "loss": 0.7732, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.290949601128612, + "learning_rate": 0.00019825203962222572, + "loss": 0.6783, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.30951784533907173, + "learning_rate": 0.0001980530769868059, + "loss": 0.7126, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.30642887130649027, + "learning_rate": 0.00019784350367254322, + "loss": 0.6716, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3253752205567512, + "learning_rate": 0.0001976233423581255, + "loss": 0.7618, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3074017490574547, + "learning_rate": 0.0001973926168680066, + "loss": 0.6863, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.2991620525987948, + "learning_rate": 0.00019715135216982798, + "loss": 0.7061, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.3226295279455581, + "learning_rate": 0.0001968995743717171, + "loss": 0.7164, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.2942063456789797, + "learning_rate": 0.00019663731071946206, + "loss": 0.6732, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.30145830255706224, + "learning_rate": 0.00019636458959356316, + "loss": 0.6917, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.2976180783817878, + "learning_rate": 0.0001960814405061619, + "loss": 0.6688, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.28814304785348366, + "learning_rate": 0.00019578789409784727, + "loss": 0.6852, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.2968046184063759, + "learning_rate": 0.00019548398213434007, + "loss": 0.6827, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.2848652475342455, + "learning_rate": 0.00019516973750305532, + "loss": 0.6905, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.29541131745521493, + "learning_rate": 0.00019484519420954354, + "loss": 0.7409, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.30472745054693, + "learning_rate": 0.00019451038737381077, + "loss": 0.6976, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.287978936057206, + "learning_rate": 0.00019416535322651818, + "loss": 0.6889, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.2917374421163095, + "learning_rate": 0.00019381012910506146, + "loss": 0.7072, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.3014118067717161, + "learning_rate": 0.00019344475344953012, + "loss": 0.736, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.27940318160556293, + "learning_rate": 0.00019306926579854821, + "loss": 0.6561, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.2764033621334266, + "learning_rate": 0.00019268370678499533, + "loss": 0.6425, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.286603211032781, + "learning_rate": 0.0001922881181316097, + "loss": 0.6984, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.29296348821168333, + "learning_rate": 0.00019188254264647337, + "loss": 0.6898, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.28097584337963444, + "learning_rate": 0.0001914670242183795, + "loss": 0.6858, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.30154798580564207, + "learning_rate": 0.0001910416078120832, + "loss": 0.7145, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.30795671266773783, + "learning_rate": 0.0001906063394634356, + "loss": 0.7126, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.28661331080986213, + "learning_rate": 0.00019016126627440237, + "loss": 0.6722, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.27995432204171594, + "learning_rate": 0.00018970643640796642, + "loss": 0.6533, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.2881701437861025, + "learning_rate": 0.000189241899082916, + "loss": 0.6864, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.2863440297333805, + "learning_rate": 0.00018876770456851877, + "loss": 0.6545, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.2880700843682891, + "learning_rate": 0.0001882839041790818, + "loss": 0.6694, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.2774792629615745, + "learning_rate": 0.00018779055026839868, + "loss": 0.6505, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.2737154598370057, + "learning_rate": 0.00018728769622408423, + "loss": 0.6388, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.27952240531043854, + "learning_rate": 0.00018677539646179707, + "loss": 0.6802, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.28225508446141556, + "learning_rate": 0.00018625370641935129, + "loss": 0.6638, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.28445391301141404, + "learning_rate": 0.00018572268255071718, + "loss": 0.6997, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.27249304099350813, + "learning_rate": 0.00018518238231991218, + "loss": 0.665, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.27827800737681285, + "learning_rate": 0.00018463286419478255, + "loss": 0.6747, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.28236239197955565, + "learning_rate": 0.00018407418764067627, + "loss": 0.6804, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.27711713330173016, + "learning_rate": 0.00018350641311400812, + "loss": 0.6825, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.27559978427300996, + "learning_rate": 0.0001829296020557174, + "loss": 0.6893, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.28871955197128163, + "learning_rate": 0.00018234381688461942, + "loss": 0.6574, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.2840675164080103, + "learning_rate": 0.0001817491209906506, + "loss": 0.7286, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3083968957018995, + "learning_rate": 0.00018114557872800905, + "loss": 0.6578, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.2843871593471055, + "learning_rate": 0.00018053325540819045, + "loss": 0.6792, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.2791531418567351, + "learning_rate": 0.0001799122172929206, + "loss": 0.6779, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.28436440881242775, + "learning_rate": 0.00017928253158698473, + "loss": 0.6461, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.28210657946106504, + "learning_rate": 0.0001786442664309554, + "loss": 0.6376, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.27471101989657526, + "learning_rate": 0.0001779974908938184, + "loss": 0.6589, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.27255559553885733, + "learning_rate": 0.0001773422749654988, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.28238069396965426, + "learning_rate": 0.00017667868954928694, + "loss": 0.665, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.28091292280312435, + "learning_rate": 0.00017600680645416583, + "loss": 0.6582, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.2819117395356436, + "learning_rate": 0.00017532669838704035, + "loss": 0.681, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.2808167623131368, + "learning_rate": 0.00017463843894486937, + "loss": 0.6635, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.2727980766577742, + "learning_rate": 0.0001739421026067017, + "loss": 0.6387, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.27946610199292726, + "learning_rate": 0.00017323776472561627, + "loss": 0.6636, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.2776930025762807, + "learning_rate": 0.00017252550152056795, + "loss": 0.6629, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.29547305795196493, + "learning_rate": 0.0001718053900681397, + "loss": 0.7105, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.27992559053351135, + "learning_rate": 0.00017107750829420176, + "loss": 0.6217, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.2836978378018261, + "learning_rate": 0.00017034193496547902, + "loss": 0.6364, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.2859793795996647, + "learning_rate": 0.00016959874968102735, + "loss": 0.6611, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.27670508997854193, + "learning_rate": 0.00016884803286362, + "loss": 0.6321, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.2804552643700331, + "learning_rate": 0.00016808986575104465, + "loss": 0.654, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.2988662193267764, + "learning_rate": 0.00016732433038731242, + "loss": 0.6631, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.2850945559743476, + "learning_rate": 0.0001665515096137797, + "loss": 0.6546, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.34508854455612903, + "learning_rate": 0.00016577148706018328, + "loss": 0.6181, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.29772302651197313, + "learning_rate": 0.00016498434713559088, + "loss": 0.6953, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.2814403372374441, + "learning_rate": 0.00016419017501926656, + "loss": 0.6711, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.2666134049420953, + "learning_rate": 0.0001633890566514535, + "loss": 0.6384, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.2759217127531887, + "learning_rate": 0.00016258107872407375, + "loss": 0.5927, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.27234087211717334, + "learning_rate": 0.0001617663286713474, + "loss": 0.6047, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.28929394588800267, + "learning_rate": 0.00016094489466033043, + "loss": 0.709, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.27285248521549677, + "learning_rate": 0.00016011686558137448, + "loss": 0.6275, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.27087214179135655, + "learning_rate": 0.0001592823310385073, + "loss": 0.6288, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.2759835535783692, + "learning_rate": 0.0001584413813397364, + "loss": 0.6529, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.2732937178984582, + "learning_rate": 0.00015759410748727662, + "loss": 0.6443, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.27150370023008635, + "learning_rate": 0.00015674060116770236, + "loss": 0.6401, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.288480165883741, + "learning_rate": 0.00015588095474202595, + "loss": 0.661, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.2715185186712244, + "learning_rate": 0.00015501526123570277, + "loss": 0.5861, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.2890522618399928, + "learning_rate": 0.00015414361432856475, + "loss": 0.6547, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.2809942944669504, + "learning_rate": 0.0001532661083446829, + "loss": 0.6502, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.2770621559504025, + "learning_rate": 0.00015238283824216015, + "loss": 0.651, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.28643434801423096, + "learning_rate": 0.00015149389960285558, + "loss": 0.6717, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.2838267721875221, + "learning_rate": 0.00015059938862204127, + "loss": 0.6666, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.2904515511293253, + "learning_rate": 0.00014969940209799248, + "loss": 0.6788, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.27521059578174806, + "learning_rate": 0.00014879403742151283, + "loss": 0.6421, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.28435477767209977, + "learning_rate": 0.00014788339256539544, + "loss": 0.6806, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.2836689862948711, + "learning_rate": 0.0001469675660738206, + "loss": 0.6633, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.27993720267562205, + "learning_rate": 0.00014604665705169237, + "loss": 0.6031, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.26823803322012507, + "learning_rate": 0.00014512076515391375, + "loss": 0.6258, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.2755753433763202, + "learning_rate": 0.00014418999057460276, + "loss": 0.6238, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.26378813575701704, + "learning_rate": 0.0001432544340362501, + "loss": 0.6105, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.27807647013178804, + "learning_rate": 0.00014231419677881966, + "loss": 0.634, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.2850302043584264, + "learning_rate": 0.00014136938054879283, + "loss": 0.6473, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.2842021607325469, + "learning_rate": 0.00014042008758815818, + "loss": 0.6666, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.2646241757350502, + "learning_rate": 0.00013946642062334766, + "loss": 0.5882, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.27553697595802684, + "learning_rate": 0.00013850848285411994, + "loss": 0.6553, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.266869763753527, + "learning_rate": 0.000137546377942393, + "loss": 0.6334, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.2891373420552467, + "learning_rate": 0.00013658021000102636, + "loss": 0.6232, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.29054049719448544, + "learning_rate": 0.00013561008358255468, + "loss": 0.6685, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.26991808658939265, + "learning_rate": 0.00013463610366787392, + "loss": 0.6451, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.2826019428481463, + "learning_rate": 0.00013365837565488064, + "loss": 0.6748, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.26989180549288744, + "learning_rate": 0.0001326770053470668, + "loss": 0.6366, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.2693716385431619, + "learning_rate": 0.0001316920989420703, + "loss": 0.6365, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.2587716538530605, + "learning_rate": 0.00013070376302018287, + "loss": 0.5821, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.2733425696050926, + "learning_rate": 0.00012971210453281674, + "loss": 0.6601, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.28260898546044794, + "learning_rate": 0.000128717230790931, + "loss": 0.6598, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.26073598361962874, + "learning_rate": 0.00012771924945341906, + "loss": 0.6062, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.26672014153172025, + "learning_rate": 0.00012671826851545851, + "loss": 0.6664, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.27064468874961567, + "learning_rate": 0.0001257143962968246, + "loss": 0.6409, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.2678777921010367, + "learning_rate": 0.00012470774143016853, + "loss": 0.6146, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.2831646298939026, + "learning_rate": 0.00012369841284926188, + "loss": 0.6641, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.2863253592057525, + "learning_rate": 0.00012268651977720866, + "loss": 0.6653, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.26496566477700495, + "learning_rate": 0.00012167217171462566, + "loss": 0.6061, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.2741590428796881, + "learning_rate": 0.0001206554784277931, + "loss": 0.683, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.274967394883945, + "learning_rate": 0.00011963654993677645, + "loss": 0.6738, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.26748184911275685, + "learning_rate": 0.00011861549650352069, + "loss": 0.6259, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.27571463833666354, + "learning_rate": 0.00011759242861991855, + "loss": 0.6824, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.28204383254583004, + "learning_rate": 0.00011656745699585371, + "loss": 0.654, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.2737273526998308, + "learning_rate": 0.00011554069254722051, + "loss": 0.6383, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.26718453878335136, + "learning_rate": 0.00011451224638392129, + "loss": 0.6336, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.28928334642647074, + "learning_rate": 0.00011348222979784289, + "loss": 0.6328, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.27269344035826515, + "learning_rate": 0.00011245075425081328, + "loss": 0.6261, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.27361132880394723, + "learning_rate": 0.00011141793136253986, + "loss": 0.6423, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.2666674720378228, + "learning_rate": 0.0001103838728985307, + "loss": 0.6397, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.26339311415617495, + "learning_rate": 0.000109348690758, + "loss": 0.6184, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.30511330160542777, + "learning_rate": 0.00010831249696175918, + "loss": 0.631, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.27393530074277894, + "learning_rate": 0.0001072754036400944, + "loss": 0.64, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.2630950654462822, + "learning_rate": 0.00010623752302063283, + "loss": 0.6256, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.26558776558003316, + "learning_rate": 0.00010519896741619803, + "loss": 0.6329, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.27737305767773224, + "learning_rate": 0.00010415984921265609, + "loss": 0.6808, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.34438277093807235, + "learning_rate": 0.00010312028085675391, + "loss": 0.628, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.2716378054950986, + "learning_rate": 0.00010208037484395114, + "loss": 0.6197, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.25713458936107125, + "learning_rate": 0.00010104024370624644, + "loss": 0.6055, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.2556414802851472, + "learning_rate": 0.0001, + "loss": 0.6088, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.9117970631393475, + "learning_rate": 9.895975629375359e-05, + "loss": 0.6301, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.26669052461677767, + "learning_rate": 9.791962515604887e-05, + "loss": 0.6326, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.27925439381355327, + "learning_rate": 9.687971914324607e-05, + "loss": 0.6681, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.25468990353229803, + "learning_rate": 9.584015078734395e-05, + "loss": 0.5743, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.2629677375763853, + "learning_rate": 9.480103258380198e-05, + "loss": 0.6055, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.26370207518366157, + "learning_rate": 9.376247697936719e-05, + "loss": 0.6102, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.26441992972242784, + "learning_rate": 9.272459635990562e-05, + "loss": 0.6238, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.2798821979607459, + "learning_rate": 9.168750303824084e-05, + "loss": 0.6568, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.2748187095456481, + "learning_rate": 9.065130924199998e-05, + "loss": 0.6068, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.26830989542228445, + "learning_rate": 8.961612710146934e-05, + "loss": 0.6217, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.2624748827277877, + "learning_rate": 8.858206863746018e-05, + "loss": 0.6203, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.2552606780949764, + "learning_rate": 8.754924574918675e-05, + "loss": 0.5663, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.2557632710956968, + "learning_rate": 8.651777020215712e-05, + "loss": 0.5918, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.2666361550200854, + "learning_rate": 8.548775361607872e-05, + "loss": 0.625, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.2560883378063079, + "learning_rate": 8.445930745277953e-05, + "loss": 0.6032, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.2583406590237059, + "learning_rate": 8.343254300414628e-05, + "loss": 0.6075, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.27012077672072304, + "learning_rate": 8.240757138008149e-05, + "loss": 0.645, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.2697472104569865, + "learning_rate": 8.138450349647936e-05, + "loss": 0.6357, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.2704698732844997, + "learning_rate": 8.036345006322359e-05, + "loss": 0.6278, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.27596503294944713, + "learning_rate": 7.934452157220694e-05, + "loss": 0.655, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.26527906505737747, + "learning_rate": 7.832782828537437e-05, + "loss": 0.568, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.2697094402719354, + "learning_rate": 7.731348022279134e-05, + "loss": 0.6172, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.2738859813879391, + "learning_rate": 7.630158715073813e-05, + "loss": 0.6275, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.25939271181980195, + "learning_rate": 7.52922585698315e-05, + "loss": 0.5639, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.2671651992817328, + "learning_rate": 7.428560370317542e-05, + "loss": 0.5919, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.2688565475090851, + "learning_rate": 7.328173148454151e-05, + "loss": 0.6312, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.2784248260332949, + "learning_rate": 7.228075054658096e-05, + "loss": 0.6124, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.26997057247712153, + "learning_rate": 7.1282769209069e-05, + "loss": 0.648, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.2707092694992218, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6108, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.2694317279781858, + "learning_rate": 6.929623697981718e-05, + "loss": 0.6399, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.26041601563873007, + "learning_rate": 6.830790105792973e-05, + "loss": 0.5889, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.25291835995919404, + "learning_rate": 6.732299465293322e-05, + "loss": 0.5911, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.26551700863248473, + "learning_rate": 6.63416243451194e-05, + "loss": 0.6089, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.25882948329790545, + "learning_rate": 6.536389633212609e-05, + "loss": 0.6298, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.2529947012716399, + "learning_rate": 6.43899164174453e-05, + "loss": 0.5914, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.32497365798332417, + "learning_rate": 6.341978999897365e-05, + "loss": 0.638, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.26296553022742203, + "learning_rate": 6.245362205760704e-05, + "loss": 0.6258, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.2657709305402464, + "learning_rate": 6.149151714588009e-05, + "loss": 0.6495, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.25801033104181925, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6019, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.25805367525126394, + "learning_rate": 5.957991241184184e-05, + "loss": 0.5931, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.26487929148998474, + "learning_rate": 5.863061945120719e-05, + "loss": 0.6711, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.25255797052501494, + "learning_rate": 5.768580322118034e-05, + "loss": 0.6088, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.25317945487768007, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.5703, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.2545495908249795, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.5878, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.32318784276902335, + "learning_rate": 5.487923484608629e-05, + "loss": 0.6273, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.25499189403717754, + "learning_rate": 5.395334294830765e-05, + "loss": 0.576, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.26227335079319297, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.5812, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.2514697436323961, + "learning_rate": 5.211660743460458e-05, + "loss": 0.5734, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.27670627140391063, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.6589, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.26233287953524004, + "learning_rate": 5.030059790200756e-05, + "loss": 0.5956, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.25899185821758536, + "learning_rate": 4.940061137795876e-05, + "loss": 0.5981, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.2695534934217965, + "learning_rate": 4.850610039714444e-05, + "loss": 0.5788, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.2622202021562537, + "learning_rate": 4.761716175783989e-05, + "loss": 0.6263, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.26561851569401834, + "learning_rate": 4.673389165531714e-05, + "loss": 0.6423, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2519941661268092, + "learning_rate": 4.585638567143529e-05, + "loss": 0.602, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.2585815299560701, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6187, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.2579931313485816, + "learning_rate": 4.411904525797408e-05, + "loss": 0.6252, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.26138279560269373, + "learning_rate": 4.325939883229766e-05, + "loss": 0.6191, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.2652328876562207, + "learning_rate": 4.240589251272342e-05, + "loss": 0.6341, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.26814443178673353, + "learning_rate": 4.155861866026364e-05, + "loss": 0.6069, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.25716580395548105, + "learning_rate": 4.071766896149273e-05, + "loss": 0.5923, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.25262697851315913, + "learning_rate": 3.988313441862553e-05, + "loss": 0.598, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.25536363817600527, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.5786, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2515414352273644, + "learning_rate": 3.823367132865265e-05, + "loss": 0.5973, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3082923451611295, + "learning_rate": 3.741892127592625e-05, + "loss": 0.5738, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.2665831077382451, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.6079, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.2570054423385765, + "learning_rate": 3.580982498073344e-05, + "loss": 0.5884, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.2764118800977991, + "learning_rate": 3.501565286440914e-05, + "loss": 0.5664, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.2659015981177506, + "learning_rate": 3.422851293981676e-05, + "loss": 0.5945, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.25185649916492603, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.5709, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.26390095033023064, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.5989, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.2530850728794403, + "learning_rate": 3.191013424895536e-05, + "loss": 0.593, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.2565488992653013, + "learning_rate": 3.115196713638e-05, + "loss": 0.5769, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.2611020063864013, + "learning_rate": 3.040125031897264e-05, + "loss": 0.6358, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.2464939099364865, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.5717, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.25190580400469725, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6154, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.2624308866231954, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.6333, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.2542363438650353, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.6209, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.25002614059735057, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.5733, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.2522774493843005, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.5709, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2592098056140366, + "learning_rate": 2.536156105513062e-05, + "loss": 0.6038, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.2545528496860185, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6015, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.26574395128852707, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6239, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.2624335087446576, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.6181, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.2586614704436218, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6334, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.31003500089156544, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.6078, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.24237023956212553, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.5494, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.2625073020569576, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6057, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.2539419606355253, + "learning_rate": 2.008778270707944e-05, + "loss": 0.6047, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.25884298262354777, + "learning_rate": 1.946674459180955e-05, + "loss": 0.5951, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.2660197122315272, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.6259, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.25655988921277334, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.568, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.2457259860690368, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.5566, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.2647745370183451, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6137, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.2526743886541512, + "learning_rate": 1.649358688599191e-05, + "loss": 0.5973, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.26287332654405005, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.5934, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.2614410753690634, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.6114, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.259783723422659, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.5961, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.24965207576444098, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.5706, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.2958426590398693, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.6272, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.25258297001548335, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.6006, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.2611674989165653, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.6272, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.25524659220422524, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6174, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.2575177709932516, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.5956, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.2561395979203794, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.5667, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.25141728122486434, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.572, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.25964673762588375, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.6372, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.256690467072949, + "learning_rate": 9.838733725597615e-06, + "loss": 0.5884, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.2605600488715863, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6151, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.2520719685895574, + "learning_rate": 8.958392187916841e-06, + "loss": 0.586, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.2544316858533973, + "learning_rate": 8.532975781620512e-06, + "loss": 0.5852, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.25086193401636275, + "learning_rate": 8.117457353526625e-06, + "loss": 0.545, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.2717178693247369, + "learning_rate": 7.711881868390291e-06, + "loss": 0.6135, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.26446663858778996, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.654, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.25710288834015854, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6077, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.2614053987061522, + "learning_rate": 6.555246550469907e-06, + "loss": 0.5887, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.25803580837193296, + "learning_rate": 6.189870894938587e-06, + "loss": 0.5922, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.25650163916574076, + "learning_rate": 5.834646773481811e-06, + "loss": 0.583, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.25422924226179444, + "learning_rate": 5.489612626189245e-06, + "loss": 0.5697, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.24957575142600866, + "learning_rate": 5.154805790456485e-06, + "loss": 0.5912, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.2515411538551669, + "learning_rate": 4.830262496944693e-06, + "loss": 0.5846, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.2601071856992811, + "learning_rate": 4.516017865659949e-06, + "loss": 0.5616, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.2639229721318898, + "learning_rate": 4.21210590215273e-06, + "loss": 0.6368, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.24857817483163056, + "learning_rate": 3.918559493838114e-06, + "loss": 0.5593, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.2810784592988802, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.7009, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.25502077384196564, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.5788, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.24704174727298683, + "learning_rate": 3.100425628282899e-06, + "loss": 0.5657, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.26397141045569056, + "learning_rate": 2.848647830172024e-06, + "loss": 0.6363, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.2524305059499819, + "learning_rate": 2.607383131993424e-06, + "loss": 0.5921, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.2697223617632766, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.6327, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.26555534553448457, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.608, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.25409868011892073, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.5747, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.252487846427678, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.5735, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.2571510334874647, + "learning_rate": 1.559629951665298e-06, + "loss": 0.5865, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4760901811369371, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.542, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.26372826176481556, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.5884, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.26175507028814576, + "learning_rate": 1.05862996252597e-06, + "loss": 0.6167, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.26047362323447143, + "learning_rate": 9.130206350089765e-07, + "loss": 0.6128, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.2454540274824027, + "learning_rate": 7.781338686584927e-07, + "loss": 0.5388, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.2476784963430826, + "learning_rate": 6.539842600603918e-07, + "loss": 0.5393, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.24671440407493223, + "learning_rate": 5.405852438937764e-07, + "loss": 0.5966, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.2571397406086264, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.6121, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.2895940448351629, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.5805, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.27720052526167127, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.6128, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.2602744150458922, + "learning_rate": 1.947230525005006e-07, + "loss": 0.6157, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.2565662022949563, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.616, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.2562137901406146, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6134, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.2525816370267746, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.5834, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.2528167501756759, + "learning_rate": 2.164213936770576e-08, + "loss": 0.5784, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.2557747985616951, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6049, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.2608436955663202, + "learning_rate": 0.0, + "loss": 0.5618, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 185984439222272.0, + "train_loss": 0.6505028414420593, + "train_runtime": 4096.0965, + "train_samples_per_second": 1.221, + "train_steps_per_second": 0.076 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 185984439222272.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..83dcfcd3ed6419e6ca1ff0b755b820036a3c711e --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "down_proj", + "o_proj", + "q_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..2b44934230d56ecc96849e5d3a81e9d5c2c1fd21 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59ba24de4775f1291ab252b12fb0b3396485600c1e19fb4a466d9300e8b3c920 +size 671150064 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..5a6bc71bf97737b8a369ad3a1c8bd22681dad003 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97cb23a344d78154074f4de89e6139a9a8c1a763b1c7b376fe682fb0d79a642e +size 918507402 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2d7655452f071ab8a74d711529d7b43f18b6dad4 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_10000_epochs_1_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 1.9138792865490715, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.6749, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 2.0368399430231263, + "learning_rate": 2.105263157894737e-05, + "loss": 1.8516, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 1.8090355867591892, + "learning_rate": 3.157894736842105e-05, + "loss": 1.6712, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 2.762713388884416, + "learning_rate": 4.210526315789474e-05, + "loss": 1.6305, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 1.2794292650844834, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.5747, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 1.4340577236198235, + "learning_rate": 6.31578947368421e-05, + "loss": 1.6697, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 1.4329234538490498, + "learning_rate": 7.368421052631579e-05, + "loss": 1.5619, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.2892622484612177, + "learning_rate": 8.421052631578948e-05, + "loss": 1.3965, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 1.18604697598696, + "learning_rate": 9.473684210526316e-05, + "loss": 1.5188, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 1.1720977115105928, + "learning_rate": 0.00010526315789473685, + "loss": 1.3695, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 1.091981284001823, + "learning_rate": 0.00011578947368421053, + "loss": 1.3952, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 1.0817636217239512, + "learning_rate": 0.0001263157894736842, + "loss": 1.2642, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.8318883186657168, + "learning_rate": 0.0001368421052631579, + "loss": 1.3037, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 1.0485130732801884, + "learning_rate": 0.00014736842105263158, + "loss": 1.3812, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 1.2774578260059815, + "learning_rate": 0.00015789473684210527, + "loss": 1.519, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 1.0273239813225716, + "learning_rate": 0.00016842105263157895, + "loss": 1.2408, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 1.2360823664444458, + "learning_rate": 0.00017894736842105264, + "loss": 1.4999, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.9587869083216982, + "learning_rate": 0.00018947368421052632, + "loss": 1.4482, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.9650065379953744, + "learning_rate": 0.0002, + "loss": 1.2293, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 1.0048012835563842, + "learning_rate": 0.00019999865623437013, + "loss": 1.374, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.9973326787801201, + "learning_rate": 0.00019999462497359466, + "loss": 1.4009, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 1.0109602492648537, + "learning_rate": 0.00019998790632601496, + "loss": 1.3765, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 1.1201615977846267, + "learning_rate": 0.0001999785004721968, + "loss": 1.4827, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 1.088872492346361, + "learning_rate": 0.00019996640766492543, + "loss": 1.2704, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.941299634470907, + "learning_rate": 0.00019995162822919883, + "loss": 1.242, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 1.0378299622697194, + "learning_rate": 0.00019993416256221895, + "loss": 1.3798, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 1.0659314149162376, + "learning_rate": 0.00019991401113338104, + "loss": 1.4877, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 1.0260255286150333, + "learning_rate": 0.00019989117448426108, + "loss": 1.384, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 1.2156029529215326, + "learning_rate": 0.00019986565322860115, + "loss": 1.402, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.924110135826946, + "learning_rate": 0.00019983744805229296, + "loss": 1.4026, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.9627079859103094, + "learning_rate": 0.00019980655971335945, + "loss": 1.2848, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 1.0238335211957148, + "learning_rate": 0.00019977298904193437, + "loss": 1.2772, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.9429351205237804, + "learning_rate": 0.00019973673694024, + "loss": 1.3437, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.9525674623335832, + "learning_rate": 0.00019969780438256293, + "loss": 1.3149, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 1.010878868364398, + "learning_rate": 0.0001996561924152278, + "loss": 1.4853, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 1.232779975593512, + "learning_rate": 0.0001996119021565693, + "loss": 1.4739, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.9746473280525739, + "learning_rate": 0.0001995649347969019, + "loss": 1.3523, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 1.0390083937732542, + "learning_rate": 0.00019951529159848805, + "loss": 1.3505, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 1.0323141306052472, + "learning_rate": 0.00019946297389550433, + "loss": 1.4415, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.9607178707827044, + "learning_rate": 0.00019940798309400526, + "loss": 1.2618, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.9680073099797065, + "learning_rate": 0.0001993503206718859, + "loss": 1.3179, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 1.2077366195171064, + "learning_rate": 0.00019928998817884182, + "loss": 1.3976, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.846005489716849, + "learning_rate": 0.00019922698723632767, + "loss": 1.1975, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.9943262552019229, + "learning_rate": 0.00019916131953751342, + "loss": 1.3532, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 1.1057388030171886, + "learning_rate": 0.00019909298684723904, + "loss": 1.3299, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.962002465600493, + "learning_rate": 0.00019902199100196697, + "loss": 1.3698, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 1.1248573964326298, + "learning_rate": 0.00019894833390973266, + "loss": 1.4142, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.9802161923960824, + "learning_rate": 0.00019887201755009357, + "loss": 1.3274, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.8727888982064798, + "learning_rate": 0.0001987930439740757, + "loss": 1.3386, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 1.0204152929412373, + "learning_rate": 0.00019871141530411853, + "loss": 1.4171, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.8940110524050955, + "learning_rate": 0.0001986271337340182, + "loss": 1.3084, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.9124671150793904, + "learning_rate": 0.00019854020152886814, + "loss": 1.3393, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 1.1643847828013019, + "learning_rate": 0.0001984506210249986, + "loss": 1.5376, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.868101186153599, + "learning_rate": 0.00019835839462991361, + "loss": 1.1898, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.9478830042552752, + "learning_rate": 0.00019826352482222638, + "loss": 1.3806, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 1.451106724764284, + "learning_rate": 0.00019816601415159263, + "loss": 1.3543, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 1.0230592284365352, + "learning_rate": 0.0001980658652386421, + "loss": 1.4803, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 1.0204098395320889, + "learning_rate": 0.00019796308077490817, + "loss": 1.4614, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.8651716760169919, + "learning_rate": 0.00019785766352275542, + "loss": 1.3444, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.9703079648386799, + "learning_rate": 0.00019774961631530545, + "loss": 1.3144, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.9979421069739579, + "learning_rate": 0.00019763894205636072, + "loss": 1.4109, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 1.0324424303171396, + "learning_rate": 0.00019752564372032657, + "loss": 1.3287, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.9785106362003406, + "learning_rate": 0.00019740972435213115, + "loss": 1.3706, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.9420094371561448, + "learning_rate": 0.00019729118706714375, + "loss": 1.3212, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.8905383774972145, + "learning_rate": 0.00019717003505109095, + "loss": 1.2238, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.9378780481865792, + "learning_rate": 0.00019704627155997108, + "loss": 1.2911, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 1.014808075570759, + "learning_rate": 0.00019691989991996663, + "loss": 1.3628, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.9984699648320681, + "learning_rate": 0.0001967909235273549, + "loss": 1.3013, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 1.0586654354662475, + "learning_rate": 0.00019665934584841682, + "loss": 1.4085, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.9799763176865066, + "learning_rate": 0.00019652517041934356, + "loss": 1.3189, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 1.0439261227003327, + "learning_rate": 0.00019638840084614182, + "loss": 1.3016, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.9661114000480542, + "learning_rate": 0.00019624904080453655, + "loss": 1.1563, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.9538595171022275, + "learning_rate": 0.00019610709403987246, + "loss": 1.3644, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.9279761643712138, + "learning_rate": 0.00019596256436701324, + "loss": 1.3325, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 1.0814585612185719, + "learning_rate": 0.000195815455670239, + "loss": 1.4022, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 1.0666505125215753, + "learning_rate": 0.00019566577190314197, + "loss": 1.3003, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.9049371013277258, + "learning_rate": 0.0001955135170885202, + "loss": 1.2489, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.920091059105391, + "learning_rate": 0.00019535869531826937, + "loss": 1.2869, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.8923587994762451, + "learning_rate": 0.00019520131075327298, + "loss": 1.277, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.8899519924175298, + "learning_rate": 0.00019504136762329047, + "loss": 1.3673, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.9471778221939827, + "learning_rate": 0.00019487887022684336, + "loss": 1.364, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.9079412832884837, + "learning_rate": 0.00019471382293110003, + "loss": 1.2632, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.9870969446400838, + "learning_rate": 0.00019454623017175812, + "loss": 1.3941, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.8702555425831592, + "learning_rate": 0.00019437609645292546, + "loss": 1.2527, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.9895405215588456, + "learning_rate": 0.0001942034263469989, + "loss": 1.3044, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.9884095855899157, + "learning_rate": 0.00019402822449454153, + "loss": 1.294, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.9092570528379283, + "learning_rate": 0.00019385049560415794, + "loss": 1.3049, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.8905401474373852, + "learning_rate": 0.00019367024445236754, + "loss": 1.2914, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.9502646619662192, + "learning_rate": 0.00019348747588347637, + "loss": 1.3151, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.929852852986334, + "learning_rate": 0.00019330219480944694, + "loss": 1.2692, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 1.0467930508496202, + "learning_rate": 0.00019311440620976597, + "loss": 1.3036, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.9318355455106352, + "learning_rate": 0.0001929241151313108, + "loss": 1.3428, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.9581321473639015, + "learning_rate": 0.00019273132668821364, + "loss": 1.1883, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.9096907067626209, + "learning_rate": 0.00019253604606172417, + "loss": 1.3583, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.888898125705102, + "learning_rate": 0.00019233827850007027, + "loss": 1.2552, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.9996772116461426, + "learning_rate": 0.00019213802931831696, + "loss": 1.2999, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.9944042594023413, + "learning_rate": 0.00019193530389822363, + "loss": 1.2563, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.7927854886824046, + "learning_rate": 0.00019173010768809933, + "loss": 1.0828, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.9063951522976151, + "learning_rate": 0.0001915224462026563, + "loss": 1.2878, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 1.0005464752308368, + "learning_rate": 0.00019131232502286188, + "loss": 1.346, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 1.0773036588879836, + "learning_rate": 0.0001910997497957885, + "loss": 1.4719, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.9902341156894181, + "learning_rate": 0.00019088472623446183, + "loss": 1.3133, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.9202584867433755, + "learning_rate": 0.00019066726011770726, + "loss": 1.2159, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.8928202430331159, + "learning_rate": 0.0001904473572899947, + "loss": 1.2781, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.8786162241735016, + "learning_rate": 0.00019022502366128135, + "loss": 1.2906, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.8818318634341324, + "learning_rate": 0.00019000026520685302, + "loss": 1.2183, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 1.016850976151942, + "learning_rate": 0.0001897730879671634, + "loss": 1.2798, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 1.1125718086502039, + "learning_rate": 0.00018954349804767184, + "loss": 1.4232, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 1.0636099998751212, + "learning_rate": 0.00018931150161867916, + "loss": 1.3642, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.9373493361047722, + "learning_rate": 0.00018907710491516199, + "loss": 1.3849, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.9688474159342598, + "learning_rate": 0.0001888403142366049, + "loss": 1.2488, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.8412981721836017, + "learning_rate": 0.00018860113594683148, + "loss": 1.2149, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 1.0455522755060047, + "learning_rate": 0.00018835957647383303, + "loss": 1.4969, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.8819959687210541, + "learning_rate": 0.00018811564230959588, + "loss": 1.2709, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 1.0390606100176032, + "learning_rate": 0.00018786934000992688, + "loss": 1.2512, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.8902083482060692, + "learning_rate": 0.00018762067619427746, + "loss": 1.2918, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.8468591087899736, + "learning_rate": 0.00018736965754556528, + "loss": 1.2147, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.9723033959967523, + "learning_rate": 0.00018711629080999504, + "loss": 1.3743, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 1.0455319231677251, + "learning_rate": 0.00018686058279687698, + "loss": 1.2743, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.9408439301893431, + "learning_rate": 0.00018660254037844388, + "loss": 1.3922, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 1.0263620281862005, + "learning_rate": 0.00018634217048966637, + "loss": 1.2693, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.9836156376006685, + "learning_rate": 0.0001860794801280666, + "loss": 1.2372, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.8441357232790682, + "learning_rate": 0.0001858144763535302, + "loss": 1.1687, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.8881386393138146, + "learning_rate": 0.0001855471662881164, + "loss": 1.2011, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.8483056979385163, + "learning_rate": 0.00018527755711586678, + "loss": 1.4101, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.894842476652938, + "learning_rate": 0.00018500565608261214, + "loss": 1.3273, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.8948620187331842, + "learning_rate": 0.00018473147049577774, + "loss": 1.2945, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.8603505640754253, + "learning_rate": 0.00018445500772418697, + "loss": 1.2164, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.9260794605533464, + "learning_rate": 0.00018417627519786315, + "loss": 1.3015, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.924634616629723, + "learning_rate": 0.00018389528040783012, + "loss": 1.3371, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.8829903828411027, + "learning_rate": 0.00018361203090591071, + "loss": 1.3814, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.9791894951236914, + "learning_rate": 0.00018332653430452376, + "loss": 1.1984, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.8856840640474363, + "learning_rate": 0.00018303879827647975, + "loss": 1.2842, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.8671119979857622, + "learning_rate": 0.00018274883055477436, + "loss": 1.1748, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.8591943456089417, + "learning_rate": 0.00018245663893238075, + "loss": 1.2471, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 1.0462864848081617, + "learning_rate": 0.00018216223126204007, + "loss": 1.4317, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.879590039265427, + "learning_rate": 0.00018186561545605054, + "loss": 1.2346, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.9258746466555289, + "learning_rate": 0.00018156679948605467, + "loss": 1.2666, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 1.026660297097013, + "learning_rate": 0.00018126579138282503, + "loss": 1.3158, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.9437344711094847, + "learning_rate": 0.0001809625992360485, + "loss": 1.1704, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.9068493066951973, + "learning_rate": 0.00018065723119410884, + "loss": 1.2705, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.8872777903966959, + "learning_rate": 0.00018034969546386757, + "loss": 1.266, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.9348855211277328, + "learning_rate": 0.0001800400003104436, + "loss": 1.3649, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.9697822035173204, + "learning_rate": 0.00017972815405699103, + "loss": 1.2889, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 1.0179014202581467, + "learning_rate": 0.00017941416508447536, + "loss": 1.288, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.981204112418224, + "learning_rate": 0.0001790980418314484, + "loss": 1.2186, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.9115720707069629, + "learning_rate": 0.00017877979279382135, + "loss": 1.2282, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.8356969676801423, + "learning_rate": 0.0001784594265246366, + "loss": 1.1469, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 1.1175490899050453, + "learning_rate": 0.0001781369516338378, + "loss": 1.4246, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.9341528226020359, + "learning_rate": 0.00017781237678803847, + "loss": 1.3495, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.9584956431596955, + "learning_rate": 0.000177485710710289, + "loss": 1.2838, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.9478525671237141, + "learning_rate": 0.00017715696217984235, + "loss": 1.2857, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 1.011731532566276, + "learning_rate": 0.00017682614003191807, + "loss": 1.295, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.9212288319149741, + "learning_rate": 0.00017649325315746478, + "loss": 1.2974, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.8918183647310739, + "learning_rate": 0.0001761583105029213, + "loss": 1.3973, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.876633321580276, + "learning_rate": 0.00017582132106997616, + "loss": 1.099, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.8734092115955532, + "learning_rate": 0.00017548229391532572, + "loss": 1.2548, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.9509936530005848, + "learning_rate": 0.00017514123815043074, + "loss": 1.3433, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.9876147790154572, + "learning_rate": 0.00017479816294127152, + "loss": 1.3589, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.8404573991625482, + "learning_rate": 0.0001744530775081015, + "loss": 1.2323, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.8718197594420647, + "learning_rate": 0.0001741059911251997, + "loss": 1.303, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.854075594306203, + "learning_rate": 0.000173756913120621, + "loss": 1.245, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.8719731050503491, + "learning_rate": 0.00017340585287594604, + "loss": 1.2371, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.9174901722888825, + "learning_rate": 0.0001730528198260285, + "loss": 1.2911, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.843494724446203, + "learning_rate": 0.00017269782345874203, + "loss": 1.1555, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.8269647090580537, + "learning_rate": 0.00017234087331472497, + "loss": 1.2305, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.8538009131480417, + "learning_rate": 0.00017198197898712404, + "loss": 1.2288, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.8200571038773096, + "learning_rate": 0.00017162115012133643, + "loss": 1.1042, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.9985090587395257, + "learning_rate": 0.00017125839641475072, + "loss": 1.2814, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.9043088861061557, + "learning_rate": 0.00017089372761648616, + "loss": 1.2904, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.9720144762524422, + "learning_rate": 0.00017052715352713075, + "loss": 1.2934, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.9429071983526343, + "learning_rate": 0.00017015868399847768, + "loss": 1.2195, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.9922714848946325, + "learning_rate": 0.00016978832893326074, + "loss": 1.386, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.8416804433344276, + "learning_rate": 0.00016941609828488807, + "loss": 1.2287, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.9113976056217002, + "learning_rate": 0.0001690420020571747, + "loss": 1.28, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.883839472667245, + "learning_rate": 0.0001686660503040737, + "loss": 1.2327, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.7691670822483967, + "learning_rate": 0.00016828825312940592, + "loss": 1.1303, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.9882033137881157, + "learning_rate": 0.0001679086206865886, + "loss": 1.3492, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.9363276145321529, + "learning_rate": 0.00016752716317836229, + "loss": 1.3594, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 1.044069691358691, + "learning_rate": 0.0001671438908565167, + "loss": 1.2119, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.8122773071076308, + "learning_rate": 0.00016675881402161536, + "loss": 1.0944, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.8127517020671314, + "learning_rate": 0.0001663719430227186, + "loss": 1.2359, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.941263554003775, + "learning_rate": 0.00016598328825710533, + "loss": 1.2414, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.8393462710847754, + "learning_rate": 0.000165592860169994, + "loss": 1.1938, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.9969899499563986, + "learning_rate": 0.00016520066925426144, + "loss": 1.2476, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.8763384503459395, + "learning_rate": 0.0001648067260501611, + "loss": 1.3152, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.8489189147455701, + "learning_rate": 0.0001644110411450398, + "loss": 1.348, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.816074167030023, + "learning_rate": 0.00016401362517305296, + "loss": 1.1743, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.8654989851443282, + "learning_rate": 0.00016361448881487914, + "loss": 1.2716, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.9377395643561406, + "learning_rate": 0.00016321364279743266, + "loss": 1.2408, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.8243605857754032, + "learning_rate": 0.0001628110978935756, + "loss": 1.2268, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.9870712994522676, + "learning_rate": 0.00016240686492182804, + "loss": 1.2249, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.9089766343458752, + "learning_rate": 0.00016200095474607753, + "loss": 1.282, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.8809977197819951, + "learning_rate": 0.00016159337827528685, + "loss": 1.2828, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.8972386215910175, + "learning_rate": 0.0001611841464632011, + "loss": 1.244, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.8800981648292031, + "learning_rate": 0.0001607732703080532, + "loss": 1.1939, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.9217855347492754, + "learning_rate": 0.00016036076085226814, + "loss": 1.1995, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.9350718448332347, + "learning_rate": 0.0001599466291821666, + "loss": 1.3191, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.8388711214458865, + "learning_rate": 0.0001595308864276666, + "loss": 1.2952, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.9667195028151565, + "learning_rate": 0.0001591135437619847, + "loss": 1.2687, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.947491495591094, + "learning_rate": 0.0001586946124013354, + "loss": 1.2749, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.9418623165919426, + "learning_rate": 0.0001582741036046301, + "loss": 1.2651, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.8811125597170616, + "learning_rate": 0.00015785202867317407, + "loss": 1.331, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.8239429953850183, + "learning_rate": 0.00015742839895036305, + "loss": 1.1908, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.8218011237910456, + "learning_rate": 0.00015700322582137827, + "loss": 1.1712, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.792556863148284, + "learning_rate": 0.0001565765207128805, + "loss": 1.158, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.7591037692200457, + "learning_rate": 0.0001561482950927029, + "loss": 1.2378, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.9134765422918922, + "learning_rate": 0.00015571856046954285, + "loss": 1.2695, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.9396029761811607, + "learning_rate": 0.00015528732839265272, + "loss": 1.3359, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.9700687750462483, + "learning_rate": 0.0001548546104515294, + "loss": 1.2743, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.972971356427299, + "learning_rate": 0.00015442041827560274, + "loss": 1.2327, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.865753023900172, + "learning_rate": 0.00015398476353392323, + "loss": 1.2224, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.8791308977470417, + "learning_rate": 0.00015354765793484834, + "loss": 1.2124, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.8645115188012078, + "learning_rate": 0.00015310911322572753, + "loss": 1.1832, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.8394381140725087, + "learning_rate": 0.000152669141192587, + "loss": 1.2047, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.8556568829596263, + "learning_rate": 0.00015222775365981273, + "loss": 1.2568, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.7871574639241793, + "learning_rate": 0.00015178496248983254, + "loss": 1.1903, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.9051963356204792, + "learning_rate": 0.00015134077958279765, + "loss": 1.2939, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.9181618868204299, + "learning_rate": 0.00015089521687626243, + "loss": 1.2466, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.8190508823611149, + "learning_rate": 0.000150448286344864, + "loss": 1.2468, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.8681848868003743, + "learning_rate": 0.00015000000000000001, + "loss": 1.1276, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 1.0010246288434874, + "learning_rate": 0.00014955036988950618, + "loss": 1.3548, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.9054763292752849, + "learning_rate": 0.00014909940809733222, + "loss": 1.1697, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.8853715938791729, + "learning_rate": 0.00014864712674321734, + "loss": 1.1584, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.8465500878684764, + "learning_rate": 0.00014819353798236427, + "loss": 1.2424, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.9212138041787676, + "learning_rate": 0.00014773865400511272, + "loss": 1.1762, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.9611719452243532, + "learning_rate": 0.00014728248703661182, + "loss": 1.3087, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.8827786916757613, + "learning_rate": 0.00014682504933649144, + "loss": 1.2625, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.8387676346336079, + "learning_rate": 0.00014636635319853275, + "loss": 1.1485, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.8494532672088838, + "learning_rate": 0.00014590641095033787, + "loss": 1.1955, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.870513532778789, + "learning_rate": 0.00014544523495299842, + "loss": 1.2725, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.8910503853194656, + "learning_rate": 0.0001449828376007636, + "loss": 1.1638, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.9040214513192539, + "learning_rate": 0.0001445192313207067, + "loss": 1.2113, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.8851553164826061, + "learning_rate": 0.0001440544285723915, + "loss": 1.3086, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.8234234579800335, + "learning_rate": 0.00014358844184753712, + "loss": 1.2142, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.8892509071021439, + "learning_rate": 0.00014312128366968243, + "loss": 1.1671, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.899688472903065, + "learning_rate": 0.00014265296659384956, + "loss": 1.2334, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.8903533861688723, + "learning_rate": 0.00014218350320620624, + "loss": 1.2552, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.9535230774328497, + "learning_rate": 0.0001417129061237278, + "loss": 1.3028, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.9120573275838028, + "learning_rate": 0.00014124118799385796, + "loss": 1.2507, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.8068854936446114, + "learning_rate": 0.00014076836149416887, + "loss": 1.0953, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.8715233148733574, + "learning_rate": 0.0001402944393320206, + "loss": 1.2744, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.8099131715159511, + "learning_rate": 0.00013981943424421932, + "loss": 1.1879, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.9613678265565011, + "learning_rate": 0.00013934335899667527, + "loss": 1.2067, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 1.2594485201695729, + "learning_rate": 0.00013886622638405952, + "loss": 1.0867, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.881910095099135, + "learning_rate": 0.00013838804922946027, + "loss": 1.2392, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 2.5785824113387066, + "learning_rate": 0.00013790884038403795, + "loss": 1.2812, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.8581277392188488, + "learning_rate": 0.00013742861272668012, + "loss": 1.2019, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.8908049815241303, + "learning_rate": 0.00013694737916365517, + "loss": 1.2827, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.9328597459880543, + "learning_rate": 0.00013646515262826552, + "loss": 1.2972, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.8819384667780155, + "learning_rate": 0.0001359819460805001, + "loss": 1.2383, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.923991271431393, + "learning_rate": 0.0001354977725066859, + "loss": 1.2378, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.852856759306124, + "learning_rate": 0.00013501264491913906, + "loss": 1.2021, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.9517124148223832, + "learning_rate": 0.0001345265763558152, + "loss": 1.2537, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.9943489920406982, + "learning_rate": 0.00013403957987995882, + "loss": 1.298, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.9424839438430717, + "learning_rate": 0.0001335516685797525, + "loss": 1.3916, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.8623889662495645, + "learning_rate": 0.00013306285556796495, + "loss": 1.2428, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.8969029098876905, + "learning_rate": 0.00013257315398159864, + "loss": 1.258, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.8685679563263606, + "learning_rate": 0.00013208257698153677, + "loss": 1.2611, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.9380453581806747, + "learning_rate": 0.00013159113775218964, + "loss": 1.2899, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.9128492251598493, + "learning_rate": 0.00013109884950114007, + "loss": 1.1631, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.7921675182825618, + "learning_rate": 0.00013060572545878875, + "loss": 1.1727, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.8741226372243694, + "learning_rate": 0.00013011177887799845, + "loss": 1.1796, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.9624066895852776, + "learning_rate": 0.00012961702303373795, + "loss": 1.2753, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.8623924847866271, + "learning_rate": 0.00012912147122272523, + "loss": 1.2239, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.8754993437924429, + "learning_rate": 0.00012862513676307008, + "loss": 1.2303, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.9196818395200124, + "learning_rate": 0.00012812803299391628, + "loss": 1.2629, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.7725792736098621, + "learning_rate": 0.00012763017327508305, + "loss": 1.1907, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.882188597768767, + "learning_rate": 0.0001271315709867059, + "loss": 1.15, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.8440779606854073, + "learning_rate": 0.00012663223952887723, + "loss": 1.1982, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.8230773009080197, + "learning_rate": 0.00012613219232128608, + "loss": 1.2513, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.8372385893798061, + "learning_rate": 0.00012563144280285741, + "loss": 1.2188, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.8872124615795473, + "learning_rate": 0.00012513000443139112, + "loss": 1.18, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.9019740107544314, + "learning_rate": 0.00012462789068320017, + "loss": 1.2596, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.8844802528802836, + "learning_rate": 0.00012412511505274844, + "loss": 1.1859, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.8145917051737542, + "learning_rate": 0.00012362169105228826, + "loss": 1.1844, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.8532928487511529, + "learning_rate": 0.000123117632211497, + "loss": 1.1622, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.8620582995155897, + "learning_rate": 0.00012261295207711346, + "loss": 1.1533, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.8594768364941002, + "learning_rate": 0.0001221076642125742, + "loss": 1.2915, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.7413865897201313, + "learning_rate": 0.00012160178219764837, + "loss": 1.0711, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.7847519476510114, + "learning_rate": 0.00012109531962807332, + "loss": 1.0843, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.8831366875265851, + "learning_rate": 0.00012058829011518896, + "loss": 1.246, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.8985205839545358, + "learning_rate": 0.00012008070728557186, + "loss": 1.2101, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.8917026992840591, + "learning_rate": 0.00011957258478066931, + "loss": 1.1728, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.89096820779557, + "learning_rate": 0.00011906393625643244, + "loss": 1.2552, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.8296500927521173, + "learning_rate": 0.00011855477538294935, + "loss": 1.1848, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.8037711878867846, + "learning_rate": 0.00011804511584407763, + "loss": 1.2396, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.9103068328992188, + "learning_rate": 0.00011753497133707679, + "loss": 1.1845, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.8647079909838531, + "learning_rate": 0.00011702435557223987, + "loss": 1.0828, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.8298068370580297, + "learning_rate": 0.00011651328227252517, + "loss": 1.2582, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.9292697985594613, + "learning_rate": 0.00011600176517318741, + "loss": 1.2346, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.7646422463520625, + "learning_rate": 0.00011548981802140848, + "loss": 1.0674, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.8290176537474588, + "learning_rate": 0.00011497745457592816, + "loss": 1.128, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.8399824958737899, + "learning_rate": 0.00011446468860667421, + "loss": 1.1764, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.8971948834731232, + "learning_rate": 0.00011395153389439233, + "loss": 1.214, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.9782768537494978, + "learning_rate": 0.00011343800423027582, + "loss": 1.1912, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 1.23581044725264, + "learning_rate": 0.0001129241134155949, + "loss": 1.2208, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.7668650057527652, + "learning_rate": 0.00011240987526132594, + "loss": 1.1246, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.8836310045333498, + "learning_rate": 0.00011189530358778005, + "loss": 1.2389, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.8199386285851565, + "learning_rate": 0.00011138041222423177, + "loss": 1.0902, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.8995441990206351, + "learning_rate": 0.00011086521500854745, + "loss": 1.2153, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.7953824562393018, + "learning_rate": 0.00011034972578681338, + "loss": 1.1021, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.8041206611379477, + "learning_rate": 0.00010983395841296348, + "loss": 1.1319, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.760043694320985, + "learning_rate": 0.00010931792674840718, + "loss": 1.0802, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.8402613615689775, + "learning_rate": 0.00010880164466165674, + "loss": 1.1854, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.8370506536169573, + "learning_rate": 0.00010828512602795462, + "loss": 1.3297, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.8756542853970231, + "learning_rate": 0.00010776838472890065, + "loss": 1.167, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.9887192919613506, + "learning_rate": 0.00010725143465207867, + "loss": 1.2035, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.8777343614550543, + "learning_rate": 0.00010673428969068364, + "loss": 1.1611, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.8756411084607579, + "learning_rate": 0.00010621696374314807, + "loss": 1.2932, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.7933193873769244, + "learning_rate": 0.00010569947071276847, + "loss": 1.1609, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.8848359151078187, + "learning_rate": 0.00010518182450733186, + "loss": 1.1338, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.7547677514371438, + "learning_rate": 0.00010466403903874176, + "loss": 1.13, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.9183224690124575, + "learning_rate": 0.00010414612822264455, + "loss": 1.2251, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.7649045744367469, + "learning_rate": 0.00010362810597805526, + "loss": 1.1016, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.7984049291501666, + "learning_rate": 0.0001031099862269837, + "loss": 1.1782, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.8430512686337509, + "learning_rate": 0.00010259178289406011, + "loss": 1.2102, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.7590493446338831, + "learning_rate": 0.00010207350990616107, + "loss": 1.2445, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 1.010969864158178, + "learning_rate": 0.0001015551811920351, + "loss": 1.163, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.906697269700168, + "learning_rate": 0.00010103681068192845, + "loss": 1.1266, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.7181242074258878, + "learning_rate": 0.00010051841230721065, + "loss": 1.1142, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.8219401840378927, + "learning_rate": 0.0001, + "loss": 1.1711, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.9611267122420954, + "learning_rate": 9.948158769278939e-05, + "loss": 1.2323, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.9877413453035904, + "learning_rate": 9.896318931807155e-05, + "loss": 1.2141, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.8368823554633094, + "learning_rate": 9.844481880796491e-05, + "loss": 1.1423, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.8556826186373597, + "learning_rate": 9.792649009383899e-05, + "loss": 1.2504, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.8420396185143431, + "learning_rate": 9.740821710593989e-05, + "loss": 1.1729, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.948304811546288, + "learning_rate": 9.689001377301633e-05, + "loss": 1.2101, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.8559182631572183, + "learning_rate": 9.637189402194476e-05, + "loss": 1.1533, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.8954018911520171, + "learning_rate": 9.585387177735547e-05, + "loss": 1.1143, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.9147431991804071, + "learning_rate": 9.533596096125825e-05, + "loss": 1.1952, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.8148360981912695, + "learning_rate": 9.481817549266817e-05, + "loss": 1.1811, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.9031070907439837, + "learning_rate": 9.430052928723153e-05, + "loss": 1.227, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.860370451267779, + "learning_rate": 9.378303625685195e-05, + "loss": 1.1484, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.8152521797221018, + "learning_rate": 9.326571030931637e-05, + "loss": 1.1894, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.8398257018415263, + "learning_rate": 9.274856534792138e-05, + "loss": 1.2223, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.9008393901320019, + "learning_rate": 9.223161527109937e-05, + "loss": 1.2366, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.7753201517758531, + "learning_rate": 9.171487397204539e-05, + "loss": 1.1435, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.9073977985666707, + "learning_rate": 9.119835533834331e-05, + "loss": 1.2315, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.871600245938443, + "learning_rate": 9.068207325159284e-05, + "loss": 1.2583, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.8841066100524158, + "learning_rate": 9.016604158703654e-05, + "loss": 1.2344, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.8143250478949455, + "learning_rate": 8.965027421318665e-05, + "loss": 1.1382, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.7426915405897001, + "learning_rate": 8.913478499145254e-05, + "loss": 1.1406, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.8864003234551969, + "learning_rate": 8.861958777576827e-05, + "loss": 1.1746, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.7776617832611498, + "learning_rate": 8.810469641222001e-05, + "loss": 1.1841, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.9102059489215345, + "learning_rate": 8.759012473867407e-05, + "loss": 1.1214, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.9094619727242491, + "learning_rate": 8.707588658440511e-05, + "loss": 1.218, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.8033485035147858, + "learning_rate": 8.656199576972423e-05, + "loss": 1.1593, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.9253378190577354, + "learning_rate": 8.604846610560771e-05, + "loss": 1.3105, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.8591530532228647, + "learning_rate": 8.553531139332582e-05, + "loss": 1.2892, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.7988519283164964, + "learning_rate": 8.502254542407186e-05, + "loss": 1.161, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.9708577800367942, + "learning_rate": 8.451018197859153e-05, + "loss": 1.2504, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.848809978081281, + "learning_rate": 8.399823482681262e-05, + "loss": 1.2201, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.7685979840925905, + "learning_rate": 8.348671772747487e-05, + "loss": 1.1189, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.869108252327602, + "learning_rate": 8.297564442776014e-05, + "loss": 1.2034, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.7859812284213198, + "learning_rate": 8.246502866292324e-05, + "loss": 1.0982, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.8967511969562878, + "learning_rate": 8.195488415592238e-05, + "loss": 1.2392, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.7783806511972009, + "learning_rate": 8.144522461705067e-05, + "loss": 1.1966, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.7643003598307078, + "learning_rate": 8.093606374356759e-05, + "loss": 1.1042, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.8427919724796142, + "learning_rate": 8.042741521933071e-05, + "loss": 1.1576, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.902025440967138, + "learning_rate": 7.991929271442817e-05, + "loss": 1.2136, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.7404334436478929, + "learning_rate": 7.941170988481108e-05, + "loss": 1.083, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.9751026591922226, + "learning_rate": 7.89046803719267e-05, + "loss": 1.3347, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.8691363843427901, + "learning_rate": 7.839821780235168e-05, + "loss": 1.2633, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.9154939493330465, + "learning_rate": 7.789233578742582e-05, + "loss": 1.166, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.8451961940267377, + "learning_rate": 7.738704792288655e-05, + "loss": 1.0749, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.8398861442241642, + "learning_rate": 7.688236778850306e-05, + "loss": 1.1373, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.8172425329442209, + "learning_rate": 7.637830894771175e-05, + "loss": 1.1056, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 1.5755117690872584, + "learning_rate": 7.587488494725157e-05, + "loss": 1.1983, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.8784668339403303, + "learning_rate": 7.537210931679987e-05, + "loss": 1.1689, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.8906976548396875, + "learning_rate": 7.48699955686089e-05, + "loss": 1.2108, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.825483148194111, + "learning_rate": 7.43685571971426e-05, + "loss": 1.3095, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.7898151196765646, + "learning_rate": 7.386780767871397e-05, + "loss": 1.1102, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.7905430424069427, + "learning_rate": 7.336776047112276e-05, + "loss": 1.0784, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.8196770468772703, + "learning_rate": 7.286842901329412e-05, + "loss": 1.1636, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.888862504227803, + "learning_rate": 7.236982672491698e-05, + "loss": 1.2867, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.824844609674756, + "learning_rate": 7.187196700608373e-05, + "loss": 1.2041, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.9152932771920264, + "learning_rate": 7.137486323692995e-05, + "loss": 1.1682, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.8516991165662066, + "learning_rate": 7.087852877727481e-05, + "loss": 1.1531, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.8253284733353577, + "learning_rate": 7.038297696626206e-05, + "loss": 1.066, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.808754642859673, + "learning_rate": 6.988822112200156e-05, + "loss": 1.2327, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.8684342530267641, + "learning_rate": 6.939427454121128e-05, + "loss": 1.1757, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 1.0676860994053174, + "learning_rate": 6.890115049885994e-05, + "loss": 1.2155, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.7701555681194645, + "learning_rate": 6.84088622478104e-05, + "loss": 1.2048, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.9048160455852856, + "learning_rate": 6.791742301846326e-05, + "loss": 1.3376, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.7957470102026036, + "learning_rate": 6.742684601840141e-05, + "loss": 1.0878, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.8093378397441714, + "learning_rate": 6.693714443203507e-05, + "loss": 1.1519, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.8203721996256516, + "learning_rate": 6.644833142024751e-05, + "loss": 1.1611, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.811712204943522, + "learning_rate": 6.59604201200412e-05, + "loss": 1.158, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.8877301877619368, + "learning_rate": 6.547342364418481e-05, + "loss": 1.1956, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.7740209396211473, + "learning_rate": 6.498735508086093e-05, + "loss": 1.1664, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.8041506425838244, + "learning_rate": 6.450222749331414e-05, + "loss": 1.0035, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.6850965549453459, + "learning_rate": 6.40180539194999e-05, + "loss": 0.9761, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.7357434070457943, + "learning_rate": 6.35348473717345e-05, + "loss": 1.144, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.778451008261011, + "learning_rate": 6.305262083634488e-05, + "loss": 1.1921, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.8680554616182264, + "learning_rate": 6.25713872733199e-05, + "loss": 1.1567, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.7610705771800518, + "learning_rate": 6.209115961596208e-05, + "loss": 1.1568, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.8059528339762222, + "learning_rate": 6.161195077053976e-05, + "loss": 1.2073, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.8428663715678696, + "learning_rate": 6.113377361594049e-05, + "loss": 1.1649, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.9188786174992472, + "learning_rate": 6.065664100332478e-05, + "loss": 1.29, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.8756111438473054, + "learning_rate": 6.018056575578075e-05, + "loss": 1.2236, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.8733922333991385, + "learning_rate": 5.970556066797941e-05, + "loss": 1.1981, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.7711392324119829, + "learning_rate": 5.923163850583113e-05, + "loss": 1.1458, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.8062835770218935, + "learning_rate": 5.875881200614207e-05, + "loss": 1.1183, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.8747371063888001, + "learning_rate": 5.828709387627218e-05, + "loss": 1.2223, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.8759256057695535, + "learning_rate": 5.781649679379378e-05, + "loss": 1.141, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.7665422523949811, + "learning_rate": 5.73470334061505e-05, + "loss": 1.0992, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.8298592894443804, + "learning_rate": 5.687871633031754e-05, + "loss": 1.0957, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.8080989355798822, + "learning_rate": 5.6411558152462894e-05, + "loss": 1.1065, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.9143197985164898, + "learning_rate": 5.5945571427608526e-05, + "loss": 1.2077, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.7882212969542681, + "learning_rate": 5.54807686792933e-05, + "loss": 1.123, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.7852271835858652, + "learning_rate": 5.501716239923642e-05, + "loss": 1.1432, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.8526592754741675, + "learning_rate": 5.4554765047001613e-05, + "loss": 1.086, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.8495245556948786, + "learning_rate": 5.4093589049662175e-05, + "loss": 1.2061, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.9870834284353387, + "learning_rate": 5.363364680146725e-05, + "loss": 1.1034, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.8040489882141855, + "learning_rate": 5.31749506635086e-05, + "loss": 1.1533, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.7964631759376904, + "learning_rate": 5.271751296338823e-05, + "loss": 1.1348, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 1.0023008023943585, + "learning_rate": 5.226134599488728e-05, + "loss": 1.1666, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.7331303069308008, + "learning_rate": 5.180646201763577e-05, + "loss": 1.0718, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.8663504773008042, + "learning_rate": 5.135287325678271e-05, + "loss": 1.2721, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.7492676947718987, + "learning_rate": 5.090059190266779e-05, + "loss": 1.0432, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.7401120500297591, + "learning_rate": 5.0449630110493836e-05, + "loss": 1.1536, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.8640112094653735, + "learning_rate": 5.000000000000002e-05, + "loss": 1.2418, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.834158086197472, + "learning_rate": 4.955171365513603e-05, + "loss": 1.1254, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.8096342566815984, + "learning_rate": 4.9104783123737566e-05, + "loss": 1.1253, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.7767589861691669, + "learning_rate": 4.865922041720239e-05, + "loss": 1.2038, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.7587020261672197, + "learning_rate": 4.821503751016746e-05, + "loss": 1.0089, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.8011720535678255, + "learning_rate": 4.777224634018732e-05, + "loss": 1.191, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.6753522844900665, + "learning_rate": 4.733085880741301e-05, + "loss": 1.0858, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.7410365911986037, + "learning_rate": 4.689088677427249e-05, + "loss": 1.1445, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.8060757851701591, + "learning_rate": 4.645234206515171e-05, + "loss": 1.2159, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.8420427461230551, + "learning_rate": 4.6015236466076747e-05, + "loss": 1.1538, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.8784266723731003, + "learning_rate": 4.5579581724397255e-05, + "loss": 1.2057, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.8013200004781331, + "learning_rate": 4.514538954847064e-05, + "loss": 1.1869, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.8583871642996956, + "learning_rate": 4.471267160734731e-05, + "loss": 1.1606, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.7462787557206001, + "learning_rate": 4.428143953045717e-05, + "loss": 1.0587, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.9324702224054695, + "learning_rate": 4.385170490729712e-05, + "loss": 1.1917, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.825952426854641, + "learning_rate": 4.342347928711953e-05, + "loss": 1.145, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 1.03145563184466, + "learning_rate": 4.2996774178621736e-05, + "loss": 1.1587, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.7296815917745646, + "learning_rate": 4.257160104963696e-05, + "loss": 1.1109, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.7828315182565447, + "learning_rate": 4.2147971326825966e-05, + "loss": 1.1362, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.8661288824834599, + "learning_rate": 4.172589639536991e-05, + "loss": 1.1877, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.8784570468123847, + "learning_rate": 4.130538759866457e-05, + "loss": 1.1941, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.8287026908615008, + "learning_rate": 4.088645623801534e-05, + "loss": 1.1556, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.8357885647659248, + "learning_rate": 4.046911357233343e-05, + "loss": 1.1265, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.9117249328448946, + "learning_rate": 4.00533708178334e-05, + "loss": 1.1627, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.8301997254234771, + "learning_rate": 3.963923914773187e-05, + "loss": 1.1352, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.8634018249078147, + "learning_rate": 3.922672969194686e-05, + "loss": 1.1479, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.8329077897134307, + "learning_rate": 3.8815853536798904e-05, + "loss": 1.018, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.850231493224787, + "learning_rate": 3.840662172471315e-05, + "loss": 1.1742, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.8570417611972676, + "learning_rate": 3.79990452539225e-05, + "loss": 1.1468, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.7855160206399039, + "learning_rate": 3.759313507817196e-05, + "loss": 1.016, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.8057271657502297, + "learning_rate": 3.7188902106424416e-05, + "loss": 1.1755, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.8720646033479583, + "learning_rate": 3.678635720256737e-05, + "loss": 1.1208, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.8174618395150179, + "learning_rate": 3.638551118512089e-05, + "loss": 1.2449, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.8984493070072344, + "learning_rate": 3.5986374826947066e-05, + "loss": 1.1483, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.8676622249630999, + "learning_rate": 3.558895885496023e-05, + "loss": 1.029, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.9174839212355238, + "learning_rate": 3.519327394983888e-05, + "loss": 1.1039, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.7786142188089832, + "learning_rate": 3.479933074573858e-05, + "loss": 1.0746, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.9909890263435749, + "learning_rate": 3.440713983000601e-05, + "loss": 1.2087, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 1.0823583993413264, + "learning_rate": 3.401671174289469e-05, + "loss": 1.1421, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.7866294988212094, + "learning_rate": 3.362805697728145e-05, + "loss": 1.1617, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.7980845840357095, + "learning_rate": 3.324118597838464e-05, + "loss": 1.1537, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.8382061072113046, + "learning_rate": 3.285610914348332e-05, + "loss": 1.1559, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.7508694928613053, + "learning_rate": 3.2472836821637744e-05, + "loss": 1.092, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.8447023248602636, + "learning_rate": 3.209137931341143e-05, + "loss": 1.2368, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.8602381239185524, + "learning_rate": 3.1711746870594086e-05, + "loss": 1.1196, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.8853054458029322, + "learning_rate": 3.1333949695926324e-05, + "loss": 1.2423, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.8939026391356107, + "learning_rate": 3.0957997942825336e-05, + "loss": 1.1277, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.8807142157738884, + "learning_rate": 3.058390171511196e-05, + "loss": 1.2526, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.8324583410129928, + "learning_rate": 3.021167106673928e-05, + "loss": 1.0743, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.7490330913568131, + "learning_rate": 2.9841316001522347e-05, + "loss": 1.1246, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.8487037505899271, + "learning_rate": 2.9472846472869298e-05, + "loss": 1.2339, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.8188830641183935, + "learning_rate": 2.9106272383513835e-05, + "loss": 1.1554, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.8660395885091415, + "learning_rate": 2.874160358524931e-05, + "loss": 1.1092, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.8736838218162293, + "learning_rate": 2.8378849878663628e-05, + "loss": 1.1988, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.8310946470978122, + "learning_rate": 2.8018021012875994e-05, + "loss": 1.1999, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.8990152018181198, + "learning_rate": 2.7659126685275027e-05, + "loss": 1.1053, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.7284122434882887, + "learning_rate": 2.7302176541257986e-05, + "loss": 1.0132, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.8273713313369169, + "learning_rate": 2.6947180173971508e-05, + "loss": 1.1439, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.798738933773401, + "learning_rate": 2.659414712405398e-05, + "loss": 1.0502, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.9731526718979308, + "learning_rate": 2.6243086879379e-05, + "loss": 1.145, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.7761081087955582, + "learning_rate": 2.5894008874800325e-05, + "loss": 1.1722, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.7799187025271342, + "learning_rate": 2.5546922491898495e-05, + "loss": 1.1349, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.7739626130943175, + "learning_rate": 2.5201837058728505e-05, + "loss": 1.0719, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.809233978771651, + "learning_rate": 2.485876184956928e-05, + "loss": 1.1558, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.8605355086619487, + "learning_rate": 2.451770608467432e-05, + "loss": 1.244, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.78285209419252, + "learning_rate": 2.417867893002387e-05, + "loss": 1.1517, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.9576645934425764, + "learning_rate": 2.3841689497078746e-05, + "loss": 1.2088, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.8630992051401077, + "learning_rate": 2.3506746842535242e-05, + "loss": 1.0503, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.7960993720786123, + "learning_rate": 2.3173859968081944e-05, + "loss": 1.0288, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.8196348688283756, + "learning_rate": 2.2843037820157675e-05, + "loss": 1.183, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.7687564956603108, + "learning_rate": 2.251428928971102e-05, + "loss": 1.1636, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.8018095926887172, + "learning_rate": 2.2187623211961562e-05, + "loss": 1.0865, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.7964486540536525, + "learning_rate": 2.1863048366162208e-05, + "loss": 1.0577, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.9073721975513531, + "learning_rate": 2.1540573475363402e-05, + "loss": 1.2657, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.7342122892244989, + "learning_rate": 2.1220207206178688e-05, + "loss": 1.0574, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.9811847783919604, + "learning_rate": 2.0901958168551638e-05, + "loss": 1.2359, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.7612781244581198, + "learning_rate": 2.058583491552465e-05, + "loss": 1.0913, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.8234719284665559, + "learning_rate": 2.027184594300898e-05, + "loss": 1.1103, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.7695635781722373, + "learning_rate": 1.995999968955641e-05, + "loss": 1.089, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.7788976075736265, + "learning_rate": 1.9650304536132426e-05, + "loss": 1.1197, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.8206755048873753, + "learning_rate": 1.9342768805891178e-05, + "loss": 1.1847, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.7634961509457562, + "learning_rate": 1.903740076395151e-05, + "loss": 1.0367, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.8036025900599119, + "learning_rate": 1.8734208617174988e-05, + "loss": 1.1318, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.7062609717479594, + "learning_rate": 1.8433200513945337e-05, + "loss": 1.0694, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.7701703193316796, + "learning_rate": 1.8134384543949478e-05, + "loss": 1.0875, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.9649938639591058, + "learning_rate": 1.783776873795994e-05, + "loss": 1.208, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.7555581523592024, + "learning_rate": 1.754336106761927e-05, + "loss": 1.1051, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.8368832116248573, + "learning_rate": 1.7251169445225657e-05, + "loss": 1.1158, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.7286985238454796, + "learning_rate": 1.696120172352025e-05, + "loss": 1.1732, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.870187735043427, + "learning_rate": 1.6673465695476232e-05, + "loss": 1.0843, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.7341923862261764, + "learning_rate": 1.6387969094089316e-05, + "loss": 1.1035, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.78122639104198, + "learning_rate": 1.6104719592169902e-05, + "loss": 1.0914, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.7325166160566458, + "learning_rate": 1.5823724802136865e-05, + "loss": 1.1732, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.7922310272884773, + "learning_rate": 1.5544992275813053e-05, + "loss": 1.1122, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.8753271390663528, + "learning_rate": 1.526852950422226e-05, + "loss": 1.1513, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.9131560606826612, + "learning_rate": 1.4994343917387854e-05, + "loss": 1.0925, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.7698842542187385, + "learning_rate": 1.4722442884133214e-05, + "loss": 1.1716, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.8426807492438998, + "learning_rate": 1.4452833711883628e-05, + "loss": 1.1362, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.783156835957247, + "learning_rate": 1.4185523646469822e-05, + "loss": 1.069, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.779858408711323, + "learning_rate": 1.3920519871933424e-05, + "loss": 1.1112, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.8247142532205438, + "learning_rate": 1.3657829510333654e-05, + "loss": 1.0457, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.8560843223223137, + "learning_rate": 1.339745962155613e-05, + "loss": 1.2159, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.7845474164901701, + "learning_rate": 1.3139417203123027e-05, + "loss": 1.1098, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.7449190731629902, + "learning_rate": 1.2883709190004955e-05, + "loss": 1.0733, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.8630092715184896, + "learning_rate": 1.263034245443473e-05, + "loss": 1.1833, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.7867104021920172, + "learning_rate": 1.2379323805722576e-05, + "loss": 1.1309, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.8244636276482338, + "learning_rate": 1.2130659990073146e-05, + "loss": 1.234, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.8138146321719298, + "learning_rate": 1.1884357690404158e-05, + "loss": 1.1515, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.8425524497553843, + "learning_rate": 1.1640423526166988e-05, + "loss": 1.1644, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.8547259869847009, + "learning_rate": 1.1398864053168534e-05, + "loss": 1.1667, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.7244799287511947, + "learning_rate": 1.1159685763395111e-05, + "loss": 1.0842, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.7524698818608209, + "learning_rate": 1.0922895084838037e-05, + "loss": 1.1178, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.9881487441349832, + "learning_rate": 1.0688498381320855e-05, + "loss": 1.3182, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.7576640311695998, + "learning_rate": 1.045650195232819e-05, + "loss": 1.0744, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.9096052404769226, + "learning_rate": 1.0226912032836611e-05, + "loss": 1.1455, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.7379987979671901, + "learning_rate": 9.999734793146998e-06, + "loss": 1.208, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.7905102802551862, + "learning_rate": 9.774976338718677e-06, + "loss": 1.055, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.8126527688723885, + "learning_rate": 9.552642710005299e-06, + "loss": 1.1168, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.8383644272793868, + "learning_rate": 9.332739882292752e-06, + "loss": 1.2018, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.8729393506383202, + "learning_rate": 9.115273765538202e-06, + "loss": 1.0735, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.7951320488942527, + "learning_rate": 8.900250204211514e-06, + "loss": 1.2413, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.8324150961444802, + "learning_rate": 8.687674977138116e-06, + "loss": 1.1574, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.9271836872000464, + "learning_rate": 8.47755379734373e-06, + "loss": 1.1051, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.8522926371827139, + "learning_rate": 8.269892311900696e-06, + "loss": 1.0549, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.7377990293794885, + "learning_rate": 8.064696101776358e-06, + "loss": 1.1044, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.7792385782548877, + "learning_rate": 7.861970681683051e-06, + "loss": 1.0408, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.7603970847804558, + "learning_rate": 7.661721499929753e-06, + "loss": 1.2524, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.8857803738015847, + "learning_rate": 7.463953938275858e-06, + "loss": 1.0188, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.7357070656600264, + "learning_rate": 7.2686733117863784e-06, + "loss": 1.0963, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.8473491473253034, + "learning_rate": 7.07588486868922e-06, + "loss": 1.0699, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.8231635478717917, + "learning_rate": 6.8855937902340576e-06, + "loss": 1.018, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.8622047371192733, + "learning_rate": 6.6978051905530855e-06, + "loss": 1.1781, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.8044949620138342, + "learning_rate": 6.512524116523633e-06, + "loss": 1.0902, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.7789169141323721, + "learning_rate": 6.329755547632499e-06, + "loss": 1.1454, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.8896580003141731, + "learning_rate": 6.149504395842087e-06, + "loss": 1.1064, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.783026117101944, + "learning_rate": 5.971775505458444e-06, + "loss": 1.1428, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.8550674103842599, + "learning_rate": 5.7965736530010916e-06, + "loss": 1.2133, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.7838714937143493, + "learning_rate": 5.623903547074549e-06, + "loss": 1.0849, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.8338129750479177, + "learning_rate": 5.453769828241872e-06, + "loss": 1.1078, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.8534721867473872, + "learning_rate": 5.286177068899989e-06, + "loss": 1.0528, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.7947880348906006, + "learning_rate": 5.121129773156663e-06, + "loss": 1.0495, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.8090524858380864, + "learning_rate": 4.95863237670956e-06, + "loss": 1.1755, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.6974919434108282, + "learning_rate": 4.798689246727006e-06, + "loss": 1.0676, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.6628997531757636, + "learning_rate": 4.641304681730641e-06, + "loss": 0.9407, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.7623591178079464, + "learning_rate": 4.486482911479839e-06, + "loss": 1.043, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.7233254791162921, + "learning_rate": 4.3342280968580285e-06, + "loss": 1.1492, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.8506229695632855, + "learning_rate": 4.184544329761009e-06, + "loss": 1.0879, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.9005950117591424, + "learning_rate": 4.037435632986786e-06, + "loss": 1.0699, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.6814512621221664, + "learning_rate": 3.892905960127546e-06, + "loss": 1.0912, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.7992049769516041, + "learning_rate": 3.750959195463466e-06, + "loss": 1.0538, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.8192409966078561, + "learning_rate": 3.611599153858214e-06, + "loss": 1.116, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.8669150974095468, + "learning_rate": 3.4748295806564356e-06, + "loss": 1.1496, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.8239210079645767, + "learning_rate": 3.3406541515832003e-06, + "loss": 1.1311, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.9021066280304881, + "learning_rate": 3.209076472645112e-06, + "loss": 1.2361, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.8013447776589625, + "learning_rate": 3.0801000800333877e-06, + "loss": 1.0475, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.7673623371996414, + "learning_rate": 2.9537284400289355e-06, + "loss": 1.0197, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.8324137919874324, + "learning_rate": 2.8299649489090475e-06, + "loss": 1.088, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.7144867605916154, + "learning_rate": 2.708812932856253e-06, + "loss": 1.0847, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.8302290746064859, + "learning_rate": 2.590275647868867e-06, + "loss": 1.094, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.915754184283194, + "learning_rate": 2.4743562796734622e-06, + "loss": 1.1402, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.8174700752216338, + "learning_rate": 2.3610579436393e-06, + "loss": 1.2386, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.7545048285638264, + "learning_rate": 2.250383684694579e-06, + "loss": 0.9968, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.7523805546706704, + "learning_rate": 2.1423364772445887e-06, + "loss": 1.0887, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.8255564351449471, + "learning_rate": 2.036919225091827e-06, + "loss": 1.1428, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.7682235946251612, + "learning_rate": 1.9341347613579087e-06, + "loss": 1.1986, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.7508838695782295, + "learning_rate": 1.8339858484073935e-06, + "loss": 1.1666, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.7272082061501186, + "learning_rate": 1.7364751777736332e-06, + "loss": 1.0427, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.8136674796372391, + "learning_rate": 1.6416053700863964e-06, + "loss": 1.1624, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.9471246322793946, + "learning_rate": 1.5493789750014031e-06, + "loss": 1.1944, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.830793594639552, + "learning_rate": 1.459798471131868e-06, + "loss": 1.1162, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.7829685338313231, + "learning_rate": 1.3728662659818204e-06, + "loss": 1.0509, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.7677342082631756, + "learning_rate": 1.2885846958814673e-06, + "loss": 1.1581, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.8580155229574348, + "learning_rate": 1.2069560259243328e-06, + "loss": 1.2314, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.8315893544832171, + "learning_rate": 1.1279824499064396e-06, + "loss": 1.1727, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.790733072519165, + "learning_rate": 1.0516660902673448e-06, + "loss": 1.0775, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.7797569176739247, + "learning_rate": 9.780089980330642e-07, + "loss": 1.0605, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.7558158376183497, + "learning_rate": 9.070131527609604e-07, + "loss": 1.0332, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.8161929460899198, + "learning_rate": 8.386804624865851e-07, + "loss": 1.1533, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.8758741339277274, + "learning_rate": 7.730127636723539e-07, + "loss": 1.2084, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.8491665046048456, + "learning_rate": 7.100118211581852e-07, + "loss": 1.1098, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.8289959874690686, + "learning_rate": 6.496793281141056e-07, + "loss": 1.113, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.8859879330617857, + "learning_rate": 5.920169059947411e-07, + "loss": 1.2009, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.7944978619957024, + "learning_rate": 5.370261044956971e-07, + "loss": 1.2376, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.8411628207305331, + "learning_rate": 4.847084015119574e-07, + "loss": 1.0887, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.7866710464305886, + "learning_rate": 4.3506520309813947e-07, + "loss": 1.0701, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.7260546876982716, + "learning_rate": 3.8809784343072366e-07, + "loss": 1.1078, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.9615683500825383, + "learning_rate": 3.4380758477219333e-07, + "loss": 1.0852, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.9611345980076806, + "learning_rate": 3.0219561743707326e-07, + "loss": 1.2499, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.8612297012148447, + "learning_rate": 2.6326305976001055e-07, + "loss": 1.1793, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.8216178579217599, + "learning_rate": 2.2701095806565432e-07, + "loss": 1.1088, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.7562499369168286, + "learning_rate": 1.9344028664056713e-07, + "loss": 1.1269, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.777584454108545, + "learning_rate": 1.6255194770704586e-07, + "loss": 1.1456, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.82330199497272, + "learning_rate": 1.3434677139885222e-07, + "loss": 1.1404, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.8305045181413623, + "learning_rate": 1.0882551573891953e-07, + "loss": 1.1219, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.826489086493862, + "learning_rate": 8.598886661895788e-08, + "loss": 1.1179, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.8079475819208309, + "learning_rate": 6.583743778106887e-08, + "loss": 1.0196, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.8318839031216161, + "learning_rate": 4.837177080119215e-08, + "loss": 1.1863, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.7703937170177031, + "learning_rate": 3.359233507459481e-08, + "loss": 1.1027, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.70822487933415, + "learning_rate": 2.1499527803214846e-08, + "loss": 1.028, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.8026825675935317, + "learning_rate": 1.209367398504746e-08, + "loss": 1.092, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.7721965398534467, + "learning_rate": 5.375026405352035e-09, + "loss": 0.9474, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.8220840958840194, + "learning_rate": 1.3437656298687097e-09, + "loss": 1.2077, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.7661086225593968, + "learning_rate": 0.0, + "loss": 1.0666, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 203897771933696.0, + "train_loss": 1.2123233310699464, + "train_runtime": 6268.0756, + "train_samples_per_second": 1.595, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 203897771933696.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..cbb7e48d87741e690140e127f0c8e291b49c2200 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "gate_proj", + "q_proj", + "k_proj", + "o_proj", + "v_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..95612d5387273527cd475667c07dd6094e46d4c8 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b5a0f19c62ddf44c3b3f88a80303cd30828b6a23c325502b62d35ab90c98de +size 671150064 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..8721a47207e4d1a2a764da69a07a0e0167fbdd63 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0572d0914e749199035e01a17b945934ce0ce81d1c0d1e9cc044983fb4e5172 +size 918507402 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..d96842d70d28f0be419a5b5dd303b9d9ae9650c1 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_20000_epochs_1_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 2.167579092516437, + "learning_rate": 5.263157894736842e-06, + "loss": 1.828, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 2.340629280044389, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.9176, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 1.811463906297539, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.7397, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 1.747010319714731, + "learning_rate": 2.105263157894737e-05, + "loss": 1.586, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 1.520092988296709, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.7585, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 1.3268982929526139, + "learning_rate": 3.157894736842105e-05, + "loss": 1.6026, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 1.4764488386549275, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.6617, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 1.6279768120710967, + "learning_rate": 4.210526315789474e-05, + "loss": 1.573, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 1.4270807326087176, + "learning_rate": 4.736842105263158e-05, + "loss": 1.5528, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 1.2428428678982786, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.3892, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 1.2401580956271006, + "learning_rate": 5.789473684210527e-05, + "loss": 1.3806, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 1.2819503399116507, + "learning_rate": 6.31578947368421e-05, + "loss": 1.396, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 1.1882915733386645, + "learning_rate": 6.842105263157895e-05, + "loss": 1.4601, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 1.0903858341710697, + "learning_rate": 7.368421052631579e-05, + "loss": 1.344, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 1.0596846531636785, + "learning_rate": 7.894736842105263e-05, + "loss": 1.3555, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 1.0634815912562547, + "learning_rate": 8.421052631578948e-05, + "loss": 1.4242, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 1.0878252629197922, + "learning_rate": 8.947368421052632e-05, + "loss": 1.3516, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 1.1800010976164466, + "learning_rate": 9.473684210526316e-05, + "loss": 1.5403, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 1.1640427666915913, + "learning_rate": 0.0001, + "loss": 1.4014, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 1.266209495132437, + "learning_rate": 0.00010526315789473685, + "loss": 1.5235, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.9978913092746845, + "learning_rate": 0.0001105263157894737, + "loss": 1.2294, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 1.1342616642751515, + "learning_rate": 0.00011578947368421053, + "loss": 1.4345, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 1.215812237535266, + "learning_rate": 0.00012105263157894738, + "loss": 1.3902, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 1.0844024644780883, + "learning_rate": 0.0001263157894736842, + "loss": 1.2559, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.9876516028052728, + "learning_rate": 0.00013157894736842108, + "loss": 1.3326, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 1.0035043823088348, + "learning_rate": 0.0001368421052631579, + "loss": 1.4357, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.9948732657309445, + "learning_rate": 0.00014210526315789474, + "loss": 1.3949, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 1.044231166970953, + "learning_rate": 0.00014736842105263158, + "loss": 1.3937, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 1.0001653312399295, + "learning_rate": 0.00015263157894736845, + "loss": 1.3577, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 1.1236353066110405, + "learning_rate": 0.00015789473684210527, + "loss": 1.387, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.9847793391832763, + "learning_rate": 0.0001631578947368421, + "loss": 1.3356, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 1.0734795709259823, + "learning_rate": 0.00016842105263157895, + "loss": 1.345, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 1.0018645283773786, + "learning_rate": 0.0001736842105263158, + "loss": 1.3844, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.9741280531955702, + "learning_rate": 0.00017894736842105264, + "loss": 1.2712, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 1.019580278753891, + "learning_rate": 0.00018421052631578948, + "loss": 1.3357, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.9806417285455257, + "learning_rate": 0.00018947368421052632, + "loss": 1.3431, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.9902082070954336, + "learning_rate": 0.00019473684210526317, + "loss": 1.4262, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.8904579435910817, + "learning_rate": 0.0002, + "loss": 1.3145, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.9830855617998796, + "learning_rate": 0.00019999966405802826, + "loss": 1.3633, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 1.0024726704595865, + "learning_rate": 0.00019999865623437013, + "loss": 1.3568, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.9739528854859765, + "learning_rate": 0.00019999697653579705, + "loss": 1.3164, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.8801591109479502, + "learning_rate": 0.00019999462497359466, + "loss": 1.2211, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 1.0260757591954461, + "learning_rate": 0.0001999916015635627, + "loss": 1.291, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 1.1031534942231496, + "learning_rate": 0.00019998790632601496, + "loss": 1.4759, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 1.0606001512863072, + "learning_rate": 0.00019998353928577919, + "loss": 1.4111, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.9840640012343169, + "learning_rate": 0.0001999785004721968, + "loss": 1.2385, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 1.1052577461032573, + "learning_rate": 0.0001999727899191228, + "loss": 1.285, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.9118307388774117, + "learning_rate": 0.00019996640766492543, + "loss": 1.2741, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 1.004695540398904, + "learning_rate": 0.00019995935375248606, + "loss": 1.3496, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 1.0407289757019198, + "learning_rate": 0.00019995162822919883, + "loss": 1.3497, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.957036327448944, + "learning_rate": 0.00019994323114697022, + "loss": 1.2609, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 1.0044911323790189, + "learning_rate": 0.00019993416256221895, + "loss": 1.3899, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 1.0623053762303933, + "learning_rate": 0.0001999244225358753, + "loss": 1.2557, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.9863645586943295, + "learning_rate": 0.00019991401113338104, + "loss": 1.3906, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 1.049058703594308, + "learning_rate": 0.00019990292842468868, + "loss": 1.3907, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 1.0807933613915983, + "learning_rate": 0.00019989117448426108, + "loss": 1.2593, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.8880685140279246, + "learning_rate": 0.0001998787493910712, + "loss": 1.298, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 1.0984359847011123, + "learning_rate": 0.00019986565322860115, + "loss": 1.3714, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 1.0017796656325524, + "learning_rate": 0.000199851886084842, + "loss": 1.4406, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.9497230746477366, + "learning_rate": 0.00019983744805229296, + "loss": 1.252, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 1.0207357838139706, + "learning_rate": 0.00019982233922796085, + "loss": 1.2045, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 1.1052157003992444, + "learning_rate": 0.00019980655971335945, + "loss": 1.3368, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 1.15820775048525, + "learning_rate": 0.00019979010961450878, + "loss": 1.3323, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 1.0393811648711768, + "learning_rate": 0.00019977298904193437, + "loss": 1.4127, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.9454950054465096, + "learning_rate": 0.00019975519811066663, + "loss": 1.2724, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.8458612707334442, + "learning_rate": 0.00019973673694024, + "loss": 1.1787, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 1.175921001498925, + "learning_rate": 0.0001997176056546921, + "loss": 1.4735, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.9928286314149518, + "learning_rate": 0.00019969780438256293, + "loss": 1.2568, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.9383971568954167, + "learning_rate": 0.0001996773332568941, + "loss": 1.3237, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 1.0146098396729828, + "learning_rate": 0.0001996561924152278, + "loss": 1.3568, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.9449158480327011, + "learning_rate": 0.00019963438199960599, + "loss": 1.2383, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.9737420940105619, + "learning_rate": 0.0001996119021565693, + "loss": 1.3425, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 1.0702776392118603, + "learning_rate": 0.00019958875303715615, + "loss": 1.2921, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.8978325266237281, + "learning_rate": 0.0001995649347969019, + "loss": 1.3866, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 1.0761678411573041, + "learning_rate": 0.0001995404475958373, + "loss": 1.3481, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 1.053564226804702, + "learning_rate": 0.00019951529159848805, + "loss": 1.3846, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.9685816836019822, + "learning_rate": 0.0001994894669738732, + "loss": 1.2801, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 1.2255497280368406, + "learning_rate": 0.00019946297389550433, + "loss": 1.4378, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 1.0024763065869442, + "learning_rate": 0.0001994358125413841, + "loss": 1.2852, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.9526992946632433, + "learning_rate": 0.00019940798309400526, + "loss": 1.2434, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.8944689402544411, + "learning_rate": 0.0001993794857403495, + "loss": 1.2831, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 1.0642179704409769, + "learning_rate": 0.0001993503206718859, + "loss": 1.3311, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.957225504372996, + "learning_rate": 0.0001993204880845699, + "loss": 1.3431, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 1.0173050328208, + "learning_rate": 0.00019928998817884182, + "loss": 1.3324, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 1.020810730538368, + "learning_rate": 0.00019925882115962568, + "loss": 1.3101, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.9494397858523248, + "learning_rate": 0.00019922698723632767, + "loss": 1.2692, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.8914234539743199, + "learning_rate": 0.00019919448662283478, + "loss": 1.2945, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.9344342732976146, + "learning_rate": 0.00019916131953751342, + "loss": 1.3109, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.985271247374639, + "learning_rate": 0.00019912748620320794, + "loss": 1.4008, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.9247507759436328, + "learning_rate": 0.00019909298684723904, + "loss": 1.2126, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.9521122192113574, + "learning_rate": 0.00019905782170140238, + "loss": 1.3568, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.919839449603137, + "learning_rate": 0.00019902199100196697, + "loss": 1.272, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.9313738933256456, + "learning_rate": 0.00019898549498967343, + "loss": 1.3046, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.9274040344808508, + "learning_rate": 0.00019894833390973266, + "loss": 1.3611, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.932158258853334, + "learning_rate": 0.000198910508011824, + "loss": 1.2141, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 1.2689889165504267, + "learning_rate": 0.00019887201755009357, + "loss": 1.4466, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.981717390104989, + "learning_rate": 0.00019883286278315262, + "loss": 1.2913, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 1.042409120435065, + "learning_rate": 0.0001987930439740757, + "loss": 1.3284, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 1.0110680752501544, + "learning_rate": 0.00019875256139039902, + "loss": 1.4257, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.8952232892799127, + "learning_rate": 0.00019871141530411853, + "loss": 1.1767, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 1.06830466342513, + "learning_rate": 0.00019866960599168826, + "loss": 1.3548, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 1.2282347155104663, + "learning_rate": 0.0001986271337340182, + "loss": 1.4166, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.8658617487000834, + "learning_rate": 0.0001985839988164726, + "loss": 1.2353, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.9595117376855367, + "learning_rate": 0.00019854020152886814, + "loss": 1.2504, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.8922764304904715, + "learning_rate": 0.00019849574216547171, + "loss": 1.2383, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.9569832187758008, + "learning_rate": 0.0001984506210249986, + "loss": 1.2801, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.9279072735190277, + "learning_rate": 0.00019840483841061058, + "loss": 1.3134, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.9343160937396461, + "learning_rate": 0.00019835839462991361, + "loss": 1.3676, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.9729683033277049, + "learning_rate": 0.00019831128999495606, + "loss": 1.313, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 1.0499300246022472, + "learning_rate": 0.00019826352482222638, + "loss": 1.3321, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.9508681720875867, + "learning_rate": 0.0001982150994326511, + "loss": 1.2798, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.9677292544725961, + "learning_rate": 0.00019816601415159263, + "loss": 1.4221, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 1.0112443013225598, + "learning_rate": 0.0001981162693088471, + "loss": 1.263, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 1.014832326054943, + "learning_rate": 0.0001980658652386421, + "loss": 1.3264, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 1.013409537107313, + "learning_rate": 0.0001980148022796345, + "loss": 1.3215, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 1.0274324617959099, + "learning_rate": 0.00019796308077490817, + "loss": 1.3629, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 1.0713761880177823, + "learning_rate": 0.00019791070107197153, + "loss": 1.2939, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 1.068251234875904, + "learning_rate": 0.00019785766352275542, + "loss": 1.447, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 1.0559978592286707, + "learning_rate": 0.0001978039684836106, + "loss": 1.3106, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.9247675055087754, + "learning_rate": 0.00019774961631530545, + "loss": 1.3401, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.9264122038131352, + "learning_rate": 0.0001976946073830234, + "loss": 1.3146, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.914783829048558, + "learning_rate": 0.00019763894205636072, + "loss": 1.3073, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 1.0154466441114909, + "learning_rate": 0.00019758262070932375, + "loss": 1.3817, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.910123856772834, + "learning_rate": 0.00019752564372032657, + "loss": 1.234, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 1.1034973206566288, + "learning_rate": 0.00019746801147218842, + "loss": 1.3984, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 1.0167835565624685, + "learning_rate": 0.00019740972435213115, + "loss": 1.3065, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.9541771221806903, + "learning_rate": 0.00019735078275177654, + "loss": 1.311, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 1.072005491293411, + "learning_rate": 0.00019729118706714375, + "loss": 1.2649, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.9222742105301948, + "learning_rate": 0.00019723093769864663, + "loss": 1.2939, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.9182690156303601, + "learning_rate": 0.00019717003505109095, + "loss": 1.2296, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.873328914275884, + "learning_rate": 0.0001971084795336719, + "loss": 1.14, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.89808663605913, + "learning_rate": 0.00019704627155997108, + "loss": 1.2603, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.8942697931590334, + "learning_rate": 0.00019698341154795389, + "loss": 1.283, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.9863497164075619, + "learning_rate": 0.00019691989991996663, + "loss": 1.3498, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 1.065122487379581, + "learning_rate": 0.00019685573710273376, + "loss": 1.4105, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.9588226639761218, + "learning_rate": 0.0001967909235273549, + "loss": 1.3338, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 1.1423314012184513, + "learning_rate": 0.00019672545962930215, + "loss": 1.2981, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.9877321537388655, + "learning_rate": 0.00019665934584841682, + "loss": 1.1558, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.9823929913907383, + "learning_rate": 0.00019659258262890683, + "loss": 1.2956, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 1.0129620806874113, + "learning_rate": 0.00019652517041934356, + "loss": 1.3666, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 1.1160502930942982, + "learning_rate": 0.00019645710967265882, + "loss": 1.2756, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.9959529425803797, + "learning_rate": 0.00019638840084614182, + "loss": 1.3355, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.9717112645235756, + "learning_rate": 0.00019631904440143612, + "loss": 1.2239, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.9263133806847209, + "learning_rate": 0.00019624904080453655, + "loss": 1.2585, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 1.1246517730989027, + "learning_rate": 0.00019617839052578603, + "loss": 1.1816, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 1.0210343295751094, + "learning_rate": 0.00019610709403987246, + "loss": 1.2572, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 1.1435629451680012, + "learning_rate": 0.0001960351518258255, + "loss": 1.2142, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.8973726237006526, + "learning_rate": 0.00019596256436701324, + "loss": 1.2267, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.9375307020959045, + "learning_rate": 0.00019588933215113926, + "loss": 1.1957, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.8877654959306899, + "learning_rate": 0.000195815455670239, + "loss": 1.3156, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.9237560489461866, + "learning_rate": 0.00019574093542067673, + "loss": 1.219, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.9940201908718649, + "learning_rate": 0.00019566577190314197, + "loss": 1.3696, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.9504882303629454, + "learning_rate": 0.0001955899656226464, + "loss": 1.2064, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.9333353719171309, + "learning_rate": 0.0001955135170885202, + "loss": 1.2968, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.9698434067734869, + "learning_rate": 0.0001954364268144088, + "loss": 1.34, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.9553092369735782, + "learning_rate": 0.00019535869531826937, + "loss": 1.2127, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.9009311579341852, + "learning_rate": 0.00019528032312236736, + "loss": 1.2811, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.9870694380783888, + "learning_rate": 0.00019520131075327298, + "loss": 1.3883, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.9961378055276119, + "learning_rate": 0.00019512165874185767, + "loss": 1.4485, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 1.0449183624088734, + "learning_rate": 0.00019504136762329047, + "loss": 1.3765, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.8615043474531041, + "learning_rate": 0.0001949604379370345, + "loss": 1.0767, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 1.0528187542828291, + "learning_rate": 0.00019487887022684336, + "loss": 1.2594, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 1.1340945053685607, + "learning_rate": 0.00019479666504075736, + "loss": 1.3687, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.9501450942419837, + "learning_rate": 0.00019471382293110003, + "loss": 1.2395, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 1.045675073511293, + "learning_rate": 0.0001946303444544741, + "loss": 1.4314, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.8775108529320873, + "learning_rate": 0.00019454623017175812, + "loss": 1.233, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 1.0583587466802307, + "learning_rate": 0.00019446148064810242, + "loss": 1.3102, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.9559765272138694, + "learning_rate": 0.00019437609645292546, + "loss": 1.2742, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.9777314716016529, + "learning_rate": 0.00019429007815990993, + "loss": 1.3153, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 1.002553842293866, + "learning_rate": 0.0001942034263469989, + "loss": 1.2063, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.9095180233066996, + "learning_rate": 0.00019411614159639204, + "loss": 1.1509, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.9465664389556642, + "learning_rate": 0.00019402822449454153, + "loss": 1.2949, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.9679809142755188, + "learning_rate": 0.00019393967563214833, + "loss": 1.2996, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 1.0395885787436914, + "learning_rate": 0.00019385049560415794, + "loss": 1.4069, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.8891529964391287, + "learning_rate": 0.00019376068500975667, + "loss": 1.2993, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.9225541873816892, + "learning_rate": 0.00019367024445236754, + "loss": 1.219, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.8655317895338619, + "learning_rate": 0.000193579174539646, + "loss": 1.2356, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.9155349545526035, + "learning_rate": 0.00019348747588347637, + "loss": 1.2478, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.9919240587073496, + "learning_rate": 0.00019339514909996706, + "loss": 1.2888, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.9112547413510824, + "learning_rate": 0.00019330219480944694, + "loss": 1.2623, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.9308186740534311, + "learning_rate": 0.00019320861363646095, + "loss": 1.2761, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.9196186440867572, + "learning_rate": 0.00019311440620976597, + "loss": 1.254, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.9142485565040559, + "learning_rate": 0.00019301957316232658, + "loss": 1.2618, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.848240080243922, + "learning_rate": 0.0001929241151313108, + "loss": 1.2474, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 1.010522331993527, + "learning_rate": 0.0001928280327580858, + "loss": 1.2613, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.9711072419826322, + "learning_rate": 0.00019273132668821364, + "loss": 1.3189, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.9538559181814769, + "learning_rate": 0.00019263399757144683, + "loss": 1.3093, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 1.0971831439930473, + "learning_rate": 0.00019253604606172417, + "loss": 1.4779, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.8851820843384431, + "learning_rate": 0.000192437472817166, + "loss": 1.1911, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.9551242000889022, + "learning_rate": 0.00019233827850007027, + "loss": 1.3758, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.9441705161533531, + "learning_rate": 0.00019223846377690754, + "loss": 1.2271, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.9836464031073223, + "learning_rate": 0.00019213802931831696, + "loss": 1.2756, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.908853294679305, + "learning_rate": 0.00019203697579910154, + "loss": 1.1848, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.8494105866506043, + "learning_rate": 0.00019193530389822363, + "loss": 1.1351, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.9247520768609205, + "learning_rate": 0.00019183301429880043, + "loss": 1.3106, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.9433884410895655, + "learning_rate": 0.00019173010768809933, + "loss": 1.3105, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 1.0114886408840793, + "learning_rate": 0.00019162658475753327, + "loss": 1.4155, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.9676678745331979, + "learning_rate": 0.0001915224462026563, + "loss": 1.313, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 1.1155683911355467, + "learning_rate": 0.00019141769272315858, + "loss": 1.2428, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.879055283601944, + "learning_rate": 0.00019131232502286188, + "loss": 1.1823, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 1.0153817814809485, + "learning_rate": 0.00019120634380971496, + "loss": 1.2951, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.9830999073990369, + "learning_rate": 0.0001910997497957885, + "loss": 1.2347, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.9347508555457964, + "learning_rate": 0.0001909925436972706, + "loss": 1.2941, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.936274005635992, + "learning_rate": 0.00019088472623446183, + "loss": 1.2885, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 1.104866636661514, + "learning_rate": 0.00019077629813177036, + "loss": 1.2579, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.9673565382699655, + "learning_rate": 0.00019066726011770726, + "loss": 1.2739, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 1.0272681517131834, + "learning_rate": 0.00019055761292488142, + "loss": 1.4015, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.908957962895597, + "learning_rate": 0.0001904473572899947, + "loss": 1.1708, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.9940800186635145, + "learning_rate": 0.00019033649395383702, + "loss": 1.2064, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.9269814183548376, + "learning_rate": 0.00019022502366128135, + "loss": 1.1778, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.9549520056237327, + "learning_rate": 0.00019011294716127867, + "loss": 1.3004, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.9302851719343455, + "learning_rate": 0.00019000026520685302, + "loss": 1.2855, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.9543168060402824, + "learning_rate": 0.0001898869785550963, + "loss": 1.2612, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.9215352284559458, + "learning_rate": 0.0001897730879671634, + "loss": 1.3013, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.9373405519430387, + "learning_rate": 0.00018965859420826684, + "loss": 1.3069, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.9109028462151921, + "learning_rate": 0.00018954349804767184, + "loss": 1.289, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 1.0417375130960227, + "learning_rate": 0.00018942780025869098, + "loss": 1.3804, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.8992185336285579, + "learning_rate": 0.00018931150161867916, + "loss": 1.2244, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.9498360222597024, + "learning_rate": 0.00018919460290902826, + "loss": 1.2513, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.9137276089471331, + "learning_rate": 0.00018907710491516199, + "loss": 1.2456, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.9343334801114195, + "learning_rate": 0.0001889590084265304, + "loss": 1.2453, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.902300229292226, + "learning_rate": 0.0001888403142366049, + "loss": 1.2514, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.9627526933565559, + "learning_rate": 0.0001887210231428727, + "loss": 1.2534, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.9707956350087426, + "learning_rate": 0.00018860113594683148, + "loss": 1.1498, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 1.1040611555514872, + "learning_rate": 0.0001884806534539841, + "loss": 1.3959, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.957089113833274, + "learning_rate": 0.00018835957647383303, + "loss": 1.2859, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.8774015715700482, + "learning_rate": 0.0001882379058198751, + "loss": 1.1675, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.9433319527481238, + "learning_rate": 0.00018811564230959588, + "loss": 1.143, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 1.0214888795909183, + "learning_rate": 0.00018799278676446423, + "loss": 1.2955, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.9616667839156496, + "learning_rate": 0.00018786934000992688, + "loss": 1.2678, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.8891852551275778, + "learning_rate": 0.00018774530287540278, + "loss": 1.2647, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.8802953650380777, + "learning_rate": 0.00018762067619427746, + "loss": 1.1692, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.8313635670238767, + "learning_rate": 0.00018749546080389757, + "loss": 1.2354, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 1.0199744757784006, + "learning_rate": 0.00018736965754556528, + "loss": 1.1035, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.8611064620021869, + "learning_rate": 0.00018724326726453244, + "loss": 1.2388, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.8735673609655413, + "learning_rate": 0.00018711629080999504, + "loss": 1.1727, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.8901145078375076, + "learning_rate": 0.00018698872903508755, + "loss": 1.2797, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.9652231294142449, + "learning_rate": 0.00018686058279687698, + "loss": 1.3121, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.9104393378234763, + "learning_rate": 0.0001867318529563574, + "loss": 1.1945, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 1.0112227771918163, + "learning_rate": 0.00018660254037844388, + "loss": 1.2843, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.8630469893675877, + "learning_rate": 0.00018647264593196688, + "loss": 1.2034, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.9188347046864217, + "learning_rate": 0.00018634217048966637, + "loss": 1.1626, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.9278100650647194, + "learning_rate": 0.00018621111492818585, + "loss": 1.1024, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 1.0464323384986105, + "learning_rate": 0.0001860794801280666, + "loss": 1.3174, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.9851443925974323, + "learning_rate": 0.00018594726697374175, + "loss": 1.2532, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 1.0030662498592748, + "learning_rate": 0.0001858144763535302, + "loss": 1.3867, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.8551821476650675, + "learning_rate": 0.0001856811091596308, + "loss": 1.1021, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.916813509295243, + "learning_rate": 0.0001855471662881164, + "loss": 1.1815, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.9293644866629288, + "learning_rate": 0.00018541264863892754, + "loss": 1.2278, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.9157513564772256, + "learning_rate": 0.00018527755711586678, + "loss": 1.342, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.882577045734966, + "learning_rate": 0.00018514189262659235, + "loss": 1.2078, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 1.0411934134353433, + "learning_rate": 0.00018500565608261214, + "loss": 1.3107, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 1.0487036682648194, + "learning_rate": 0.00018486884839927768, + "loss": 1.2408, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.996003176964268, + "learning_rate": 0.00018473147049577774, + "loss": 1.3839, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.9993503556381033, + "learning_rate": 0.0001845935232951325, + "loss": 1.2378, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.9735271694798517, + "learning_rate": 0.00018445500772418697, + "loss": 1.2637, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 1.0260426667918034, + "learning_rate": 0.00018431592471360503, + "loss": 1.295, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 1.8934073314300326, + "learning_rate": 0.00018417627519786315, + "loss": 1.2806, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 1.1055103869913467, + "learning_rate": 0.000184036060115244, + "loss": 1.1872, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.9182298549291064, + "learning_rate": 0.00018389528040783012, + "loss": 1.2763, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.9957870148143394, + "learning_rate": 0.00018375393702149787, + "loss": 1.2213, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 1.0231771013527717, + "learning_rate": 0.00018361203090591071, + "loss": 1.3007, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 1.053869176624131, + "learning_rate": 0.00018346956301451304, + "loss": 1.228, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 1.0541928753871113, + "learning_rate": 0.00018332653430452376, + "loss": 1.3133, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 1.048461099911668, + "learning_rate": 0.00018318294573692985, + "loss": 1.2627, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.9680026687318783, + "learning_rate": 0.00018303879827647975, + "loss": 1.2683, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.9159924136404415, + "learning_rate": 0.0001828940928916772, + "loss": 1.1734, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.9493743000419643, + "learning_rate": 0.00018274883055477436, + "loss": 1.3051, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.7984846876543369, + "learning_rate": 0.00018260301224176558, + "loss": 1.2875, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 1.0588105456539831, + "learning_rate": 0.00018245663893238075, + "loss": 1.3482, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.8617306722077073, + "learning_rate": 0.00018230971161007853, + "loss": 1.2347, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.9201580389065354, + "learning_rate": 0.00018216223126204007, + "loss": 1.3656, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.9632315380099041, + "learning_rate": 0.00018201419887916214, + "loss": 1.3028, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 1.036405616549462, + "learning_rate": 0.00018186561545605054, + "loss": 1.2665, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.8722582297782859, + "learning_rate": 0.00018171648199101346, + "loss": 1.2154, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.9598056669866919, + "learning_rate": 0.00018156679948605467, + "loss": 1.2157, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.9093669858230348, + "learning_rate": 0.00018141656894686689, + "loss": 1.217, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.9301820924793702, + "learning_rate": 0.00018126579138282503, + "loss": 1.099, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.8515405402797095, + "learning_rate": 0.00018111446780697929, + "loss": 1.1085, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.9886837697985009, + "learning_rate": 0.0001809625992360485, + "loss": 1.2912, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.9108321566956064, + "learning_rate": 0.00018081018669041324, + "loss": 1.3845, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 1.020152530259783, + "learning_rate": 0.00018065723119410884, + "loss": 1.2758, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.9135337960724216, + "learning_rate": 0.00018050373377481878, + "loss": 1.2935, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 1.003246670931465, + "learning_rate": 0.00018034969546386757, + "loss": 1.2846, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.9908671594294807, + "learning_rate": 0.0001801951172962139, + "loss": 1.2798, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 1.0046666460518803, + "learning_rate": 0.0001800400003104436, + "loss": 1.3153, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 1.0050270897576201, + "learning_rate": 0.0001798843455487629, + "loss": 1.2318, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.8898448753827383, + "learning_rate": 0.00017972815405699103, + "loss": 1.2443, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.8456349163049246, + "learning_rate": 0.00017957142688455362, + "loss": 1.1138, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.9006103289307575, + "learning_rate": 0.00017941416508447536, + "loss": 1.3603, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 1.1364506597841737, + "learning_rate": 0.00017925636971337304, + "loss": 1.3077, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.891233671680888, + "learning_rate": 0.0001790980418314484, + "loss": 1.2842, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.9822838216631534, + "learning_rate": 0.00017893918250248104, + "loss": 1.1698, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.9004032270433734, + "learning_rate": 0.00017877979279382135, + "loss": 1.3045, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.8809100040469414, + "learning_rate": 0.00017861987377638312, + "loss": 1.3591, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 1.0363156271259157, + "learning_rate": 0.0001784594265246366, + "loss": 1.2574, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.8941089337345295, + "learning_rate": 0.0001782984521166011, + "loss": 1.1797, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.9854339490702365, + "learning_rate": 0.0001781369516338378, + "loss": 1.1502, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 1.162057797328878, + "learning_rate": 0.00017797492616144256, + "loss": 1.4165, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.8667128774735741, + "learning_rate": 0.00017781237678803847, + "loss": 1.0879, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.8755236610189415, + "learning_rate": 0.00017764930460576866, + "loss": 1.1483, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 1.0760108468427834, + "learning_rate": 0.000177485710710289, + "loss": 1.3106, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 1.0103325474819502, + "learning_rate": 0.00017732159620076053, + "loss": 1.2281, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.8605645733888942, + "learning_rate": 0.00017715696217984235, + "loss": 1.2049, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.8892076278648906, + "learning_rate": 0.00017699180975368396, + "loss": 1.3148, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.9676847990497317, + "learning_rate": 0.00017682614003191807, + "loss": 1.2684, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 1.0896990899045311, + "learning_rate": 0.00017665995412765285, + "loss": 1.3011, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.948038100131218, + "learning_rate": 0.00017649325315746478, + "loss": 1.2795, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.9585751723182749, + "learning_rate": 0.00017632603824139085, + "loss": 1.2842, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.8352758228615104, + "learning_rate": 0.0001761583105029213, + "loss": 1.1673, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.9235985259875558, + "learning_rate": 0.0001759900710689918, + "loss": 1.1964, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.9045244602385455, + "learning_rate": 0.00017582132106997616, + "loss": 1.2193, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.8395526289182972, + "learning_rate": 0.00017565206163967846, + "loss": 1.1979, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.9196580961351944, + "learning_rate": 0.00017548229391532572, + "loss": 1.1456, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.8143014058708408, + "learning_rate": 0.00017531201903755994, + "loss": 1.08, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 1.3447886704942074, + "learning_rate": 0.00017514123815043074, + "loss": 1.3336, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.9149130861662886, + "learning_rate": 0.00017496995240138744, + "loss": 1.2199, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.9363131784977247, + "learning_rate": 0.00017479816294127152, + "loss": 1.3458, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.9245883146194218, + "learning_rate": 0.00017462587092430875, + "loss": 1.2124, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.8623535081047469, + "learning_rate": 0.0001744530775081015, + "loss": 1.1537, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.8261285070810973, + "learning_rate": 0.00017427978385362112, + "loss": 1.2583, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 1.0443091644941611, + "learning_rate": 0.0001741059911251997, + "loss": 1.3473, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 1.1936136897937293, + "learning_rate": 0.0001739317004905227, + "loss": 1.3646, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.9618183784456393, + "learning_rate": 0.000173756913120621, + "loss": 1.195, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.8834444187506799, + "learning_rate": 0.00017358163018986282, + "loss": 1.1462, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 1.0644742450260856, + "learning_rate": 0.00017340585287594604, + "loss": 1.2692, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.9763081988106593, + "learning_rate": 0.00017322958235989016, + "loss": 1.3289, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.9067380840876519, + "learning_rate": 0.0001730528198260285, + "loss": 1.2101, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.9877674117491254, + "learning_rate": 0.00017287556646200018, + "loss": 1.2473, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 1.0166560289485393, + "learning_rate": 0.00017269782345874203, + "loss": 1.2879, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.8867253833662876, + "learning_rate": 0.00017251959201048083, + "loss": 1.204, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 1.0222895737290478, + "learning_rate": 0.00017234087331472497, + "loss": 1.3303, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.9297052904091985, + "learning_rate": 0.00017216166857225674, + "loss": 1.2158, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 1.113950668125169, + "learning_rate": 0.00017198197898712404, + "loss": 1.4063, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.8315094003529999, + "learning_rate": 0.00017180180576663228, + "loss": 1.2375, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 1.0132794538892895, + "learning_rate": 0.00017162115012133643, + "loss": 1.3325, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.9301648106998512, + "learning_rate": 0.00017144001326503273, + "loss": 1.314, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.8387166059271942, + "learning_rate": 0.00017125839641475072, + "loss": 1.2933, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.9491040601324949, + "learning_rate": 0.00017107630079074478, + "loss": 1.3023, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.9589786429040578, + "learning_rate": 0.00017089372761648616, + "loss": 1.1919, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.8211356124008726, + "learning_rate": 0.00017071067811865476, + "loss": 1.1531, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.8651844248084547, + "learning_rate": 0.00017052715352713075, + "loss": 1.1853, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.9587234574931144, + "learning_rate": 0.00017034315507498635, + "loss": 1.2009, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.9638887805121406, + "learning_rate": 0.00017015868399847768, + "loss": 1.3002, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.9884632423837139, + "learning_rate": 0.00016997374153703625, + "loss": 1.2294, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.8617371245052429, + "learning_rate": 0.00016978832893326074, + "loss": 1.1756, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.8570029343254111, + "learning_rate": 0.00016960244743290868, + "loss": 1.1458, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 1.0403499538130299, + "learning_rate": 0.00016941609828488807, + "loss": 1.2649, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 1.0701402360745733, + "learning_rate": 0.00016922928274124886, + "loss": 1.3516, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.8046004799379116, + "learning_rate": 0.0001690420020571747, + "loss": 1.1243, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 1.0896556455414892, + "learning_rate": 0.00016885425749097444, + "loss": 1.3964, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.8051493078826016, + "learning_rate": 0.0001686660503040737, + "loss": 1.1317, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.8861789311545909, + "learning_rate": 0.00016847738176100632, + "loss": 1.2176, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.9581457318097438, + "learning_rate": 0.00016828825312940592, + "loss": 1.2752, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.9500815430533931, + "learning_rate": 0.0001680986656799975, + "loss": 1.229, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.8983829589570962, + "learning_rate": 0.0001679086206865886, + "loss": 1.1718, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.9563371854485535, + "learning_rate": 0.00016771811942606108, + "loss": 1.2362, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.8612559432875038, + "learning_rate": 0.00016752716317836229, + "loss": 1.2893, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.9028864105321798, + "learning_rate": 0.00016733575322649657, + "loss": 1.1828, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.8451192551757782, + "learning_rate": 0.0001671438908565167, + "loss": 1.1265, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.9533224377402716, + "learning_rate": 0.00016695157735751513, + "loss": 1.209, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 1.0147529765631986, + "learning_rate": 0.00016675881402161536, + "loss": 1.2257, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 1.031808782707111, + "learning_rate": 0.0001665656021439633, + "loss": 1.3177, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.911147895751784, + "learning_rate": 0.0001663719430227186, + "loss": 1.2467, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.7817909097309222, + "learning_rate": 0.00016617783795904565, + "loss": 1.1432, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.7444889968847276, + "learning_rate": 0.00016598328825710533, + "loss": 1.0799, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.8492631593144773, + "learning_rate": 0.00016578829522404583, + "loss": 1.1427, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.9450214645654191, + "learning_rate": 0.000165592860169994, + "loss": 1.2868, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.8397588768973042, + "learning_rate": 0.00016539698440804661, + "loss": 1.2199, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.8322979893800689, + "learning_rate": 0.00016520066925426144, + "loss": 1.2057, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.8670377342574328, + "learning_rate": 0.0001650039160276485, + "loss": 1.1538, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.8914144967857235, + "learning_rate": 0.0001648067260501611, + "loss": 1.1952, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.9502443612778761, + "learning_rate": 0.0001646091006466871, + "loss": 1.2334, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.98482104661955, + "learning_rate": 0.0001644110411450398, + "loss": 1.249, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.8645467629715614, + "learning_rate": 0.00016421254887594917, + "loss": 1.174, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.8561391927441676, + "learning_rate": 0.00016401362517305296, + "loss": 1.1494, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.9171929313181237, + "learning_rate": 0.00016381427137288754, + "loss": 1.2708, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 1.0114481827984823, + "learning_rate": 0.00016361448881487914, + "loss": 1.262, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.7901886201121449, + "learning_rate": 0.0001634142788413346, + "loss": 0.9509, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.9286085733043485, + "learning_rate": 0.00016321364279743266, + "loss": 1.2093, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.8525243507575246, + "learning_rate": 0.00016301258203121462, + "loss": 1.2247, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.8049172689583466, + "learning_rate": 0.0001628110978935756, + "loss": 1.1541, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.898689792997307, + "learning_rate": 0.00016260919173825508, + "loss": 1.1567, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.944213975316119, + "learning_rate": 0.00016240686492182804, + "loss": 1.1869, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.9682693375087824, + "learning_rate": 0.00016220411880369601, + "loss": 1.2967, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.9018898556139062, + "learning_rate": 0.00016200095474607753, + "loss": 1.1955, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.8668345825876101, + "learning_rate": 0.00016179737411399926, + "loss": 1.3118, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.9774777295478445, + "learning_rate": 0.00016159337827528685, + "loss": 1.219, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.8041179042413573, + "learning_rate": 0.00016138896860055555, + "loss": 1.1725, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.9130745985984293, + "learning_rate": 0.0001611841464632011, + "loss": 1.1321, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.8711746954473047, + "learning_rate": 0.00016097891323939062, + "loss": 1.1676, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.9128330506885919, + "learning_rate": 0.0001607732703080532, + "loss": 1.2505, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.845758888302076, + "learning_rate": 0.00016056721905087056, + "loss": 1.2259, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.9059778839638238, + "learning_rate": 0.00016036076085226814, + "loss": 1.2066, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.8653822503261192, + "learning_rate": 0.00016015389709940538, + "loss": 1.071, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.8687364719367553, + "learning_rate": 0.0001599466291821666, + "loss": 1.2495, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.8680808216538957, + "learning_rate": 0.0001597389584931517, + "loss": 1.2001, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.9619686817861596, + "learning_rate": 0.0001595308864276666, + "loss": 1.2892, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.9611654419173977, + "learning_rate": 0.0001593224143837142, + "loss": 1.2643, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.7915960551513423, + "learning_rate": 0.0001591135437619847, + "loss": 1.0126, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.8065933118797741, + "learning_rate": 0.00015890427596584617, + "loss": 1.1837, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.7630648072457108, + "learning_rate": 0.0001586946124013354, + "loss": 0.9984, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.8008835614895283, + "learning_rate": 0.00015848455447714822, + "loss": 1.1827, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.8181476801131583, + "learning_rate": 0.0001582741036046301, + "loss": 1.1794, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.9518258795235218, + "learning_rate": 0.00015806326119776663, + "loss": 1.3664, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.9740622816903404, + "learning_rate": 0.00015785202867317407, + "loss": 1.2787, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.8894350998025817, + "learning_rate": 0.00015764040745008988, + "loss": 1.2131, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.8033594520828705, + "learning_rate": 0.00015742839895036305, + "loss": 1.1828, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.9186622687647027, + "learning_rate": 0.00015721600459844468, + "loss": 1.2594, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.9134888956592484, + "learning_rate": 0.00015700322582137827, + "loss": 1.2057, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.9500637042175568, + "learning_rate": 0.00015679006404879033, + "loss": 1.207, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.849598717746634, + "learning_rate": 0.0001565765207128805, + "loss": 1.0906, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 1.0259657028423845, + "learning_rate": 0.00015636259724841222, + "loss": 1.3111, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 1.0309066909033666, + "learning_rate": 0.0001561482950927029, + "loss": 1.2006, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.9460127280243473, + "learning_rate": 0.00015593361568561428, + "loss": 1.2872, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.9494818972639643, + "learning_rate": 0.00015571856046954285, + "loss": 1.0666, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.9948125878007591, + "learning_rate": 0.0001555031308894101, + "loss": 1.1886, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 1.0347143496320934, + "learning_rate": 0.00015528732839265272, + "loss": 1.1577, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.8971181852404114, + "learning_rate": 0.0001550711544292131, + "loss": 1.1707, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.8744963963866264, + "learning_rate": 0.0001548546104515294, + "loss": 1.1817, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.873626962980643, + "learning_rate": 0.00015463769791452574, + "loss": 1.173, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.8742899718656414, + "learning_rate": 0.00015442041827560274, + "loss": 1.1749, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 1.030382215523112, + "learning_rate": 0.00015420277299462736, + "loss": 1.4224, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.9538569427641035, + "learning_rate": 0.00015398476353392323, + "loss": 1.236, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.925636229189722, + "learning_rate": 0.00015376639135826107, + "loss": 1.2947, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.9716062996409484, + "learning_rate": 0.00015354765793484834, + "loss": 1.2085, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.817533441455966, + "learning_rate": 0.00015332856473331978, + "loss": 1.1791, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.7932440760120276, + "learning_rate": 0.00015310911322572753, + "loss": 1.1963, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.943311651505502, + "learning_rate": 0.00015288930488653094, + "loss": 1.3386, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.977028135598457, + "learning_rate": 0.000152669141192587, + "loss": 1.3633, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 1.0906113450970434, + "learning_rate": 0.0001524486236231402, + "loss": 1.2432, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.8424851045981453, + "learning_rate": 0.00015222775365981273, + "loss": 1.168, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.8615709350842159, + "learning_rate": 0.00015200653278659432, + "loss": 1.1567, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 1.0404299995470772, + "learning_rate": 0.00015178496248983254, + "loss": 1.3156, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.8639625636960955, + "learning_rate": 0.00015156304425822267, + "loss": 1.2002, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 1.0379110675975693, + "learning_rate": 0.00015134077958279765, + "loss": 1.2018, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.9839761304214002, + "learning_rate": 0.00015111816995691809, + "loss": 1.0979, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.9624304807291328, + "learning_rate": 0.00015089521687626243, + "loss": 1.3084, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.900987750667034, + "learning_rate": 0.00015067192183881658, + "loss": 1.2027, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.8888875144514918, + "learning_rate": 0.000150448286344864, + "loss": 1.3207, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.8850569429393079, + "learning_rate": 0.00015022431189697568, + "loss": 1.2416, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.9003696024963973, + "learning_rate": 0.00015000000000000001, + "loss": 1.1479, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 1.0064359203785187, + "learning_rate": 0.0001497753521610526, + "loss": 1.1769, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.9014492959862608, + "learning_rate": 0.00014955036988950618, + "loss": 1.2437, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.8582665088283009, + "learning_rate": 0.00014932505469698052, + "loss": 1.2826, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.9378752094731411, + "learning_rate": 0.00014909940809733222, + "loss": 1.276, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.9320151471539153, + "learning_rate": 0.0001488734316066446, + "loss": 1.1669, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.8645625988589353, + "learning_rate": 0.00014864712674321734, + "loss": 1.1414, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.9461016811211961, + "learning_rate": 0.0001484204950275565, + "loss": 1.1583, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 1.170475064084964, + "learning_rate": 0.00014819353798236427, + "loss": 1.2634, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.8528015522455052, + "learning_rate": 0.00014796625713252848, + "loss": 1.1828, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.8948649989942181, + "learning_rate": 0.00014773865400511272, + "loss": 1.1555, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.956391413560586, + "learning_rate": 0.00014751073012934587, + "loss": 1.0984, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.8815982203572552, + "learning_rate": 0.00014728248703661182, + "loss": 1.1565, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.9366137358254796, + "learning_rate": 0.0001470539262604393, + "loss": 1.1641, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.9463767041942464, + "learning_rate": 0.00014682504933649144, + "loss": 1.2412, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.8635682429287176, + "learning_rate": 0.00014659585780255556, + "loss": 1.1995, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.9412634696119908, + "learning_rate": 0.00014636635319853275, + "loss": 1.256, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.894710543323146, + "learning_rate": 0.0001461365370664276, + "loss": 1.1971, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.9849238469492475, + "learning_rate": 0.00014590641095033787, + "loss": 1.0918, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.9372613714430934, + "learning_rate": 0.00014567597639644387, + "loss": 1.304, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.8366575987171679, + "learning_rate": 0.00014544523495299842, + "loss": 1.1435, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 1.0183573724764445, + "learning_rate": 0.00014521418817031628, + "loss": 1.372, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.921436388687488, + "learning_rate": 0.0001449828376007636, + "loss": 1.2401, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 1.0126133537347326, + "learning_rate": 0.00014475118479874774, + "loss": 1.2933, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.9366176126930088, + "learning_rate": 0.0001445192313207067, + "loss": 1.2144, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 1.039845091753781, + "learning_rate": 0.0001442869787250987, + "loss": 1.2504, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.831029380673922, + "learning_rate": 0.0001440544285723915, + "loss": 1.1508, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.8257437837880253, + "learning_rate": 0.00014382158242505234, + "loss": 1.1706, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.8023515201237144, + "learning_rate": 0.00014358844184753712, + "loss": 1.1508, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.9545116084936521, + "learning_rate": 0.00014335500840627986, + "loss": 1.3051, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.9717442585412174, + "learning_rate": 0.00014312128366968243, + "loss": 1.3213, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.9015444991867029, + "learning_rate": 0.0001428872692081038, + "loss": 1.1661, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.8453162594038274, + "learning_rate": 0.00014265296659384956, + "loss": 1.2304, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.8760218476506566, + "learning_rate": 0.00014241837740116132, + "loss": 1.2186, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.7876474094502752, + "learning_rate": 0.00014218350320620624, + "loss": 1.075, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.7993246547885368, + "learning_rate": 0.00014194834558706632, + "loss": 1.231, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.7907090498056094, + "learning_rate": 0.0001417129061237278, + "loss": 1.1443, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.7511377628701228, + "learning_rate": 0.0001414771863980707, + "loss": 1.1022, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 1.0602160883969263, + "learning_rate": 0.00014124118799385796, + "loss": 1.1419, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.8499695191433811, + "learning_rate": 0.00014100491249672498, + "loss": 1.1788, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.8540057141934249, + "learning_rate": 0.00014076836149416887, + "loss": 1.105, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.8102920946458256, + "learning_rate": 0.0001405315365755379, + "loss": 1.1683, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.9072627599036039, + "learning_rate": 0.0001402944393320206, + "loss": 1.088, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.9586398821405642, + "learning_rate": 0.00014005707135663527, + "loss": 1.2484, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.9290000247371769, + "learning_rate": 0.00013981943424421932, + "loss": 1.1843, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.8375789473026685, + "learning_rate": 0.00013958152959141825, + "loss": 1.0358, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.7635571324142661, + "learning_rate": 0.00013934335899667527, + "loss": 1.1445, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 1.045797503899567, + "learning_rate": 0.00013910492406022033, + "loss": 1.2421, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 1.0109307529063931, + "learning_rate": 0.00013886622638405952, + "loss": 1.2873, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.797021878143829, + "learning_rate": 0.0001386272675719642, + "loss": 1.2203, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.9700917177296354, + "learning_rate": 0.00013838804922946027, + "loss": 1.1761, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.890026042051327, + "learning_rate": 0.00013814857296381728, + "loss": 1.1883, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.765185289338175, + "learning_rate": 0.00013790884038403795, + "loss": 1.1012, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.8140561721789563, + "learning_rate": 0.00013766885310084688, + "loss": 1.1017, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.9666521023648389, + "learning_rate": 0.00013742861272668012, + "loss": 1.1211, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.9393299199746825, + "learning_rate": 0.00013718812087567414, + "loss": 1.2397, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.894655604863808, + "learning_rate": 0.00013694737916365517, + "loss": 1.1175, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.8959521328951345, + "learning_rate": 0.000136706389208128, + "loss": 1.138, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.9818068524694171, + "learning_rate": 0.00013646515262826552, + "loss": 1.137, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 1.027296109586849, + "learning_rate": 0.00013622367104489756, + "loss": 1.1507, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.8678449288401437, + "learning_rate": 0.0001359819460805001, + "loss": 1.1469, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 1.0031997037428155, + "learning_rate": 0.0001357399793591844, + "loss": 1.2433, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 1.0515492628817857, + "learning_rate": 0.0001354977725066859, + "loss": 1.2729, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.8885796845124188, + "learning_rate": 0.00013525532715035366, + "loss": 1.1275, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.8641143500851984, + "learning_rate": 0.00013501264491913906, + "loss": 1.2576, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.8154811162618586, + "learning_rate": 0.00013476972744358507, + "loss": 1.0765, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.9568018074614199, + "learning_rate": 0.0001345265763558152, + "loss": 1.1195, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.8108302299524088, + "learning_rate": 0.00013428319328952253, + "loss": 1.053, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.9409675732775402, + "learning_rate": 0.00013403957987995882, + "loss": 1.127, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.7781864660886626, + "learning_rate": 0.0001337957377639235, + "loss": 1.1329, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 1.0199127709160234, + "learning_rate": 0.0001335516685797525, + "loss": 1.2385, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.8221811253224244, + "learning_rate": 0.0001333073739673076, + "loss": 1.2236, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.9463573458104121, + "learning_rate": 0.00013306285556796495, + "loss": 1.2651, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.8597680591743613, + "learning_rate": 0.0001328181150246045, + "loss": 1.0963, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.9237614005652235, + "learning_rate": 0.00013257315398159864, + "loss": 1.1679, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.9161415729139808, + "learning_rate": 0.00013232797408480127, + "loss": 1.1218, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.7773862816198334, + "learning_rate": 0.00013208257698153677, + "loss": 1.2071, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.8763697877745098, + "learning_rate": 0.00013183696432058888, + "loss": 1.1922, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.8436430947992818, + "learning_rate": 0.00013159113775218964, + "loss": 1.1839, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.8542536722190046, + "learning_rate": 0.00013134509892800822, + "loss": 1.2099, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.8718154209836556, + "learning_rate": 0.00013109884950114007, + "loss": 1.1767, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.814140019880975, + "learning_rate": 0.00013085239112609547, + "loss": 1.1537, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.8889221338176805, + "learning_rate": 0.00013060572545878875, + "loss": 1.2299, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.7808170171112502, + "learning_rate": 0.00013035885415652685, + "loss": 1.1895, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.8232553022619731, + "learning_rate": 0.00013011177887799845, + "loss": 1.2091, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.8391444436897774, + "learning_rate": 0.00012986450128326266, + "loss": 1.1803, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.8753444917553742, + "learning_rate": 0.00012961702303373795, + "loss": 1.1082, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.7924003048200808, + "learning_rate": 0.00012936934579219094, + "loss": 1.1342, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.8983735111525624, + "learning_rate": 0.00012912147122272523, + "loss": 1.2484, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.895102530864573, + "learning_rate": 0.00012887340099077024, + "loss": 1.1044, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.9445868985759234, + "learning_rate": 0.00012862513676307008, + "loss": 1.121, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.9149240583640527, + "learning_rate": 0.0001283766802076722, + "loss": 1.1474, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.8571405322515444, + "learning_rate": 0.00012812803299391628, + "loss": 1.1563, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.8481113794861626, + "learning_rate": 0.00012787919679242306, + "loss": 1.1171, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.8603903731508595, + "learning_rate": 0.00012763017327508305, + "loss": 1.1545, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.7662421829165785, + "learning_rate": 0.00012738096411504522, + "loss": 1.048, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.8706681219444926, + "learning_rate": 0.0001271315709867059, + "loss": 1.1907, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.875195678660521, + "learning_rate": 0.00012688199556569753, + "loss": 1.1865, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.797748206572489, + "learning_rate": 0.00012663223952887723, + "loss": 1.2685, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.8273035537862294, + "learning_rate": 0.0001263823045543158, + "loss": 1.2624, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.8766792540528516, + "learning_rate": 0.00012613219232128608, + "loss": 1.0499, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.9423730978153396, + "learning_rate": 0.00012588190451025207, + "loss": 1.174, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.8276737854919352, + "learning_rate": 0.00012563144280285741, + "loss": 1.1171, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.8108921482599494, + "learning_rate": 0.00012538080888191408, + "loss": 1.0962, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.8387867133920371, + "learning_rate": 0.00012513000443139112, + "loss": 1.1382, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.9413873346510828, + "learning_rate": 0.00012487903113640337, + "loss": 1.2573, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.883006129224719, + "learning_rate": 0.00012462789068320017, + "loss": 1.1496, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.8199461868571233, + "learning_rate": 0.00012437658475915377, + "loss": 1.143, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.8553685705620094, + "learning_rate": 0.00012412511505274844, + "loss": 1.2265, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.8820477217672932, + "learning_rate": 0.00012387348325356874, + "loss": 1.2634, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.8601839000066415, + "learning_rate": 0.00012362169105228826, + "loss": 1.247, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.8527697759820766, + "learning_rate": 0.00012336974014065844, + "loss": 1.1594, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.8123714400910506, + "learning_rate": 0.000123117632211497, + "loss": 1.1541, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.8388929535371012, + "learning_rate": 0.00012286536895867654, + "loss": 1.2318, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.9072499746597118, + "learning_rate": 0.00012261295207711346, + "loss": 1.2327, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.745225852061896, + "learning_rate": 0.00012236038326275626, + "loss": 1.1105, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.8784362755933546, + "learning_rate": 0.0001221076642125742, + "loss": 1.2133, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.8153133519550086, + "learning_rate": 0.00012185479662454595, + "loss": 1.1971, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.7811186057897588, + "learning_rate": 0.00012160178219764837, + "loss": 1.0931, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.9326536326980271, + "learning_rate": 0.00012134862263184467, + "loss": 1.1517, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.8216914978062639, + "learning_rate": 0.00012109531962807332, + "loss": 1.0912, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.8476962507001223, + "learning_rate": 0.00012084187488823657, + "loss": 1.1957, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.8596187835809099, + "learning_rate": 0.00012058829011518896, + "loss": 1.1555, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.8298184772484226, + "learning_rate": 0.00012033456701272576, + "loss": 1.1074, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.9609701936506492, + "learning_rate": 0.00012008070728557186, + "loss": 1.2726, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.8680508025657486, + "learning_rate": 0.00011982671263936995, + "loss": 1.1339, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.9925839781085051, + "learning_rate": 0.00011957258478066931, + "loss": 1.2028, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.8125320750678805, + "learning_rate": 0.00011931832541691418, + "loss": 1.0625, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.8768511805834935, + "learning_rate": 0.00011906393625643244, + "loss": 1.2186, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.8345742788681829, + "learning_rate": 0.00011880941900842397, + "loss": 1.1813, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.8959461578622999, + "learning_rate": 0.00011855477538294935, + "loss": 1.1638, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.9924411869619153, + "learning_rate": 0.00011830000709091815, + "loss": 1.1712, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.8943053158039393, + "learning_rate": 0.00011804511584407763, + "loss": 1.1679, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.805038370805271, + "learning_rate": 0.0001177901033550012, + "loss": 1.2985, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.8873055710491184, + "learning_rate": 0.00011753497133707679, + "loss": 1.197, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.7816032860712533, + "learning_rate": 0.00011727972150449544, + "loss": 1.0355, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.906300282180546, + "learning_rate": 0.00011702435557223987, + "loss": 1.1816, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.7925010420648496, + "learning_rate": 0.00011676887525607271, + "loss": 1.1191, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.8376692937905297, + "learning_rate": 0.00011651328227252517, + "loss": 1.1698, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.8948600707756753, + "learning_rate": 0.00011625757833888551, + "loss": 1.2028, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.8952116689846026, + "learning_rate": 0.00011600176517318741, + "loss": 1.1844, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.9094589278333447, + "learning_rate": 0.0001157458444941984, + "loss": 1.1354, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.8536043917642891, + "learning_rate": 0.00011548981802140848, + "loss": 1.1392, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.9733923853117847, + "learning_rate": 0.00011523368747501839, + "loss": 1.2189, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.8964503801179634, + "learning_rate": 0.00011497745457592816, + "loss": 1.1615, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.8877921864629532, + "learning_rate": 0.00011472112104572547, + "loss": 1.1435, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.8372328560031221, + "learning_rate": 0.00011446468860667421, + "loss": 1.1934, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.9079557906978756, + "learning_rate": 0.0001142081589817027, + "loss": 1.1476, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.8510474966571356, + "learning_rate": 0.00011395153389439233, + "loss": 1.2275, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.8408931790931111, + "learning_rate": 0.00011369481506896582, + "loss": 1.1917, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.8343733813573859, + "learning_rate": 0.00011343800423027582, + "loss": 1.1732, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.8289961257458175, + "learning_rate": 0.00011318110310379301, + "loss": 1.1986, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.8555001028611945, + "learning_rate": 0.0001129241134155949, + "loss": 1.1986, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.8264825870352727, + "learning_rate": 0.00011266703689235394, + "loss": 1.1929, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.8553948588686492, + "learning_rate": 0.00011240987526132594, + "loss": 1.1033, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.8380899044644874, + "learning_rate": 0.00011215263025033869, + "loss": 1.1686, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.8727679175975, + "learning_rate": 0.00011189530358778005, + "loss": 1.041, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.8180328740580982, + "learning_rate": 0.00011163789700258655, + "loss": 1.1672, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.8400486300979757, + "learning_rate": 0.00011138041222423177, + "loss": 1.2248, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.8668030633717136, + "learning_rate": 0.00011112285098271451, + "loss": 1.1847, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.9183371992913884, + "learning_rate": 0.00011086521500854745, + "loss": 1.1679, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.759898728911129, + "learning_rate": 0.00011060750603274535, + "loss": 1.0188, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.8840110162639573, + "learning_rate": 0.00011034972578681338, + "loss": 1.0226, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.7914148929556654, + "learning_rate": 0.00011009187600273566, + "loss": 1.0652, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.7403012262197173, + "learning_rate": 0.00010983395841296348, + "loss": 1.115, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.8253809044516809, + "learning_rate": 0.00010957597475040373, + "loss": 1.1122, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.8853790879504476, + "learning_rate": 0.00010931792674840718, + "loss": 1.1727, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.909555754043428, + "learning_rate": 0.00010905981614075693, + "loss": 1.0891, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.8425311868030075, + "learning_rate": 0.00010880164466165674, + "loss": 1.1894, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.812821876426759, + "learning_rate": 0.00010854341404571928, + "loss": 1.1576, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.9936089741508148, + "learning_rate": 0.00010828512602795462, + "loss": 1.2062, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.8209545933895037, + "learning_rate": 0.00010802678234375851, + "loss": 1.2334, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.9796965099618715, + "learning_rate": 0.00010776838472890065, + "loss": 1.2203, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.8950705321197276, + "learning_rate": 0.0001075099349195131, + "loss": 1.1832, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.8249629348907459, + "learning_rate": 0.00010725143465207867, + "loss": 1.1365, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.8558753794538538, + "learning_rate": 0.00010699288566341914, + "loss": 1.2335, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.8764834607191381, + "learning_rate": 0.00010673428969068364, + "loss": 1.0876, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.8968033214264838, + "learning_rate": 0.000106475648471337, + "loss": 1.2158, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.8854900296778185, + "learning_rate": 0.00010621696374314807, + "loss": 1.1501, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.9247912025755864, + "learning_rate": 0.00010595823724417795, + "loss": 1.2254, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.8296803216440574, + "learning_rate": 0.00010569947071276847, + "loss": 1.1098, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.8445302120263739, + "learning_rate": 0.00010544066588753044, + "loss": 1.175, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.8526662821322857, + "learning_rate": 0.00010518182450733186, + "loss": 1.2137, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.8573450684787427, + "learning_rate": 0.00010492294831128641, + "loss": 1.1198, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.9704132936461313, + "learning_rate": 0.00010466403903874176, + "loss": 1.2565, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.9094218556866875, + "learning_rate": 0.00010440509842926767, + "loss": 1.2282, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.7589102419386903, + "learning_rate": 0.00010414612822264455, + "loss": 1.0726, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.8568552824329363, + "learning_rate": 0.00010388713015885161, + "loss": 1.2225, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.9099573473568132, + "learning_rate": 0.00010362810597805526, + "loss": 1.2192, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.8663868216092737, + "learning_rate": 0.00010336905742059742, + "loss": 1.1698, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.9012087460682382, + "learning_rate": 0.0001031099862269837, + "loss": 1.1963, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.8542139256758909, + "learning_rate": 0.0001028508941378719, + "loss": 1.157, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.7399476383108098, + "learning_rate": 0.00010259178289406011, + "loss": 1.0874, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.890064848208203, + "learning_rate": 0.00010233265423647523, + "loss": 1.2032, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.8900187384119745, + "learning_rate": 0.00010207350990616107, + "loss": 1.1205, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.8774863747578419, + "learning_rate": 0.00010181435164426676, + "loss": 1.1616, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.8024417773551542, + "learning_rate": 0.0001015551811920351, + "loss": 1.0815, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.8365325260082347, + "learning_rate": 0.00010129600029079072, + "loss": 1.1137, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.7979893484558706, + "learning_rate": 0.00010103681068192845, + "loss": 1.092, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.8700261231040217, + "learning_rate": 0.00010077761410690172, + "loss": 1.2301, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.8760299325080985, + "learning_rate": 0.00010051841230721065, + "loss": 1.0199, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.8407610443896854, + "learning_rate": 0.00010025920702439051, + "loss": 1.1225, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.9984053835752241, + "learning_rate": 0.0001, + "loss": 1.1345, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.8939606929722101, + "learning_rate": 9.97407929756095e-05, + "loss": 1.125, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.9033370249042589, + "learning_rate": 9.948158769278939e-05, + "loss": 1.201, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.787837536241076, + "learning_rate": 9.92223858930983e-05, + "loss": 1.1558, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.8355913231040456, + "learning_rate": 9.896318931807155e-05, + "loss": 1.1463, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.8169983302610252, + "learning_rate": 9.870399970920932e-05, + "loss": 1.1265, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.7757282371794076, + "learning_rate": 9.844481880796491e-05, + "loss": 1.1897, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.7989135286082059, + "learning_rate": 9.818564835573323e-05, + "loss": 1.1708, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.9121822189834736, + "learning_rate": 9.792649009383899e-05, + "loss": 1.1805, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.82789990823343, + "learning_rate": 9.766734576352478e-05, + "loss": 1.1776, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.930793369669878, + "learning_rate": 9.740821710593989e-05, + "loss": 1.3292, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.8918823860727169, + "learning_rate": 9.714910586212816e-05, + "loss": 1.1502, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.8902864870451087, + "learning_rate": 9.689001377301633e-05, + "loss": 1.2186, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.8781025509894115, + "learning_rate": 9.663094257940258e-05, + "loss": 1.2224, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.8121841814461802, + "learning_rate": 9.637189402194476e-05, + "loss": 1.2021, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.7886764492758879, + "learning_rate": 9.611286984114841e-05, + "loss": 1.0356, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.7686029066214254, + "learning_rate": 9.585387177735547e-05, + "loss": 1.0723, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.8703794048947431, + "learning_rate": 9.559490157073236e-05, + "loss": 1.075, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.8989762785964018, + "learning_rate": 9.533596096125825e-05, + "loss": 1.1801, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.8864219915448953, + "learning_rate": 9.507705168871358e-05, + "loss": 1.1061, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.8086124406622477, + "learning_rate": 9.481817549266817e-05, + "loss": 1.1339, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.8818612647451756, + "learning_rate": 9.455933411246958e-05, + "loss": 1.2096, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.9279341204076482, + "learning_rate": 9.430052928723153e-05, + "loss": 1.0718, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.8212583197585621, + "learning_rate": 9.404176275582208e-05, + "loss": 1.1684, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.9224561529108475, + "learning_rate": 9.378303625685195e-05, + "loss": 1.0358, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 1.0488626132449583, + "learning_rate": 9.352435152866298e-05, + "loss": 1.1282, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.789184234952121, + "learning_rate": 9.326571030931637e-05, + "loss": 1.1243, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 1.1532029459770452, + "learning_rate": 9.300711433658087e-05, + "loss": 1.1105, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.8583297822926123, + "learning_rate": 9.274856534792138e-05, + "loss": 1.1302, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.8746865884006344, + "learning_rate": 9.249006508048694e-05, + "loss": 1.1228, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.8308540377755012, + "learning_rate": 9.223161527109937e-05, + "loss": 1.0653, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.9080365422518107, + "learning_rate": 9.197321765624152e-05, + "loss": 1.136, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.8244965498672249, + "learning_rate": 9.171487397204539e-05, + "loss": 1.1526, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.8778465204353496, + "learning_rate": 9.145658595428074e-05, + "loss": 1.191, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.899596371232968, + "learning_rate": 9.119835533834331e-05, + "loss": 1.2633, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.846027449549948, + "learning_rate": 9.09401838592431e-05, + "loss": 1.1045, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.875587666265761, + "learning_rate": 9.068207325159284e-05, + "loss": 1.1452, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.8182419596546981, + "learning_rate": 9.04240252495963e-05, + "loss": 1.0371, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.883399913373032, + "learning_rate": 9.016604158703654e-05, + "loss": 1.2016, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.8182133900424698, + "learning_rate": 8.990812399726435e-05, + "loss": 1.134, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.7773559052370546, + "learning_rate": 8.965027421318665e-05, + "loss": 1.1416, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.8151636516880102, + "learning_rate": 8.939249396725467e-05, + "loss": 1.0544, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.8119234078524113, + "learning_rate": 8.913478499145254e-05, + "loss": 1.1735, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.7939689877289976, + "learning_rate": 8.887714901728551e-05, + "loss": 1.1403, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.8701602064505436, + "learning_rate": 8.861958777576827e-05, + "loss": 1.2373, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.8750979269151778, + "learning_rate": 8.836210299741346e-05, + "loss": 1.1367, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.8368484768425418, + "learning_rate": 8.810469641222001e-05, + "loss": 1.0717, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.7722128748638069, + "learning_rate": 8.784736974966135e-05, + "loss": 1.0568, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.8650625917734309, + "learning_rate": 8.759012473867407e-05, + "loss": 1.0874, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.868758473046753, + "learning_rate": 8.733296310764611e-05, + "loss": 1.1982, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.7333969388904342, + "learning_rate": 8.707588658440511e-05, + "loss": 1.0735, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.9342906386330464, + "learning_rate": 8.6818896896207e-05, + "loss": 1.1111, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.9329161005373878, + "learning_rate": 8.656199576972423e-05, + "loss": 1.1205, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.9165208035013336, + "learning_rate": 8.63051849310342e-05, + "loss": 1.1113, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.8801790291991601, + "learning_rate": 8.604846610560771e-05, + "loss": 1.0137, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.9601549553046083, + "learning_rate": 8.579184101829734e-05, + "loss": 1.1841, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.8379558504679411, + "learning_rate": 8.553531139332582e-05, + "loss": 1.1055, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.9093649112724251, + "learning_rate": 8.527887895427454e-05, + "loss": 1.2555, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.7687625068151095, + "learning_rate": 8.502254542407186e-05, + "loss": 0.9919, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.7714325328233042, + "learning_rate": 8.476631252498162e-05, + "loss": 1.1593, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.8718743778557958, + "learning_rate": 8.451018197859153e-05, + "loss": 1.2377, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.861892519002274, + "learning_rate": 8.425415550580162e-05, + "loss": 1.1547, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.8471051334080404, + "learning_rate": 8.399823482681262e-05, + "loss": 1.2204, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.7800037231529334, + "learning_rate": 8.374242166111448e-05, + "loss": 1.1018, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.8564345124089149, + "learning_rate": 8.348671772747487e-05, + "loss": 1.1518, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.9264361623337496, + "learning_rate": 8.323112474392731e-05, + "loss": 1.0757, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.8712899167429417, + "learning_rate": 8.297564442776014e-05, + "loss": 1.1096, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.7795794260093907, + "learning_rate": 8.272027849550457e-05, + "loss": 1.0942, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.8750109638854078, + "learning_rate": 8.246502866292324e-05, + "loss": 1.0867, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.7642708387310234, + "learning_rate": 8.220989664499878e-05, + "loss": 1.1202, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.8697644696800666, + "learning_rate": 8.195488415592238e-05, + "loss": 1.1798, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.7282315195593572, + "learning_rate": 8.169999290908188e-05, + "loss": 0.9475, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.8771028965759182, + "learning_rate": 8.144522461705067e-05, + "loss": 1.0596, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.8276268545588975, + "learning_rate": 8.119058099157604e-05, + "loss": 1.0337, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.9149360167307691, + "learning_rate": 8.093606374356759e-05, + "loss": 1.2154, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.8458772152757161, + "learning_rate": 8.068167458308582e-05, + "loss": 1.1974, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.9427927985836158, + "learning_rate": 8.042741521933071e-05, + "loss": 1.2551, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.8554309991838364, + "learning_rate": 8.017328736063006e-05, + "loss": 1.0821, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.810579604922163, + "learning_rate": 7.991929271442817e-05, + "loss": 1.0999, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.8458766496688266, + "learning_rate": 7.966543298727425e-05, + "loss": 1.194, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 1.041488539269024, + "learning_rate": 7.941170988481108e-05, + "loss": 1.0821, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 1.0716270770436451, + "learning_rate": 7.915812511176347e-05, + "loss": 1.0159, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.7889165860745851, + "learning_rate": 7.89046803719267e-05, + "loss": 1.0456, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.8041429168861712, + "learning_rate": 7.865137736815535e-05, + "loss": 1.1367, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.8154198808415671, + "learning_rate": 7.839821780235168e-05, + "loss": 1.1029, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.8118014580044097, + "learning_rate": 7.814520337545406e-05, + "loss": 1.0941, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.792146530511344, + "learning_rate": 7.789233578742582e-05, + "loss": 1.12, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.8868178855674158, + "learning_rate": 7.763961673724379e-05, + "loss": 1.1412, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.8017281660103419, + "learning_rate": 7.738704792288655e-05, + "loss": 1.0368, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.8000691024627, + "learning_rate": 7.713463104132345e-05, + "loss": 1.0888, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.7451097291353643, + "learning_rate": 7.688236778850306e-05, + "loss": 1.0572, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.8729647871873557, + "learning_rate": 7.663025985934158e-05, + "loss": 1.1053, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.8788176333054569, + "learning_rate": 7.637830894771175e-05, + "loss": 1.0999, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.7532952898180028, + "learning_rate": 7.61265167464313e-05, + "loss": 1.0621, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.8344408583571611, + "learning_rate": 7.587488494725157e-05, + "loss": 1.1367, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.7920384181717828, + "learning_rate": 7.562341524084623e-05, + "loss": 1.0681, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.8010517078779921, + "learning_rate": 7.537210931679987e-05, + "loss": 1.1865, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.8010838919456496, + "learning_rate": 7.512096886359664e-05, + "loss": 1.1533, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.933129482023698, + "learning_rate": 7.48699955686089e-05, + "loss": 1.1516, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.8468459182650288, + "learning_rate": 7.461919111808595e-05, + "loss": 1.1326, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.9162291392781863, + "learning_rate": 7.43685571971426e-05, + "loss": 1.1588, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.7736452217785802, + "learning_rate": 7.411809548974792e-05, + "loss": 0.9918, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.8909484010627318, + "learning_rate": 7.386780767871397e-05, + "loss": 1.0871, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.830245407983687, + "learning_rate": 7.361769544568425e-05, + "loss": 1.1392, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.7938151018111826, + "learning_rate": 7.336776047112276e-05, + "loss": 1.0767, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.8050749300928102, + "learning_rate": 7.311800443430251e-05, + "loss": 1.0704, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.8422094038655485, + "learning_rate": 7.286842901329412e-05, + "loss": 1.0566, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.8125604137667185, + "learning_rate": 7.26190358849548e-05, + "loss": 1.077, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.8215269206004515, + "learning_rate": 7.236982672491698e-05, + "loss": 1.104, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 1.080505997510848, + "learning_rate": 7.212080320757695e-05, + "loss": 1.2943, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.8299123958305165, + "learning_rate": 7.187196700608373e-05, + "loss": 1.035, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.8292512699924797, + "learning_rate": 7.162331979232783e-05, + "loss": 1.0974, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.9041423336155675, + "learning_rate": 7.137486323692995e-05, + "loss": 1.2077, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.8351246439627844, + "learning_rate": 7.112659900922976e-05, + "loss": 1.2475, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.7916680319988713, + "learning_rate": 7.087852877727481e-05, + "loss": 1.1293, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.8148732530602424, + "learning_rate": 7.06306542078091e-05, + "loss": 1.0803, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.8352478784169337, + "learning_rate": 7.038297696626206e-05, + "loss": 1.1248, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.848558267850434, + "learning_rate": 7.013549871673736e-05, + "loss": 1.1294, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.886575182677277, + "learning_rate": 6.988822112200156e-05, + "loss": 1.196, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.8400239260638604, + "learning_rate": 6.964114584347316e-05, + "loss": 1.2257, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.716802554303067, + "learning_rate": 6.939427454121128e-05, + "loss": 1.083, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.9234326795779665, + "learning_rate": 6.914760887390452e-05, + "loss": 1.0735, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.8945019976496457, + "learning_rate": 6.890115049885994e-05, + "loss": 1.1687, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.8081601399019864, + "learning_rate": 6.865490107199181e-05, + "loss": 1.1061, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.768131579857086, + "learning_rate": 6.84088622478104e-05, + "loss": 1.0402, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.9024593058339698, + "learning_rate": 6.816303567941112e-05, + "loss": 1.1213, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.7386998610259434, + "learning_rate": 6.791742301846326e-05, + "loss": 1.0713, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.7213578892564265, + "learning_rate": 6.767202591519875e-05, + "loss": 1.0458, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.8547907731081595, + "learning_rate": 6.742684601840141e-05, + "loss": 1.0748, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.8885372244769414, + "learning_rate": 6.718188497539554e-05, + "loss": 1.1719, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.759216817677016, + "learning_rate": 6.693714443203507e-05, + "loss": 1.2117, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.8640646688740952, + "learning_rate": 6.669262603269246e-05, + "loss": 1.015, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.9505908649729419, + "learning_rate": 6.644833142024751e-05, + "loss": 1.194, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.8414799777767897, + "learning_rate": 6.620426223607654e-05, + "loss": 1.1219, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.7801317291815592, + "learning_rate": 6.59604201200412e-05, + "loss": 1.0187, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.8597928560121036, + "learning_rate": 6.571680671047749e-05, + "loss": 1.1172, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.7722221121410255, + "learning_rate": 6.547342364418481e-05, + "loss": 0.9957, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.9123633740913779, + "learning_rate": 6.523027255641493e-05, + "loss": 1.2027, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.8121400103829539, + "learning_rate": 6.498735508086093e-05, + "loss": 1.0574, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.9259710217409359, + "learning_rate": 6.474467284964634e-05, + "loss": 1.1166, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.8962231282015487, + "learning_rate": 6.450222749331414e-05, + "loss": 1.2077, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.7736903065729944, + "learning_rate": 6.426002064081565e-05, + "loss": 1.1006, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.905468806571558, + "learning_rate": 6.40180539194999e-05, + "loss": 1.1467, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.8409825776664667, + "learning_rate": 6.377632895510248e-05, + "loss": 1.08, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.8617982497771295, + "learning_rate": 6.35348473717345e-05, + "loss": 1.1161, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.9258322580589109, + "learning_rate": 6.329361079187199e-05, + "loss": 1.1235, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.9526781011791231, + "learning_rate": 6.305262083634488e-05, + "loss": 1.1493, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.7781631789182217, + "learning_rate": 6.281187912432587e-05, + "loss": 1.0735, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.7092921491242238, + "learning_rate": 6.25713872733199e-05, + "loss": 1.0933, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.7889253303946118, + "learning_rate": 6.233114689915316e-05, + "loss": 1.0977, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.9442231893690635, + "learning_rate": 6.209115961596208e-05, + "loss": 1.095, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.9488461614190968, + "learning_rate": 6.18514270361827e-05, + "loss": 1.2477, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.7714464171885789, + "learning_rate": 6.161195077053976e-05, + "loss": 1.145, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.8367332889082589, + "learning_rate": 6.13727324280358e-05, + "loss": 1.0824, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.755752037576069, + "learning_rate": 6.113377361594049e-05, + "loss": 1.0332, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.8250639067334734, + "learning_rate": 6.08950759397797e-05, + "loss": 1.0402, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.7435904683490475, + "learning_rate": 6.065664100332478e-05, + "loss": 1.0871, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.869656041067551, + "learning_rate": 6.0418470408581774e-05, + "loss": 1.1569, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.8342729387941089, + "learning_rate": 6.018056575578075e-05, + "loss": 1.1291, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.7476414619232458, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.9765, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.7805454973026643, + "learning_rate": 5.970556066797941e-05, + "loss": 1.1149, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.7940158861016194, + "learning_rate": 5.946846342446214e-05, + "loss": 1.1148, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.816238837296542, + "learning_rate": 5.923163850583113e-05, + "loss": 1.1721, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.8612039632403123, + "learning_rate": 5.899508750327501e-05, + "loss": 1.1537, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.8704542256188401, + "learning_rate": 5.875881200614207e-05, + "loss": 1.1211, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.8410500526207699, + "learning_rate": 5.8522813601929324e-05, + "loss": 1.0405, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.8437521301512859, + "learning_rate": 5.828709387627218e-05, + "loss": 1.0505, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.7970168742278224, + "learning_rate": 5.80516544129337e-05, + "loss": 1.1339, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.8056564142638072, + "learning_rate": 5.781649679379378e-05, + "loss": 0.9961, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.8225667991518454, + "learning_rate": 5.758162259883867e-05, + "loss": 1.0822, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.8054113821507712, + "learning_rate": 5.73470334061505e-05, + "loss": 1.0873, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.7807098345586613, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.9968, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.9033512475643545, + "learning_rate": 5.687871633031754e-05, + "loss": 1.1096, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.8316099430383033, + "learning_rate": 5.664499159372017e-05, + "loss": 1.05, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.8364325293651905, + "learning_rate": 5.6411558152462894e-05, + "loss": 1.1192, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.8387897132078757, + "learning_rate": 5.617841757494762e-05, + "loss": 1.126, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.7943181258355784, + "learning_rate": 5.5945571427608526e-05, + "loss": 1.0905, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.9194687690867345, + "learning_rate": 5.5713021274901335e-05, + "loss": 1.1612, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.7659556619523961, + "learning_rate": 5.54807686792933e-05, + "loss": 0.9791, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.8211925068553112, + "learning_rate": 5.524881520125229e-05, + "loss": 1.0956, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.7689878142328473, + "learning_rate": 5.501716239923642e-05, + "loss": 0.9637, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.9475286657707129, + "learning_rate": 5.4785811829683764e-05, + "loss": 1.1987, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.8798707840647109, + "learning_rate": 5.4554765047001613e-05, + "loss": 1.1092, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.9678328660495827, + "learning_rate": 5.432402360355615e-05, + "loss": 1.2474, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.9030882302098774, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.9859, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.8324833034351167, + "learning_rate": 5.386346293357242e-05, + "loss": 1.0637, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.8856226930234121, + "learning_rate": 5.363364680146725e-05, + "loss": 1.0574, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.8589605102834983, + "learning_rate": 5.3404142197444506e-05, + "loss": 1.1582, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.8788357210288834, + "learning_rate": 5.31749506635086e-05, + "loss": 1.03, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.930444408144179, + "learning_rate": 5.2946073739560706e-05, + "loss": 1.083, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.8435587500060738, + "learning_rate": 5.271751296338823e-05, + "loss": 1.0741, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.787078737860185, + "learning_rate": 5.248926987065417e-05, + "loss": 1.1415, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.9096181402514936, + "learning_rate": 5.226134599488728e-05, + "loss": 1.0613, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.7180597736990848, + "learning_rate": 5.203374286747158e-05, + "loss": 1.0081, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.8183737596107125, + "learning_rate": 5.180646201763577e-05, + "loss": 1.0737, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.7523623613392743, + "learning_rate": 5.15795049724435e-05, + "loss": 1.0959, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.8285998941738993, + "learning_rate": 5.135287325678271e-05, + "loss": 1.0251, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.8296654676630226, + "learning_rate": 5.112656839335543e-05, + "loss": 1.1092, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.7946138376139481, + "learning_rate": 5.090059190266779e-05, + "loss": 1.0882, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.862786168494372, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.978, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.7851555341840509, + "learning_rate": 5.0449630110493836e-05, + "loss": 1.0435, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.843942554035457, + "learning_rate": 5.022464783894744e-05, + "loss": 1.0004, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.9082315582270839, + "learning_rate": 5.000000000000002e-05, + "loss": 1.1232, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.8716824703464001, + "learning_rate": 4.977568810302432e-05, + "loss": 1.0487, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.8594414287342595, + "learning_rate": 4.955171365513603e-05, + "loss": 1.1496, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.9502004091668262, + "learning_rate": 4.9328078161183464e-05, + "loss": 1.1618, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.8516762802902843, + "learning_rate": 4.9104783123737566e-05, + "loss": 1.0397, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.8166207305907555, + "learning_rate": 4.88818300430819e-05, + "loss": 1.1019, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.9482880117564721, + "learning_rate": 4.865922041720239e-05, + "loss": 1.0356, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.7258036421019421, + "learning_rate": 4.843695574177737e-05, + "loss": 1.0178, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.872814346090495, + "learning_rate": 4.821503751016746e-05, + "loss": 1.1421, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.8458393993914094, + "learning_rate": 4.7993467213405706e-05, + "loss": 1.0526, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.910122283278521, + "learning_rate": 4.777224634018732e-05, + "loss": 1.1135, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.7783447548656339, + "learning_rate": 4.755137637685979e-05, + "loss": 1.0034, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.7694625021252215, + "learning_rate": 4.733085880741301e-05, + "loss": 1.1224, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.7789699524240866, + "learning_rate": 4.7110695113469085e-05, + "loss": 1.0577, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.7966320829391377, + "learning_rate": 4.689088677427249e-05, + "loss": 1.0705, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.7955706972132834, + "learning_rate": 4.6671435266680216e-05, + "loss": 1.0405, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 1.0213874747735756, + "learning_rate": 4.645234206515171e-05, + "loss": 1.0886, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.8363724959442057, + "learning_rate": 4.623360864173893e-05, + "loss": 0.9931, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.7917579494315584, + "learning_rate": 4.6015236466076747e-05, + "loss": 1.127, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.832130475395553, + "learning_rate": 4.579722700537268e-05, + "loss": 1.1778, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.8846275284126474, + "learning_rate": 4.5579581724397255e-05, + "loss": 1.1274, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.82177266481349, + "learning_rate": 4.5362302085474254e-05, + "loss": 1.1013, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.9400441406424015, + "learning_rate": 4.514538954847064e-05, + "loss": 1.1665, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.8310350112225171, + "learning_rate": 4.492884557078688e-05, + "loss": 1.243, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.952079058372432, + "learning_rate": 4.471267160734731e-05, + "loss": 1.0113, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.8088607074314236, + "learning_rate": 4.449686911058992e-05, + "loss": 1.0944, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.828801567930494, + "learning_rate": 4.428143953045717e-05, + "loss": 1.1089, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.9020366227162672, + "learning_rate": 4.406638431438576e-05, + "loss": 1.1116, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.9277789986108305, + "learning_rate": 4.385170490729712e-05, + "loss": 1.1, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.8626346678476595, + "learning_rate": 4.36374027515878e-05, + "loss": 1.0016, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.7966414003601904, + "learning_rate": 4.342347928711953e-05, + "loss": 0.9873, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.8036041116068522, + "learning_rate": 4.320993595120969e-05, + "loss": 1.0645, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.8720820634045482, + "learning_rate": 4.2996774178621736e-05, + "loss": 1.09, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.875647722790824, + "learning_rate": 4.278399540155536e-05, + "loss": 1.1393, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.7897158665335176, + "learning_rate": 4.257160104963696e-05, + "loss": 1.1671, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.8279757558420157, + "learning_rate": 4.2359592549910145e-05, + "loss": 1.0473, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.7553070530855709, + "learning_rate": 4.2147971326825966e-05, + "loss": 1.0388, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.7409780701505099, + "learning_rate": 4.193673880223339e-05, + "loss": 1.0441, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.8813018133474086, + "learning_rate": 4.172589639536991e-05, + "loss": 1.1323, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.8916739966164384, + "learning_rate": 4.1515445522851784e-05, + "loss": 1.1638, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.725246831952099, + "learning_rate": 4.130538759866457e-05, + "loss": 1.0564, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.8551037087395035, + "learning_rate": 4.109572403415386e-05, + "loss": 1.0942, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.7667994554139557, + "learning_rate": 4.088645623801534e-05, + "loss": 1.0806, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.7787508174187104, + "learning_rate": 4.0677585616285774e-05, + "loss": 1.097, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.7269911908663984, + "learning_rate": 4.046911357233343e-05, + "loss": 1.0385, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.8178400476588422, + "learning_rate": 4.026104150684835e-05, + "loss": 1.0532, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.7338076719326018, + "learning_rate": 4.00533708178334e-05, + "loss": 1.1199, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.9600653046668537, + "learning_rate": 3.984610290059467e-05, + "loss": 1.0884, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.9693259464203958, + "learning_rate": 3.963923914773187e-05, + "loss": 1.121, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.8971313534056323, + "learning_rate": 3.943278094912946e-05, + "loss": 1.1076, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.8505547848799991, + "learning_rate": 3.922672969194686e-05, + "loss": 1.1581, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.901973898717887, + "learning_rate": 3.902108676060937e-05, + "loss": 1.114, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.7732708517581511, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.9983, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.8376010427650987, + "learning_rate": 3.861103139944449e-05, + "loss": 1.1088, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.8869780495184847, + "learning_rate": 3.840662172471315e-05, + "loss": 1.1958, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.7383696992151155, + "learning_rate": 3.820262588600074e-05, + "loss": 0.94, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 1.008957136472109, + "learning_rate": 3.79990452539225e-05, + "loss": 1.1233, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.7561674578801395, + "learning_rate": 3.7795881196303995e-05, + "loss": 1.0239, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.7993430654708011, + "learning_rate": 3.759313507817196e-05, + "loss": 1.0965, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.9005324807571452, + "learning_rate": 3.739080826174498e-05, + "loss": 1.1309, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.7688379670981546, + "learning_rate": 3.7188902106424416e-05, + "loss": 1.0644, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.755342086444707, + "learning_rate": 3.6987417968785366e-05, + "loss": 1.0308, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.9116587787303737, + "learning_rate": 3.678635720256737e-05, + "loss": 1.1085, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.8840048457007753, + "learning_rate": 3.658572115866541e-05, + "loss": 1.0365, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.88369305991926, + "learning_rate": 3.638551118512089e-05, + "loss": 1.113, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.8914593348589701, + "learning_rate": 3.618572862711247e-05, + "loss": 1.0857, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.7202666269984003, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.9599, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.8609160051639316, + "learning_rate": 3.578745112405083e-05, + "loss": 1.0807, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.875315044369369, + "learning_rate": 3.558895885496023e-05, + "loss": 1.0459, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.8046586030392038, + "learning_rate": 3.539089935331294e-05, + "loss": 0.9892, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.8845577469864819, + "learning_rate": 3.519327394983888e-05, + "loss": 1.0744, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.81407299976784, + "learning_rate": 3.4996083972351515e-05, + "loss": 1.1137, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.8719550109634547, + "learning_rate": 3.479933074573858e-05, + "loss": 1.0937, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.8078469055176236, + "learning_rate": 3.4603015591953395e-05, + "loss": 1.0418, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.8635161003549958, + "learning_rate": 3.440713983000601e-05, + "loss": 1.116, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.8635724625565849, + "learning_rate": 3.421170477595419e-05, + "loss": 1.0166, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.7958853087898546, + "learning_rate": 3.401671174289469e-05, + "loss": 1.0631, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.8541261044016341, + "learning_rate": 3.3822162040954354e-05, + "loss": 1.1747, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.7611045541610414, + "learning_rate": 3.362805697728145e-05, + "loss": 0.9826, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.8166688214584858, + "learning_rate": 3.34343978560367e-05, + "loss": 1.155, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.8729492023257425, + "learning_rate": 3.324118597838464e-05, + "loss": 0.9985, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.7338889411154411, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.9764, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.7841742476721919, + "learning_rate": 3.285610914348332e-05, + "loss": 1.0407, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.8190648808105729, + "learning_rate": 3.266424677350346e-05, + "loss": 1.0059, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 1.189971966755002, + "learning_rate": 3.2472836821637744e-05, + "loss": 1.2784, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.7747620673874669, + "learning_rate": 3.228188057393895e-05, + "loss": 1.0338, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.8459819458124576, + "learning_rate": 3.209137931341143e-05, + "loss": 1.0603, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.7299775773284684, + "learning_rate": 3.190133432000252e-05, + "loss": 0.9589, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.873153295737133, + "learning_rate": 3.1711746870594086e-05, + "loss": 1.0276, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.7399229179029378, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.9942, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.8081689802013524, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.9782, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.8205043844461317, + "learning_rate": 3.114574250902558e-05, + "loss": 1.1347, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.9300242973554271, + "learning_rate": 3.0957997942825336e-05, + "loss": 1.1063, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.7955739825725661, + "learning_rate": 3.077071725875116e-05, + "loss": 0.9989, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.8369237179848538, + "learning_rate": 3.058390171511196e-05, + "loss": 1.0653, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.8877315005007081, + "learning_rate": 3.0397552567091337e-05, + "loss": 1.139, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.9063518978090884, + "learning_rate": 3.021167106673928e-05, + "loss": 1.1666, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.8402301995330217, + "learning_rate": 3.0026258462963787e-05, + "loss": 1.0871, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.7941155124027731, + "learning_rate": 2.9841316001522347e-05, + "loss": 1.0616, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.8086180722645121, + "learning_rate": 2.9656844925013637e-05, + "loss": 1.0831, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.7301695836136674, + "learning_rate": 2.9472846472869298e-05, + "loss": 1.0564, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.8824825301242152, + "learning_rate": 2.9289321881345254e-05, + "loss": 1.0591, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.7608663915405037, + "learning_rate": 2.9106272383513835e-05, + "loss": 1.1637, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.9562058298891669, + "learning_rate": 2.8923699209255284e-05, + "loss": 1.0107, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.8353655172198402, + "learning_rate": 2.874160358524931e-05, + "loss": 1.0556, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.9446711176411489, + "learning_rate": 2.8559986734967282e-05, + "loss": 1.0598, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.8373112011041507, + "learning_rate": 2.8378849878663628e-05, + "loss": 1.0998, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.8393140098410127, + "learning_rate": 2.819819423336775e-05, + "loss": 1.0959, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 1.0007850712037603, + "learning_rate": 2.8018021012875994e-05, + "loss": 1.0852, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.9547238570159858, + "learning_rate": 2.7838331427743282e-05, + "loss": 1.1677, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.7670799358518546, + "learning_rate": 2.7659126685275027e-05, + "loss": 1.0491, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.8069720746952094, + "learning_rate": 2.7480407989519198e-05, + "loss": 1.0403, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.8323483451268169, + "learning_rate": 2.7302176541257986e-05, + "loss": 1.1721, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.8665610531801702, + "learning_rate": 2.712443353799984e-05, + "loss": 1.0433, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.820120496167094, + "learning_rate": 2.6947180173971508e-05, + "loss": 1.0389, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.8668907325122193, + "learning_rate": 2.677041764010988e-05, + "loss": 1.0486, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.9215499864182681, + "learning_rate": 2.659414712405398e-05, + "loss": 1.2304, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.8575033778345631, + "learning_rate": 2.6418369810137188e-05, + "loss": 1.1016, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.7885383565624948, + "learning_rate": 2.6243086879379e-05, + "loss": 1.1444, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.9616420533689475, + "learning_rate": 2.6068299509477266e-05, + "loss": 1.0858, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.8000352388587801, + "learning_rate": 2.5894008874800325e-05, + "loss": 1.0613, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.9432878918791088, + "learning_rate": 2.5720216146378917e-05, + "loss": 1.083, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.8581930522375921, + "learning_rate": 2.5546922491898495e-05, + "loss": 1.0763, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.8477439535159955, + "learning_rate": 2.5374129075691265e-05, + "loss": 1.0084, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.8954126142945501, + "learning_rate": 2.5201837058728505e-05, + "loss": 1.1148, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.8451424029281424, + "learning_rate": 2.503004759861258e-05, + "loss": 1.0159, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.7687739108451231, + "learning_rate": 2.485876184956928e-05, + "loss": 1.0414, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.8232387388107232, + "learning_rate": 2.4687980962440072e-05, + "loss": 1.0491, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.8670984144963605, + "learning_rate": 2.451770608467432e-05, + "loss": 1.045, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.9207845432159518, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.9877, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 1.0445500951448323, + "learning_rate": 2.417867893002387e-05, + "loss": 1.1575, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.8525396222453334, + "learning_rate": 2.400992893100822e-05, + "loss": 1.0176, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.847576403477811, + "learning_rate": 2.3841689497078746e-05, + "loss": 1.1012, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.8689298591511133, + "learning_rate": 2.3673961758609152e-05, + "loss": 1.138, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.8140926501993877, + "learning_rate": 2.3506746842535242e-05, + "loss": 1.0843, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.7594070690792206, + "learning_rate": 2.334004587234717e-05, + "loss": 0.9893, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.7554074010238546, + "learning_rate": 2.3173859968081944e-05, + "loss": 1.0272, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.8250669999383188, + "learning_rate": 2.300819024631603e-05, + "loss": 1.0714, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.8301229569066313, + "learning_rate": 2.2843037820157675e-05, + "loss": 1.0756, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.8601687215822483, + "learning_rate": 2.26784037992395e-05, + "loss": 1.0887, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.8506451142903321, + "learning_rate": 2.251428928971102e-05, + "loss": 1.1066, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.8198327681216107, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.9676, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.9012566805483494, + "learning_rate": 2.2187623211961562e-05, + "loss": 1.0499, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.7889708330455324, + "learning_rate": 2.2025073838557454e-05, + "loss": 1.1374, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.7479780383891586, + "learning_rate": 2.1863048366162208e-05, + "loss": 1.1091, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.8085409919807247, + "learning_rate": 2.1701547883398922e-05, + "loss": 1.0892, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.8669689768248773, + "learning_rate": 2.1540573475363402e-05, + "loss": 1.0941, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.7680590513678617, + "learning_rate": 2.138012622361689e-05, + "loss": 1.1078, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.8266768713093456, + "learning_rate": 2.1220207206178688e-05, + "loss": 1.021, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.8507316837188268, + "learning_rate": 2.106081749751897e-05, + "loss": 1.0557, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.7259233668247581, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.9342, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.9510904005306071, + "learning_rate": 2.0743630286627002e-05, + "loss": 1.0499, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.8679214790847098, + "learning_rate": 2.058583491552465e-05, + "loss": 1.0864, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.8388281221981571, + "learning_rate": 2.0428573115446392e-05, + "loss": 1.1251, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.7967302882328616, + "learning_rate": 2.027184594300898e-05, + "loss": 1.0881, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.8788941746699341, + "learning_rate": 2.011565445123711e-05, + "loss": 1.0843, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.8310993678093996, + "learning_rate": 1.995999968955641e-05, + "loss": 1.1267, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.7422918123056207, + "learning_rate": 1.980488270378612e-05, + "loss": 0.9686, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.8580935822649455, + "learning_rate": 1.9650304536132426e-05, + "loss": 1.1098, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.9062482236530666, + "learning_rate": 1.9496266225181248e-05, + "loss": 1.1375, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.8078159675834186, + "learning_rate": 1.9342768805891178e-05, + "loss": 1.0216, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.8974067691172537, + "learning_rate": 1.918981330958678e-05, + "loss": 1.1616, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.7601753406737961, + "learning_rate": 1.903740076395151e-05, + "loss": 1.0938, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.833755893544062, + "learning_rate": 1.8885532193020704e-05, + "loss": 1.0737, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.8417856272896627, + "learning_rate": 1.8734208617174988e-05, + "loss": 1.0216, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.7192037601325971, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.9657, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.8325532445569533, + "learning_rate": 1.8433200513945337e-05, + "loss": 1.1143, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.8143603763208344, + "learning_rate": 1.8283518008986567e-05, + "loss": 1.0981, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.8315570305000287, + "learning_rate": 1.8134384543949478e-05, + "loss": 1.0619, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.9431012095409678, + "learning_rate": 1.7985801120837865e-05, + "loss": 1.0707, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 1.0858550918279, + "learning_rate": 1.783776873795994e-05, + "loss": 1.1165, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.8236739587451712, + "learning_rate": 1.7690288389921493e-05, + "loss": 1.0156, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.797923169332508, + "learning_rate": 1.754336106761927e-05, + "loss": 1.0358, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.9566963752166892, + "learning_rate": 1.739698775823442e-05, + "loss": 1.1489, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.7864660429156809, + "learning_rate": 1.7251169445225657e-05, + "loss": 1.0898, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.7712612996367878, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.9535, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.7646883255608178, + "learning_rate": 1.696120172352025e-05, + "loss": 1.1222, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.8401577710670294, + "learning_rate": 1.6817054263070174e-05, + "loss": 1.2397, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.8497089754801997, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.9819, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.7901100406395174, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.987, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.7882408403718572, + "learning_rate": 1.6387969094089316e-05, + "loss": 1.0182, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.7727435639026202, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.9139, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.9867700550888403, + "learning_rate": 1.6104719592169902e-05, + "loss": 1.1207, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.8901041929563709, + "learning_rate": 1.5963939884756042e-05, + "loss": 1.0642, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.8830215405712524, + "learning_rate": 1.5823724802136865e-05, + "loss": 1.0436, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.7917962858116705, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.8976, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.8433163117805252, + "learning_rate": 1.5544992275813053e-05, + "loss": 1.0382, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.7716275597028938, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.9649, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.8069655979220444, + "learning_rate": 1.526852950422226e-05, + "loss": 1.0419, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.8930329120684376, + "learning_rate": 1.5131151600722337e-05, + "loss": 1.1139, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.7984848868984005, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.9043, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.8793418989596367, + "learning_rate": 1.485810737340767e-05, + "loss": 0.9909, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.843670675814957, + "learning_rate": 1.4722442884133214e-05, + "loss": 1.0708, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.8266625193427682, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.9907, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.8455368846406135, + "learning_rate": 1.4452833711883628e-05, + "loss": 1.0169, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.8443310120368719, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.9915, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.8460089788670055, + "learning_rate": 1.4185523646469822e-05, + "loss": 1.135, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.9201811307817429, + "learning_rate": 1.4052733026258281e-05, + "loss": 1.1082, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.8339010425588553, + "learning_rate": 1.3920519871933424e-05, + "loss": 1.0462, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.7982008593372306, + "learning_rate": 1.3788885071814172e-05, + "loss": 1.0188, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.8085660300902544, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.9638, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.8227448081758223, + "learning_rate": 1.3527354068033139e-05, + "loss": 1.1124, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.7519255247523404, + "learning_rate": 1.339745962155613e-05, + "loss": 0.9756, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.7723108828050917, + "learning_rate": 1.326814704364262e-05, + "loss": 1.0099, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.848142675290011, + "learning_rate": 1.3139417203123027e-05, + "loss": 1.061, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.8548972885408882, + "learning_rate": 1.3011270964912459e-05, + "loss": 1.0671, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.8339445160342264, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.9893, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.8766689327185713, + "learning_rate": 1.275673273546758e-05, + "loss": 1.1564, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.7853075453523916, + "learning_rate": 1.263034245443473e-05, + "loss": 1.0244, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.8570147208703542, + "learning_rate": 1.2504539196102439e-05, + "loss": 1.0484, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.8136814107398292, + "learning_rate": 1.2379323805722576e-05, + "loss": 1.0179, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.769578688484213, + "learning_rate": 1.2254697124597237e-05, + "loss": 1.0155, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.7643325804101265, + "learning_rate": 1.2130659990073146e-05, + "loss": 1.0514, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.8210295279927137, + "learning_rate": 1.2007213235535786e-05, + "loss": 1.0158, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.8542721125276475, + "learning_rate": 1.1884357690404158e-05, + "loss": 1.0017, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.7785321602297618, + "learning_rate": 1.176209418012495e-05, + "loss": 1.077, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.7314186123390642, + "learning_rate": 1.1640423526166988e-05, + "loss": 1.0268, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.8435403090839856, + "learning_rate": 1.1519346546015907e-05, + "loss": 1.096, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.8724304024662537, + "learning_rate": 1.1398864053168534e-05, + "loss": 1.0366, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.7369835686536529, + "learning_rate": 1.1278976857127311e-05, + "loss": 1.0144, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.8181065216676432, + "learning_rate": 1.1159685763395111e-05, + "loss": 1.0245, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.8815239530451991, + "learning_rate": 1.1040991573469629e-05, + "loss": 1.1473, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.8032536779947032, + "learning_rate": 1.0922895084838037e-05, + "loss": 1.0423, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.7131358007552208, + "learning_rate": 1.0805397090971737e-05, + "loss": 1.0076, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.8561235966926187, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.9958, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.8337193626782963, + "learning_rate": 1.057219974130903e-05, + "loss": 1.0355, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 1.0554434463308862, + "learning_rate": 1.045650195232819e-05, + "loss": 1.0646, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.8834137892193425, + "learning_rate": 1.0341405791733183e-05, + "loss": 1.1266, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.8081061581358774, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.9963, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.7790424195012522, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.9875, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.9447272471765829, + "learning_rate": 9.999734793146998e-06, + "loss": 1.18, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.7520148284219924, + "learning_rate": 9.887052838721322e-06, + "loss": 0.9588, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.8934743042970116, + "learning_rate": 9.774976338718677e-06, + "loss": 1.086, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.8616689862663275, + "learning_rate": 9.663506046162985e-06, + "loss": 1.0719, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.7922685409060988, + "learning_rate": 9.552642710005299e-06, + "loss": 0.9262, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.7694082262129455, + "learning_rate": 9.44238707511862e-06, + "loss": 0.9479, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.8757202664560848, + "learning_rate": 9.332739882292752e-06, + "loss": 1.0836, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.8801430802878017, + "learning_rate": 9.22370186822965e-06, + "loss": 1.208, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.8873892163530365, + "learning_rate": 9.115273765538202e-06, + "loss": 1.005, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.7935533814333376, + "learning_rate": 9.0074563027294e-06, + "loss": 1.0279, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.8438568284727476, + "learning_rate": 8.900250204211514e-06, + "loss": 1.0458, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.9029979799434774, + "learning_rate": 8.79365619028507e-06, + "loss": 1.1174, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.854865680023673, + "learning_rate": 8.687674977138116e-06, + "loss": 1.0298, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.9227965088124007, + "learning_rate": 8.582307276841462e-06, + "loss": 1.1905, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.7845733244188352, + "learning_rate": 8.47755379734373e-06, + "loss": 1.0777, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.9076594402743925, + "learning_rate": 8.37341524246672e-06, + "loss": 1.1409, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.8343244746160255, + "learning_rate": 8.269892311900696e-06, + "loss": 1.0167, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.7413090833309477, + "learning_rate": 8.166985701199582e-06, + "loss": 1.0595, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.7810378713617924, + "learning_rate": 8.064696101776358e-06, + "loss": 1.0228, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.8818267886496703, + "learning_rate": 7.963024200898462e-06, + "loss": 1.0787, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.9149056583819745, + "learning_rate": 7.861970681683051e-06, + "loss": 1.1618, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.7690539234652768, + "learning_rate": 7.761536223092458e-06, + "loss": 0.9936, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.8681153381027179, + "learning_rate": 7.661721499929753e-06, + "loss": 0.9168, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.7567542671180796, + "learning_rate": 7.562527182833978e-06, + "loss": 1.0454, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.7510789582140142, + "learning_rate": 7.463953938275858e-06, + "loss": 0.9934, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.7077461773353635, + "learning_rate": 7.366002428553153e-06, + "loss": 0.9656, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.8122835526864932, + "learning_rate": 7.2686733117863784e-06, + "loss": 1.0685, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.8701275544447438, + "learning_rate": 7.171967241914224e-06, + "loss": 1.1241, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.8299484468653472, + "learning_rate": 7.07588486868922e-06, + "loss": 1.0042, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.7762025442493229, + "learning_rate": 6.980426837673437e-06, + "loss": 0.9914, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.8600018089658686, + "learning_rate": 6.8855937902340576e-06, + "loss": 1.1116, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.7916195175227821, + "learning_rate": 6.791386363539065e-06, + "loss": 1.0024, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.7975427193955309, + "learning_rate": 6.6978051905530855e-06, + "loss": 1.0999, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.7203807251928327, + "learning_rate": 6.604850900032955e-06, + "loss": 0.9853, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.7442501433407781, + "learning_rate": 6.512524116523633e-06, + "loss": 0.957, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.9649373646508272, + "learning_rate": 6.420825460353974e-06, + "loss": 1.1912, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.8221262344919317, + "learning_rate": 6.329755547632499e-06, + "loss": 1.0573, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.8817730038905364, + "learning_rate": 6.239314990243339e-06, + "loss": 1.0231, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.7588643695285561, + "learning_rate": 6.149504395842087e-06, + "loss": 0.9955, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.7927470871468772, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.9565, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.8365527062220788, + "learning_rate": 5.971775505458444e-06, + "loss": 0.9795, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.882373043265085, + "learning_rate": 5.883858403607967e-06, + "loss": 1.0722, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.8038726568288451, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.9725, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.6968209369455347, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.9518, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.8053464432992384, + "learning_rate": 5.623903547074549e-06, + "loss": 1.1971, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.9487223791910389, + "learning_rate": 5.538519351897575e-06, + "loss": 1.143, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.8269540682182868, + "learning_rate": 5.453769828241872e-06, + "loss": 1.0476, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.8959769043876853, + "learning_rate": 5.369655545525909e-06, + "loss": 1.0322, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.8482885755213688, + "learning_rate": 5.286177068899989e-06, + "loss": 1.0814, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.8430964156606374, + "learning_rate": 5.2033349592426335e-06, + "loss": 1.1773, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 1.0766306814995699, + "learning_rate": 5.121129773156663e-06, + "loss": 1.1936, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.8555670658867724, + "learning_rate": 5.039562062965508e-06, + "loss": 1.0445, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.7803847535248349, + "learning_rate": 4.95863237670956e-06, + "loss": 1.1091, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.8083442794644436, + "learning_rate": 4.87834125814235e-06, + "loss": 1.1041, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.7675666250884017, + "learning_rate": 4.798689246727006e-06, + "loss": 0.9808, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.8455159865291129, + "learning_rate": 4.719676877632639e-06, + "loss": 1.0783, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.8961068833586694, + "learning_rate": 4.641304681730641e-06, + "loss": 1.0994, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 1.4506444620216155, + "learning_rate": 4.563573185591219e-06, + "loss": 1.1413, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.770061682050551, + "learning_rate": 4.486482911479839e-06, + "loss": 1.0944, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.7891790037353675, + "learning_rate": 4.4100343773536225e-06, + "loss": 1.0574, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.9119231007900593, + "learning_rate": 4.3342280968580285e-06, + "loss": 1.0913, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.7950096502373967, + "learning_rate": 4.259064579323302e-06, + "loss": 1.103, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.8791042132117247, + "learning_rate": 4.184544329761009e-06, + "loss": 1.1572, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.7628036129054035, + "learning_rate": 4.1106678488607495e-06, + "loss": 1.0829, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.8791188158176501, + "learning_rate": 4.037435632986786e-06, + "loss": 1.0875, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.7523148247789707, + "learning_rate": 3.964848174174541e-06, + "loss": 1.044, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.8113682212901249, + "learning_rate": 3.892905960127546e-06, + "loss": 0.9732, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.8147464631743097, + "learning_rate": 3.821609474213983e-06, + "loss": 1.1032, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.886241187198881, + "learning_rate": 3.750959195463466e-06, + "loss": 1.1017, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.8386506464729159, + "learning_rate": 3.6809555985639068e-06, + "loss": 1.0944, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.7268472637888334, + "learning_rate": 3.611599153858214e-06, + "loss": 0.9468, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.9525044757276668, + "learning_rate": 3.5428903273411863e-06, + "loss": 1.0831, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.8053724140197336, + "learning_rate": 3.4748295806564356e-06, + "loss": 1.0737, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.8516042807221436, + "learning_rate": 3.40741737109318e-06, + "loss": 1.1573, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.852134568484373, + "learning_rate": 3.3406541515832003e-06, + "loss": 1.0343, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.6990479370212819, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.8985, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.7983754867840562, + "learning_rate": 3.209076472645112e-06, + "loss": 1.0917, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.9274473548553837, + "learning_rate": 3.1442628972662704e-06, + "loss": 1.0352, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.8188215523931848, + "learning_rate": 3.0801000800333877e-06, + "loss": 1.0683, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.8538581769741597, + "learning_rate": 3.0165884520461316e-06, + "loss": 1.0381, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 1.0069134002810485, + "learning_rate": 2.9537284400289355e-06, + "loss": 1.079, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.8075925733220816, + "learning_rate": 2.8915204663281013e-06, + "loss": 1.0509, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.921754289470793, + "learning_rate": 2.8299649489090475e-06, + "loss": 1.1385, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.9552746253102438, + "learning_rate": 2.7690623013533976e-06, + "loss": 1.11, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.8481665514412091, + "learning_rate": 2.708812932856253e-06, + "loss": 1.0993, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.8850663754196179, + "learning_rate": 2.649217248223468e-06, + "loss": 1.0511, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.8423309250860148, + "learning_rate": 2.590275647868867e-06, + "loss": 1.0887, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.9086622994893373, + "learning_rate": 2.5319885278115906e-06, + "loss": 1.0991, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.7743885704537332, + "learning_rate": 2.4743562796734622e-06, + "loss": 1.0021, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.826099928526726, + "learning_rate": 2.4173792906762804e-06, + "loss": 1.117, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.8561322325727061, + "learning_rate": 2.3610579436393e-06, + "loss": 1.0156, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.8185299016872479, + "learning_rate": 2.3053926169765984e-06, + "loss": 1.0752, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.7788460498099669, + "learning_rate": 2.250383684694579e-06, + "loss": 1.0119, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.8686029111410445, + "learning_rate": 2.1960315163894075e-06, + "loss": 1.0305, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.8330074771412815, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.9842, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.7930170644993012, + "learning_rate": 2.0892989280284823e-06, + "loss": 1.0737, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.8665021364309282, + "learning_rate": 2.036919225091827e-06, + "loss": 1.1391, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 1.0265081407582635, + "learning_rate": 1.9851977203654835e-06, + "loss": 1.1065, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.8324763969279934, + "learning_rate": 1.9341347613579087e-06, + "loss": 1.1389, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.7998666682996705, + "learning_rate": 1.8837306911529184e-06, + "loss": 1.0472, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.8810888210880106, + "learning_rate": 1.8339858484073935e-06, + "loss": 1.0541, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.9403217037232425, + "learning_rate": 1.7849005673489127e-06, + "loss": 1.0467, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.8300390467407558, + "learning_rate": 1.7364751777736332e-06, + "loss": 1.0533, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.9254647235536025, + "learning_rate": 1.6887100050439587e-06, + "loss": 1.0816, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.7778474839752096, + "learning_rate": 1.6416053700863964e-06, + "loss": 1.0773, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.8003783399982549, + "learning_rate": 1.595161589389449e-06, + "loss": 1.007, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.8812466491710779, + "learning_rate": 1.5493789750014031e-06, + "loss": 1.16, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.8326312999003161, + "learning_rate": 1.5042578345283108e-06, + "loss": 1.0965, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.7563118996771208, + "learning_rate": 1.459798471131868e-06, + "loss": 1.0676, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 1.036415448691471, + "learning_rate": 1.4160011835273934e-06, + "loss": 1.0978, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.8486359609408237, + "learning_rate": 1.3728662659818204e-06, + "loss": 1.0701, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.7640372589074247, + "learning_rate": 1.3303940083117527e-06, + "loss": 1.0699, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.8004358391415526, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.9778, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.8674774175052101, + "learning_rate": 1.2474386096010039e-06, + "loss": 1.0817, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.7729028526574891, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.9437, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.9783238007096406, + "learning_rate": 1.1671372168474138e-06, + "loss": 1.1378, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.8522896936237008, + "learning_rate": 1.1279824499064396e-06, + "loss": 1.058, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.9505815733076248, + "learning_rate": 1.089491988176017e-06, + "loss": 1.093, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.8219084334556093, + "learning_rate": 1.0516660902673448e-06, + "loss": 1.1247, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.7582367787081001, + "learning_rate": 1.014505010326583e-06, + "loss": 1.0931, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.8275786084042629, + "learning_rate": 9.780089980330642e-07, + "loss": 1.0781, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.7825227563832609, + "learning_rate": 9.421782985976068e-07, + "loss": 1.1064, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.9396717094080378, + "learning_rate": 9.070131527609604e-07, + "loss": 1.1058, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.8190377523928112, + "learning_rate": 8.725137967920738e-07, + "loss": 1.1167, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.874112847760409, + "learning_rate": 8.386804624865851e-07, + "loss": 1.0229, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.8033519658511188, + "learning_rate": 8.055133771652345e-07, + "loss": 1.0412, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.8697321168989555, + "learning_rate": 7.730127636723539e-07, + "loss": 1.0862, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.8482848418241187, + "learning_rate": 7.411788403743237e-07, + "loss": 1.0522, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.8067862128846277, + "learning_rate": 7.100118211581852e-07, + "loss": 1.0957, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.7349051364302464, + "learning_rate": 6.7951191543012e-07, + "loss": 1.008, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.8062767237675197, + "learning_rate": 6.496793281141056e-07, + "loss": 1.0228, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.8043382593470939, + "learning_rate": 6.205142596505176e-07, + "loss": 1.0865, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.777121153566047, + "learning_rate": 5.920169059947411e-07, + "loss": 1.0193, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.7720730274857394, + "learning_rate": 5.64187458615939e-07, + "loss": 0.9782, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.8932139873483963, + "learning_rate": 5.370261044956971e-07, + "loss": 1.1671, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.8094519972185115, + "learning_rate": 5.105330261267916e-07, + "loss": 0.9897, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.8207848408700373, + "learning_rate": 4.847084015119574e-07, + "loss": 1.1209, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.9064750065862063, + "learning_rate": 4.5955240416271084e-07, + "loss": 1.0058, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.8136539983287625, + "learning_rate": 4.3506520309813947e-07, + "loss": 1.0715, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.8037367517607537, + "learning_rate": 4.112469628438365e-07, + "loss": 1.066, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.8251132367630437, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.9979, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.8110527503825463, + "learning_rate": 3.6561800039403016e-07, + "loss": 1.1374, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.8824191340178144, + "learning_rate": 3.4380758477219333e-07, + "loss": 1.084, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.9784790378200482, + "learning_rate": 3.2266674310589273e-07, + "loss": 1.112, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.8108840654344787, + "learning_rate": 3.0219561743707326e-07, + "loss": 1.0889, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.8600919844190589, + "learning_rate": 2.8239434530792365e-07, + "loss": 1.2295, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.9574092537518714, + "learning_rate": 2.6326305976001055e-07, + "loss": 1.1282, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.8059799363006932, + "learning_rate": 2.448018893333681e-07, + "loss": 1.088, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.8699350493645416, + "learning_rate": 2.2701095806565432e-07, + "loss": 1.0753, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.8706707491571822, + "learning_rate": 2.098903854912515e-07, + "loss": 1.1125, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.9403287413231343, + "learning_rate": 1.9344028664056713e-07, + "loss": 1.0149, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.7965770509441772, + "learning_rate": 1.7766077203915655e-07, + "loss": 1.0018, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.7904600168542316, + "learning_rate": 1.6255194770704586e-07, + "loss": 1.0288, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.9023844501222255, + "learning_rate": 1.481139151579991e-07, + "loss": 1.1748, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.7628378514295081, + "learning_rate": 1.3434677139885222e-07, + "loss": 1.0356, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.8420363563080878, + "learning_rate": 1.2125060892881346e-07, + "loss": 1.082, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.7698524661911217, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.9641, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.7667599742364795, + "learning_rate": 9.707157531134713e-08, + "loss": 1.0008, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.8424245996095795, + "learning_rate": 8.598886661895788e-08, + "loss": 1.0517, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.8510353343099226, + "learning_rate": 7.557746412468758e-08, + "loss": 1.0016, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.7616975654904604, + "learning_rate": 6.583743778106887e-08, + "loss": 0.9691, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.8107876525232283, + "learning_rate": 5.6768853029787184e-08, + "loss": 1.026, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.7369625978002111, + "learning_rate": 4.837177080119215e-08, + "loss": 0.9834, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.8910867360427009, + "learning_rate": 4.064624751394242e-08, + "loss": 1.0972, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.748674014255129, + "learning_rate": 3.359233507459481e-08, + "loss": 1.0393, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.8615963370022487, + "learning_rate": 2.7210080877237976e-08, + "loss": 1.094, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.875903473799594, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.985, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.8894533001501406, + "learning_rate": 1.646071422083395e-08, + "loss": 1.056, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.8871935662849862, + "learning_rate": 1.209367398504746e-08, + "loss": 1.0659, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.8146409426801687, + "learning_rate": 8.398436437317969e-09, + "loss": 1.1013, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 1.0705442277942085, + "learning_rate": 5.375026405352035e-09, + "loss": 1.0065, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.8415335059819268, + "learning_rate": 3.023464202944748e-09, + "loss": 1.0892, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.8828933279031737, + "learning_rate": 1.3437656298687097e-09, + "loss": 1.0625, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.8413043992024973, + "learning_rate": 3.3594197175190745e-10, + "loss": 1.1092, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.8051131433252608, + "learning_rate": 0.0, + "loss": 1.133, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 409299338477568.0, + "train_loss": 1.1655625699996948, + "train_runtime": 12525.0909, + "train_samples_per_second": 1.597, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 409299338477568.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..4e91269c9eae3be89407a4883df0cbd727c017e3 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "v_proj", + "up_proj", + "o_proj", + "gate_proj", + "k_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..85bb8abdb020ac999ec504b7bfe4db902f9d5c55 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bda2d6cde95036419c93a716829b0284b15f940254364ee7fd021c9b07db115 +size 671150064 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c9888b056674e28fe0152b1527b8d8da571fd4a8 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d195785765d811acd791e9e869eed8b7cf69e0452dfdef7c9103f061a654e123 +size 918507402 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..2c3fbbc409ece9900a3cbf9952db36cebf3ee95a --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_1_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 2.4375962767612345, + "learning_rate": 5e-05, + "loss": 2.0055, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 1.8205678181133527, + "learning_rate": 0.0001, + "loss": 1.8155, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 1.1652809543443925, + "learning_rate": 0.00015000000000000001, + "loss": 1.4824, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.5201152697502844, + "learning_rate": 0.0002, + "loss": 1.5361, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.3090552478809887, + "learning_rate": 0.00019996629653035126, + "loss": 1.6664, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 1.031628287092597, + "learning_rate": 0.00019986520883988232, + "loss": 1.4614, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 1.0202753941947118, + "learning_rate": 0.00019969680506871137, + "loss": 1.5362, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 1.058916686800406, + "learning_rate": 0.00019946119873266613, + "loss": 1.4049, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 1.209947453785025, + "learning_rate": 0.00019915854864676664, + "loss": 1.4271, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 1.136029805718251, + "learning_rate": 0.00019878905881817252, + "loss": 1.397, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 1.1940378677759405, + "learning_rate": 0.00019835297830866826, + "loss": 1.4353, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 1.0763620564079805, + "learning_rate": 0.00019785060106677818, + "loss": 1.27, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 1.0186155436117859, + "learning_rate": 0.00019728226572962473, + "loss": 1.504, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.9058596122818576, + "learning_rate": 0.0001966483553946637, + "loss": 1.2254, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.8646082804367832, + "learning_rate": 0.00019594929736144976, + "loss": 1.2529, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.9007250526249097, + "learning_rate": 0.00019518556284360696, + "loss": 1.3052, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 1.0101800105447813, + "learning_rate": 0.0001943576666511982, + "loss": 1.41, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.9435744648080255, + "learning_rate": 0.0001934661668437073, + "loss": 1.3445, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.9424757697182853, + "learning_rate": 0.0001925116643538684, + "loss": 1.3374, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.9235794265909824, + "learning_rate": 0.00019149480258259533, + "loss": 1.35, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.9344657343675837, + "learning_rate": 0.00019041626696528503, + "loss": 1.3005, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 1.1400402352729915, + "learning_rate": 0.0001892767845097864, + "loss": 1.4263, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 1.0028077404329654, + "learning_rate": 0.00018807712330634642, + "loss": 1.3401, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.8953385872918737, + "learning_rate": 0.0001868180920098644, + "loss": 1.1645, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.975688053138917, + "learning_rate": 0.00018550053929480202, + "loss": 1.4462, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.9022176701016223, + "learning_rate": 0.00018412535328311814, + "loss": 1.3582, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 1.0273974913918964, + "learning_rate": 0.0001826934609456129, + "loss": 1.4179, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 1.0245918857785254, + "learning_rate": 0.00018120582747708502, + "loss": 1.3315, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.934195162405148, + "learning_rate": 0.0001796634556457236, + "loss": 1.3547, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.927115496292952, + "learning_rate": 0.0001780673851171728, + "loss": 1.3661, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.8748689090750555, + "learning_rate": 0.00017641869175372493, + "loss": 1.3008, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.962208141701793, + "learning_rate": 0.00017471848688911464, + "loss": 1.3561, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 1.0069280840139563, + "learning_rate": 0.000172967916579403, + "loss": 1.3655, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.9243569825094787, + "learning_rate": 0.00017116816083045602, + "loss": 1.3448, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.8951086932705498, + "learning_rate": 0.0001693204328025389, + "loss": 1.2588, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.8622664662409053, + "learning_rate": 0.00016742597799256182, + "loss": 1.3533, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.9293730578720981, + "learning_rate": 0.00016548607339452853, + "loss": 1.2414, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 1.0469127574163346, + "learning_rate": 0.00016350202663875386, + "loss": 1.5204, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.8820185423482838, + "learning_rate": 0.0001614751751104301, + "loss": 1.1172, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.9122615718472, + "learning_rate": 0.00015940688504813662, + "loss": 1.3977, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.8510033976335003, + "learning_rate": 0.00015729855062290022, + "loss": 1.3419, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.9821223857398642, + "learning_rate": 0.00015515159299842707, + "loss": 1.3981, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.9144643080831125, + "learning_rate": 0.00015296745937313987, + "loss": 1.4276, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.8941879417566212, + "learning_rate": 0.00015074762200466556, + "loss": 1.4579, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.9058529475179548, + "learning_rate": 0.00014849357721743168, + "loss": 1.3782, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.9272855454987454, + "learning_rate": 0.00014620684439403962, + "loss": 1.3791, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.8332099088043601, + "learning_rate": 0.0001438889649510956, + "loss": 1.3699, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.8364845255823447, + "learning_rate": 0.00014154150130018866, + "loss": 1.3083, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.9560879208789635, + "learning_rate": 0.00013916603579471705, + "loss": 1.3531, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.8415371480344829, + "learning_rate": 0.000136764169663272, + "loss": 1.3675, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.853424371015124, + "learning_rate": 0.00013433752193029886, + "loss": 1.3429, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.9625578075844118, + "learning_rate": 0.00013188772832476188, + "loss": 1.2927, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.8482472192606121, + "learning_rate": 0.00012941644017754964, + "loss": 1.2812, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.8476053067734847, + "learning_rate": 0.00012692532330836346, + "loss": 1.34, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.8527526959812046, + "learning_rate": 0.00012441605690283915, + "loss": 1.343, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.9141929930011206, + "learning_rate": 0.0001218903323806595, + "loss": 1.3156, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.9463779538926128, + "learning_rate": 0.00011934985225541998, + "loss": 1.3841, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.944185912368796, + "learning_rate": 0.00011679632898701649, + "loss": 1.3545, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.8369356187298539, + "learning_rate": 0.00011423148382732853, + "loss": 1.2654, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.9066908409453514, + "learning_rate": 0.00011165704565997593, + "loss": 1.3662, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 2.271347741753355, + "learning_rate": 0.00010907474983493144, + "loss": 1.2857, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.802300358511912, + "learning_rate": 0.0001064863369987743, + "loss": 1.249, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.8234655930884818, + "learning_rate": 0.00010389355192137377, + "loss": 1.229, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 1.0623250114905893, + "learning_rate": 0.0001012981423197931, + "loss": 1.3819, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.8416321789047925, + "learning_rate": 9.870185768020693e-05, + "loss": 1.3624, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 1.0433051505010305, + "learning_rate": 9.610644807862625e-05, + "loss": 1.438, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.8373819372038648, + "learning_rate": 9.35136630012257e-05, + "loss": 1.2292, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.8156628394687041, + "learning_rate": 9.092525016506858e-05, + "loss": 1.2318, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 1.0744275187211567, + "learning_rate": 8.83429543400241e-05, + "loss": 1.3872, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.8259842891973573, + "learning_rate": 8.57685161726715e-05, + "loss": 1.2973, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.8889081422789769, + "learning_rate": 8.320367101298351e-05, + "loss": 1.3242, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.8768836198798882, + "learning_rate": 8.065014774458003e-05, + "loss": 1.1688, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.8643228297558432, + "learning_rate": 7.810966761934053e-05, + "loss": 1.1996, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.870246487198974, + "learning_rate": 7.558394309716088e-05, + "loss": 1.405, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.9310147988449106, + "learning_rate": 7.307467669163655e-05, + "loss": 1.2846, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.8547840042469805, + "learning_rate": 7.058355982245037e-05, + "loss": 1.2263, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.7826078161467719, + "learning_rate": 6.811227167523815e-05, + "loss": 1.1951, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.9003174084433649, + "learning_rate": 6.566247806970119e-05, + "loss": 1.3574, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.7798062968098542, + "learning_rate": 6.323583033672799e-05, + "loss": 1.2386, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.9384176057411918, + "learning_rate": 6.083396420528298e-05, + "loss": 1.2428, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.8138110165553195, + "learning_rate": 5.845849869981137e-05, + "loss": 1.2728, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.7562590090488508, + "learning_rate": 5.611103504890444e-05, + "loss": 1.1451, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.7636838854961598, + "learning_rate": 5.379315560596038e-05, + "loss": 1.2535, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.7869922876906755, + "learning_rate": 5.1506422782568345e-05, + "loss": 1.2666, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.884478858985538, + "learning_rate": 4.9252377995334444e-05, + "loss": 1.2799, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.8109606603905511, + "learning_rate": 4.703254062686017e-05, + "loss": 1.2603, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.8313160829561579, + "learning_rate": 4.484840700157295e-05, + "loss": 1.3399, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.7924509618604803, + "learning_rate": 4.270144937709981e-05, + "loss": 1.2432, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.9108193579559817, + "learning_rate": 4.059311495186338e-05, + "loss": 1.2918, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.9276660976162258, + "learning_rate": 3.852482488956992e-05, + "loss": 1.3278, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.7361813611911072, + "learning_rate": 3.649797336124615e-05, + "loss": 1.3238, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.885325101404829, + "learning_rate": 3.45139266054715e-05, + "loss": 1.2137, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.8879103772088985, + "learning_rate": 3.257402200743821e-05, + "loss": 1.3551, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.9449620463795166, + "learning_rate": 3.0679567197461134e-05, + "loss": 1.3261, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.9964457474038876, + "learning_rate": 2.8831839169543996e-05, + "loss": 1.3729, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.9418918840580128, + "learning_rate": 2.7032083420597e-05, + "loss": 1.3541, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.9080986687610889, + "learning_rate": 2.528151311088537e-05, + "loss": 1.2559, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.851676767567751, + "learning_rate": 2.3581308246275103e-05, + "loss": 1.2089, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.7872410177082587, + "learning_rate": 2.1932614882827197e-05, + "loss": 1.145, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.9353780847877511, + "learning_rate": 2.03365443542764e-05, + "loss": 1.2498, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.8719766191624784, + "learning_rate": 1.879417252291502e-05, + "loss": 1.2702, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.7950396247078377, + "learning_rate": 1.730653905438714e-05, + "loss": 1.242, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.8416961370911875, + "learning_rate": 1.587464671688187e-05, + "loss": 1.3646, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.8572495478701838, + "learning_rate": 1.4499460705197998e-05, + "loss": 1.2794, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.8521127011891462, + "learning_rate": 1.3181907990135622e-05, + "loss": 1.2827, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.8252536924805987, + "learning_rate": 1.1922876693653585e-05, + "loss": 1.2652, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.9154015891631243, + "learning_rate": 1.0723215490213634e-05, + "loss": 1.2429, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.878794667923896, + "learning_rate": 9.583733034714981e-06, + "loss": 1.2176, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.875395034576249, + "learning_rate": 8.505197417404687e-06, + "loss": 1.3073, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.8745853976117788, + "learning_rate": 7.488335646131628e-06, + "loss": 1.2886, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.9830700358548075, + "learning_rate": 6.533833156292679e-06, + "loss": 1.28, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.8858764862186183, + "learning_rate": 5.6423333488018095e-06, + "loss": 1.2357, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.7998919971474687, + "learning_rate": 4.8144371563930476e-06, + "loss": 1.2607, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.7384683443866129, + "learning_rate": 4.050702638550275e-06, + "loss": 1.0985, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.811443115370641, + "learning_rate": 3.3516446053363015e-06, + "loss": 1.261, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.7461134302323397, + "learning_rate": 2.717734270375272e-06, + "loss": 1.2753, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.7927512730358989, + "learning_rate": 2.1493989332218468e-06, + "loss": 1.2605, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.9092973079007383, + "learning_rate": 1.6470216913317626e-06, + "loss": 1.2804, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.9920121022665794, + "learning_rate": 1.2109411818274852e-06, + "loss": 1.4275, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.7497931860541336, + "learning_rate": 8.41451353233369e-07, + "loss": 1.1458, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.8672561201546914, + "learning_rate": 5.388012673338661e-07, + "loss": 1.2127, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.8334654251902224, + "learning_rate": 3.0319493128866396e-07, + "loss": 1.2524, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.9220176752178163, + "learning_rate": 1.3479116011769767e-07, + "loss": 1.131, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.8476351152901493, + "learning_rate": 3.370346964876036e-08, + "loss": 1.2145, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.9123565175137601, + "learning_rate": 0.0, + "loss": 1.3582, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 40355963437056.0, + "train_loss": 1.325367919921875, + "train_runtime": 1253.4357, + "train_samples_per_second": 1.596, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 40355963437056.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..9765a3e78557b6c79bb6f5bca0a034d0388a5ead --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "gate_proj", + "k_proj", + "v_proj", + "q_proj", + "o_proj", + "down_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..076ae57dc89e00607e846bda943126160d5eac2d --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09e6207f9d9fb334a33ff935bf554ab20ceaa9634495a6f768138cc0247bc27b +size 671150064 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b64710ee6ba62807d72285bf354c2994b5d35269 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b00d344ddcecd1782f30de60de7522e44338bd9bd5fd68b665c5e9a298d3e42f +size 918507402 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..1346602eead4ea9ebf5dd9db29201bdf05c576e2 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_2000_epochs_2_lora/trainer_state.json @@ -0,0 +1,1792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 2.454786897231832, + "learning_rate": 2.5e-05, + "loss": 2.0055, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 1.842436047635649, + "learning_rate": 5e-05, + "loss": 1.8155, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 1.4608343313778398, + "learning_rate": 7.500000000000001e-05, + "loss": 1.5465, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.04404253663071, + "learning_rate": 0.0001, + "loss": 1.4925, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 1.4833433883239422, + "learning_rate": 0.000125, + "loss": 1.6817, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 1.3463159602018167, + "learning_rate": 0.00015000000000000001, + "loss": 1.5435, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 1.2565394186018986, + "learning_rate": 0.000175, + "loss": 1.5802, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 1.0992906450167048, + "learning_rate": 0.0002, + "loss": 1.4388, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.9800827433283528, + "learning_rate": 0.0001999915737775817, + "loss": 1.4302, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 1.0377034629839987, + "learning_rate": 0.00019996629653035126, + "loss": 1.3866, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 1.1873953407603806, + "learning_rate": 0.00019992417251814282, + "loss": 1.4402, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 1.0901190708201218, + "learning_rate": 0.00019986520883988232, + "loss": 1.2677, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 1.0525552249283692, + "learning_rate": 0.0001997894154323911, + "loss": 1.4872, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.9983966962380565, + "learning_rate": 0.00019969680506871137, + "loss": 1.2229, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.9391956274259196, + "learning_rate": 0.0001995873933559535, + "loss": 1.2639, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.9298725207140135, + "learning_rate": 0.00019946119873266613, + "loss": 1.3107, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 1.0109608082604975, + "learning_rate": 0.0001993182424657285, + "loss": 1.4046, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.9783080720697636, + "learning_rate": 0.00019915854864676664, + "loss": 1.3579, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.9069991356656613, + "learning_rate": 0.0001989821441880933, + "loss": 1.3355, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.9895451465879731, + "learning_rate": 0.00019878905881817252, + "loss": 1.3439, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.986267073744202, + "learning_rate": 0.0001985793250766098, + "loss": 1.3042, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 1.1534014257751493, + "learning_rate": 0.00019835297830866826, + "loss": 1.4238, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 1.0255571791577869, + "learning_rate": 0.00019811005665931205, + "loss": 1.3415, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.9274064718692961, + "learning_rate": 0.00019785060106677818, + "loss": 1.1667, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 1.0859074770833808, + "learning_rate": 0.0001975746552556772, + "loss": 1.4558, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.8987327600865704, + "learning_rate": 0.00019728226572962473, + "loss": 1.3593, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.9315367888961246, + "learning_rate": 0.0001969734817634044, + "loss": 1.424, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 1.0397623290053568, + "learning_rate": 0.0001966483553946637, + "loss": 1.3356, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.9239779767358856, + "learning_rate": 0.00019630694141514464, + "loss": 1.3457, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.9256155259019234, + "learning_rate": 0.00019594929736144976, + "loss": 1.3682, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.91453564542336, + "learning_rate": 0.0001955754835053459, + "loss": 1.2999, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.9322616402712055, + "learning_rate": 0.00019518556284360696, + "loss": 1.3528, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 1.0110738671839277, + "learning_rate": 0.0001947796010873974, + "loss": 1.3682, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.9209930367156439, + "learning_rate": 0.0001943576666511982, + "loss": 1.3418, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.9131101646709927, + "learning_rate": 0.0001939198306412775, + "loss": 1.2602, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.8886169513541825, + "learning_rate": 0.0001934661668437073, + "loss": 1.359, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.9280189508217098, + "learning_rate": 0.0001929967517119289, + "loss": 1.2432, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 1.1198070932319397, + "learning_rate": 0.0001925116643538684, + "loss": 1.5276, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.9003335984159968, + "learning_rate": 0.0001920109865186052, + "loss": 1.1269, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.9797642104699517, + "learning_rate": 0.00019149480258259533, + "loss": 1.4041, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.8417683479387569, + "learning_rate": 0.00019096319953545185, + "loss": 1.3465, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.9818092313272447, + "learning_rate": 0.00019041626696528503, + "loss": 1.4078, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.9083765370663439, + "learning_rate": 0.00018985409704360456, + "loss": 1.4321, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.8941496427189838, + "learning_rate": 0.0001892767845097864, + "loss": 1.4587, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.8839595066828793, + "learning_rate": 0.00018868442665510678, + "loss": 1.3793, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.9549893377990281, + "learning_rate": 0.00018807712330634642, + "loss": 1.391, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.882414820292664, + "learning_rate": 0.00018745497680896722, + "loss": 1.3803, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.887759800655994, + "learning_rate": 0.0001868180920098644, + "loss": 1.3198, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.9492260628073359, + "learning_rate": 0.0001861665762396974, + "loss": 1.3614, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.8762951830541706, + "learning_rate": 0.00018550053929480202, + "loss": 1.3855, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.8915934670890578, + "learning_rate": 0.00018482009341868697, + "loss": 1.3612, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 1.0115455917430354, + "learning_rate": 0.00018412535328311814, + "loss": 1.3162, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.8741912749537363, + "learning_rate": 0.00018341643596879367, + "loss": 1.2894, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.8441946856123005, + "learning_rate": 0.0001826934609456129, + "loss": 1.3515, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.8809778516688372, + "learning_rate": 0.00018195655005254273, + "loss": 1.3595, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.9433398546492309, + "learning_rate": 0.00018120582747708502, + "loss": 1.3354, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.964971265725767, + "learning_rate": 0.00018044141973434758, + "loss": 1.394, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 1.0418275518352318, + "learning_rate": 0.0001796634556457236, + "loss": 1.3663, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.8516899579807444, + "learning_rate": 0.00017887206631718203, + "loss": 1.2768, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.8962604414615124, + "learning_rate": 0.0001780673851171728, + "loss": 1.3755, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 1.0298256962484944, + "learning_rate": 0.00017724954765415137, + "loss": 1.3205, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.8289660154098288, + "learning_rate": 0.00017641869175372493, + "loss": 1.2592, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.9298213657742931, + "learning_rate": 0.00017557495743542585, + "loss": 1.2407, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 1.106967597246562, + "learning_rate": 0.00017471848688911464, + "loss": 1.4217, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.8814103122645993, + "learning_rate": 0.00017384942445101772, + "loss": 1.3808, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 1.1443583040123078, + "learning_rate": 0.000172967916579403, + "loss": 1.453, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.8711492300529738, + "learning_rate": 0.00017207411182989832, + "loss": 1.2567, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.8436613350439512, + "learning_rate": 0.00017116816083045602, + "loss": 1.2643, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 1.0512295898273085, + "learning_rate": 0.00017025021625596853, + "loss": 1.4127, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.8602871887637837, + "learning_rate": 0.0001693204328025389, + "loss": 1.3148, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.9067935512993148, + "learning_rate": 0.0001683789671614107, + "loss": 1.35, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.8788883808272243, + "learning_rate": 0.00016742597799256182, + "loss": 1.1869, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.9460567866998142, + "learning_rate": 0.00016646162589796615, + "loss": 1.2199, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 1.0036077278420452, + "learning_rate": 0.00016548607339452853, + "loss": 1.4394, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.9452753464036726, + "learning_rate": 0.00016449948488669639, + "loss": 1.2931, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.8233442283123868, + "learning_rate": 0.00016350202663875386, + "loss": 1.2427, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.8615498620797277, + "learning_rate": 0.00016249386674680184, + "loss": 1.2248, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 1.0537862842321968, + "learning_rate": 0.0001614751751104301, + "loss": 1.4126, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.8605446318443883, + "learning_rate": 0.00016044612340408466, + "loss": 1.2592, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.8682553736624375, + "learning_rate": 0.00015940688504813662, + "loss": 1.2749, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.8611624544092797, + "learning_rate": 0.00015835763517965673, + "loss": 1.3008, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.7778043968372704, + "learning_rate": 0.00015729855062290022, + "loss": 1.159, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.8267947712581017, + "learning_rate": 0.0001562298098595078, + "loss": 1.2829, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.8619985794810339, + "learning_rate": 0.00015515159299842707, + "loss": 1.3103, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.9595046423703979, + "learning_rate": 0.00015406408174555976, + "loss": 1.3134, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.8074353076035666, + "learning_rate": 0.00015296745937313987, + "loss": 1.2772, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.8372427936508989, + "learning_rate": 0.00015186191068884775, + "loss": 1.3604, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.8332005936689508, + "learning_rate": 0.00015074762200466556, + "loss": 1.2626, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.9439628299152224, + "learning_rate": 0.00014962478110547918, + "loss": 1.3441, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.996012432825993, + "learning_rate": 0.00014849357721743168, + "loss": 1.3717, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.8258261527226268, + "learning_rate": 0.0001473542009760343, + "loss": 1.3518, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.8778655106027685, + "learning_rate": 0.00014620684439403962, + "loss": 1.239, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.9540956569028518, + "learning_rate": 0.0001450517008290827, + "loss": 1.3892, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.9759804751023198, + "learning_rate": 0.0001438889649510956, + "loss": 1.355, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.9465521978636429, + "learning_rate": 0.00014271883270950073, + "loss": 1.414, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.9737987698753232, + "learning_rate": 0.00014154150130018866, + "loss": 1.3891, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.9099939493752546, + "learning_rate": 0.00014035716913228568, + "loss": 1.2908, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.9059674667292016, + "learning_rate": 0.00013916603579471705, + "loss": 1.2388, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.7761495052755369, + "learning_rate": 0.0001379683020225714, + "loss": 1.1604, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.9480743996261369, + "learning_rate": 0.000136764169663272, + "loss": 1.2747, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.9348510892211489, + "learning_rate": 0.00013555384164256048, + "loss": 1.315, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.8759026106815115, + "learning_rate": 0.00013433752193029886, + "loss": 1.261, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.8988522453335569, + "learning_rate": 0.00013311541550609565, + "loss": 1.3952, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 1.0173472319077939, + "learning_rate": 0.00013188772832476188, + "loss": 1.3036, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.9741728528764557, + "learning_rate": 0.00013065466728160252, + "loss": 1.3275, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.8999294894193648, + "learning_rate": 0.00012941644017754964, + "loss": 1.2809, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.90520417051285, + "learning_rate": 0.00012817325568414297, + "loss": 1.2641, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.8890234119872065, + "learning_rate": 0.00012692532330836346, + "loss": 1.2489, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.8474743399902241, + "learning_rate": 0.00012567285335732633, + "loss": 1.3252, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.8658741276450808, + "learning_rate": 0.00012441605690283915, + "loss": 1.3124, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.8981624669190241, + "learning_rate": 0.00012315514574583113, + "loss": 1.293, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.9264292061687182, + "learning_rate": 0.0001218903323806595, + "loss": 1.2691, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.8632538887733768, + "learning_rate": 0.00012062182995929882, + "loss": 1.2964, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.7531324392533881, + "learning_rate": 0.00011934985225541998, + "loss": 1.1218, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.8372202446838725, + "learning_rate": 0.0001180746136283638, + "loss": 1.2861, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.7756688230347595, + "learning_rate": 0.00011679632898701649, + "loss": 1.3032, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.8215704713405931, + "learning_rate": 0.00011551521375359206, + "loss": 1.2896, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.8939563107843507, + "learning_rate": 0.00011423148382732853, + "loss": 1.2896, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 1.0694989586581007, + "learning_rate": 0.00011294535554810354, + "loss": 1.4404, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.7928451007447335, + "learning_rate": 0.00011165704565997593, + "loss": 1.1585, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.9138057796186851, + "learning_rate": 0.00011036677127465889, + "loss": 1.2114, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.8812247071590515, + "learning_rate": 0.00010907474983493144, + "loss": 1.2766, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.9140878350260794, + "learning_rate": 0.00010778119907799398, + "loss": 1.1352, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.8566159540621913, + "learning_rate": 0.0001064863369987743, + "loss": 1.2235, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.9116304056158802, + "learning_rate": 0.00010519038181318999, + "loss": 1.3621, + "step": 125 + }, + { + "epoch": 1.008, + "grad_norm": 0.7755679747113441, + "learning_rate": 0.00010389355192137377, + "loss": 0.8344, + "step": 126 + }, + { + "epoch": 1.016, + "grad_norm": 0.6979054522941274, + "learning_rate": 0.00010259606587086783, + "loss": 0.7669, + "step": 127 + }, + { + "epoch": 1.024, + "grad_norm": 0.7291268179512961, + "learning_rate": 0.0001012981423197931, + "loss": 0.7925, + "step": 128 + }, + { + "epoch": 1.032, + "grad_norm": 0.8154205675076477, + "learning_rate": 0.0001, + "loss": 0.8207, + "step": 129 + }, + { + "epoch": 1.04, + "grad_norm": 0.8030396568749524, + "learning_rate": 9.870185768020693e-05, + "loss": 0.8186, + "step": 130 + }, + { + "epoch": 1.048, + "grad_norm": 0.8770791344771406, + "learning_rate": 9.740393412913219e-05, + "loss": 0.8436, + "step": 131 + }, + { + "epoch": 1.056, + "grad_norm": 0.8439224955672564, + "learning_rate": 9.610644807862625e-05, + "loss": 0.691, + "step": 132 + }, + { + "epoch": 1.064, + "grad_norm": 1.0048193558934282, + "learning_rate": 9.480961818681004e-05, + "loss": 0.8403, + "step": 133 + }, + { + "epoch": 1.072, + "grad_norm": 1.0523542503386671, + "learning_rate": 9.35136630012257e-05, + "loss": 0.7973, + "step": 134 + }, + { + "epoch": 1.08, + "grad_norm": 0.9002095632718905, + "learning_rate": 9.221880092200601e-05, + "loss": 0.7315, + "step": 135 + }, + { + "epoch": 1.088, + "grad_norm": 1.0630429397417207, + "learning_rate": 9.092525016506858e-05, + "loss": 0.8238, + "step": 136 + }, + { + "epoch": 1.096, + "grad_norm": 0.9190093600470862, + "learning_rate": 8.963322872534114e-05, + "loss": 0.6638, + "step": 137 + }, + { + "epoch": 1.104, + "grad_norm": 1.079431501244743, + "learning_rate": 8.83429543400241e-05, + "loss": 0.7734, + "step": 138 + }, + { + "epoch": 1.112, + "grad_norm": 0.9806969575217238, + "learning_rate": 8.705464445189647e-05, + "loss": 0.7376, + "step": 139 + }, + { + "epoch": 1.12, + "grad_norm": 0.9299842546825281, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7011, + "step": 140 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.86875813110234, + "learning_rate": 8.448478624640797e-05, + "loss": 0.7586, + "step": 141 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.9244830667073946, + "learning_rate": 8.320367101298351e-05, + "loss": 0.7808, + "step": 142 + }, + { + "epoch": 1.144, + "grad_norm": 0.8451369002160248, + "learning_rate": 8.192538637163621e-05, + "loss": 0.7508, + "step": 143 + }, + { + "epoch": 1.152, + "grad_norm": 0.8870725287895037, + "learning_rate": 8.065014774458003e-05, + "loss": 0.7372, + "step": 144 + }, + { + "epoch": 1.16, + "grad_norm": 0.9227181372026184, + "learning_rate": 7.93781700407012e-05, + "loss": 0.7934, + "step": 145 + }, + { + "epoch": 1.168, + "grad_norm": 1.011467321825371, + "learning_rate": 7.810966761934053e-05, + "loss": 0.8319, + "step": 146 + }, + { + "epoch": 1.176, + "grad_norm": 1.019105698928754, + "learning_rate": 7.684485425416888e-05, + "loss": 0.7234, + "step": 147 + }, + { + "epoch": 1.184, + "grad_norm": 0.8950069527342469, + "learning_rate": 7.558394309716088e-05, + "loss": 0.7635, + "step": 148 + }, + { + "epoch": 1.192, + "grad_norm": 0.9609590149702135, + "learning_rate": 7.432714664267373e-05, + "loss": 0.7848, + "step": 149 + }, + { + "epoch": 1.2, + "grad_norm": 1.000983387982248, + "learning_rate": 7.307467669163655e-05, + "loss": 0.7037, + "step": 150 + }, + { + "epoch": 1.208, + "grad_norm": 0.9044957343906033, + "learning_rate": 7.182674431585704e-05, + "loss": 0.6679, + "step": 151 + }, + { + "epoch": 1.216, + "grad_norm": 0.9191152811134384, + "learning_rate": 7.058355982245037e-05, + "loss": 0.7899, + "step": 152 + }, + { + "epoch": 1.224, + "grad_norm": 0.9035340124406496, + "learning_rate": 6.934533271839752e-05, + "loss": 0.7174, + "step": 153 + }, + { + "epoch": 1.232, + "grad_norm": 0.9056583346175192, + "learning_rate": 6.811227167523815e-05, + "loss": 0.7246, + "step": 154 + }, + { + "epoch": 1.24, + "grad_norm": 0.9420824209712398, + "learning_rate": 6.688458449390437e-05, + "loss": 0.7473, + "step": 155 + }, + { + "epoch": 1.248, + "grad_norm": 0.9877125280086316, + "learning_rate": 6.566247806970119e-05, + "loss": 0.6951, + "step": 156 + }, + { + "epoch": 1.256, + "grad_norm": 0.9362781914039232, + "learning_rate": 6.444615835743955e-05, + "loss": 0.7093, + "step": 157 + }, + { + "epoch": 1.264, + "grad_norm": 0.9371151683347106, + "learning_rate": 6.323583033672799e-05, + "loss": 0.8175, + "step": 158 + }, + { + "epoch": 1.272, + "grad_norm": 1.0140898027379972, + "learning_rate": 6.203169797742861e-05, + "loss": 0.6958, + "step": 159 + }, + { + "epoch": 1.28, + "grad_norm": 1.0998722652608337, + "learning_rate": 6.083396420528298e-05, + "loss": 0.7738, + "step": 160 + }, + { + "epoch": 1.288, + "grad_norm": 0.9450932338254235, + "learning_rate": 5.964283086771435e-05, + "loss": 0.7078, + "step": 161 + }, + { + "epoch": 1.296, + "grad_norm": 1.0699624736401447, + "learning_rate": 5.845849869981137e-05, + "loss": 0.8035, + "step": 162 + }, + { + "epoch": 1.304, + "grad_norm": 1.742699415489815, + "learning_rate": 5.728116729049928e-05, + "loss": 0.707, + "step": 163 + }, + { + "epoch": 1.312, + "grad_norm": 0.9281568756309962, + "learning_rate": 5.611103504890444e-05, + "loss": 0.7321, + "step": 164 + }, + { + "epoch": 1.32, + "grad_norm": 0.9896754789509472, + "learning_rate": 5.4948299170917325e-05, + "loss": 0.773, + "step": 165 + }, + { + "epoch": 1.328, + "grad_norm": 0.9613196439719053, + "learning_rate": 5.379315560596038e-05, + "loss": 0.739, + "step": 166 + }, + { + "epoch": 1.336, + "grad_norm": 0.8612910635418535, + "learning_rate": 5.26457990239657e-05, + "loss": 0.7113, + "step": 167 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.9934810738394599, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.776, + "step": 168 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.8947682005770344, + "learning_rate": 5.0375218894520834e-05, + "loss": 0.7245, + "step": 169 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.8593670171654829, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.734, + "step": 170 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.9551317826270233, + "learning_rate": 4.813808931115228e-05, + "loss": 0.7536, + "step": 171 + }, + { + "epoch": 1.376, + "grad_norm": 0.9646754007650834, + "learning_rate": 4.703254062686017e-05, + "loss": 0.7289, + "step": 172 + }, + { + "epoch": 1.384, + "grad_norm": 0.9443811214884448, + "learning_rate": 4.593591825444028e-05, + "loss": 0.739, + "step": 173 + }, + { + "epoch": 1.392, + "grad_norm": 0.8894525263479396, + "learning_rate": 4.484840700157295e-05, + "loss": 0.714, + "step": 174 + }, + { + "epoch": 1.4, + "grad_norm": 1.0258290832265347, + "learning_rate": 4.377019014049223e-05, + "loss": 0.7807, + "step": 175 + }, + { + "epoch": 1.408, + "grad_norm": 0.9773569061242865, + "learning_rate": 4.270144937709981e-05, + "loss": 0.6801, + "step": 176 + }, + { + "epoch": 1.416, + "grad_norm": 0.8195593757551065, + "learning_rate": 4.164236482034327e-05, + "loss": 0.6585, + "step": 177 + }, + { + "epoch": 1.424, + "grad_norm": 0.8856301350683881, + "learning_rate": 4.059311495186338e-05, + "loss": 0.6718, + "step": 178 + }, + { + "epoch": 1.432, + "grad_norm": 1.032547376478014, + "learning_rate": 3.9553876595915375e-05, + "loss": 0.763, + "step": 179 + }, + { + "epoch": 1.44, + "grad_norm": 0.9696986764115734, + "learning_rate": 3.852482488956992e-05, + "loss": 0.7687, + "step": 180 + }, + { + "epoch": 1.448, + "grad_norm": 0.9585486030234329, + "learning_rate": 3.750613325319817e-05, + "loss": 0.711, + "step": 181 + }, + { + "epoch": 1.456, + "grad_norm": 0.8487980548445815, + "learning_rate": 3.649797336124615e-05, + "loss": 0.7318, + "step": 182 + }, + { + "epoch": 1.464, + "grad_norm": 1.245397825528434, + "learning_rate": 3.550051511330361e-05, + "loss": 0.7734, + "step": 183 + }, + { + "epoch": 1.472, + "grad_norm": 0.8973712731694369, + "learning_rate": 3.45139266054715e-05, + "loss": 0.7002, + "step": 184 + }, + { + "epoch": 1.48, + "grad_norm": 0.9725393858342541, + "learning_rate": 3.3538374102033866e-05, + "loss": 0.7616, + "step": 185 + }, + { + "epoch": 1.488, + "grad_norm": 1.117394190148479, + "learning_rate": 3.257402200743821e-05, + "loss": 0.7912, + "step": 186 + }, + { + "epoch": 1.496, + "grad_norm": 0.8912340910228728, + "learning_rate": 3.1621032838589305e-05, + "loss": 0.6443, + "step": 187 + }, + { + "epoch": 1.504, + "grad_norm": 0.8866116563120968, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.7704, + "step": 188 + }, + { + "epoch": 1.512, + "grad_norm": 0.7749023022858249, + "learning_rate": 2.974978374403147e-05, + "loss": 0.6558, + "step": 189 + }, + { + "epoch": 1.52, + "grad_norm": 0.9365999731244843, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7603, + "step": 190 + }, + { + "epoch": 1.528, + "grad_norm": 0.9766004674575304, + "learning_rate": 2.7925888170101665e-05, + "loss": 0.7267, + "step": 191 + }, + { + "epoch": 1.536, + "grad_norm": 1.0390916143424298, + "learning_rate": 2.7032083420597e-05, + "loss": 0.8023, + "step": 192 + }, + { + "epoch": 1.544, + "grad_norm": 0.9147338948135607, + "learning_rate": 2.6150575548982292e-05, + "loss": 0.6871, + "step": 193 + }, + { + "epoch": 1.552, + "grad_norm": 0.932755369920222, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7526, + "step": 194 + }, + { + "epoch": 1.56, + "grad_norm": 0.935429666752721, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.6711, + "step": 195 + }, + { + "epoch": 1.568, + "grad_norm": 0.8845809906688457, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.6961, + "step": 196 + }, + { + "epoch": 1.576, + "grad_norm": 0.9995414517223054, + "learning_rate": 2.2750452345848682e-05, + "loss": 0.7372, + "step": 197 + }, + { + "epoch": 1.584, + "grad_norm": 0.9468268923677634, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.7459, + "step": 198 + }, + { + "epoch": 1.592, + "grad_norm": 0.9145494700163316, + "learning_rate": 2.112793368281799e-05, + "loss": 0.7345, + "step": 199 + }, + { + "epoch": 1.6, + "grad_norm": 1.0705558425892678, + "learning_rate": 2.03365443542764e-05, + "loss": 0.7512, + "step": 200 + }, + { + "epoch": 1.608, + "grad_norm": 0.9633908526891983, + "learning_rate": 1.9558580265652448e-05, + "loss": 0.7458, + "step": 201 + }, + { + "epoch": 1.616, + "grad_norm": 0.9217792604616163, + "learning_rate": 1.879417252291502e-05, + "loss": 0.8034, + "step": 202 + }, + { + "epoch": 1.624, + "grad_norm": 0.9439148119334351, + "learning_rate": 1.804344994745727e-05, + "loss": 0.7709, + "step": 203 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 1.2065917749966415, + "learning_rate": 1.730653905438714e-05, + "loss": 0.7415, + "step": 204 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.9549382902209208, + "learning_rate": 1.6583564031206357e-05, + "loss": 0.6937, + "step": 205 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.9953482294115805, + "learning_rate": 1.587464671688187e-05, + "loss": 0.7345, + "step": 206 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 1.0391151845768114, + "learning_rate": 1.5179906581313064e-05, + "loss": 0.7517, + "step": 207 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.892304198205609, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.6669, + "step": 208 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 1.0266783762809504, + "learning_rate": 1.3833423760302611e-05, + "loss": 0.7266, + "step": 209 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.1356445620964943, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.7033, + "step": 210 + }, + { + "epoch": 1.688, + "grad_norm": 1.0125564012072685, + "learning_rate": 1.2545023191032801e-05, + "loss": 0.7136, + "step": 211 + }, + { + "epoch": 1.696, + "grad_norm": 0.9028529514701249, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.7006, + "step": 212 + }, + { + "epoch": 1.704, + "grad_norm": 1.0456487739230185, + "learning_rate": 1.131557334489326e-05, + "loss": 0.7334, + "step": 213 + }, + { + "epoch": 1.712, + "grad_norm": 1.0317362890417232, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7419, + "step": 214 + }, + { + "epoch": 1.72, + "grad_norm": 0.9311035227081702, + "learning_rate": 1.0145902956395447e-05, + "loss": 0.7117, + "step": 215 + }, + { + "epoch": 1.728, + "grad_norm": 0.9331344553753933, + "learning_rate": 9.583733034714981e-06, + "loss": 0.6911, + "step": 216 + }, + { + "epoch": 1.736, + "grad_norm": 1.1060613237728212, + "learning_rate": 9.036800464548157e-06, + "loss": 0.7269, + "step": 217 + }, + { + "epoch": 1.744, + "grad_norm": 1.015541013094166, + "learning_rate": 8.505197417404687e-06, + "loss": 0.7157, + "step": 218 + }, + { + "epoch": 1.752, + "grad_norm": 0.8510540833090268, + "learning_rate": 7.989013481394814e-06, + "loss": 0.715, + "step": 219 + }, + { + "epoch": 1.76, + "grad_norm": 0.9268564574728297, + "learning_rate": 7.488335646131628e-06, + "loss": 0.6754, + "step": 220 + }, + { + "epoch": 1.768, + "grad_norm": 0.9207812232471209, + "learning_rate": 7.003248288071118e-06, + "loss": 0.6912, + "step": 221 + }, + { + "epoch": 1.776, + "grad_norm": 0.8179210209673273, + "learning_rate": 6.533833156292679e-06, + "loss": 0.5982, + "step": 222 + }, + { + "epoch": 1.784, + "grad_norm": 1.2179160408590126, + "learning_rate": 6.08016935872251e-06, + "loss": 0.749, + "step": 223 + }, + { + "epoch": 1.792, + "grad_norm": 1.0116883719900378, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7632, + "step": 224 + }, + { + "epoch": 1.8, + "grad_norm": 0.9667019863085939, + "learning_rate": 5.22039891260262e-06, + "loss": 0.7349, + "step": 225 + }, + { + "epoch": 1.808, + "grad_norm": 0.9824595257081432, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.6991, + "step": 226 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.914323365177283, + "learning_rate": 4.424516494654118e-06, + "loss": 0.734, + "step": 227 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.9171539637252152, + "learning_rate": 4.050702638550275e-06, + "loss": 0.6559, + "step": 228 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.9296078028858084, + "learning_rate": 3.693058584855369e-06, + "loss": 0.6004, + "step": 229 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.9768565124078812, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.6891, + "step": 230 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 1.095726657475443, + "learning_rate": 3.026518236595621e-06, + "loss": 0.7675, + "step": 231 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 1.0273439273800609, + "learning_rate": 2.717734270375272e-06, + "loss": 0.7454, + "step": 232 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.9590456904201695, + "learning_rate": 2.4253447443228106e-06, + "loss": 0.6743, + "step": 233 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.9745320151965339, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.7886, + "step": 234 + }, + { + "epoch": 1.88, + "grad_norm": 1.0763904305244638, + "learning_rate": 1.8899433406879608e-06, + "loss": 0.7467, + "step": 235 + }, + { + "epoch": 1.888, + "grad_norm": 0.967581764456723, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.6785, + "step": 236 + }, + { + "epoch": 1.896, + "grad_norm": 1.0982009578130143, + "learning_rate": 1.4206749233902084e-06, + "loss": 0.698, + "step": 237 + }, + { + "epoch": 1.904, + "grad_norm": 0.8776694361072739, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.742, + "step": 238 + }, + { + "epoch": 1.912, + "grad_norm": 1.19156386832082, + "learning_rate": 1.0178558119067315e-06, + "loss": 0.712, + "step": 239 + }, + { + "epoch": 1.92, + "grad_norm": 1.102094357079355, + "learning_rate": 8.41451353233369e-07, + "loss": 0.7427, + "step": 240 + }, + { + "epoch": 1.928, + "grad_norm": 0.9264182869001772, + "learning_rate": 6.817575342714988e-07, + "loss": 0.6734, + "step": 241 + }, + { + "epoch": 1.936, + "grad_norm": 1.0486964114257968, + "learning_rate": 5.388012673338661e-07, + "loss": 0.7543, + "step": 242 + }, + { + "epoch": 1.944, + "grad_norm": 0.9750425672218928, + "learning_rate": 4.126066440464982e-07, + "loss": 0.7329, + "step": 243 + }, + { + "epoch": 1.952, + "grad_norm": 0.9780928365989245, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.7039, + "step": 244 + }, + { + "epoch": 1.96, + "grad_norm": 1.061220289490227, + "learning_rate": 2.1058456760891798e-07, + "loss": 0.8341, + "step": 245 + }, + { + "epoch": 1.968, + "grad_norm": 0.9416367422713224, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.7403, + "step": 246 + }, + { + "epoch": 1.976, + "grad_norm": 0.9265996957737294, + "learning_rate": 7.582748185719358e-08, + "loss": 0.6653, + "step": 247 + }, + { + "epoch": 1.984, + "grad_norm": 1.0104858211229228, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7525, + "step": 248 + }, + { + "epoch": 1.992, + "grad_norm": 0.8765473318432947, + "learning_rate": 8.426222418311814e-09, + "loss": 0.6066, + "step": 249 + }, + { + "epoch": 2.0, + "grad_norm": 0.9455283227165583, + "learning_rate": 0.0, + "loss": 0.6425, + "step": 250 + }, + { + "epoch": 2.0, + "step": 250, + "total_flos": 81089103593472.0, + "train_loss": 1.037638253211975, + "train_runtime": 2502.8991, + "train_samples_per_second": 1.598, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 81089103593472.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/README.md b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a5995eae1b8c90e2ccac3c70c288f75cec71c83a --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "up_proj", + "q_proj", + "gate_proj", + "down_proj", + "v_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..71701bd992146f189da03e0329a3d7922e681da6 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95784096674574f6950ba70e2eedb295d32f854fdd95a4a7454a03e1388f5b93 +size 671150064 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/config.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..c9e59f4ecc1ec529c37a3d7b3171da8c029880ca --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b29a9500e381d736d83fc8fb868393fcb4882fb12bf439cb83277f333ec14778 +size 918507402 diff --git a/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..02b10f2cf163875b7056baa99142c333faf3ff2e --- /dev/null +++ b/single_dataset/llama3_qa/VideoGameBunny_v1_1-Llama-3-8B-V-llama3_qa_dataset_5000_epochs_1_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 1.9276346784275542, + "learning_rate": 2e-05, + "loss": 1.7768, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 1.6481278982394791, + "learning_rate": 4e-05, + "loss": 1.538, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 1.7717724000401631, + "learning_rate": 6e-05, + "loss": 1.6643, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 1.4798087223498675, + "learning_rate": 8e-05, + "loss": 1.745, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.5040913189510572, + "learning_rate": 0.0001, + "loss": 1.6507, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.4179433288448318, + "learning_rate": 0.00012, + "loss": 1.4839, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 1.390878794328348, + "learning_rate": 0.00014, + "loss": 1.5861, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 1.159005693005469, + "learning_rate": 0.00016, + "loss": 1.4928, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 1.0655915255744721, + "learning_rate": 0.00018, + "loss": 1.4568, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 1.0850599508988026, + "learning_rate": 0.0002, + "loss": 1.5257, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 1.0538700008730024, + "learning_rate": 0.00019999458931878073, + "loss": 1.3898, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 1.1490879941978314, + "learning_rate": 0.0001999783578606323, + "loss": 1.4767, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 1.002696197709384, + "learning_rate": 0.00019995130738201966, + "loss": 1.3906, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 1.0994669012710434, + "learning_rate": 0.0001999134408101731, + "loss": 1.3314, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 1.1006062488575192, + "learning_rate": 0.00019986476224277165, + "loss": 1.4306, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 1.1638333681456912, + "learning_rate": 0.00019980527694749952, + "loss": 1.4754, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 1.0752931115953115, + "learning_rate": 0.00019973499136147606, + "loss": 1.5654, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 1.4343590589869308, + "learning_rate": 0.0001996539130905593, + "loss": 1.4925, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 1.024250588988728, + "learning_rate": 0.0001995620509085228, + "loss": 1.4888, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.9485578932937413, + "learning_rate": 0.00019945941475610623, + "loss": 1.2813, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 1.0130437917922024, + "learning_rate": 0.0001993460157399396, + "loss": 1.2961, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 1.1200060173705617, + "learning_rate": 0.0001992218661313415, + "loss": 1.3012, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.939748034453486, + "learning_rate": 0.00019908697936499103, + "loss": 1.3703, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.9919126812101607, + "learning_rate": 0.00019894137003747403, + "loss": 1.4193, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.9686773170539923, + "learning_rate": 0.00019878505390570362, + "loss": 1.4145, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 1.1428306722186963, + "learning_rate": 0.00019861804788521493, + "loss": 1.5102, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.9949347203047502, + "learning_rate": 0.00019844037004833473, + "loss": 1.1556, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.8848229387853778, + "learning_rate": 0.00019825203962222572, + "loss": 1.3756, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.9900335819187616, + "learning_rate": 0.0001980530769868059, + "loss": 1.4067, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.9801011347209362, + "learning_rate": 0.00019784350367254322, + "loss": 1.3651, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.9365165117736447, + "learning_rate": 0.0001976233423581255, + "loss": 1.4403, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 1.0123569730398672, + "learning_rate": 0.0001973926168680066, + "loss": 1.4026, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.9736841721199668, + "learning_rate": 0.00019715135216982798, + "loss": 1.4121, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 1.1071688313996157, + "learning_rate": 0.0001968995743717171, + "loss": 1.3487, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 1.1060493331445256, + "learning_rate": 0.00019663731071946206, + "loss": 1.4219, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 1.026938645087035, + "learning_rate": 0.00019636458959356316, + "loss": 1.3932, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 1.1557966387978893, + "learning_rate": 0.0001960814405061619, + "loss": 1.3466, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 1.0078215594281974, + "learning_rate": 0.00019578789409784727, + "loss": 1.3699, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.9811913896716529, + "learning_rate": 0.00019548398213434007, + "loss": 1.2409, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.8918197387654728, + "learning_rate": 0.00019516973750305532, + "loss": 1.3851, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 1.068089022611143, + "learning_rate": 0.00019484519420954354, + "loss": 1.2598, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 1.0478030296296357, + "learning_rate": 0.00019451038737381077, + "loss": 1.4167, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.8100338238370158, + "learning_rate": 0.00019416535322651818, + "loss": 1.2201, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.8517143366939263, + "learning_rate": 0.00019381012910506146, + "loss": 1.1609, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.9142506272664487, + "learning_rate": 0.00019344475344953012, + "loss": 1.3413, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 1.0536733695691438, + "learning_rate": 0.00019306926579854821, + "loss": 1.3029, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.9221962771066791, + "learning_rate": 0.00019268370678499533, + "loss": 1.3558, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.910936924116227, + "learning_rate": 0.0001922881181316097, + "loss": 1.2835, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 1.1056786632238242, + "learning_rate": 0.00019188254264647337, + "loss": 1.4781, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 1.0454075542982686, + "learning_rate": 0.0001914670242183795, + "loss": 1.4421, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.9283596325199607, + "learning_rate": 0.0001910416078120832, + "loss": 1.2897, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.9848959751841972, + "learning_rate": 0.0001906063394634356, + "loss": 1.3819, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.9673118165984793, + "learning_rate": 0.00019016126627440237, + "loss": 1.4272, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.9432928631761596, + "learning_rate": 0.00018970643640796642, + "loss": 1.3185, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.961549810649517, + "learning_rate": 0.000189241899082916, + "loss": 1.3047, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.966460247332086, + "learning_rate": 0.00018876770456851877, + "loss": 1.3066, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.9154876554983992, + "learning_rate": 0.0001882839041790818, + "loss": 1.2382, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.9921361426235112, + "learning_rate": 0.00018779055026839868, + "loss": 1.3354, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.8773066871627214, + "learning_rate": 0.00018728769622408423, + "loss": 1.2401, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.9892167716904691, + "learning_rate": 0.00018677539646179707, + "loss": 1.3274, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.9692891455072072, + "learning_rate": 0.00018625370641935129, + "loss": 1.3451, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.9936653236889128, + "learning_rate": 0.00018572268255071718, + "loss": 1.3666, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 1.0220940618417156, + "learning_rate": 0.00018518238231991218, + "loss": 1.3666, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.872041348906631, + "learning_rate": 0.00018463286419478255, + "loss": 1.1922, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.9828590750917678, + "learning_rate": 0.00018407418764067627, + "loss": 1.4001, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 1.0032588614829971, + "learning_rate": 0.00018350641311400812, + "loss": 1.3512, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.9312261542560399, + "learning_rate": 0.0001829296020557174, + "loss": 1.3406, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.9844326224338974, + "learning_rate": 0.00018234381688461942, + "loss": 1.2899, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.9865649903509482, + "learning_rate": 0.0001817491209906506, + "loss": 1.3333, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.83819063618543, + "learning_rate": 0.00018114557872800905, + "loss": 1.3818, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 1.0551126917051847, + "learning_rate": 0.00018053325540819045, + "loss": 1.3504, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.8632160203247755, + "learning_rate": 0.0001799122172929206, + "loss": 1.3005, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.9440754764890599, + "learning_rate": 0.00017928253158698473, + "loss": 1.2767, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.8735135525555031, + "learning_rate": 0.0001786442664309554, + "loss": 1.3807, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.856740028229761, + "learning_rate": 0.0001779974908938184, + "loss": 1.197, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.9264103960369618, + "learning_rate": 0.0001773422749654988, + "loss": 1.3382, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.9204087723578827, + "learning_rate": 0.00017667868954928694, + "loss": 1.2589, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.8668385341278854, + "learning_rate": 0.00017600680645416583, + "loss": 1.3496, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 1.0097296204009092, + "learning_rate": 0.00017532669838704035, + "loss": 1.4332, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 1.026107451056098, + "learning_rate": 0.00017463843894486937, + "loss": 1.4299, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.9086670325992544, + "learning_rate": 0.0001739421026067017, + "loss": 1.2304, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.9338245765912986, + "learning_rate": 0.00017323776472561627, + "loss": 1.2421, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 1.095641194729298, + "learning_rate": 0.00017252550152056795, + "loss": 1.3781, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.9084520911757816, + "learning_rate": 0.0001718053900681397, + "loss": 1.3417, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.9649982305360747, + "learning_rate": 0.00017107750829420176, + "loss": 1.4502, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.866091858490719, + "learning_rate": 0.00017034193496547902, + "loss": 1.2619, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.8513403052208766, + "learning_rate": 0.00016959874968102735, + "loss": 1.3397, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.917755765312962, + "learning_rate": 0.00016884803286362, + "loss": 1.259, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.8494853247000094, + "learning_rate": 0.00016808986575104465, + "loss": 1.3267, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.8654871261827237, + "learning_rate": 0.00016732433038731242, + "loss": 1.164, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.9552195348413874, + "learning_rate": 0.0001665515096137797, + "loss": 1.3154, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 1.016127615130936, + "learning_rate": 0.00016577148706018328, + "loss": 1.4602, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.8884802039436105, + "learning_rate": 0.00016498434713559088, + "loss": 1.3322, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.9107081196079571, + "learning_rate": 0.00016419017501926656, + "loss": 1.2707, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.9518776066420643, + "learning_rate": 0.0001633890566514535, + "loss": 1.3673, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 1.0212478759273667, + "learning_rate": 0.00016258107872407375, + "loss": 1.1936, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.9056282973849618, + "learning_rate": 0.0001617663286713474, + "loss": 1.3772, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.8251877121032121, + "learning_rate": 0.00016094489466033043, + "loss": 1.1911, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.9289746754002959, + "learning_rate": 0.00016011686558137448, + "loss": 1.3496, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.9128957367242889, + "learning_rate": 0.0001592823310385073, + "loss": 1.2479, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.7965044140608732, + "learning_rate": 0.0001584413813397364, + "loss": 1.1455, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 1.0545930599310702, + "learning_rate": 0.00015759410748727662, + "loss": 1.3209, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.8852713683280387, + "learning_rate": 0.00015674060116770236, + "loss": 1.2693, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.9174434467161392, + "learning_rate": 0.00015588095474202595, + "loss": 1.2562, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.8400842605770039, + "learning_rate": 0.00015501526123570277, + "loss": 1.156, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 1.0611988942111712, + "learning_rate": 0.00015414361432856475, + "loss": 1.315, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.9147798982224843, + "learning_rate": 0.0001532661083446829, + "loss": 1.3202, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 1.0010578693236878, + "learning_rate": 0.00015238283824216015, + "loss": 1.3362, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.9000872074490198, + "learning_rate": 0.00015149389960285558, + "loss": 1.3446, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.7920512263187361, + "learning_rate": 0.00015059938862204127, + "loss": 1.2624, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.9461381629633617, + "learning_rate": 0.00014969940209799248, + "loss": 1.3432, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.9492837163279607, + "learning_rate": 0.00014879403742151283, + "loss": 1.2902, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.8488571384929292, + "learning_rate": 0.00014788339256539544, + "loss": 1.1759, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.9186243276864146, + "learning_rate": 0.0001469675660738206, + "loss": 1.3183, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.9045365204837359, + "learning_rate": 0.00014604665705169237, + "loss": 1.2769, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.8145440663745105, + "learning_rate": 0.00014512076515391375, + "loss": 1.1665, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 1.0801018363645023, + "learning_rate": 0.00014418999057460276, + "loss": 1.2615, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.8347270943328522, + "learning_rate": 0.0001432544340362501, + "loss": 1.1709, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.8953758008715978, + "learning_rate": 0.00014231419677881966, + "loss": 1.3476, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.9339933247904735, + "learning_rate": 0.00014136938054879283, + "loss": 1.2882, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.7712534114542834, + "learning_rate": 0.00014042008758815818, + "loss": 1.1018, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.8307908587514617, + "learning_rate": 0.00013946642062334766, + "loss": 1.3319, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.8672221825940479, + "learning_rate": 0.00013850848285411994, + "loss": 1.2412, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.8485691861687462, + "learning_rate": 0.000137546377942393, + "loss": 1.1078, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.819638464973109, + "learning_rate": 0.00013658021000102636, + "loss": 1.2004, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.9482780315861542, + "learning_rate": 0.00013561008358255468, + "loss": 1.4392, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.8086620843754946, + "learning_rate": 0.00013463610366787392, + "loss": 1.1491, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.8707780475245294, + "learning_rate": 0.00013365837565488064, + "loss": 1.2727, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.9425633093531728, + "learning_rate": 0.0001326770053470668, + "loss": 1.2641, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.9528069327815744, + "learning_rate": 0.0001316920989420703, + "loss": 1.381, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 1.032159872857148, + "learning_rate": 0.00013070376302018287, + "loss": 1.2908, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.8591875555532374, + "learning_rate": 0.00012971210453281674, + "loss": 1.2944, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.8669588989282161, + "learning_rate": 0.000128717230790931, + "loss": 1.3257, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.9175575752707962, + "learning_rate": 0.00012771924945341906, + "loss": 1.1591, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.916532881566656, + "learning_rate": 0.00012671826851545851, + "loss": 1.3507, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.8344006687821425, + "learning_rate": 0.0001257143962968246, + "loss": 1.238, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.8908164072105582, + "learning_rate": 0.00012470774143016853, + "loss": 1.2133, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.8258874851557738, + "learning_rate": 0.00012369841284926188, + "loss": 1.2407, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.8168770012860906, + "learning_rate": 0.00012268651977720866, + "loss": 1.2627, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.9672601465907635, + "learning_rate": 0.00012167217171462566, + "loss": 1.4012, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 1.0297331391395985, + "learning_rate": 0.0001206554784277931, + "loss": 1.2051, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.8954821340132485, + "learning_rate": 0.00011963654993677645, + "loss": 1.1384, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.9673654445988941, + "learning_rate": 0.00011861549650352069, + "loss": 1.3527, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.9265597815605019, + "learning_rate": 0.00011759242861991855, + "loss": 1.2943, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.8175004558718472, + "learning_rate": 0.00011656745699585371, + "loss": 1.2177, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.8333702019225635, + "learning_rate": 0.00011554069254722051, + "loss": 1.2051, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.7569650720830139, + "learning_rate": 0.00011451224638392129, + "loss": 1.1324, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.9357586879033147, + "learning_rate": 0.00011348222979784289, + "loss": 1.2851, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.8527284834939162, + "learning_rate": 0.00011245075425081328, + "loss": 1.2396, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.8594577247326713, + "learning_rate": 0.00011141793136253986, + "loss": 1.3016, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.8688050502279938, + "learning_rate": 0.0001103838728985307, + "loss": 1.2143, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.933697227598257, + "learning_rate": 0.000109348690758, + "loss": 1.3447, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 1.0161891719030802, + "learning_rate": 0.00010831249696175918, + "loss": 1.167, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.926586483980676, + "learning_rate": 0.0001072754036400944, + "loss": 1.2326, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.7956334969811154, + "learning_rate": 0.00010623752302063283, + "loss": 1.1309, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.8239994579933885, + "learning_rate": 0.00010519896741619803, + "loss": 1.1648, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.8348427384230527, + "learning_rate": 0.00010415984921265609, + "loss": 1.357, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.8258203560381552, + "learning_rate": 0.00010312028085675391, + "loss": 1.2152, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.8482515431381136, + "learning_rate": 0.00010208037484395114, + "loss": 1.1935, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.8417744852317112, + "learning_rate": 0.00010104024370624644, + "loss": 1.3203, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.8611627618982554, + "learning_rate": 0.0001, + "loss": 1.2677, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.8406854312604937, + "learning_rate": 9.895975629375359e-05, + "loss": 1.1903, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.8067394788759179, + "learning_rate": 9.791962515604887e-05, + "loss": 1.2452, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.846955968603144, + "learning_rate": 9.687971914324607e-05, + "loss": 1.3268, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.8314645079471286, + "learning_rate": 9.584015078734395e-05, + "loss": 1.2503, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.8366588389815677, + "learning_rate": 9.480103258380198e-05, + "loss": 1.244, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.9291738661860993, + "learning_rate": 9.376247697936719e-05, + "loss": 1.3705, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.9352632281589096, + "learning_rate": 9.272459635990562e-05, + "loss": 1.1825, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.794306846381449, + "learning_rate": 9.168750303824084e-05, + "loss": 1.1809, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.891848905066318, + "learning_rate": 9.065130924199998e-05, + "loss": 1.2885, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.9009410327369045, + "learning_rate": 8.961612710146934e-05, + "loss": 1.2426, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.7983808800209792, + "learning_rate": 8.858206863746018e-05, + "loss": 1.1466, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.8158403213845199, + "learning_rate": 8.754924574918675e-05, + "loss": 1.2988, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.9028836450931733, + "learning_rate": 8.651777020215712e-05, + "loss": 1.1745, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.9192920894100809, + "learning_rate": 8.548775361607872e-05, + "loss": 1.2563, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.9073659197361555, + "learning_rate": 8.445930745277953e-05, + "loss": 1.2116, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.8601282806731785, + "learning_rate": 8.343254300414628e-05, + "loss": 1.2211, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.7743572852622722, + "learning_rate": 8.240757138008149e-05, + "loss": 1.1189, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.9731152954185913, + "learning_rate": 8.138450349647936e-05, + "loss": 1.2922, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.8269711251624151, + "learning_rate": 8.036345006322359e-05, + "loss": 1.2394, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.8636114941268018, + "learning_rate": 7.934452157220694e-05, + "loss": 1.1756, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.8001603451805631, + "learning_rate": 7.832782828537437e-05, + "loss": 1.3446, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.8294495413728634, + "learning_rate": 7.731348022279134e-05, + "loss": 1.1428, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.7970768519309458, + "learning_rate": 7.630158715073813e-05, + "loss": 1.1597, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.8343954735798168, + "learning_rate": 7.52922585698315e-05, + "loss": 1.1475, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 1.1367065956930875, + "learning_rate": 7.428560370317542e-05, + "loss": 1.1926, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.7774104715732815, + "learning_rate": 7.328173148454151e-05, + "loss": 1.1298, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.8255964324237054, + "learning_rate": 7.228075054658096e-05, + "loss": 1.2593, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.7843123105664823, + "learning_rate": 7.1282769209069e-05, + "loss": 1.2835, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.7981085568864953, + "learning_rate": 7.028789546718326e-05, + "loss": 1.2071, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.721122994064396, + "learning_rate": 6.929623697981718e-05, + "loss": 1.183, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.8989434699612031, + "learning_rate": 6.830790105792973e-05, + "loss": 1.1542, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.8142294986053386, + "learning_rate": 6.732299465293322e-05, + "loss": 1.1944, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.7911113853918703, + "learning_rate": 6.63416243451194e-05, + "loss": 1.1117, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.7173845638795395, + "learning_rate": 6.536389633212609e-05, + "loss": 1.1267, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.7854681270504528, + "learning_rate": 6.43899164174453e-05, + "loss": 1.1885, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.8331468548214547, + "learning_rate": 6.341978999897365e-05, + "loss": 1.2341, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.816416495817863, + "learning_rate": 6.245362205760704e-05, + "loss": 1.0917, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.8666692389516789, + "learning_rate": 6.149151714588009e-05, + "loss": 1.2418, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.778466139322238, + "learning_rate": 6.053357937665237e-05, + "loss": 1.1086, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.764141246873744, + "learning_rate": 5.957991241184184e-05, + "loss": 1.0999, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.7766672707410954, + "learning_rate": 5.863061945120719e-05, + "loss": 1.1619, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.8356257804017168, + "learning_rate": 5.768580322118034e-05, + "loss": 1.1816, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.8983453768366347, + "learning_rate": 5.6745565963749925e-05, + "loss": 1.2263, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.8247821633222613, + "learning_rate": 5.5810009425397294e-05, + "loss": 1.1575, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.8287735532750757, + "learning_rate": 5.487923484608629e-05, + "loss": 1.1057, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.8095241240636343, + "learning_rate": 5.395334294830765e-05, + "loss": 1.2601, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.8479222640578135, + "learning_rate": 5.3032433926179395e-05, + "loss": 1.3422, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.8010885490765423, + "learning_rate": 5.211660743460458e-05, + "loss": 1.1548, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.801934289730396, + "learning_rate": 5.1205962578487155e-05, + "loss": 1.1854, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.8011647459338709, + "learning_rate": 5.030059790200756e-05, + "loss": 1.3251, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.7679823492923111, + "learning_rate": 4.940061137795876e-05, + "loss": 1.2011, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 1.2209222931014045, + "learning_rate": 4.850610039714444e-05, + "loss": 1.3057, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.8172392109293101, + "learning_rate": 4.761716175783989e-05, + "loss": 1.1466, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.8084414078781809, + "learning_rate": 4.673389165531714e-05, + "loss": 1.2086, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.7799909624044505, + "learning_rate": 4.585638567143529e-05, + "loss": 1.0911, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.799724183341175, + "learning_rate": 4.498473876429726e-05, + "loss": 1.1077, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.9004210665974285, + "learning_rate": 4.411904525797408e-05, + "loss": 1.188, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.883984385574042, + "learning_rate": 4.325939883229766e-05, + "loss": 1.1851, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.8981544710149507, + "learning_rate": 4.240589251272342e-05, + "loss": 1.1885, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.8067748995831314, + "learning_rate": 4.155861866026364e-05, + "loss": 1.2239, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.7961366302755665, + "learning_rate": 4.071766896149273e-05, + "loss": 1.1601, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.9390958958344794, + "learning_rate": 3.988313441862553e-05, + "loss": 1.188, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.8775126473919165, + "learning_rate": 3.9055105339669595e-05, + "loss": 1.2275, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.7760005346742462, + "learning_rate": 3.823367132865265e-05, + "loss": 1.1591, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.7834072172886265, + "learning_rate": 3.741892127592625e-05, + "loss": 1.1553, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.8443666697676242, + "learning_rate": 3.6610943348546526e-05, + "loss": 1.1369, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.8014359622570144, + "learning_rate": 3.580982498073344e-05, + "loss": 1.159, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.7616605489390753, + "learning_rate": 3.501565286440914e-05, + "loss": 1.1311, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.8969179735835168, + "learning_rate": 3.422851293981676e-05, + "loss": 1.1737, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.9018871934259612, + "learning_rate": 3.3448490386220355e-05, + "loss": 1.1813, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.8382979101420402, + "learning_rate": 3.2675669612687565e-05, + "loss": 1.1886, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.8018934631010715, + "learning_rate": 3.191013424895536e-05, + "loss": 1.2241, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.7390291843770445, + "learning_rate": 3.115196713638e-05, + "loss": 0.9976, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.9266438365579518, + "learning_rate": 3.040125031897264e-05, + "loss": 1.2341, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.861065268104146, + "learning_rate": 2.9658065034520978e-05, + "loss": 1.2627, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.8204617046880526, + "learning_rate": 2.892249170579826e-05, + "loss": 1.178, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.9265792236123463, + "learning_rate": 2.8194609931860316e-05, + "loss": 1.2555, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.8613593275799136, + "learning_rate": 2.7474498479432087e-05, + "loss": 1.2695, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 1.1038814044988352, + "learning_rate": 2.6762235274383772e-05, + "loss": 1.1719, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.905407690012056, + "learning_rate": 2.6057897393298324e-05, + "loss": 1.1587, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.8002036080708511, + "learning_rate": 2.536156105513062e-05, + "loss": 1.0718, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.7551531161165965, + "learning_rate": 2.4673301612959654e-05, + "loss": 1.0928, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.725768166113705, + "learning_rate": 2.399319354583418e-05, + "loss": 1.2068, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.8109713482708016, + "learning_rate": 2.3321310450713062e-05, + "loss": 1.1437, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.8117450836482087, + "learning_rate": 2.265772503450122e-05, + "loss": 1.2281, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.8523708555547533, + "learning_rate": 2.2002509106181624e-05, + "loss": 1.1589, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.8080157824825525, + "learning_rate": 2.1355733569044635e-05, + "loss": 1.271, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.9926926564662885, + "learning_rate": 2.0717468413015283e-05, + "loss": 1.2264, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.8747788681716573, + "learning_rate": 2.008778270707944e-05, + "loss": 1.1876, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.858557919178996, + "learning_rate": 1.946674459180955e-05, + "loss": 1.1884, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 1.1027947452090465, + "learning_rate": 1.8854421271990964e-05, + "loss": 1.3269, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.9616315049301722, + "learning_rate": 1.8250879009349398e-05, + "loss": 1.266, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.8627891810031095, + "learning_rate": 1.7656183115380577e-05, + "loss": 1.1313, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.8603099014912703, + "learning_rate": 1.707039794428259e-05, + "loss": 1.1797, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.6788146705314012, + "learning_rate": 1.649358688599191e-05, + "loss": 1.0931, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.8343403001633107, + "learning_rate": 1.5925812359323745e-05, + "loss": 1.2136, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.8155842206759587, + "learning_rate": 1.5367135805217458e-05, + "loss": 1.0883, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.7596472178159187, + "learning_rate": 1.4817617680087825e-05, + "loss": 1.1554, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.8202439148773495, + "learning_rate": 1.4277317449282834e-05, + "loss": 1.1459, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.8370741064646355, + "learning_rate": 1.3746293580648717e-05, + "loss": 1.2309, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.7976979167753185, + "learning_rate": 1.3224603538202929e-05, + "loss": 1.1261, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.8128289382009063, + "learning_rate": 1.2712303775915802e-05, + "loss": 1.2263, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 1.0706639890399756, + "learning_rate": 1.220944973160133e-05, + "loss": 1.2743, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.8111350315322916, + "learning_rate": 1.1716095820918216e-05, + "loss": 1.091, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.8645351791577299, + "learning_rate": 1.1232295431481222e-05, + "loss": 1.1923, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.8831423664098689, + "learning_rate": 1.0758100917083991e-05, + "loss": 1.3053, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.9087013463848541, + "learning_rate": 1.0293563592033595e-05, + "loss": 1.1431, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.8453476640976382, + "learning_rate": 9.838733725597615e-06, + "loss": 1.1361, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.7991537010464119, + "learning_rate": 9.393660536564408e-06, + "loss": 1.2065, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.8294822038333, + "learning_rate": 8.958392187916841e-06, + "loss": 1.1782, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.903460681143856, + "learning_rate": 8.532975781620512e-06, + "loss": 1.2391, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.8322544796923445, + "learning_rate": 8.117457353526625e-06, + "loss": 1.1208, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.8384660345857029, + "learning_rate": 7.711881868390291e-06, + "loss": 1.2535, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.7931126532809545, + "learning_rate": 7.3162932150046885e-06, + "loss": 1.1714, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.844869330986358, + "learning_rate": 6.930734201451816e-06, + "loss": 1.1538, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.8709269057729458, + "learning_rate": 6.555246550469907e-06, + "loss": 1.2546, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.7558598839450004, + "learning_rate": 6.189870894938587e-06, + "loss": 1.1193, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.8500033430267443, + "learning_rate": 5.834646773481811e-06, + "loss": 1.152, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.7729090524617063, + "learning_rate": 5.489612626189245e-06, + "loss": 1.1098, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.7643772648840976, + "learning_rate": 5.154805790456485e-06, + "loss": 1.0279, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.8276269212663161, + "learning_rate": 4.830262496944693e-06, + "loss": 1.2287, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.7807491891276336, + "learning_rate": 4.516017865659949e-06, + "loss": 1.0979, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.7825104357246059, + "learning_rate": 4.21210590215273e-06, + "loss": 1.1803, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.7874570354765664, + "learning_rate": 3.918559493838114e-06, + "loss": 1.212, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.8623185804959949, + "learning_rate": 3.6354104064368566e-06, + "loss": 1.0947, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.8211354353553579, + "learning_rate": 3.3626892805379562e-06, + "loss": 1.2393, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.7681659932492936, + "learning_rate": 3.100425628282899e-06, + "loss": 1.1523, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.8577824703991259, + "learning_rate": 2.848647830172024e-06, + "loss": 1.2206, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.80270605462384, + "learning_rate": 2.607383131993424e-06, + "loss": 1.057, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.7348124182641901, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.9738, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.9447233425970715, + "learning_rate": 2.1564963274568027e-06, + "loss": 1.1511, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.8043668353555504, + "learning_rate": 1.9469230131940907e-06, + "loss": 1.1463, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.8339332728481555, + "learning_rate": 1.7479603777742938e-06, + "loss": 1.213, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.8401094805427077, + "learning_rate": 1.559629951665298e-06, + "loss": 1.239, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.8883832277628466, + "learning_rate": 1.3819521147851123e-06, + "loss": 1.2046, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.9959065976233928, + "learning_rate": 1.2149460942964098e-06, + "loss": 1.1497, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.7880542961669169, + "learning_rate": 1.05862996252597e-06, + "loss": 1.1489, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.8092589785189201, + "learning_rate": 9.130206350089765e-07, + "loss": 1.1295, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.7317415886804474, + "learning_rate": 7.781338686584927e-07, + "loss": 1.1087, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.9057811805320088, + "learning_rate": 6.539842600603918e-07, + "loss": 1.12, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.7744919130570085, + "learning_rate": 5.405852438937764e-07, + "loss": 1.1901, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.8641892028445843, + "learning_rate": 4.3794909147720773e-07, + "loss": 1.1875, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.8278884975053862, + "learning_rate": 3.4608690944071263e-07, + "loss": 1.1478, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.8878776299342533, + "learning_rate": 2.6500863852395584e-07, + "loss": 1.2387, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.8175876526171806, + "learning_rate": 1.947230525005006e-07, + "loss": 1.1862, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.7199639827824592, + "learning_rate": 1.3523775722834587e-07, + "loss": 1.1295, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.9136510681471197, + "learning_rate": 8.655918982689581e-08, + "loss": 1.2803, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.767248727443108, + "learning_rate": 4.8692617980350406e-08, + "loss": 1.1746, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.8395636925652074, + "learning_rate": 2.164213936770576e-08, + "loss": 1.2465, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.9605079921621236, + "learning_rate": 5.410681219286673e-09, + "loss": 1.3639, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.7456669378605656, + "learning_rate": 0.0, + "loss": 1.1567, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 102004528611328.0, + "train_loss": 1.2587099098242247, + "train_runtime": 3131.9625, + "train_samples_per_second": 1.596, + "train_steps_per_second": 0.1 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 102004528611328.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..2fd299d79b29b6a152f1741aab425b54aadad65f --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "down_proj", + "k_proj", + "up_proj", + "gate_proj", + "o_proj", + "v_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..0e3ace7fdce87934f95075283e224ccd6395530c --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f35b404e5f5d58ff8b8b5a7a22896100bbdc8a816d6323849bed80b5bcc2dc2c +size 671150064 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..11f9e2fc695df84fc5f15eef7c6cf87b419cc570 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0f4f8316bacd73d83b96d0e1cfb5b58cad4c4aadef49fe67a33aa091a5e9072 +size 918507402 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..66206d707f1892425ae16f72e179c3368be5953a --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_10000_epochs_1_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 0.7110440684465407, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.9068, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 0.7402343530118121, + "learning_rate": 2.105263157894737e-05, + "loss": 0.9466, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 0.700186513981636, + "learning_rate": 3.157894736842105e-05, + "loss": 0.9259, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 0.5505407762435491, + "learning_rate": 4.210526315789474e-05, + "loss": 0.8359, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 0.4862152844581287, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.8108, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 0.4309565303252267, + "learning_rate": 6.31578947368421e-05, + "loss": 0.7751, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 0.6187148335727641, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8066, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 0.7302821056119174, + "learning_rate": 8.421052631578948e-05, + "loss": 0.821, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 0.5060929149374445, + "learning_rate": 9.473684210526316e-05, + "loss": 0.7962, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 0.4458004825377806, + "learning_rate": 0.00010526315789473685, + "loss": 0.7821, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 0.4089589669778015, + "learning_rate": 0.00011578947368421053, + "loss": 0.7828, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 0.3601723384901577, + "learning_rate": 0.0001263157894736842, + "loss": 0.6884, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 0.40858125080193214, + "learning_rate": 0.0001368421052631579, + "loss": 0.765, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 0.34659150917180453, + "learning_rate": 0.00014736842105263158, + "loss": 0.6784, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 0.4154170252774343, + "learning_rate": 0.00015789473684210527, + "loss": 0.7487, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 0.35305856643040606, + "learning_rate": 0.00016842105263157895, + "loss": 0.6666, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 0.4051307765789964, + "learning_rate": 0.00017894736842105264, + "loss": 0.7008, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 0.4704633484539053, + "learning_rate": 0.00018947368421052632, + "loss": 0.7038, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 0.35216674124673664, + "learning_rate": 0.0002, + "loss": 0.699, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 0.3940341166792085, + "learning_rate": 0.00019999865623437013, + "loss": 0.7319, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 0.32907304141610794, + "learning_rate": 0.00019999462497359466, + "loss": 0.6622, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 0.36339116687554524, + "learning_rate": 0.00019998790632601496, + "loss": 0.7147, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 0.3807439948921444, + "learning_rate": 0.0001999785004721968, + "loss": 0.7533, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 0.3871403568340088, + "learning_rate": 0.00019996640766492543, + "loss": 0.6999, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 0.3839673604576073, + "learning_rate": 0.00019995162822919883, + "loss": 0.7195, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 0.3441053998710712, + "learning_rate": 0.00019993416256221895, + "loss": 0.7251, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 0.35243220597752656, + "learning_rate": 0.00019991401113338104, + "loss": 0.7196, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3636373360678986, + "learning_rate": 0.00019989117448426108, + "loss": 0.7013, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 0.38690640735599346, + "learning_rate": 0.00019986565322860115, + "loss": 0.7257, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 0.34432537602176394, + "learning_rate": 0.00019983744805229296, + "loss": 0.6949, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 0.33886239466871904, + "learning_rate": 0.00019980655971335945, + "loss": 0.7076, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 0.39604847662347764, + "learning_rate": 0.00019977298904193437, + "loss": 0.709, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 0.3554397918223366, + "learning_rate": 0.00019973673694024, + "loss": 0.6936, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3006245693098016, + "learning_rate": 0.00019969780438256293, + "loss": 0.6309, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 0.3460179345437749, + "learning_rate": 0.0001996561924152278, + "loss": 0.7033, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 0.3580661160694404, + "learning_rate": 0.0001996119021565693, + "loss": 0.6624, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 0.3675642471853245, + "learning_rate": 0.0001995649347969019, + "loss": 0.7166, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 0.33195058427509394, + "learning_rate": 0.00019951529159848805, + "loss": 0.6981, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 0.36505875925567893, + "learning_rate": 0.00019946297389550433, + "loss": 0.6965, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 0.33771080687308747, + "learning_rate": 0.00019940798309400526, + "loss": 0.681, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 0.34732300147085565, + "learning_rate": 0.0001993503206718859, + "loss": 0.6926, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 0.3892677134753642, + "learning_rate": 0.00019928998817884182, + "loss": 0.7061, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 0.41174857195156683, + "learning_rate": 0.00019922698723632767, + "loss": 0.727, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3261432831176223, + "learning_rate": 0.00019916131953751342, + "loss": 0.6659, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 0.39023105588833745, + "learning_rate": 0.00019909298684723904, + "loss": 0.7317, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 0.3470903187488154, + "learning_rate": 0.00019902199100196697, + "loss": 0.6618, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 0.44533985824170724, + "learning_rate": 0.00019894833390973266, + "loss": 0.7189, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 0.38820559490715234, + "learning_rate": 0.00019887201755009357, + "loss": 0.7184, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 0.3309722521348095, + "learning_rate": 0.0001987930439740757, + "loss": 0.6738, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 0.3500223436629588, + "learning_rate": 0.00019871141530411853, + "loss": 0.6605, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 0.3667381135063131, + "learning_rate": 0.0001986271337340182, + "loss": 0.7317, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3302936992504295, + "learning_rate": 0.00019854020152886814, + "loss": 0.6474, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 0.3679651518546313, + "learning_rate": 0.0001984506210249986, + "loss": 0.6687, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 0.33494114144170967, + "learning_rate": 0.00019835839462991361, + "loss": 0.6662, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 0.40296578279584266, + "learning_rate": 0.00019826352482222638, + "loss": 0.7072, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 0.32730591928259145, + "learning_rate": 0.00019816601415159263, + "loss": 0.6951, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 0.3062729660279465, + "learning_rate": 0.0001980658652386421, + "loss": 0.6254, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 0.47647017324525437, + "learning_rate": 0.00019796308077490817, + "loss": 0.7032, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 0.32627709157890566, + "learning_rate": 0.00019785766352275542, + "loss": 0.6167, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 0.3329798568108714, + "learning_rate": 0.00019774961631530545, + "loss": 0.6676, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 0.3891963711757699, + "learning_rate": 0.00019763894205636072, + "loss": 0.6965, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 0.33183293946745057, + "learning_rate": 0.00019752564372032657, + "loss": 0.6392, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 0.33912037930238675, + "learning_rate": 0.00019740972435213115, + "loss": 0.6356, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 0.32664447889845427, + "learning_rate": 0.00019729118706714375, + "loss": 0.6339, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.3300921726396346, + "learning_rate": 0.00019717003505109095, + "loss": 0.6419, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 0.32753730385613744, + "learning_rate": 0.00019704627155997108, + "loss": 0.6538, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 0.3595156771897832, + "learning_rate": 0.00019691989991996663, + "loss": 0.7021, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 0.34599170888547853, + "learning_rate": 0.0001967909235273549, + "loss": 0.6761, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 0.3072498490442669, + "learning_rate": 0.00019665934584841682, + "loss": 0.6617, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 0.29256884683023665, + "learning_rate": 0.00019652517041934356, + "loss": 0.6049, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 0.35571382028549814, + "learning_rate": 0.00019638840084614182, + "loss": 0.6909, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3617050838266649, + "learning_rate": 0.00019624904080453655, + "loss": 0.7114, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 0.3401169135033129, + "learning_rate": 0.00019610709403987246, + "loss": 0.6848, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 0.35361065732811076, + "learning_rate": 0.00019596256436701324, + "loss": 0.6923, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 0.3583639301348245, + "learning_rate": 0.000195815455670239, + "loss": 0.6702, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 0.3036815741783258, + "learning_rate": 0.00019566577190314197, + "loss": 0.6226, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 0.34885191390743076, + "learning_rate": 0.0001955135170885202, + "loss": 0.7084, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3282951690750017, + "learning_rate": 0.00019535869531826937, + "loss": 0.6456, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 0.3676583612320544, + "learning_rate": 0.00019520131075327298, + "loss": 0.6878, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 0.33870275217077395, + "learning_rate": 0.00019504136762329047, + "loss": 0.6961, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 0.3239305669676888, + "learning_rate": 0.00019487887022684336, + "loss": 0.655, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 0.3036751743254618, + "learning_rate": 0.00019471382293110003, + "loss": 0.6076, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 0.3299563834929254, + "learning_rate": 0.00019454623017175812, + "loss": 0.63, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 0.345960064508727, + "learning_rate": 0.00019437609645292546, + "loss": 0.6894, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 0.3188538305366772, + "learning_rate": 0.0001942034263469989, + "loss": 0.6605, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 0.317225148263491, + "learning_rate": 0.00019402822449454153, + "loss": 0.6245, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 0.3215959016705841, + "learning_rate": 0.00019385049560415794, + "loss": 0.6901, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 0.39325835809764975, + "learning_rate": 0.00019367024445236754, + "loss": 0.6956, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 0.3222902742014463, + "learning_rate": 0.00019348747588347637, + "loss": 0.6066, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 0.3348215618090499, + "learning_rate": 0.00019330219480944694, + "loss": 0.6664, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 0.31538169493425555, + "learning_rate": 0.00019311440620976597, + "loss": 0.6534, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3129197934323035, + "learning_rate": 0.0001929241151313108, + "loss": 0.6362, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 0.48046051152079117, + "learning_rate": 0.00019273132668821364, + "loss": 0.7376, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 0.3750084153380919, + "learning_rate": 0.00019253604606172417, + "loss": 0.6733, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 0.28943781471618396, + "learning_rate": 0.00019233827850007027, + "loss": 0.6143, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 0.3697511920947507, + "learning_rate": 0.00019213802931831696, + "loss": 0.6227, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 0.3220730779992305, + "learning_rate": 0.00019193530389822363, + "loss": 0.6422, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 0.32426781868870586, + "learning_rate": 0.00019173010768809933, + "loss": 0.6473, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 0.36372082103869735, + "learning_rate": 0.0001915224462026563, + "loss": 0.6677, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.32810076407887667, + "learning_rate": 0.00019131232502286188, + "loss": 0.6241, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 0.32386807063212686, + "learning_rate": 0.0001910997497957885, + "loss": 0.6301, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 0.3220906865944551, + "learning_rate": 0.00019088472623446183, + "loss": 0.6106, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 0.40958173832989664, + "learning_rate": 0.00019066726011770726, + "loss": 0.6824, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3416481259773205, + "learning_rate": 0.0001904473572899947, + "loss": 0.6465, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 0.3501750365308928, + "learning_rate": 0.00019022502366128135, + "loss": 0.6715, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3933598149181734, + "learning_rate": 0.00019000026520685302, + "loss": 0.7267, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 0.3477897871913843, + "learning_rate": 0.0001897730879671634, + "loss": 0.6765, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 0.34931633673439555, + "learning_rate": 0.00018954349804767184, + "loss": 0.6476, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 0.3615756469744839, + "learning_rate": 0.00018931150161867916, + "loss": 0.651, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 0.3588830035989954, + "learning_rate": 0.00018907710491516199, + "loss": 0.7245, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 0.32400355120523916, + "learning_rate": 0.0001888403142366049, + "loss": 0.6511, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3139049955415279, + "learning_rate": 0.00018860113594683148, + "loss": 0.6874, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 0.32974412650898166, + "learning_rate": 0.00018835957647383303, + "loss": 0.6542, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 0.32521135211297236, + "learning_rate": 0.00018811564230959588, + "loss": 0.606, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 0.3274458228827635, + "learning_rate": 0.00018786934000992688, + "loss": 0.6197, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 0.38249267054912756, + "learning_rate": 0.00018762067619427746, + "loss": 0.6922, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 0.3993895495653672, + "learning_rate": 0.00018736965754556528, + "loss": 0.7049, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 0.35465045760647435, + "learning_rate": 0.00018711629080999504, + "loss": 0.6708, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.3366336875124197, + "learning_rate": 0.00018686058279687698, + "loss": 0.6431, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 0.3490649974959498, + "learning_rate": 0.00018660254037844388, + "loss": 0.6577, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 0.3753710847840562, + "learning_rate": 0.00018634217048966637, + "loss": 0.675, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3236435175590241, + "learning_rate": 0.0001860794801280666, + "loss": 0.6544, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 0.3344044866776899, + "learning_rate": 0.0001858144763535302, + "loss": 0.6665, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.40250024802517503, + "learning_rate": 0.0001855471662881164, + "loss": 0.6715, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.3210471159114599, + "learning_rate": 0.00018527755711586678, + "loss": 0.6152, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 0.30939342647725143, + "learning_rate": 0.00018500565608261214, + "loss": 0.6417, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.34480787064227664, + "learning_rate": 0.00018473147049577774, + "loss": 0.6839, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 0.34282744371958, + "learning_rate": 0.00018445500772418697, + "loss": 0.6705, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.31002911403922345, + "learning_rate": 0.00018417627519786315, + "loss": 0.6157, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 0.327253325750313, + "learning_rate": 0.00018389528040783012, + "loss": 0.6641, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.3239163540448087, + "learning_rate": 0.00018361203090591071, + "loss": 0.6544, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3352972454556464, + "learning_rate": 0.00018332653430452376, + "loss": 0.6324, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3578753303826149, + "learning_rate": 0.00018303879827647975, + "loss": 0.6436, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.38389784430305535, + "learning_rate": 0.00018274883055477436, + "loss": 0.6577, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 0.33577156813804415, + "learning_rate": 0.00018245663893238075, + "loss": 0.6351, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 0.31939866459158905, + "learning_rate": 0.00018216223126204007, + "loss": 0.6207, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 0.3411423086425535, + "learning_rate": 0.00018186561545605054, + "loss": 0.6639, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 0.34919337262794314, + "learning_rate": 0.00018156679948605467, + "loss": 0.6558, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 0.3562855429858843, + "learning_rate": 0.00018126579138282503, + "loss": 0.6677, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 0.36959206915419046, + "learning_rate": 0.0001809625992360485, + "loss": 0.6552, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 0.36018173281575633, + "learning_rate": 0.00018065723119410884, + "loss": 0.6277, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3312018888377899, + "learning_rate": 0.00018034969546386757, + "loss": 0.6228, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 0.34710619478939314, + "learning_rate": 0.0001800400003104436, + "loss": 0.6397, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3616780762901335, + "learning_rate": 0.00017972815405699103, + "loss": 0.6647, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.3632272942971575, + "learning_rate": 0.00017941416508447536, + "loss": 0.6823, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3438490233014574, + "learning_rate": 0.0001790980418314484, + "loss": 0.6479, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 0.30793762863839114, + "learning_rate": 0.00017877979279382135, + "loss": 0.6031, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 0.32997593549516097, + "learning_rate": 0.0001784594265246366, + "loss": 0.6577, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.3292567598218092, + "learning_rate": 0.0001781369516338378, + "loss": 0.6773, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 0.3489678086055255, + "learning_rate": 0.00017781237678803847, + "loss": 0.7063, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 0.324031447810286, + "learning_rate": 0.000177485710710289, + "loss": 0.6527, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3569167593361834, + "learning_rate": 0.00017715696217984235, + "loss": 0.6576, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3112375499035634, + "learning_rate": 0.00017682614003191807, + "loss": 0.6022, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3480171639311505, + "learning_rate": 0.00017649325315746478, + "loss": 0.6312, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 0.33785670131306333, + "learning_rate": 0.0001761583105029213, + "loss": 0.5784, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 0.32973378317808705, + "learning_rate": 0.00017582132106997616, + "loss": 0.6269, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 0.321638262841785, + "learning_rate": 0.00017548229391532572, + "loss": 0.5982, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 0.33916451990137586, + "learning_rate": 0.00017514123815043074, + "loss": 0.653, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.346726007252727, + "learning_rate": 0.00017479816294127152, + "loss": 0.6409, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.3499513321280793, + "learning_rate": 0.0001744530775081015, + "loss": 0.6522, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.3166028999401929, + "learning_rate": 0.0001741059911251997, + "loss": 0.6542, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 0.33194231087455445, + "learning_rate": 0.000173756913120621, + "loss": 0.618, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 0.3071551680466884, + "learning_rate": 0.00017340585287594604, + "loss": 0.5776, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3176933297860166, + "learning_rate": 0.0001730528198260285, + "loss": 0.5975, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 0.3236083324367126, + "learning_rate": 0.00017269782345874203, + "loss": 0.6001, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3242228487517961, + "learning_rate": 0.00017234087331472497, + "loss": 0.5985, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 0.3433505723805961, + "learning_rate": 0.00017198197898712404, + "loss": 0.6402, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 0.3142632180901461, + "learning_rate": 0.00017162115012133643, + "loss": 0.619, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.3650080087368739, + "learning_rate": 0.00017125839641475072, + "loss": 0.6645, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 0.33213013539789776, + "learning_rate": 0.00017089372761648616, + "loss": 0.6311, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.32299500479841137, + "learning_rate": 0.00017052715352713075, + "loss": 0.6439, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3325685962656323, + "learning_rate": 0.00017015868399847768, + "loss": 0.6388, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 0.29601692313967864, + "learning_rate": 0.00016978832893326074, + "loss": 0.5875, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 0.31069581062561197, + "learning_rate": 0.00016941609828488807, + "loss": 0.6235, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 0.3335052906646829, + "learning_rate": 0.0001690420020571747, + "loss": 0.6386, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 0.315143057504453, + "learning_rate": 0.0001686660503040737, + "loss": 0.6057, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 0.34013732175970035, + "learning_rate": 0.00016828825312940592, + "loss": 0.6102, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3392534389907391, + "learning_rate": 0.0001679086206865886, + "loss": 0.6176, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 0.346885901966819, + "learning_rate": 0.00016752716317836229, + "loss": 0.6773, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.3110542617842148, + "learning_rate": 0.0001671438908565167, + "loss": 0.6189, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 0.33175430427217634, + "learning_rate": 0.00016675881402161536, + "loss": 0.6374, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 0.32724307311049633, + "learning_rate": 0.0001663719430227186, + "loss": 0.6631, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.33837839138182874, + "learning_rate": 0.00016598328825710533, + "loss": 0.632, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 0.3346809848936033, + "learning_rate": 0.000165592860169994, + "loss": 0.5922, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 0.32538956568562816, + "learning_rate": 0.00016520066925426144, + "loss": 0.6399, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 0.30362553981973994, + "learning_rate": 0.0001648067260501611, + "loss": 0.6159, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.3379322372857405, + "learning_rate": 0.0001644110411450398, + "loss": 0.6059, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 0.29726833260616514, + "learning_rate": 0.00016401362517305296, + "loss": 0.6127, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 0.30324116876677487, + "learning_rate": 0.00016361448881487914, + "loss": 0.625, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 0.3116739250863844, + "learning_rate": 0.00016321364279743266, + "loss": 0.6415, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 0.31621045811587917, + "learning_rate": 0.0001628110978935756, + "loss": 0.6105, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 0.35486237890466066, + "learning_rate": 0.00016240686492182804, + "loss": 0.666, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 0.33064598256285993, + "learning_rate": 0.00016200095474607753, + "loss": 0.6874, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3180002664113174, + "learning_rate": 0.00016159337827528685, + "loss": 0.6281, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 0.3305608635057252, + "learning_rate": 0.0001611841464632011, + "loss": 0.6225, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3347652151951757, + "learning_rate": 0.0001607732703080532, + "loss": 0.6242, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 0.27991874031675124, + "learning_rate": 0.00016036076085226814, + "loss": 0.5405, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3162354099409061, + "learning_rate": 0.0001599466291821666, + "loss": 0.6009, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 0.32486415305433153, + "learning_rate": 0.0001595308864276666, + "loss": 0.6002, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.3646449871084355, + "learning_rate": 0.0001591135437619847, + "loss": 0.6499, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 0.31688261087946556, + "learning_rate": 0.0001586946124013354, + "loss": 0.6017, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.33798344241209227, + "learning_rate": 0.0001582741036046301, + "loss": 0.6342, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.2998562294365411, + "learning_rate": 0.00015785202867317407, + "loss": 0.6216, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 0.33028086875759405, + "learning_rate": 0.00015742839895036305, + "loss": 0.634, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 0.3474490101498229, + "learning_rate": 0.00015700322582137827, + "loss": 0.6173, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 0.32937156718869864, + "learning_rate": 0.0001565765207128805, + "loss": 0.6257, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 0.3219749951301766, + "learning_rate": 0.0001561482950927029, + "loss": 0.662, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3561197924950446, + "learning_rate": 0.00015571856046954285, + "loss": 0.6569, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 0.3244256712146143, + "learning_rate": 0.00015528732839265272, + "loss": 0.6594, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 0.3381412265786872, + "learning_rate": 0.0001548546104515294, + "loss": 0.6464, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 0.3238669811135605, + "learning_rate": 0.00015442041827560274, + "loss": 0.6027, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3280219085314321, + "learning_rate": 0.00015398476353392323, + "loss": 0.6288, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.3685876179481929, + "learning_rate": 0.00015354765793484834, + "loss": 0.6255, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 0.29270701118818704, + "learning_rate": 0.00015310911322572753, + "loss": 0.5863, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 0.3474896849101967, + "learning_rate": 0.000152669141192587, + "loss": 0.6566, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 0.32082996987901913, + "learning_rate": 0.00015222775365981273, + "loss": 0.6429, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.30825421233708616, + "learning_rate": 0.00015178496248983254, + "loss": 0.5864, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 0.36395065272004196, + "learning_rate": 0.00015134077958279765, + "loss": 0.6453, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.34726778931857044, + "learning_rate": 0.00015089521687626243, + "loss": 0.6858, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 0.33674366113332843, + "learning_rate": 0.000150448286344864, + "loss": 0.6295, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3674013055955959, + "learning_rate": 0.00015000000000000001, + "loss": 0.6069, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 0.31996724678426863, + "learning_rate": 0.00014955036988950618, + "loss": 0.5908, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.31666566972160265, + "learning_rate": 0.00014909940809733222, + "loss": 0.623, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3677352283800213, + "learning_rate": 0.00014864712674321734, + "loss": 0.6453, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.3574559425690372, + "learning_rate": 0.00014819353798236427, + "loss": 0.6501, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 0.4988636076953785, + "learning_rate": 0.00014773865400511272, + "loss": 0.6514, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 0.31060166601964173, + "learning_rate": 0.00014728248703661182, + "loss": 0.6134, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3472702683654676, + "learning_rate": 0.00014682504933649144, + "loss": 0.5882, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3463745142979321, + "learning_rate": 0.00014636635319853275, + "loss": 0.6492, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 0.3479775333527672, + "learning_rate": 0.00014590641095033787, + "loss": 0.6677, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 0.33334390558230875, + "learning_rate": 0.00014544523495299842, + "loss": 0.6231, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 0.30961419862981415, + "learning_rate": 0.0001449828376007636, + "loss": 0.5667, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3094987062712464, + "learning_rate": 0.0001445192313207067, + "loss": 0.5594, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3159953848404255, + "learning_rate": 0.0001440544285723915, + "loss": 0.5843, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 0.3665073287523895, + "learning_rate": 0.00014358844184753712, + "loss": 0.6432, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 0.3463574230115457, + "learning_rate": 0.00014312128366968243, + "loss": 0.6306, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 0.31587888241197065, + "learning_rate": 0.00014265296659384956, + "loss": 0.5616, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.35804068720403287, + "learning_rate": 0.00014218350320620624, + "loss": 0.6201, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.30834708658834076, + "learning_rate": 0.0001417129061237278, + "loss": 0.6296, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.32627377791243184, + "learning_rate": 0.00014124118799385796, + "loss": 0.6205, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.31645951528801003, + "learning_rate": 0.00014076836149416887, + "loss": 0.6248, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3104451415173383, + "learning_rate": 0.0001402944393320206, + "loss": 0.6095, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 0.2890888494258903, + "learning_rate": 0.00013981943424421932, + "loss": 0.5883, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.29338240459411014, + "learning_rate": 0.00013934335899667527, + "loss": 0.6043, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 0.32262457740542627, + "learning_rate": 0.00013886622638405952, + "loss": 0.6419, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 0.33009135754906616, + "learning_rate": 0.00013838804922946027, + "loss": 0.6093, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 0.3158757926032022, + "learning_rate": 0.00013790884038403795, + "loss": 0.605, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.31245531255722064, + "learning_rate": 0.00013742861272668012, + "loss": 0.5791, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 0.34480793865812576, + "learning_rate": 0.00013694737916365517, + "loss": 0.64, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.34835203084995453, + "learning_rate": 0.00013646515262826552, + "loss": 0.6129, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.3433649816275233, + "learning_rate": 0.0001359819460805001, + "loss": 0.6378, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.34142040864867906, + "learning_rate": 0.0001354977725066859, + "loss": 0.6127, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.33077044742523903, + "learning_rate": 0.00013501264491913906, + "loss": 0.5955, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.3319211688043142, + "learning_rate": 0.0001345265763558152, + "loss": 0.6164, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.3215426812914164, + "learning_rate": 0.00013403957987995882, + "loss": 0.67, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.3272276807128324, + "learning_rate": 0.0001335516685797525, + "loss": 0.6543, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3388778535567206, + "learning_rate": 0.00013306285556796495, + "loss": 0.6384, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.3268286368522702, + "learning_rate": 0.00013257315398159864, + "loss": 0.6395, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.3279167955970497, + "learning_rate": 0.00013208257698153677, + "loss": 0.5976, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 0.34207456240072015, + "learning_rate": 0.00013159113775218964, + "loss": 0.6101, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.31432358611256983, + "learning_rate": 0.00013109884950114007, + "loss": 0.5871, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3097995350253823, + "learning_rate": 0.00013060572545878875, + "loss": 0.5971, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 0.31957824226435905, + "learning_rate": 0.00013011177887799845, + "loss": 0.6141, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.33556461316683844, + "learning_rate": 0.00012961702303373795, + "loss": 0.6184, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.3257640201823845, + "learning_rate": 0.00012912147122272523, + "loss": 0.6043, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.32743782814789024, + "learning_rate": 0.00012862513676307008, + "loss": 0.595, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.31806595157011563, + "learning_rate": 0.00012812803299391628, + "loss": 0.6115, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 0.30977554761088855, + "learning_rate": 0.00012763017327508305, + "loss": 0.6193, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3342560970887094, + "learning_rate": 0.0001271315709867059, + "loss": 0.6232, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 0.33069944231914344, + "learning_rate": 0.00012663223952887723, + "loss": 0.6129, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.31480071435918866, + "learning_rate": 0.00012613219232128608, + "loss": 0.5862, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.32243920235963874, + "learning_rate": 0.00012563144280285741, + "loss": 0.6247, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.33812522280669727, + "learning_rate": 0.00012513000443139112, + "loss": 0.6527, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 0.33089269374465646, + "learning_rate": 0.00012462789068320017, + "loss": 0.6002, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.3646236367929071, + "learning_rate": 0.00012412511505274844, + "loss": 0.6411, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 0.2799400640797182, + "learning_rate": 0.00012362169105228826, + "loss": 0.5564, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.32703269570958576, + "learning_rate": 0.000123117632211497, + "loss": 0.6225, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 0.3629491452820718, + "learning_rate": 0.00012261295207711346, + "loss": 0.6047, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3451267184380675, + "learning_rate": 0.0001221076642125742, + "loss": 0.6548, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.31381151848276595, + "learning_rate": 0.00012160178219764837, + "loss": 0.5983, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.34476408803594877, + "learning_rate": 0.00012109531962807332, + "loss": 0.6573, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.32193052407965933, + "learning_rate": 0.00012058829011518896, + "loss": 0.5542, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 0.3226898058737763, + "learning_rate": 0.00012008070728557186, + "loss": 0.6068, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.31976219084762086, + "learning_rate": 0.00011957258478066931, + "loss": 0.6125, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.3252185719704812, + "learning_rate": 0.00011906393625643244, + "loss": 0.6061, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3326884379691767, + "learning_rate": 0.00011855477538294935, + "loss": 0.6147, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.33492432797209265, + "learning_rate": 0.00011804511584407763, + "loss": 0.6123, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.3164586148445968, + "learning_rate": 0.00011753497133707679, + "loss": 0.6251, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.3090687482048777, + "learning_rate": 0.00011702435557223987, + "loss": 0.6153, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.3128302069224832, + "learning_rate": 0.00011651328227252517, + "loss": 0.6185, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 0.29351845518854947, + "learning_rate": 0.00011600176517318741, + "loss": 0.6018, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3410055854158688, + "learning_rate": 0.00011548981802140848, + "loss": 0.5789, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.3141087954157077, + "learning_rate": 0.00011497745457592816, + "loss": 0.6005, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3390354929842119, + "learning_rate": 0.00011446468860667421, + "loss": 0.6353, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.296516570958744, + "learning_rate": 0.00011395153389439233, + "loss": 0.5377, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 0.33066364468327153, + "learning_rate": 0.00011343800423027582, + "loss": 0.5861, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.33208494496920954, + "learning_rate": 0.0001129241134155949, + "loss": 0.6163, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.32052779611504956, + "learning_rate": 0.00011240987526132594, + "loss": 0.6017, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 0.38249530580741814, + "learning_rate": 0.00011189530358778005, + "loss": 0.6612, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.3066619083863857, + "learning_rate": 0.00011138041222423177, + "loss": 0.5741, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3051923913503972, + "learning_rate": 0.00011086521500854745, + "loss": 0.6014, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.30702785701491847, + "learning_rate": 0.00011034972578681338, + "loss": 0.5953, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 0.2979662489245293, + "learning_rate": 0.00010983395841296348, + "loss": 0.5848, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 0.3161998126919067, + "learning_rate": 0.00010931792674840718, + "loss": 0.5741, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.3253871022364271, + "learning_rate": 0.00010880164466165674, + "loss": 0.5985, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.34784480437776627, + "learning_rate": 0.00010828512602795462, + "loss": 0.604, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.3183414818409375, + "learning_rate": 0.00010776838472890065, + "loss": 0.5785, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 0.3597552732010806, + "learning_rate": 0.00010725143465207867, + "loss": 0.6179, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.32689548044171374, + "learning_rate": 0.00010673428969068364, + "loss": 0.5677, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.3537374631617811, + "learning_rate": 0.00010621696374314807, + "loss": 0.6252, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 0.3069063861327447, + "learning_rate": 0.00010569947071276847, + "loss": 0.5801, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3655647459141965, + "learning_rate": 0.00010518182450733186, + "loss": 0.6507, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.28855866544087244, + "learning_rate": 0.00010466403903874176, + "loss": 0.5789, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 0.3310666913511004, + "learning_rate": 0.00010414612822264455, + "loss": 0.5927, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 0.3128799657254568, + "learning_rate": 0.00010362810597805526, + "loss": 0.5774, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.45241050341416117, + "learning_rate": 0.0001031099862269837, + "loss": 0.6626, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.3121988566030735, + "learning_rate": 0.00010259178289406011, + "loss": 0.6474, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 0.2987472152843115, + "learning_rate": 0.00010207350990616107, + "loss": 0.587, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3330186838552484, + "learning_rate": 0.0001015551811920351, + "loss": 0.5733, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.38804357512944315, + "learning_rate": 0.00010103681068192845, + "loss": 0.5818, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.29517399138629286, + "learning_rate": 0.00010051841230721065, + "loss": 0.5785, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.30226149985906536, + "learning_rate": 0.0001, + "loss": 0.6248, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 0.28609710190348236, + "learning_rate": 9.948158769278939e-05, + "loss": 0.5425, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.29415325411634424, + "learning_rate": 9.896318931807155e-05, + "loss": 0.5756, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 0.33285141117958783, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6037, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.30157945611754017, + "learning_rate": 9.792649009383899e-05, + "loss": 0.5466, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 0.34447034812304733, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6158, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 0.3294249930520986, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6006, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.33164246733032615, + "learning_rate": 9.637189402194476e-05, + "loss": 0.5825, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 0.3107641846660284, + "learning_rate": 9.585387177735547e-05, + "loss": 0.5848, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3145409558213135, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6177, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.31753153253665634, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6091, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.3157578704685954, + "learning_rate": 9.430052928723153e-05, + "loss": 0.6084, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.29476395762907864, + "learning_rate": 9.378303625685195e-05, + "loss": 0.5841, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.3209582290404505, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6408, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 0.40444408574813595, + "learning_rate": 9.274856534792138e-05, + "loss": 0.5806, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3319189186745888, + "learning_rate": 9.223161527109937e-05, + "loss": 0.5628, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 0.37186268404405437, + "learning_rate": 9.171487397204539e-05, + "loss": 0.6173, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.32202567918702646, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6355, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 0.3566642646810941, + "learning_rate": 9.068207325159284e-05, + "loss": 0.6211, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.33734408956869044, + "learning_rate": 9.016604158703654e-05, + "loss": 0.603, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 0.33271896050771005, + "learning_rate": 8.965027421318665e-05, + "loss": 0.5862, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.3140618334759957, + "learning_rate": 8.913478499145254e-05, + "loss": 0.5936, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.30530598221384914, + "learning_rate": 8.861958777576827e-05, + "loss": 0.5852, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.3720165481328377, + "learning_rate": 8.810469641222001e-05, + "loss": 0.621, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.4127542577896669, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6037, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.29375796385920805, + "learning_rate": 8.707588658440511e-05, + "loss": 0.603, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3154118064740421, + "learning_rate": 8.656199576972423e-05, + "loss": 0.5921, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.3372373705593303, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6117, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.3133153360665321, + "learning_rate": 8.553531139332582e-05, + "loss": 0.5629, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3123915646510453, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6136, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3091600893466998, + "learning_rate": 8.451018197859153e-05, + "loss": 0.5723, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 0.34631992093275005, + "learning_rate": 8.399823482681262e-05, + "loss": 0.5251, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.2980341545388974, + "learning_rate": 8.348671772747487e-05, + "loss": 0.5809, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 0.34817708486534094, + "learning_rate": 8.297564442776014e-05, + "loss": 0.568, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.35450939715310636, + "learning_rate": 8.246502866292324e-05, + "loss": 0.5828, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.3203799738608221, + "learning_rate": 8.195488415592238e-05, + "loss": 0.5698, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.33422873597279046, + "learning_rate": 8.144522461705067e-05, + "loss": 0.5796, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.3338188435622777, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6019, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.328280335637955, + "learning_rate": 8.042741521933071e-05, + "loss": 0.5858, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.33567994116448024, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6026, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.31031837812977875, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6125, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.32926091956212744, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6006, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 0.36993353060267264, + "learning_rate": 7.839821780235168e-05, + "loss": 0.5733, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 0.31499830986752403, + "learning_rate": 7.789233578742582e-05, + "loss": 0.6259, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 0.32188347085701724, + "learning_rate": 7.738704792288655e-05, + "loss": 0.5917, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 0.3061945600647162, + "learning_rate": 7.688236778850306e-05, + "loss": 0.614, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 0.3073702794282459, + "learning_rate": 7.637830894771175e-05, + "loss": 0.5617, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.30515399425767387, + "learning_rate": 7.587488494725157e-05, + "loss": 0.5912, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.3355739467311519, + "learning_rate": 7.537210931679987e-05, + "loss": 0.6067, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3468971034802618, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6384, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3116793267107946, + "learning_rate": 7.43685571971426e-05, + "loss": 0.5852, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 0.3047471597740809, + "learning_rate": 7.386780767871397e-05, + "loss": 0.5636, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.29591949315461463, + "learning_rate": 7.336776047112276e-05, + "loss": 0.5955, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.3241793514631215, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6125, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 0.2969675059082848, + "learning_rate": 7.236982672491698e-05, + "loss": 0.5837, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 0.2895976804540707, + "learning_rate": 7.187196700608373e-05, + "loss": 0.5228, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.3330341571035036, + "learning_rate": 7.137486323692995e-05, + "loss": 0.5934, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.37117430157855713, + "learning_rate": 7.087852877727481e-05, + "loss": 0.573, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.33656138401884717, + "learning_rate": 7.038297696626206e-05, + "loss": 0.5979, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 0.32473469764325097, + "learning_rate": 6.988822112200156e-05, + "loss": 0.5832, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 0.31633341000198745, + "learning_rate": 6.939427454121128e-05, + "loss": 0.561, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.3543881916244574, + "learning_rate": 6.890115049885994e-05, + "loss": 0.5922, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.32944401599102163, + "learning_rate": 6.84088622478104e-05, + "loss": 0.5781, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 0.29379043631370816, + "learning_rate": 6.791742301846326e-05, + "loss": 0.5585, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.30087163699355657, + "learning_rate": 6.742684601840141e-05, + "loss": 0.577, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.3305440990682953, + "learning_rate": 6.693714443203507e-05, + "loss": 0.5972, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 0.32808249115703125, + "learning_rate": 6.644833142024751e-05, + "loss": 0.5646, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3579019126827201, + "learning_rate": 6.59604201200412e-05, + "loss": 0.6178, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.31229957334655506, + "learning_rate": 6.547342364418481e-05, + "loss": 0.563, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 0.30876924398557587, + "learning_rate": 6.498735508086093e-05, + "loss": 0.6047, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.334833827162927, + "learning_rate": 6.450222749331414e-05, + "loss": 0.6016, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 0.3154101243775418, + "learning_rate": 6.40180539194999e-05, + "loss": 0.5769, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.3156947224159009, + "learning_rate": 6.35348473717345e-05, + "loss": 0.5603, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.30358467735576816, + "learning_rate": 6.305262083634488e-05, + "loss": 0.5619, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 0.3747209131021904, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6279, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 0.34430479121130414, + "learning_rate": 6.209115961596208e-05, + "loss": 0.5865, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 0.32400213742109385, + "learning_rate": 6.161195077053976e-05, + "loss": 0.5477, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 0.385300934700712, + "learning_rate": 6.113377361594049e-05, + "loss": 0.5844, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.2966689332023562, + "learning_rate": 6.065664100332478e-05, + "loss": 0.5705, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 0.2931038318652647, + "learning_rate": 6.018056575578075e-05, + "loss": 0.5782, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3351747643450468, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6056, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.30684605499457235, + "learning_rate": 5.923163850583113e-05, + "loss": 0.5488, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3279054154311602, + "learning_rate": 5.875881200614207e-05, + "loss": 0.5839, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 0.30962367293197873, + "learning_rate": 5.828709387627218e-05, + "loss": 0.6076, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.31679931254330107, + "learning_rate": 5.781649679379378e-05, + "loss": 0.5871, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.32141908822706244, + "learning_rate": 5.73470334061505e-05, + "loss": 0.5469, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3527971138131436, + "learning_rate": 5.687871633031754e-05, + "loss": 0.603, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.27369281915172705, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.5474, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.2905310605819737, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.5533, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.296175952659622, + "learning_rate": 5.54807686792933e-05, + "loss": 0.5147, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.2951710811283539, + "learning_rate": 5.501716239923642e-05, + "loss": 0.5362, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.33059665402204946, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.5844, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.30176317940139036, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.5507, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 0.30774773664578586, + "learning_rate": 5.363364680146725e-05, + "loss": 0.5282, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.5117447996571544, + "learning_rate": 5.31749506635086e-05, + "loss": 0.5492, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.3421289435965022, + "learning_rate": 5.271751296338823e-05, + "loss": 0.5875, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.354463766375705, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6217, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 0.3154260452857168, + "learning_rate": 5.180646201763577e-05, + "loss": 0.5499, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.2969536159605783, + "learning_rate": 5.135287325678271e-05, + "loss": 0.577, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.31177957234143505, + "learning_rate": 5.090059190266779e-05, + "loss": 0.5569, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.3019463727112216, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.5822, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3058338570000804, + "learning_rate": 5.000000000000002e-05, + "loss": 0.606, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3230837429064014, + "learning_rate": 4.955171365513603e-05, + "loss": 0.5651, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.2678609967121915, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.527, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.31984197931050856, + "learning_rate": 4.865922041720239e-05, + "loss": 0.5277, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.36787971775645245, + "learning_rate": 4.821503751016746e-05, + "loss": 0.5831, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.31761183735193627, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6092, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 0.3209030010567008, + "learning_rate": 4.733085880741301e-05, + "loss": 0.5839, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.3048133128165885, + "learning_rate": 4.689088677427249e-05, + "loss": 0.5736, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.28120456740912425, + "learning_rate": 4.645234206515171e-05, + "loss": 0.5471, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.37214820992626707, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.6119, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3408766459891202, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.5843, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 0.27993522238408386, + "learning_rate": 4.514538954847064e-05, + "loss": 0.5368, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.33430364727703993, + "learning_rate": 4.471267160734731e-05, + "loss": 0.5607, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.31370924363308805, + "learning_rate": 4.428143953045717e-05, + "loss": 0.5796, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.3158247100165067, + "learning_rate": 4.385170490729712e-05, + "loss": 0.554, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.3293365360340102, + "learning_rate": 4.342347928711953e-05, + "loss": 0.616, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.3102964238553234, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.5566, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.3320311237718539, + "learning_rate": 4.257160104963696e-05, + "loss": 0.5976, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.39732095318780175, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.5625, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.31155382051607555, + "learning_rate": 4.172589639536991e-05, + "loss": 0.5411, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 0.3323104297517485, + "learning_rate": 4.130538759866457e-05, + "loss": 0.5865, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3712237621744867, + "learning_rate": 4.088645623801534e-05, + "loss": 0.6318, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.3229042644926411, + "learning_rate": 4.046911357233343e-05, + "loss": 0.5743, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 0.29907881399082403, + "learning_rate": 4.00533708178334e-05, + "loss": 0.5664, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 0.4441456508733791, + "learning_rate": 3.963923914773187e-05, + "loss": 0.5882, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3519351419650644, + "learning_rate": 3.922672969194686e-05, + "loss": 0.5916, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.3046891795610562, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.5939, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.28455026675791334, + "learning_rate": 3.840662172471315e-05, + "loss": 0.5188, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3325770886862265, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5869, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 0.2973809949258795, + "learning_rate": 3.759313507817196e-05, + "loss": 0.5421, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.3221347374942181, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.5864, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 0.34687591239754206, + "learning_rate": 3.678635720256737e-05, + "loss": 0.5944, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.3352608379297791, + "learning_rate": 3.638551118512089e-05, + "loss": 0.5771, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3305224911439026, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.5905, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.2965071411303001, + "learning_rate": 3.558895885496023e-05, + "loss": 0.4929, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3527451252706239, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6096, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.30724814051450583, + "learning_rate": 3.479933074573858e-05, + "loss": 0.5701, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.3469545333516701, + "learning_rate": 3.440713983000601e-05, + "loss": 0.5809, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3432172543820323, + "learning_rate": 3.401671174289469e-05, + "loss": 0.5815, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.27208246581981216, + "learning_rate": 3.362805697728145e-05, + "loss": 0.5307, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 0.4100518601876637, + "learning_rate": 3.324118597838464e-05, + "loss": 0.5817, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.34061629245083513, + "learning_rate": 3.285610914348332e-05, + "loss": 0.6113, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.30675429430291684, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.5424, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 0.31742149090771027, + "learning_rate": 3.209137931341143e-05, + "loss": 0.5611, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.324121004647133, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6087, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3289709513645346, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.5866, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.3039839724902563, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6005, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.3227359743275398, + "learning_rate": 3.058390171511196e-05, + "loss": 0.5755, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 0.3312219578229909, + "learning_rate": 3.021167106673928e-05, + "loss": 0.5951, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.2607892687043467, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.5239, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.3211451583419221, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.5996, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3261838860983297, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.5394, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.31184360385586457, + "learning_rate": 2.874160358524931e-05, + "loss": 0.5579, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.352870753034346, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6094, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.2783628041575873, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.5212, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 0.321237438184481, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.5614, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.3019928769523239, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5561, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 0.29832262198894516, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.5377, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.323634424531092, + "learning_rate": 2.659414712405398e-05, + "loss": 0.618, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.3282204388290083, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5931, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.35625417801874015, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.589, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2844237915466193, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.5143, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.29732264384556584, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.5476, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.2856189609161265, + "learning_rate": 2.485876184956928e-05, + "loss": 0.516, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.33400754346314043, + "learning_rate": 2.451770608467432e-05, + "loss": 0.5725, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.329844208780943, + "learning_rate": 2.417867893002387e-05, + "loss": 0.5457, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.29412208186368505, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.5305, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.3067167595539764, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.5952, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 0.31507436481266066, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.5721, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.374802888789554, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.5098, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.32859006997932044, + "learning_rate": 2.251428928971102e-05, + "loss": 0.5806, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.32874810609039673, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.5841, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 0.28836419309288286, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.5541, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.3099871606982036, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5606, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3221853052094363, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.5389, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.2894838671575457, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.5381, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.30988958158731844, + "learning_rate": 2.058583491552465e-05, + "loss": 0.586, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.31132930659044444, + "learning_rate": 2.027184594300898e-05, + "loss": 0.5506, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.3263325087242562, + "learning_rate": 1.995999968955641e-05, + "loss": 0.5206, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.35348868141276724, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.5538, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.33519734489221437, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.5787, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.35515766666587856, + "learning_rate": 1.903740076395151e-05, + "loss": 0.5603, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.3538496256081159, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5965, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.34314938114488563, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.5603, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 0.2995789633564441, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.5263, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 0.36551548091125113, + "learning_rate": 1.783776873795994e-05, + "loss": 0.5799, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.33608376035602755, + "learning_rate": 1.754336106761927e-05, + "loss": 0.5498, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.311043421155776, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.5645, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 0.3156023943290121, + "learning_rate": 1.696120172352025e-05, + "loss": 0.5488, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.33536823334656135, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.5563, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 0.33526801042681004, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.484, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 0.3313356108133414, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.5671, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.3032308382473139, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.5313, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.35514370450316773, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.6026, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.32561672086420923, + "learning_rate": 1.526852950422226e-05, + "loss": 0.5632, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 0.32605764722031383, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.5493, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.3300675802793695, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.5661, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.3149622536766178, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.547, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 0.36686698274862983, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.5503, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.38360774512659207, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.527, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 0.33270680502856625, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.5638, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.31349166170125337, + "learning_rate": 1.339745962155613e-05, + "loss": 0.5654, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.3337236613807481, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5681, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 0.3284719148971575, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.5755, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.3166628892383498, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6076, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 0.2923848352653842, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.5732, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3009077815189411, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.5436, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.3130027772871504, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.5625, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.3361962977258804, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.5807, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 0.29608833926553046, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.5269, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.3270524860119114, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.5816, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3430437346896367, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.5731, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.32535876490467913, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.4918, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3231292747667162, + "learning_rate": 1.045650195232819e-05, + "loss": 0.5872, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.3251794730113907, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.6055, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.31024411607401303, + "learning_rate": 9.999734793146998e-06, + "loss": 0.536, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.32455636870036475, + "learning_rate": 9.774976338718677e-06, + "loss": 0.5415, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.3150003047627516, + "learning_rate": 9.552642710005299e-06, + "loss": 0.5166, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.29348215525540744, + "learning_rate": 9.332739882292752e-06, + "loss": 0.4773, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.292859851123708, + "learning_rate": 9.115273765538202e-06, + "loss": 0.5636, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3300194120158872, + "learning_rate": 8.900250204211514e-06, + "loss": 0.5522, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.2963361079298774, + "learning_rate": 8.687674977138116e-06, + "loss": 0.5493, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 0.29879896580398585, + "learning_rate": 8.47755379734373e-06, + "loss": 0.5591, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.4116034082913684, + "learning_rate": 8.269892311900696e-06, + "loss": 0.5883, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.3232773360036941, + "learning_rate": 8.064696101776358e-06, + "loss": 0.5737, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.3134894851722256, + "learning_rate": 7.861970681683051e-06, + "loss": 0.5848, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3528480304212816, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6011, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.3310343277309279, + "learning_rate": 7.463953938275858e-06, + "loss": 0.5872, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.29087492472336945, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.5401, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.30926114646884223, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5854, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3219895811423112, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5649, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.32051340587736893, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5897, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.29727631379917907, + "learning_rate": 6.512524116523633e-06, + "loss": 0.5261, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.3034336648188552, + "learning_rate": 6.329755547632499e-06, + "loss": 0.5318, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3075346806380729, + "learning_rate": 6.149504395842087e-06, + "loss": 0.5652, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.2809666320569592, + "learning_rate": 5.971775505458444e-06, + "loss": 0.4978, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.32906249814027355, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.5675, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.3433895713502097, + "learning_rate": 5.623903547074549e-06, + "loss": 0.5659, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 0.299004282903824, + "learning_rate": 5.453769828241872e-06, + "loss": 0.5388, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.31081135142207966, + "learning_rate": 5.286177068899989e-06, + "loss": 0.5737, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.3321533336299761, + "learning_rate": 5.121129773156663e-06, + "loss": 0.5229, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.32310942065808324, + "learning_rate": 4.95863237670956e-06, + "loss": 0.6103, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.35428031176295227, + "learning_rate": 4.798689246727006e-06, + "loss": 0.5895, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.3226230204094095, + "learning_rate": 4.641304681730641e-06, + "loss": 0.5796, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.31844821904730863, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5565, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.31127211469742905, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.5742, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.3301616178055294, + "learning_rate": 4.184544329761009e-06, + "loss": 0.5633, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.31088406778226274, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5741, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.30910524243719273, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5422, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.32891907673013904, + "learning_rate": 3.750959195463466e-06, + "loss": 0.5939, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.31765925114121074, + "learning_rate": 3.611599153858214e-06, + "loss": 0.5525, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3265729180708036, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.5591, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.304115871463315, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.569, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.3108161445075911, + "learning_rate": 3.209076472645112e-06, + "loss": 0.5302, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.3117508517796498, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.5924, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.3173885393960302, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5513, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3065343745357514, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.5488, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.3465215111780123, + "learning_rate": 2.708812932856253e-06, + "loss": 0.5718, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.3139184869222331, + "learning_rate": 2.590275647868867e-06, + "loss": 0.5634, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3347273007314033, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.5388, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3255954139922336, + "learning_rate": 2.3610579436393e-06, + "loss": 0.5913, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 0.31531159165403394, + "learning_rate": 2.250383684694579e-06, + "loss": 0.531, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.30838344199204837, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5312, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.31304073052384795, + "learning_rate": 2.036919225091827e-06, + "loss": 0.5543, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 0.28806534227726394, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.5099, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 0.30803254142561787, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.5429, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.3233273532446496, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.5475, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.314011051336649, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.5825, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.31288304219875024, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.5351, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 0.4138905691158364, + "learning_rate": 1.459798471131868e-06, + "loss": 0.5927, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.33771234928093735, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.5824, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.29905255464308417, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.5495, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.40814635147033207, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5305, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 0.4552099236409849, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.6151, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 0.29847599170770306, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.5433, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3107431078356281, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5635, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 0.2993825608025531, + "learning_rate": 9.070131527609604e-07, + "loss": 0.5608, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.3220569129147483, + "learning_rate": 8.386804624865851e-07, + "loss": 0.556, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.2944064570193302, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5398, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3095022936191454, + "learning_rate": 7.100118211581852e-07, + "loss": 0.5168, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.3721121088717959, + "learning_rate": 6.496793281141056e-07, + "loss": 0.5385, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 0.287888409393853, + "learning_rate": 5.920169059947411e-07, + "loss": 0.5547, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 0.27515729289166646, + "learning_rate": 5.370261044956971e-07, + "loss": 0.4845, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.35344587392267507, + "learning_rate": 4.847084015119574e-07, + "loss": 0.5638, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3601029556024769, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.5279, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.30954690722928574, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.5541, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.29250672200275046, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.5425, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.3679308518468009, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.5775, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.2986869169097338, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5344, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 0.4446774451930178, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.5608, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.31519624442850597, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5073, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3073412869268294, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.5422, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.3498546428328075, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.5048, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.3227086733836615, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5576, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 0.3131763041284998, + "learning_rate": 8.598886661895788e-08, + "loss": 0.5746, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.30994026071777964, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5458, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 0.304032587550163, + "learning_rate": 4.837177080119215e-08, + "loss": 0.5185, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.32560298395148213, + "learning_rate": 3.359233507459481e-08, + "loss": 0.5953, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.4316946416052306, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.5302, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 1.148032759175294, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5889, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.3273908378003577, + "learning_rate": 5.375026405352035e-09, + "loss": 0.5348, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.29324728887148505, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5362, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 0.33246322889785346, + "learning_rate": 0.0, + "loss": 0.5623, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 617553363992576.0, + "train_loss": 0.6094743517398834, + "train_runtime": 9982.4112, + "train_samples_per_second": 1.002, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 617553363992576.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..0bd037a10dfc12a3cd60f9e2e9e7b84b48b58a11 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "v_proj", + "q_proj", + "up_proj", + "o_proj", + "down_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..95ed70cf0151c04d096f86686598432852f2757a --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d18065781693bb13fdd4485b21e7f60e2b9088ff799e15113a2288f346cce37 +size 671150064 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..230329cf72d3a62bdb02956b5934a079996be505 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e580902e5072fe04b7f4500f34f52d53625f8d78936978957c8fcbdc12bfe68f +size 918507402 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..5745ffc710cf2db3994d48173e15137dedaef571 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_20000_epochs_1_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 0.7333467964438457, + "learning_rate": 5.263157894736842e-06, + "loss": 0.8541, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 0.7046812923627103, + "learning_rate": 1.0526315789473684e-05, + "loss": 0.8938, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 0.8023384874584811, + "learning_rate": 1.5789473684210526e-05, + "loss": 0.9444, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 0.6592371105137336, + "learning_rate": 2.105263157894737e-05, + "loss": 0.879, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 0.5489801752269747, + "learning_rate": 2.6315789473684212e-05, + "loss": 0.8878, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 0.49330057949697514, + "learning_rate": 3.157894736842105e-05, + "loss": 0.8441, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 0.4514470086044233, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.8222, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 0.46072800657223917, + "learning_rate": 4.210526315789474e-05, + "loss": 0.8425, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 0.5343209805429193, + "learning_rate": 4.736842105263158e-05, + "loss": 0.809, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 0.5681041941951939, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.8089, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 0.5984329597457433, + "learning_rate": 5.789473684210527e-05, + "loss": 0.8328, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 0.5019398392065417, + "learning_rate": 6.31578947368421e-05, + "loss": 0.7657, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 0.4568875591342637, + "learning_rate": 6.842105263157895e-05, + "loss": 0.813, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 0.4638574433579869, + "learning_rate": 7.368421052631579e-05, + "loss": 0.7875, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 0.45583896124706985, + "learning_rate": 7.894736842105263e-05, + "loss": 0.733, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 0.44012367968398347, + "learning_rate": 8.421052631578948e-05, + "loss": 0.774, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 0.6381321163246892, + "learning_rate": 8.947368421052632e-05, + "loss": 0.7551, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 0.38086721843816257, + "learning_rate": 9.473684210526316e-05, + "loss": 0.7197, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 0.41242103873065866, + "learning_rate": 0.0001, + "loss": 0.7259, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 0.4031452967079881, + "learning_rate": 0.00010526315789473685, + "loss": 0.7345, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 0.3796337899011198, + "learning_rate": 0.0001105263157894737, + "loss": 0.7308, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 0.3589672408280759, + "learning_rate": 0.00011578947368421053, + "loss": 0.7252, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 0.36779673876973484, + "learning_rate": 0.00012105263157894738, + "loss": 0.7553, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 0.3771478046107944, + "learning_rate": 0.0001263157894736842, + "loss": 0.728, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 0.4059869425404555, + "learning_rate": 0.00013157894736842108, + "loss": 0.6706, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 0.41816849965572295, + "learning_rate": 0.0001368421052631579, + "loss": 0.7328, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 0.4083488253667742, + "learning_rate": 0.00014210526315789474, + "loss": 0.7246, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 0.3806308894022481, + "learning_rate": 0.00014736842105263158, + "loss": 0.7368, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 0.3686860299373134, + "learning_rate": 0.00015263157894736845, + "loss": 0.6777, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 0.3911767574920794, + "learning_rate": 0.00015789473684210527, + "loss": 0.7319, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 0.33941469797174856, + "learning_rate": 0.0001631578947368421, + "loss": 0.6488, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 0.33953284273489737, + "learning_rate": 0.00016842105263157895, + "loss": 0.6693, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 0.32790137430310623, + "learning_rate": 0.0001736842105263158, + "loss": 0.6598, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 0.366265210647376, + "learning_rate": 0.00017894736842105264, + "loss": 0.7223, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 0.3710081737672853, + "learning_rate": 0.00018421052631578948, + "loss": 0.6841, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 0.3662173246501666, + "learning_rate": 0.00018947368421052632, + "loss": 0.6701, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 0.38472592055372573, + "learning_rate": 0.00019473684210526317, + "loss": 0.7235, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 0.33667608978973407, + "learning_rate": 0.0002, + "loss": 0.685, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 0.3245830016683275, + "learning_rate": 0.00019999966405802826, + "loss": 0.6846, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 0.3622644699856565, + "learning_rate": 0.00019999865623437013, + "loss": 0.6687, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 0.38426542618145665, + "learning_rate": 0.00019999697653579705, + "loss": 0.7106, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 0.35786200143014124, + "learning_rate": 0.00019999462497359466, + "loss": 0.7038, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 0.4612420884227193, + "learning_rate": 0.0001999916015635627, + "loss": 0.7225, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 0.4006770487146382, + "learning_rate": 0.00019998790632601496, + "loss": 0.7333, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 0.3733574872750548, + "learning_rate": 0.00019998353928577919, + "loss": 0.7105, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 0.3448576640940017, + "learning_rate": 0.0001999785004721968, + "loss": 0.6873, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 0.3543916842716417, + "learning_rate": 0.0001999727899191228, + "loss": 0.7174, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 0.43857513510849166, + "learning_rate": 0.00019996640766492543, + "loss": 0.706, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 0.3789729781625536, + "learning_rate": 0.00019995935375248606, + "loss": 0.657, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 0.3577421848104737, + "learning_rate": 0.00019995162822919883, + "loss": 0.6551, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 0.3729045151309279, + "learning_rate": 0.00019994323114697022, + "loss": 0.6635, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 0.36050892352274594, + "learning_rate": 0.00019993416256221895, + "loss": 0.6734, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 0.33619674572344266, + "learning_rate": 0.0001999244225358753, + "loss": 0.6697, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 0.3511748484935949, + "learning_rate": 0.00019991401113338104, + "loss": 0.6932, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 0.3338714829275817, + "learning_rate": 0.00019990292842468868, + "loss": 0.679, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 0.3546683605903518, + "learning_rate": 0.00019989117448426108, + "loss": 0.6951, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 0.34590425068052655, + "learning_rate": 0.0001998787493910712, + "loss": 0.6999, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 0.34937606227550067, + "learning_rate": 0.00019986565322860115, + "loss": 0.6687, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 0.3401995704397724, + "learning_rate": 0.000199851886084842, + "loss": 0.6405, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 0.38521477320962205, + "learning_rate": 0.00019983744805229296, + "loss": 0.6971, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 0.36610187767563834, + "learning_rate": 0.00019982233922796085, + "loss": 0.7103, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 0.36747178830589144, + "learning_rate": 0.00019980655971335945, + "loss": 0.6695, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 0.36260599210373223, + "learning_rate": 0.00019979010961450878, + "loss": 0.7015, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 0.34234793606854774, + "learning_rate": 0.00019977298904193437, + "loss": 0.6397, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 0.3502511952025378, + "learning_rate": 0.00019975519811066663, + "loss": 0.67, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 0.3654449065132795, + "learning_rate": 0.00019973673694024, + "loss": 0.719, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 0.3789789626981662, + "learning_rate": 0.0001997176056546921, + "loss": 0.7189, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 0.3315091880490501, + "learning_rate": 0.00019969780438256293, + "loss": 0.6778, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 0.3786473692725525, + "learning_rate": 0.0001996773332568941, + "loss": 0.6965, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 0.36763485908472676, + "learning_rate": 0.0001996561924152278, + "loss": 0.6963, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 0.35592546585794316, + "learning_rate": 0.00019963438199960599, + "loss": 0.6586, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 0.34593530293633556, + "learning_rate": 0.0001996119021565693, + "loss": 0.71, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 0.31652977117815895, + "learning_rate": 0.00019958875303715615, + "loss": 0.6175, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 0.3279341370437573, + "learning_rate": 0.0001995649347969019, + "loss": 0.6442, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 0.30589567213978114, + "learning_rate": 0.0001995404475958373, + "loss": 0.6053, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 0.37411511157136124, + "learning_rate": 0.00019951529159848805, + "loss": 0.6604, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 0.33254333104980854, + "learning_rate": 0.0001994894669738732, + "loss": 0.6504, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 0.34114993306366115, + "learning_rate": 0.00019946297389550433, + "loss": 0.6446, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 0.3660399567176618, + "learning_rate": 0.0001994358125413841, + "loss": 0.654, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 0.3401691894450481, + "learning_rate": 0.00019940798309400526, + "loss": 0.6689, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 0.382049598789673, + "learning_rate": 0.0001993794857403495, + "loss": 0.7018, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 0.37980106479306164, + "learning_rate": 0.0001993503206718859, + "loss": 0.6632, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 0.4024825236670562, + "learning_rate": 0.0001993204880845699, + "loss": 0.6724, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 0.35981220589930224, + "learning_rate": 0.00019928998817884182, + "loss": 0.6421, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 0.3290097261427852, + "learning_rate": 0.00019925882115962568, + "loss": 0.661, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 0.3512867136385803, + "learning_rate": 0.00019922698723632767, + "loss": 0.7121, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 0.3223343141825949, + "learning_rate": 0.00019919448662283478, + "loss": 0.6741, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 0.33715632997441913, + "learning_rate": 0.00019916131953751342, + "loss": 0.6758, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 0.33980240202859846, + "learning_rate": 0.00019912748620320794, + "loss": 0.6961, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 0.33840677725516743, + "learning_rate": 0.00019909298684723904, + "loss": 0.6802, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 0.34861377177855213, + "learning_rate": 0.00019905782170140238, + "loss": 0.6518, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 0.34992870540252374, + "learning_rate": 0.00019902199100196697, + "loss": 0.6477, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 0.38417763225511253, + "learning_rate": 0.00019898549498967343, + "loss": 0.6532, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 0.36598538222084, + "learning_rate": 0.00019894833390973266, + "loss": 0.6674, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 0.3142884383546262, + "learning_rate": 0.000198910508011824, + "loss": 0.6046, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 0.3538134463621248, + "learning_rate": 0.00019887201755009357, + "loss": 0.6937, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 0.31998281722492944, + "learning_rate": 0.00019883286278315262, + "loss": 0.6391, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 0.36592875585738904, + "learning_rate": 0.0001987930439740757, + "loss": 0.6818, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 0.362517266127021, + "learning_rate": 0.00019875256139039902, + "loss": 0.6487, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 0.32690240419691086, + "learning_rate": 0.00019871141530411853, + "loss": 0.644, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 0.31337521433189724, + "learning_rate": 0.00019866960599168826, + "loss": 0.6405, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 0.3536448102446704, + "learning_rate": 0.0001986271337340182, + "loss": 0.6443, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 0.3838886237343212, + "learning_rate": 0.0001985839988164726, + "loss": 0.6911, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 0.34521287318071664, + "learning_rate": 0.00019854020152886814, + "loss": 0.6639, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 0.3885827888518703, + "learning_rate": 0.00019849574216547171, + "loss": 0.6821, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 0.3774208870941129, + "learning_rate": 0.0001984506210249986, + "loss": 0.6477, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 0.3487294577299662, + "learning_rate": 0.00019840483841061058, + "loss": 0.6402, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 0.3675667985641538, + "learning_rate": 0.00019835839462991361, + "loss": 0.6411, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 0.3858224565215601, + "learning_rate": 0.00019831128999495606, + "loss": 0.6396, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 0.3312217529929406, + "learning_rate": 0.00019826352482222638, + "loss": 0.6141, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 0.334488409574564, + "learning_rate": 0.0001982150994326511, + "loss": 0.6182, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 0.32561586510457413, + "learning_rate": 0.00019816601415159263, + "loss": 0.6181, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 0.35314824041217946, + "learning_rate": 0.0001981162693088471, + "loss": 0.6561, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 0.40543110997691756, + "learning_rate": 0.0001980658652386421, + "loss": 0.6565, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 0.34572707660144664, + "learning_rate": 0.0001980148022796345, + "loss": 0.635, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 0.41880695361859205, + "learning_rate": 0.00019796308077490817, + "loss": 0.8094, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 0.4314435242035476, + "learning_rate": 0.00019791070107197153, + "loss": 0.6529, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 0.3458276721677694, + "learning_rate": 0.00019785766352275542, + "loss": 0.6085, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 0.345832220010251, + "learning_rate": 0.0001978039684836106, + "loss": 0.6492, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 0.3832504293765825, + "learning_rate": 0.00019774961631530545, + "loss": 0.6623, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 0.36633565364216164, + "learning_rate": 0.0001976946073830234, + "loss": 0.6423, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 0.3829393918649457, + "learning_rate": 0.00019763894205636072, + "loss": 0.6464, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.34672910506272275, + "learning_rate": 0.00019758262070932375, + "loss": 0.6458, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.3852087511837911, + "learning_rate": 0.00019752564372032657, + "loss": 0.6585, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 0.33913584509396805, + "learning_rate": 0.00019746801147218842, + "loss": 0.6459, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 0.3478104764584854, + "learning_rate": 0.00019740972435213115, + "loss": 0.6529, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 0.38015962327569774, + "learning_rate": 0.00019735078275177654, + "loss": 0.656, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 0.31846132254752074, + "learning_rate": 0.00019729118706714375, + "loss": 0.6191, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 0.3090484661720633, + "learning_rate": 0.00019723093769864663, + "loss": 0.6191, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 0.3309332991113378, + "learning_rate": 0.00019717003505109095, + "loss": 0.6376, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 0.34607157007406497, + "learning_rate": 0.0001971084795336719, + "loss": 0.656, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 0.34154025516290404, + "learning_rate": 0.00019704627155997108, + "loss": 0.6613, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 0.34621999136149534, + "learning_rate": 0.00019698341154795389, + "loss": 0.663, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 0.31938959567418773, + "learning_rate": 0.00019691989991996663, + "loss": 0.6392, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 0.354949769989008, + "learning_rate": 0.00019685573710273376, + "loss": 0.6416, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 0.38189814518037746, + "learning_rate": 0.0001967909235273549, + "loss": 0.6921, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 0.3659263446214391, + "learning_rate": 0.00019672545962930215, + "loss": 0.6499, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 0.3328330282062205, + "learning_rate": 0.00019665934584841682, + "loss": 0.6315, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 0.32116564042899076, + "learning_rate": 0.00019659258262890683, + "loss": 0.6258, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 0.37785042372798666, + "learning_rate": 0.00019652517041934356, + "loss": 0.6576, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.3094529863079494, + "learning_rate": 0.00019645710967265882, + "loss": 0.613, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 0.3137324032253774, + "learning_rate": 0.00019638840084614182, + "loss": 0.6249, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 0.38180488121497574, + "learning_rate": 0.00019631904440143612, + "loss": 0.6766, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3644983311374808, + "learning_rate": 0.00019624904080453655, + "loss": 0.6301, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 0.3317008198946184, + "learning_rate": 0.00019617839052578603, + "loss": 0.6577, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 0.3694225450245007, + "learning_rate": 0.00019610709403987246, + "loss": 0.6587, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 0.3828605568034377, + "learning_rate": 0.0001960351518258255, + "loss": 0.6622, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 0.36181988330545833, + "learning_rate": 0.00019596256436701324, + "loss": 0.6683, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 0.35047126739354484, + "learning_rate": 0.00019588933215113926, + "loss": 0.6871, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 0.3308112712054644, + "learning_rate": 0.000195815455670239, + "loss": 0.6832, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.33134319006255447, + "learning_rate": 0.00019574093542067673, + "loss": 0.6696, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 0.34666324653052366, + "learning_rate": 0.00019566577190314197, + "loss": 0.6725, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 0.36207621794732525, + "learning_rate": 0.0001955899656226464, + "loss": 0.6732, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.33889345106741003, + "learning_rate": 0.0001955135170885202, + "loss": 0.651, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 0.35159449636801615, + "learning_rate": 0.0001954364268144088, + "loss": 0.6571, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 0.35584438905062943, + "learning_rate": 0.00019535869531826937, + "loss": 0.6717, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 0.35199238498112145, + "learning_rate": 0.00019528032312236736, + "loss": 0.6468, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 0.31843385563990295, + "learning_rate": 0.00019520131075327298, + "loss": 0.5959, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 0.3740596945424377, + "learning_rate": 0.00019512165874185767, + "loss": 0.6538, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 0.3550232628004775, + "learning_rate": 0.00019504136762329047, + "loss": 0.6348, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.3486879260513687, + "learning_rate": 0.0001949604379370345, + "loss": 0.629, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 0.38571634904297875, + "learning_rate": 0.00019487887022684336, + "loss": 0.6244, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 0.35061083378189084, + "learning_rate": 0.00019479666504075736, + "loss": 0.6643, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 0.33223226481959944, + "learning_rate": 0.00019471382293110003, + "loss": 0.6149, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 0.32043302030456344, + "learning_rate": 0.0001946303444544741, + "loss": 0.6205, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.32250710950305245, + "learning_rate": 0.00019454623017175812, + "loss": 0.631, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 0.3171309934126043, + "learning_rate": 0.00019446148064810242, + "loss": 0.6219, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 0.34830888209551053, + "learning_rate": 0.00019437609645292546, + "loss": 0.6399, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.34760275654822936, + "learning_rate": 0.00019429007815990993, + "loss": 0.6128, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 0.33888876826964004, + "learning_rate": 0.0001942034263469989, + "loss": 0.6224, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 0.35209782663951905, + "learning_rate": 0.00019411614159639204, + "loss": 0.628, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 0.33820840008987435, + "learning_rate": 0.00019402822449454153, + "loss": 0.6359, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 0.3664340083977849, + "learning_rate": 0.00019393967563214833, + "loss": 0.6673, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 0.34094112806828475, + "learning_rate": 0.00019385049560415794, + "loss": 0.6589, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 0.33821570267165935, + "learning_rate": 0.00019376068500975667, + "loss": 0.6574, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 0.3186428146376564, + "learning_rate": 0.00019367024445236754, + "loss": 0.5942, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 0.3451631464189648, + "learning_rate": 0.000193579174539646, + "loss": 0.6211, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 0.35343049372918955, + "learning_rate": 0.00019348747588347637, + "loss": 0.6467, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 0.36376785759597074, + "learning_rate": 0.00019339514909996706, + "loss": 0.6669, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.35064007261406416, + "learning_rate": 0.00019330219480944694, + "loss": 0.6569, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 0.30431346771152046, + "learning_rate": 0.00019320861363646095, + "loss": 0.5859, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 0.3311512501444353, + "learning_rate": 0.00019311440620976597, + "loss": 0.6157, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 0.34243549624437286, + "learning_rate": 0.00019301957316232658, + "loss": 0.6625, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 0.3410091522230089, + "learning_rate": 0.0001929241151313108, + "loss": 0.6946, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.3421852451349417, + "learning_rate": 0.0001928280327580858, + "loss": 0.634, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 0.32865521453219587, + "learning_rate": 0.00019273132668821364, + "loss": 0.6529, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 0.34062067626916304, + "learning_rate": 0.00019263399757144683, + "loss": 0.6695, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 0.33536717311969466, + "learning_rate": 0.00019253604606172417, + "loss": 0.6457, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 0.33990576364220726, + "learning_rate": 0.000192437472817166, + "loss": 0.6762, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 0.32843308367784046, + "learning_rate": 0.00019233827850007027, + "loss": 0.6631, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 0.33223961737400787, + "learning_rate": 0.00019223846377690754, + "loss": 0.6338, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 0.35501297943771504, + "learning_rate": 0.00019213802931831696, + "loss": 0.6236, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 0.3430432408972357, + "learning_rate": 0.00019203697579910154, + "loss": 0.6365, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 0.38476315719148757, + "learning_rate": 0.00019193530389822363, + "loss": 0.7256, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 0.3346137083977566, + "learning_rate": 0.00019183301429880043, + "loss": 0.6229, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 0.3082807963901158, + "learning_rate": 0.00019173010768809933, + "loss": 0.6043, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 0.3136974323360699, + "learning_rate": 0.00019162658475753327, + "loss": 0.6522, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.3572327998101803, + "learning_rate": 0.0001915224462026563, + "loss": 0.6599, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 0.3290643907110654, + "learning_rate": 0.00019141769272315858, + "loss": 0.6367, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 0.3668339281339755, + "learning_rate": 0.00019131232502286188, + "loss": 0.6643, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 0.3216040123384671, + "learning_rate": 0.00019120634380971496, + "loss": 0.5818, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 0.3381108626702672, + "learning_rate": 0.0001910997497957885, + "loss": 0.628, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 0.3309441524962154, + "learning_rate": 0.0001909925436972706, + "loss": 0.5479, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 0.362723924528029, + "learning_rate": 0.00019088472623446183, + "loss": 0.6453, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.3627063433665466, + "learning_rate": 0.00019077629813177036, + "loss": 0.6117, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 0.3305218390152876, + "learning_rate": 0.00019066726011770726, + "loss": 0.6173, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 0.33151598385948866, + "learning_rate": 0.00019055761292488142, + "loss": 0.617, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.34691784822633, + "learning_rate": 0.0001904473572899947, + "loss": 0.5999, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 0.2968593137797099, + "learning_rate": 0.00019033649395383702, + "loss": 0.5524, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.3033544821112245, + "learning_rate": 0.00019022502366128135, + "loss": 0.5948, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 0.31610195297938865, + "learning_rate": 0.00019011294716127867, + "loss": 0.6322, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 0.34840628740571494, + "learning_rate": 0.00019000026520685302, + "loss": 0.625, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 0.3528847028604991, + "learning_rate": 0.0001898869785550963, + "loss": 0.6272, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 0.34476033038334497, + "learning_rate": 0.0001897730879671634, + "loss": 0.6307, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 0.3090337292429833, + "learning_rate": 0.00018965859420826684, + "loss": 0.5959, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 0.31740805110292, + "learning_rate": 0.00018954349804767184, + "loss": 0.5934, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 0.3424042297072991, + "learning_rate": 0.00018942780025869098, + "loss": 0.6204, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.3616309533366346, + "learning_rate": 0.00018931150161867916, + "loss": 0.6341, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.32840944549521617, + "learning_rate": 0.00018919460290902826, + "loss": 0.6287, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 0.33634508751845704, + "learning_rate": 0.00018907710491516199, + "loss": 0.6501, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.345376073506382, + "learning_rate": 0.0001889590084265304, + "loss": 0.675, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 0.321779898847676, + "learning_rate": 0.0001888403142366049, + "loss": 0.6224, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 0.3317479094471431, + "learning_rate": 0.0001887210231428727, + "loss": 0.5951, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 0.32745902841744945, + "learning_rate": 0.00018860113594683148, + "loss": 0.6338, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 0.3345831834056288, + "learning_rate": 0.0001884806534539841, + "loss": 0.5968, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 0.40885289247034085, + "learning_rate": 0.00018835957647383303, + "loss": 0.6411, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.30078215781686657, + "learning_rate": 0.0001882379058198751, + "loss": 0.5971, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 0.32460199442321497, + "learning_rate": 0.00018811564230959588, + "loss": 0.6376, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.3533886940677027, + "learning_rate": 0.00018799278676446423, + "loss": 0.6903, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 0.3374240648532365, + "learning_rate": 0.00018786934000992688, + "loss": 0.6594, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.3386338355930694, + "learning_rate": 0.00018774530287540278, + "loss": 0.641, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 0.3649927062956018, + "learning_rate": 0.00018762067619427746, + "loss": 0.6367, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 0.32354185562358234, + "learning_rate": 0.00018749546080389757, + "loss": 0.6185, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 0.34540969342540034, + "learning_rate": 0.00018736965754556528, + "loss": 0.6427, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 0.3367172613058143, + "learning_rate": 0.00018724326726453244, + "loss": 0.6488, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 0.3248751260302771, + "learning_rate": 0.00018711629080999504, + "loss": 0.5906, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.3343029055249699, + "learning_rate": 0.00018698872903508755, + "loss": 0.6313, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 0.36348786188614796, + "learning_rate": 0.00018686058279687698, + "loss": 0.6579, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 0.3381220559557464, + "learning_rate": 0.0001867318529563574, + "loss": 0.581, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 0.3726975314977943, + "learning_rate": 0.00018660254037844388, + "loss": 0.6073, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 0.32277931313111685, + "learning_rate": 0.00018647264593196688, + "loss": 0.6089, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 0.31838728825263063, + "learning_rate": 0.00018634217048966637, + "loss": 0.5844, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 0.3378246630597595, + "learning_rate": 0.00018621111492818585, + "loss": 0.6187, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 0.28655347766899164, + "learning_rate": 0.0001860794801280666, + "loss": 0.5831, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.305738884397888, + "learning_rate": 0.00018594726697374175, + "loss": 0.5803, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 0.33699359311357896, + "learning_rate": 0.0001858144763535302, + "loss": 0.5935, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 0.31150735826778087, + "learning_rate": 0.0001856811091596308, + "loss": 0.641, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.3054786148140765, + "learning_rate": 0.0001855471662881164, + "loss": 0.6593, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 0.34948532216662775, + "learning_rate": 0.00018541264863892754, + "loss": 0.5906, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 0.33864084582756415, + "learning_rate": 0.00018527755711586678, + "loss": 0.6236, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 0.34634624154896837, + "learning_rate": 0.00018514189262659235, + "loss": 0.6826, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 0.35260995832319136, + "learning_rate": 0.00018500565608261214, + "loss": 0.5873, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.33682247133338794, + "learning_rate": 0.00018486884839927768, + "loss": 0.6607, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 0.346085289606286, + "learning_rate": 0.00018473147049577774, + "loss": 0.6567, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.3393048241910544, + "learning_rate": 0.0001845935232951325, + "loss": 0.6619, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3003583628662106, + "learning_rate": 0.00018445500772418697, + "loss": 0.6177, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 0.34016554903106605, + "learning_rate": 0.00018431592471360503, + "loss": 0.6691, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.3093553378225518, + "learning_rate": 0.00018417627519786315, + "loss": 0.614, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 0.33172592867243444, + "learning_rate": 0.000184036060115244, + "loss": 0.6436, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.29544828097432757, + "learning_rate": 0.00018389528040783012, + "loss": 0.5766, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.333956977312725, + "learning_rate": 0.00018375393702149787, + "loss": 0.6268, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 0.35148063395313595, + "learning_rate": 0.00018361203090591071, + "loss": 0.6259, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 0.3651031312943574, + "learning_rate": 0.00018346956301451304, + "loss": 0.6612, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.326094381969768, + "learning_rate": 0.00018332653430452376, + "loss": 0.6068, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 0.3674698984073789, + "learning_rate": 0.00018318294573692985, + "loss": 0.6951, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 0.3188621148135757, + "learning_rate": 0.00018303879827647975, + "loss": 0.6065, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 0.3710220035442832, + "learning_rate": 0.0001828940928916772, + "loss": 0.6886, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 0.3532434378333304, + "learning_rate": 0.00018274883055477436, + "loss": 0.6434, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 0.35527501155029584, + "learning_rate": 0.00018260301224176558, + "loss": 0.6517, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.3916507293055691, + "learning_rate": 0.00018245663893238075, + "loss": 0.6503, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.32044950584072485, + "learning_rate": 0.00018230971161007853, + "loss": 0.6202, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.32282933301493805, + "learning_rate": 0.00018216223126204007, + "loss": 0.6444, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 0.3133558079994367, + "learning_rate": 0.00018201419887916214, + "loss": 0.5897, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.35815970108664874, + "learning_rate": 0.00018186561545605054, + "loss": 0.6477, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 0.310639374075745, + "learning_rate": 0.00018171648199101346, + "loss": 0.5964, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.32845584266465505, + "learning_rate": 0.00018156679948605467, + "loss": 0.5722, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 0.31126980443075597, + "learning_rate": 0.00018141656894686689, + "loss": 0.5861, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.36275063956995357, + "learning_rate": 0.00018126579138282503, + "loss": 0.6681, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.32341296377966344, + "learning_rate": 0.00018111446780697929, + "loss": 0.601, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 0.33371756721528906, + "learning_rate": 0.0001809625992360485, + "loss": 0.6371, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.3210595730247353, + "learning_rate": 0.00018081018669041324, + "loss": 0.604, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 0.33978554404755684, + "learning_rate": 0.00018065723119410884, + "loss": 0.6232, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 0.3386691752916447, + "learning_rate": 0.00018050373377481878, + "loss": 0.6353, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.36082000765336303, + "learning_rate": 0.00018034969546386757, + "loss": 0.6172, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 0.28688120900525155, + "learning_rate": 0.0001801951172962139, + "loss": 0.5644, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 0.3542563143605929, + "learning_rate": 0.0001800400003104436, + "loss": 0.6407, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.33798332974615464, + "learning_rate": 0.0001798843455487629, + "loss": 0.5897, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 0.35372407086058777, + "learning_rate": 0.00017972815405699103, + "loss": 0.6274, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.31344756354870335, + "learning_rate": 0.00017957142688455362, + "loss": 0.5996, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 0.336677637693999, + "learning_rate": 0.00017941416508447536, + "loss": 0.6402, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.3372768178475663, + "learning_rate": 0.00017925636971337304, + "loss": 0.6203, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.2942773640929049, + "learning_rate": 0.0001790980418314484, + "loss": 0.6023, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 0.3522102150676642, + "learning_rate": 0.00017893918250248104, + "loss": 0.6351, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.31741912184406945, + "learning_rate": 0.00017877979279382135, + "loss": 0.6009, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 0.31924641317688096, + "learning_rate": 0.00017861987377638312, + "loss": 0.6108, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 0.37053405946908835, + "learning_rate": 0.0001784594265246366, + "loss": 0.6431, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 0.31961598906952976, + "learning_rate": 0.0001782984521166011, + "loss": 0.6316, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 0.3634402137789276, + "learning_rate": 0.0001781369516338378, + "loss": 0.651, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 0.31948674209526967, + "learning_rate": 0.00017797492616144256, + "loss": 0.6035, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 0.299548802883558, + "learning_rate": 0.00017781237678803847, + "loss": 0.5926, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 0.32402448050086785, + "learning_rate": 0.00017764930460576866, + "loss": 0.6156, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 0.33880235096169364, + "learning_rate": 0.000177485710710289, + "loss": 0.6279, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 0.39083307094174247, + "learning_rate": 0.00017732159620076053, + "loss": 0.629, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3219566382994085, + "learning_rate": 0.00017715696217984235, + "loss": 0.6479, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 0.2994626226151748, + "learning_rate": 0.00017699180975368396, + "loss": 0.5806, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.3232838605331977, + "learning_rate": 0.00017682614003191807, + "loss": 0.61, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.3497376990079617, + "learning_rate": 0.00017665995412765285, + "loss": 0.5986, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.3507562759459691, + "learning_rate": 0.00017649325315746478, + "loss": 0.6309, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.37058372545630297, + "learning_rate": 0.00017632603824139085, + "loss": 0.6328, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 0.33061497130522244, + "learning_rate": 0.0001761583105029213, + "loss": 0.6104, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 0.3186501826235579, + "learning_rate": 0.0001759900710689918, + "loss": 0.5812, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 0.41864475687139424, + "learning_rate": 0.00017582132106997616, + "loss": 0.6127, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.3205208087991299, + "learning_rate": 0.00017565206163967846, + "loss": 0.5832, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 0.32498438069860025, + "learning_rate": 0.00017548229391532572, + "loss": 0.5907, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 0.37112126992550315, + "learning_rate": 0.00017531201903755994, + "loss": 0.6245, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 0.3046377834490436, + "learning_rate": 0.00017514123815043074, + "loss": 0.5908, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.3521974098484267, + "learning_rate": 0.00017496995240138744, + "loss": 0.6266, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 0.3318392213699102, + "learning_rate": 0.00017479816294127152, + "loss": 0.6101, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.3328213775760183, + "learning_rate": 0.00017462587092430875, + "loss": 0.6411, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 0.3419657527360827, + "learning_rate": 0.0001744530775081015, + "loss": 0.6207, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 0.3499038698126123, + "learning_rate": 0.00017427978385362112, + "loss": 0.6674, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 0.3586849462117495, + "learning_rate": 0.0001741059911251997, + "loss": 0.6361, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.34369836364013817, + "learning_rate": 0.0001739317004905227, + "loss": 0.6314, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 0.3417683310024924, + "learning_rate": 0.000173756913120621, + "loss": 0.6613, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 0.3409954866063188, + "learning_rate": 0.00017358163018986282, + "loss": 0.6489, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 0.31603895890078487, + "learning_rate": 0.00017340585287594604, + "loss": 0.5854, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.3333999839718448, + "learning_rate": 0.00017322958235989016, + "loss": 0.6252, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 0.3069820322086464, + "learning_rate": 0.0001730528198260285, + "loss": 0.5925, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.34072642945605036, + "learning_rate": 0.00017287556646200018, + "loss": 0.6001, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 0.35920669108348435, + "learning_rate": 0.00017269782345874203, + "loss": 0.6175, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.3783316154216495, + "learning_rate": 0.00017251959201048083, + "loss": 0.6116, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3371072543398276, + "learning_rate": 0.00017234087331472497, + "loss": 0.6039, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.3395864137013351, + "learning_rate": 0.00017216166857225674, + "loss": 0.6455, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 0.33436473615672774, + "learning_rate": 0.00017198197898712404, + "loss": 0.6157, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.34399483461043867, + "learning_rate": 0.00017180180576663228, + "loss": 0.6388, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 0.31793307107047525, + "learning_rate": 0.00017162115012133643, + "loss": 0.6074, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.33446208035550756, + "learning_rate": 0.00017144001326503273, + "loss": 0.6139, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 0.3571652352841207, + "learning_rate": 0.00017125839641475072, + "loss": 0.608, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.3320381930882504, + "learning_rate": 0.00017107630079074478, + "loss": 0.5973, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 0.3510144786574423, + "learning_rate": 0.00017089372761648616, + "loss": 0.5891, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 0.3274044045131555, + "learning_rate": 0.00017071067811865476, + "loss": 0.6348, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 0.3269897438226837, + "learning_rate": 0.00017052715352713075, + "loss": 0.6086, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.32574199716607716, + "learning_rate": 0.00017034315507498635, + "loss": 0.624, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 0.33312146479263194, + "learning_rate": 0.00017015868399847768, + "loss": 0.6109, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 0.3467304661728986, + "learning_rate": 0.00016997374153703625, + "loss": 0.6533, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.2937669247519489, + "learning_rate": 0.00016978832893326074, + "loss": 0.5568, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.32697302232779996, + "learning_rate": 0.00016960244743290868, + "loss": 0.637, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.33008922674291, + "learning_rate": 0.00016941609828488807, + "loss": 0.5947, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 0.35932702083702056, + "learning_rate": 0.00016922928274124886, + "loss": 0.6224, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.3488947331373348, + "learning_rate": 0.0001690420020571747, + "loss": 0.6285, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.33888121269893695, + "learning_rate": 0.00016885425749097444, + "loss": 0.5889, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.40010242072052316, + "learning_rate": 0.0001686660503040737, + "loss": 0.6248, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 0.31657595290120183, + "learning_rate": 0.00016847738176100632, + "loss": 0.611, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 0.3319297476858135, + "learning_rate": 0.00016828825312940592, + "loss": 0.6418, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 0.34036386362008975, + "learning_rate": 0.0001680986656799975, + "loss": 0.6112, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3523951602730713, + "learning_rate": 0.0001679086206865886, + "loss": 0.657, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 0.31335321558367357, + "learning_rate": 0.00016771811942606108, + "loss": 0.6226, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 0.3013914565914275, + "learning_rate": 0.00016752716317836229, + "loss": 0.6288, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 0.34300227799897226, + "learning_rate": 0.00016733575322649657, + "loss": 0.6555, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 0.30634783522717546, + "learning_rate": 0.0001671438908565167, + "loss": 0.5639, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.3425070849108771, + "learning_rate": 0.00016695157735751513, + "loss": 0.617, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 0.30179565706233397, + "learning_rate": 0.00016675881402161536, + "loss": 0.5843, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 0.3296112433632852, + "learning_rate": 0.0001665656021439633, + "loss": 0.6033, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 0.2945134672184559, + "learning_rate": 0.0001663719430227186, + "loss": 0.555, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.33776637698122036, + "learning_rate": 0.00016617783795904565, + "loss": 0.6066, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 0.3423568940979264, + "learning_rate": 0.00016598328825710533, + "loss": 0.6326, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.32648811706587916, + "learning_rate": 0.00016578829522404583, + "loss": 0.6105, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 0.32912624202267404, + "learning_rate": 0.000165592860169994, + "loss": 0.6173, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.293587610047466, + "learning_rate": 0.00016539698440804661, + "loss": 0.5404, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 0.33688460148914934, + "learning_rate": 0.00016520066925426144, + "loss": 0.6275, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 0.3323749410062641, + "learning_rate": 0.0001650039160276485, + "loss": 0.6079, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 0.3267444427300877, + "learning_rate": 0.0001648067260501611, + "loss": 0.6123, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.3436790069140749, + "learning_rate": 0.0001646091006466871, + "loss": 0.628, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.32119138424362625, + "learning_rate": 0.0001644110411450398, + "loss": 0.6157, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 0.5151733736289191, + "learning_rate": 0.00016421254887594917, + "loss": 0.6439, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 0.33998083278904195, + "learning_rate": 0.00016401362517305296, + "loss": 0.5653, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.33423052743474296, + "learning_rate": 0.00016381427137288754, + "loss": 0.6382, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.37977710280386257, + "learning_rate": 0.00016361448881487914, + "loss": 0.6551, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 0.32753090231737964, + "learning_rate": 0.0001634142788413346, + "loss": 0.6022, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.36698761359491083, + "learning_rate": 0.00016321364279743266, + "loss": 0.6158, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.3141226357753015, + "learning_rate": 0.00016301258203121462, + "loss": 0.6357, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.32060289780088846, + "learning_rate": 0.0001628110978935756, + "loss": 0.5887, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.3286994567711767, + "learning_rate": 0.00016260919173825508, + "loss": 0.6043, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 0.35706097499663475, + "learning_rate": 0.00016240686492182804, + "loss": 0.619, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 0.28066924635940443, + "learning_rate": 0.00016220411880369601, + "loss": 0.5691, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 0.32754973215699557, + "learning_rate": 0.00016200095474607753, + "loss": 0.5872, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 0.3092661920020084, + "learning_rate": 0.00016179737411399926, + "loss": 0.6005, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.33436383411458787, + "learning_rate": 0.00016159337827528685, + "loss": 0.61, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.34876358910283783, + "learning_rate": 0.00016138896860055555, + "loss": 0.6556, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 0.2999228716581912, + "learning_rate": 0.0001611841464632011, + "loss": 0.5667, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 0.3759584585259726, + "learning_rate": 0.00016097891323939062, + "loss": 0.6834, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 0.29330724303471173, + "learning_rate": 0.0001607732703080532, + "loss": 0.5379, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.31247952102174653, + "learning_rate": 0.00016056721905087056, + "loss": 0.5804, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.3714867722083103, + "learning_rate": 0.00016036076085226814, + "loss": 0.5928, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 0.3331624506824157, + "learning_rate": 0.00016015389709940538, + "loss": 0.6082, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.31057364449667263, + "learning_rate": 0.0001599466291821666, + "loss": 0.6172, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 0.30513730993747623, + "learning_rate": 0.0001597389584931517, + "loss": 0.5898, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.3333233683204336, + "learning_rate": 0.0001595308864276666, + "loss": 0.593, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.3718388692445804, + "learning_rate": 0.0001593224143837142, + "loss": 0.6643, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 0.3045469786263307, + "learning_rate": 0.0001591135437619847, + "loss": 0.5594, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 0.3400954993676332, + "learning_rate": 0.00015890427596584617, + "loss": 0.6452, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 0.33068944804413763, + "learning_rate": 0.0001586946124013354, + "loss": 0.6184, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.3263005697408856, + "learning_rate": 0.00015848455447714822, + "loss": 0.6137, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.3799361907548381, + "learning_rate": 0.0001582741036046301, + "loss": 0.6117, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 0.2961328400166121, + "learning_rate": 0.00015806326119776663, + "loss": 0.5831, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.29802987478257925, + "learning_rate": 0.00015785202867317407, + "loss": 0.5767, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.3588158832570962, + "learning_rate": 0.00015764040745008988, + "loss": 0.623, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 0.32042047425999975, + "learning_rate": 0.00015742839895036305, + "loss": 0.6383, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 0.32407789664361586, + "learning_rate": 0.00015721600459844468, + "loss": 0.6056, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 0.3731581141563738, + "learning_rate": 0.00015700322582137827, + "loss": 0.6311, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.33308689757663057, + "learning_rate": 0.00015679006404879033, + "loss": 0.5581, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.345094080476923, + "learning_rate": 0.0001565765207128805, + "loss": 0.5997, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.3257450772192306, + "learning_rate": 0.00015636259724841222, + "loss": 0.6082, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 0.32214136459012144, + "learning_rate": 0.0001561482950927029, + "loss": 0.6195, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 0.3141660128143383, + "learning_rate": 0.00015593361568561428, + "loss": 0.6027, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 0.33291376077434987, + "learning_rate": 0.00015571856046954285, + "loss": 0.5713, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.3456442402866317, + "learning_rate": 0.0001555031308894101, + "loss": 0.5992, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 0.32137711599982044, + "learning_rate": 0.00015528732839265272, + "loss": 0.5749, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.3488399941382417, + "learning_rate": 0.0001550711544292131, + "loss": 0.6133, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.3172833467405452, + "learning_rate": 0.0001548546104515294, + "loss": 0.5964, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 0.3107087680682212, + "learning_rate": 0.00015463769791452574, + "loss": 0.6368, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.32346772310238436, + "learning_rate": 0.00015442041827560274, + "loss": 0.5702, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.293740028197297, + "learning_rate": 0.00015420277299462736, + "loss": 0.583, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 0.3131215213340511, + "learning_rate": 0.00015398476353392323, + "loss": 0.6202, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.30852267611758893, + "learning_rate": 0.00015376639135826107, + "loss": 0.6273, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.31073136007732777, + "learning_rate": 0.00015354765793484834, + "loss": 0.564, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.2890617954917825, + "learning_rate": 0.00015332856473331978, + "loss": 0.5852, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 0.33603207991140516, + "learning_rate": 0.00015310911322572753, + "loss": 0.5863, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 0.32670973873126913, + "learning_rate": 0.00015288930488653094, + "loss": 0.5973, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 0.3196958708549082, + "learning_rate": 0.000152669141192587, + "loss": 0.5903, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 0.32502039443517705, + "learning_rate": 0.0001524486236231402, + "loss": 0.5515, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.3214718738126143, + "learning_rate": 0.00015222775365981273, + "loss": 0.6478, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.332217429288651, + "learning_rate": 0.00015200653278659432, + "loss": 0.6234, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 0.31099973831050426, + "learning_rate": 0.00015178496248983254, + "loss": 0.6197, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 0.2962571050160548, + "learning_rate": 0.00015156304425822267, + "loss": 0.594, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.3153302209931717, + "learning_rate": 0.00015134077958279765, + "loss": 0.5973, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 0.3311960609985164, + "learning_rate": 0.00015111816995691809, + "loss": 0.5776, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.2979953357678523, + "learning_rate": 0.00015089521687626243, + "loss": 0.596, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 0.33082862308923683, + "learning_rate": 0.00015067192183881658, + "loss": 0.6185, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 0.30598496662422275, + "learning_rate": 0.000150448286344864, + "loss": 0.6153, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 0.3280300082629786, + "learning_rate": 0.00015022431189697568, + "loss": 0.5917, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 0.3251028559551198, + "learning_rate": 0.00015000000000000001, + "loss": 0.5825, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.3194549680822979, + "learning_rate": 0.0001497753521610526, + "loss": 0.6216, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 0.29653194129442245, + "learning_rate": 0.00014955036988950618, + "loss": 0.5765, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.34791285322763277, + "learning_rate": 0.00014932505469698052, + "loss": 0.62, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.3202293426291375, + "learning_rate": 0.00014909940809733222, + "loss": 0.6216, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 0.3174535575241201, + "learning_rate": 0.0001488734316066446, + "loss": 0.6061, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3002687444840116, + "learning_rate": 0.00014864712674321734, + "loss": 0.5822, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 0.5331348084291688, + "learning_rate": 0.0001484204950275565, + "loss": 0.6379, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.30851128418293494, + "learning_rate": 0.00014819353798236427, + "loss": 0.5986, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 0.31958460369884306, + "learning_rate": 0.00014796625713252848, + "loss": 0.6179, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.34091527263286986, + "learning_rate": 0.00014773865400511272, + "loss": 0.6166, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.2990571469218516, + "learning_rate": 0.00014751073012934587, + "loss": 0.5966, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.3436075957977801, + "learning_rate": 0.00014728248703661182, + "loss": 0.6326, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.35137030920003226, + "learning_rate": 0.0001470539262604393, + "loss": 0.5645, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 0.33924485422613193, + "learning_rate": 0.00014682504933649144, + "loss": 0.6184, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.3134749139771607, + "learning_rate": 0.00014659585780255556, + "loss": 0.5709, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 0.3084857998743944, + "learning_rate": 0.00014636635319853275, + "loss": 0.5896, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 0.34836899461435644, + "learning_rate": 0.0001461365370664276, + "loss": 0.6443, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 0.3293458971943998, + "learning_rate": 0.00014590641095033787, + "loss": 0.5857, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 0.2931982589027787, + "learning_rate": 0.00014567597639644387, + "loss": 0.5243, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.346647579881588, + "learning_rate": 0.00014544523495299842, + "loss": 0.6187, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.3260771339014491, + "learning_rate": 0.00014521418817031628, + "loss": 0.6186, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 0.33632282615149567, + "learning_rate": 0.0001449828376007636, + "loss": 0.6216, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 0.3210763835251143, + "learning_rate": 0.00014475118479874774, + "loss": 0.6262, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.3129848743701589, + "learning_rate": 0.0001445192313207067, + "loss": 0.583, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.35452368332305, + "learning_rate": 0.0001442869787250987, + "loss": 0.6261, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.33979598399894534, + "learning_rate": 0.0001440544285723915, + "loss": 0.6157, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 0.35015314225851113, + "learning_rate": 0.00014382158242505234, + "loss": 0.6385, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 0.32658054454872115, + "learning_rate": 0.00014358844184753712, + "loss": 0.6152, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 0.315602878773404, + "learning_rate": 0.00014335500840627986, + "loss": 0.6062, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 0.33170259327411067, + "learning_rate": 0.00014312128366968243, + "loss": 0.5832, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 0.3328168923869451, + "learning_rate": 0.0001428872692081038, + "loss": 0.5952, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 0.2978949712686637, + "learning_rate": 0.00014265296659384956, + "loss": 0.5474, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 0.33412115879805643, + "learning_rate": 0.00014241837740116132, + "loss": 0.5956, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3172963885067165, + "learning_rate": 0.00014218350320620624, + "loss": 0.5803, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 0.34619898505454627, + "learning_rate": 0.00014194834558706632, + "loss": 0.6414, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.34914551735458776, + "learning_rate": 0.0001417129061237278, + "loss": 0.5898, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.3406617490116019, + "learning_rate": 0.0001414771863980707, + "loss": 0.6064, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.3759582987232799, + "learning_rate": 0.00014124118799385796, + "loss": 0.5926, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 0.3550949299611754, + "learning_rate": 0.00014100491249672498, + "loss": 0.6484, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.3284855272098851, + "learning_rate": 0.00014076836149416887, + "loss": 0.6287, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.30199493201393424, + "learning_rate": 0.0001405315365755379, + "loss": 0.5935, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3098467180783862, + "learning_rate": 0.0001402944393320206, + "loss": 0.5566, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 0.31183291738971114, + "learning_rate": 0.00014005707135663527, + "loss": 0.584, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.31929677431537473, + "learning_rate": 0.00013981943424421932, + "loss": 0.6131, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 0.41978014421811577, + "learning_rate": 0.00013958152959141825, + "loss": 0.6011, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 0.34342195007224435, + "learning_rate": 0.00013934335899667527, + "loss": 0.5592, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.30196768181200767, + "learning_rate": 0.00013910492406022033, + "loss": 0.5342, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.32016809859995643, + "learning_rate": 0.00013886622638405952, + "loss": 0.5513, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.3030961366563467, + "learning_rate": 0.0001386272675719642, + "loss": 0.5416, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3640166896343213, + "learning_rate": 0.00013838804922946027, + "loss": 0.5939, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 0.33311721998350496, + "learning_rate": 0.00013814857296381728, + "loss": 0.5941, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.3118572958395179, + "learning_rate": 0.00013790884038403795, + "loss": 0.5873, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.3524618854868946, + "learning_rate": 0.00013766885310084688, + "loss": 0.6004, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 0.30261631398153654, + "learning_rate": 0.00013742861272668012, + "loss": 0.599, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 0.32222661211801934, + "learning_rate": 0.00013718812087567414, + "loss": 0.6179, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 0.32887621360130437, + "learning_rate": 0.00013694737916365517, + "loss": 0.59, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.3081053135399744, + "learning_rate": 0.000136706389208128, + "loss": 0.581, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 0.3104914902426349, + "learning_rate": 0.00013646515262826552, + "loss": 0.5823, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 0.3240593761286958, + "learning_rate": 0.00013622367104489756, + "loss": 0.5629, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 0.2971645909812542, + "learning_rate": 0.0001359819460805001, + "loss": 0.5673, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.30765629167391056, + "learning_rate": 0.0001357399793591844, + "loss": 0.5686, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.34662720007108755, + "learning_rate": 0.0001354977725066859, + "loss": 0.6206, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 0.32151305481811043, + "learning_rate": 0.00013525532715035366, + "loss": 0.6041, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 0.3026717132828533, + "learning_rate": 0.00013501264491913906, + "loss": 0.581, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.3115727768344922, + "learning_rate": 0.00013476972744358507, + "loss": 0.6127, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 0.32325633413163507, + "learning_rate": 0.0001345265763558152, + "loss": 0.5831, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 0.3406702237157173, + "learning_rate": 0.00013428319328952253, + "loss": 0.6062, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 0.3092593810807342, + "learning_rate": 0.00013403957987995882, + "loss": 0.5874, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 0.33249730642091496, + "learning_rate": 0.0001337957377639235, + "loss": 0.6137, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 0.32429643339268305, + "learning_rate": 0.0001335516685797525, + "loss": 0.5782, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.2856644756348792, + "learning_rate": 0.0001333073739673076, + "loss": 0.5785, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 0.3174505397082634, + "learning_rate": 0.00013306285556796495, + "loss": 0.564, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.33154785899386857, + "learning_rate": 0.0001328181150246045, + "loss": 0.6121, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.31824223026592074, + "learning_rate": 0.00013257315398159864, + "loss": 0.6125, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 0.3425796531184699, + "learning_rate": 0.00013232797408480127, + "loss": 0.6277, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.30980473656593255, + "learning_rate": 0.00013208257698153677, + "loss": 0.582, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.6076175814566195, + "learning_rate": 0.00013183696432058888, + "loss": 0.5652, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 0.2872005508008041, + "learning_rate": 0.00013159113775218964, + "loss": 0.5663, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.2941820265987994, + "learning_rate": 0.00013134509892800822, + "loss": 0.5632, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.3736048758448455, + "learning_rate": 0.00013109884950114007, + "loss": 0.5783, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 0.3139318455160298, + "learning_rate": 0.00013085239112609547, + "loss": 0.587, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 0.32914317410037475, + "learning_rate": 0.00013060572545878875, + "loss": 0.5746, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.29522944680111907, + "learning_rate": 0.00013035885415652685, + "loss": 0.5543, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 0.3337111411709632, + "learning_rate": 0.00013011177887799845, + "loss": 0.5964, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.3343565592312885, + "learning_rate": 0.00012986450128326266, + "loss": 0.6095, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 0.3396141654510037, + "learning_rate": 0.00012961702303373795, + "loss": 0.5707, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 0.30097027159385814, + "learning_rate": 0.00012936934579219094, + "loss": 0.5541, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.35273747266556094, + "learning_rate": 0.00012912147122272523, + "loss": 0.6511, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.3148814749827936, + "learning_rate": 0.00012887340099077024, + "loss": 0.5839, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3307776757603587, + "learning_rate": 0.00012862513676307008, + "loss": 0.559, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 0.3584458844664954, + "learning_rate": 0.0001283766802076722, + "loss": 0.6508, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 0.3438687476549892, + "learning_rate": 0.00012812803299391628, + "loss": 0.5832, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.34643015237385527, + "learning_rate": 0.00012787919679242306, + "loss": 0.5951, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.3004074425788124, + "learning_rate": 0.00012763017327508305, + "loss": 0.557, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.3098596835581066, + "learning_rate": 0.00012738096411504522, + "loss": 0.6101, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.3154602565169677, + "learning_rate": 0.0001271315709867059, + "loss": 0.594, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.31535172683979323, + "learning_rate": 0.00012688199556569753, + "loss": 0.5907, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 0.3683800650639519, + "learning_rate": 0.00012663223952887723, + "loss": 0.6062, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.35901655003556643, + "learning_rate": 0.0001263823045543158, + "loss": 0.5883, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.32146165807925703, + "learning_rate": 0.00012613219232128608, + "loss": 0.6008, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.33168383613708075, + "learning_rate": 0.00012588190451025207, + "loss": 0.6119, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.35702815659174447, + "learning_rate": 0.00012563144280285741, + "loss": 0.6541, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.2899388905134802, + "learning_rate": 0.00012538080888191408, + "loss": 0.5852, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.29446886988340504, + "learning_rate": 0.00012513000443139112, + "loss": 0.5675, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.34053821314045435, + "learning_rate": 0.00012487903113640337, + "loss": 0.5872, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3350005651489098, + "learning_rate": 0.00012462789068320017, + "loss": 0.6383, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.3370383042632032, + "learning_rate": 0.00012437658475915377, + "loss": 0.5947, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.3310474361028596, + "learning_rate": 0.00012412511505274844, + "loss": 0.5602, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.3361021628125497, + "learning_rate": 0.00012387348325356874, + "loss": 0.5578, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.31765051956438506, + "learning_rate": 0.00012362169105228826, + "loss": 0.5777, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.31371062106564285, + "learning_rate": 0.00012336974014065844, + "loss": 0.5438, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.33037423863396975, + "learning_rate": 0.000123117632211497, + "loss": 0.5586, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 0.33639075074149816, + "learning_rate": 0.00012286536895867654, + "loss": 0.5955, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 0.31110248151937986, + "learning_rate": 0.00012261295207711346, + "loss": 0.5572, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 0.3248594213682404, + "learning_rate": 0.00012236038326275626, + "loss": 0.5598, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 0.3123099001416393, + "learning_rate": 0.0001221076642125742, + "loss": 0.5751, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 0.33489861794349407, + "learning_rate": 0.00012185479662454595, + "loss": 0.5663, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.3283720748648807, + "learning_rate": 0.00012160178219764837, + "loss": 0.6106, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 0.3275800483403531, + "learning_rate": 0.00012134862263184467, + "loss": 0.6041, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.3049636997648269, + "learning_rate": 0.00012109531962807332, + "loss": 0.5841, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 0.33112859099247005, + "learning_rate": 0.00012084187488823657, + "loss": 0.5874, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.38284859464913196, + "learning_rate": 0.00012058829011518896, + "loss": 0.5724, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.3311207733229893, + "learning_rate": 0.00012033456701272576, + "loss": 0.5726, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.32156034085457397, + "learning_rate": 0.00012008070728557186, + "loss": 0.577, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.33971653595522966, + "learning_rate": 0.00011982671263936995, + "loss": 0.5699, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3623093689499001, + "learning_rate": 0.00011957258478066931, + "loss": 0.6208, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.3290860318979573, + "learning_rate": 0.00011931832541691418, + "loss": 0.5553, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.3298586814615897, + "learning_rate": 0.00011906393625643244, + "loss": 0.5602, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 0.3036153939407574, + "learning_rate": 0.00011880941900842397, + "loss": 0.5632, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3239852509324012, + "learning_rate": 0.00011855477538294935, + "loss": 0.5865, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.3293432127865494, + "learning_rate": 0.00011830000709091815, + "loss": 0.571, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 0.31413022714131084, + "learning_rate": 0.00011804511584407763, + "loss": 0.5731, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.3233028919849246, + "learning_rate": 0.0001177901033550012, + "loss": 0.581, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.29842695942222947, + "learning_rate": 0.00011753497133707679, + "loss": 0.5572, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.30617107236458035, + "learning_rate": 0.00011727972150449544, + "loss": 0.5748, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 0.30634148267818595, + "learning_rate": 0.00011702435557223987, + "loss": 0.5486, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 0.33002320870772317, + "learning_rate": 0.00011676887525607271, + "loss": 0.5959, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 0.33549835176562415, + "learning_rate": 0.00011651328227252517, + "loss": 0.617, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 0.39908094186117465, + "learning_rate": 0.00011625757833888551, + "loss": 0.5972, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.29611461200931244, + "learning_rate": 0.00011600176517318741, + "loss": 0.5694, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.32488354902119576, + "learning_rate": 0.0001157458444941984, + "loss": 0.6111, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3289872766327485, + "learning_rate": 0.00011548981802140848, + "loss": 0.6114, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 0.37679220427159815, + "learning_rate": 0.00011523368747501839, + "loss": 0.6396, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.3170293818278164, + "learning_rate": 0.00011497745457592816, + "loss": 0.5848, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.2904013307442176, + "learning_rate": 0.00011472112104572547, + "loss": 0.567, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3315032774391374, + "learning_rate": 0.00011446468860667421, + "loss": 0.5951, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.3464804958950567, + "learning_rate": 0.0001142081589817027, + "loss": 0.6333, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 0.2956379310063911, + "learning_rate": 0.00011395153389439233, + "loss": 0.5845, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.34014894088179815, + "learning_rate": 0.00011369481506896582, + "loss": 0.6087, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.31311931547255073, + "learning_rate": 0.00011343800423027582, + "loss": 0.594, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.30219795846098496, + "learning_rate": 0.00011318110310379301, + "loss": 0.559, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.2945554836924428, + "learning_rate": 0.0001129241134155949, + "loss": 0.5372, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.306847991852595, + "learning_rate": 0.00011266703689235394, + "loss": 0.5731, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 0.2947953663893335, + "learning_rate": 0.00011240987526132594, + "loss": 0.5374, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.3096735962232439, + "learning_rate": 0.00011215263025033869, + "loss": 0.5859, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.3230566622403895, + "learning_rate": 0.00011189530358778005, + "loss": 0.6206, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.329975913532662, + "learning_rate": 0.00011163789700258655, + "loss": 0.5987, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.33390179362794614, + "learning_rate": 0.00011138041222423177, + "loss": 0.5591, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.3263508029175923, + "learning_rate": 0.00011112285098271451, + "loss": 0.5715, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.3015015220251556, + "learning_rate": 0.00011086521500854745, + "loss": 0.5694, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.3091036280998343, + "learning_rate": 0.00011060750603274535, + "loss": 0.5597, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.33630594584449497, + "learning_rate": 0.00011034972578681338, + "loss": 0.6166, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.33234679395755995, + "learning_rate": 0.00011009187600273566, + "loss": 0.5755, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.2754987158410254, + "learning_rate": 0.00010983395841296348, + "loss": 0.532, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.3210960951919637, + "learning_rate": 0.00010957597475040373, + "loss": 0.6078, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.32177088310010665, + "learning_rate": 0.00010931792674840718, + "loss": 0.6197, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.31415194137155206, + "learning_rate": 0.00010905981614075693, + "loss": 0.5661, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 0.35398700420880674, + "learning_rate": 0.00010880164466165674, + "loss": 0.6036, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.3123690874169846, + "learning_rate": 0.00010854341404571928, + "loss": 0.5686, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.31659120941358476, + "learning_rate": 0.00010828512602795462, + "loss": 0.5393, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.34287302428913957, + "learning_rate": 0.00010802678234375851, + "loss": 0.5907, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.3194427108434081, + "learning_rate": 0.00010776838472890065, + "loss": 0.5663, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 0.28416716013936555, + "learning_rate": 0.0001075099349195131, + "loss": 0.5482, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.29934393305943974, + "learning_rate": 0.00010725143465207867, + "loss": 0.5566, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.31210441356618257, + "learning_rate": 0.00010699288566341914, + "loss": 0.5833, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.4175710823270915, + "learning_rate": 0.00010673428969068364, + "loss": 0.57, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.3005431361289694, + "learning_rate": 0.000106475648471337, + "loss": 0.5497, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.30603805485437957, + "learning_rate": 0.00010621696374314807, + "loss": 0.5969, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.30839245587096714, + "learning_rate": 0.00010595823724417795, + "loss": 0.5743, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.32811301561537237, + "learning_rate": 0.00010569947071276847, + "loss": 0.56, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.36282994378736644, + "learning_rate": 0.00010544066588753044, + "loss": 0.587, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3073779704203103, + "learning_rate": 0.00010518182450733186, + "loss": 0.5662, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.3080445780693838, + "learning_rate": 0.00010492294831128641, + "loss": 0.5717, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 0.3329595020216389, + "learning_rate": 0.00010466403903874176, + "loss": 0.6073, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.3389768344465552, + "learning_rate": 0.00010440509842926767, + "loss": 0.6038, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 0.2920049809736086, + "learning_rate": 0.00010414612822264455, + "loss": 0.5606, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 0.3329890595277704, + "learning_rate": 0.00010388713015885161, + "loss": 0.5881, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 0.3078511666243944, + "learning_rate": 0.00010362810597805526, + "loss": 0.5684, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.2969232568921349, + "learning_rate": 0.00010336905742059742, + "loss": 0.5688, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 0.2975446861211374, + "learning_rate": 0.0001031099862269837, + "loss": 0.5866, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 0.3085150297080228, + "learning_rate": 0.0001028508941378719, + "loss": 0.5824, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.31897289948701685, + "learning_rate": 0.00010259178289406011, + "loss": 0.5892, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.2761811680872987, + "learning_rate": 0.00010233265423647523, + "loss": 0.5269, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 0.3342011446978759, + "learning_rate": 0.00010207350990616107, + "loss": 0.5786, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.32936047495463505, + "learning_rate": 0.00010181435164426676, + "loss": 0.581, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 0.3310160783493966, + "learning_rate": 0.0001015551811920351, + "loss": 0.59, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.3086808688168038, + "learning_rate": 0.00010129600029079072, + "loss": 0.5142, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.32294814756946233, + "learning_rate": 0.00010103681068192845, + "loss": 0.5676, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.335730313097155, + "learning_rate": 0.00010077761410690172, + "loss": 0.5929, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.2891092682415137, + "learning_rate": 0.00010051841230721065, + "loss": 0.5862, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.31391514362095485, + "learning_rate": 0.00010025920702439051, + "loss": 0.5763, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.34128688111125355, + "learning_rate": 0.0001, + "loss": 0.5983, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.2970950083492731, + "learning_rate": 9.97407929756095e-05, + "loss": 0.5636, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.3322422790261592, + "learning_rate": 9.948158769278939e-05, + "loss": 0.6336, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 0.3462889113767833, + "learning_rate": 9.92223858930983e-05, + "loss": 0.6058, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 0.3091873131073345, + "learning_rate": 9.896318931807155e-05, + "loss": 0.565, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 0.3259586772383051, + "learning_rate": 9.870399970920932e-05, + "loss": 0.5435, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 0.30355190065994, + "learning_rate": 9.844481880796491e-05, + "loss": 0.5578, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.3525175334175896, + "learning_rate": 9.818564835573323e-05, + "loss": 0.6411, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.3144319852680261, + "learning_rate": 9.792649009383899e-05, + "loss": 0.5432, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.3606268812428096, + "learning_rate": 9.766734576352478e-05, + "loss": 0.592, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 0.3285048416784418, + "learning_rate": 9.740821710593989e-05, + "loss": 0.5878, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.3126871249195601, + "learning_rate": 9.714910586212816e-05, + "loss": 0.552, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 0.398048211066614, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6313, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.29852580156421366, + "learning_rate": 9.663094257940258e-05, + "loss": 0.5876, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.29668308123383585, + "learning_rate": 9.637189402194476e-05, + "loss": 0.5798, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.38923416320875054, + "learning_rate": 9.611286984114841e-05, + "loss": 0.6116, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.3151158278851179, + "learning_rate": 9.585387177735547e-05, + "loss": 0.5882, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.2915554377465551, + "learning_rate": 9.559490157073236e-05, + "loss": 0.5781, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.3283600604801278, + "learning_rate": 9.533596096125825e-05, + "loss": 0.5915, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.2892731582576615, + "learning_rate": 9.507705168871358e-05, + "loss": 0.5378, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.32313775663040456, + "learning_rate": 9.481817549266817e-05, + "loss": 0.5856, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.3143239975542234, + "learning_rate": 9.455933411246958e-05, + "loss": 0.5943, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 0.28093040153755033, + "learning_rate": 9.430052928723153e-05, + "loss": 0.5538, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.29836527707274535, + "learning_rate": 9.404176275582208e-05, + "loss": 0.5606, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.3034497970344787, + "learning_rate": 9.378303625685195e-05, + "loss": 0.5709, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.34855905245294183, + "learning_rate": 9.352435152866298e-05, + "loss": 0.6134, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.28627133463186333, + "learning_rate": 9.326571030931637e-05, + "loss": 0.5459, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.2912756420621914, + "learning_rate": 9.300711433658087e-05, + "loss": 0.5624, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.30179944173660356, + "learning_rate": 9.274856534792138e-05, + "loss": 0.5765, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 0.29890460764801857, + "learning_rate": 9.249006508048694e-05, + "loss": 0.5537, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.3466282966197182, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6011, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 0.29223313989202737, + "learning_rate": 9.197321765624152e-05, + "loss": 0.5451, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.3025698887084507, + "learning_rate": 9.171487397204539e-05, + "loss": 0.5513, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.31970077904393274, + "learning_rate": 9.145658595428074e-05, + "loss": 0.5726, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 0.30219260523572566, + "learning_rate": 9.119835533834331e-05, + "loss": 0.5543, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 0.3418708829034411, + "learning_rate": 9.09401838592431e-05, + "loss": 0.5941, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.3184434479584035, + "learning_rate": 9.068207325159284e-05, + "loss": 0.5602, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.2803254943591195, + "learning_rate": 9.04240252495963e-05, + "loss": 0.5235, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.3113324526358792, + "learning_rate": 9.016604158703654e-05, + "loss": 0.5876, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.32516855337457196, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6085, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.30435796694491285, + "learning_rate": 8.965027421318665e-05, + "loss": 0.5414, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.3706669941280122, + "learning_rate": 8.939249396725467e-05, + "loss": 0.6582, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.2909623001085036, + "learning_rate": 8.913478499145254e-05, + "loss": 0.5378, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.31528375783476587, + "learning_rate": 8.887714901728551e-05, + "loss": 0.5713, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.3650121672123169, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6142, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.29119371050396164, + "learning_rate": 8.836210299741346e-05, + "loss": 0.5619, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.3203911171044392, + "learning_rate": 8.810469641222001e-05, + "loss": 0.5877, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.32271298283608196, + "learning_rate": 8.784736974966135e-05, + "loss": 0.5283, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.28878418893810864, + "learning_rate": 8.759012473867407e-05, + "loss": 0.5751, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 0.30789216817595116, + "learning_rate": 8.733296310764611e-05, + "loss": 0.6049, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.31190512927915737, + "learning_rate": 8.707588658440511e-05, + "loss": 0.5887, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.29370360358539743, + "learning_rate": 8.6818896896207e-05, + "loss": 0.5527, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 0.32282691953164466, + "learning_rate": 8.656199576972423e-05, + "loss": 0.5787, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 0.29607470204491804, + "learning_rate": 8.63051849310342e-05, + "loss": 0.5365, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.28276141290643, + "learning_rate": 8.604846610560771e-05, + "loss": 0.5233, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 0.3085936472841523, + "learning_rate": 8.579184101829734e-05, + "loss": 0.5557, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.30258328345313, + "learning_rate": 8.553531139332582e-05, + "loss": 0.515, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.3030974741918099, + "learning_rate": 8.527887895427454e-05, + "loss": 0.529, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.3032703363829799, + "learning_rate": 8.502254542407186e-05, + "loss": 0.5196, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.31717487898699986, + "learning_rate": 8.476631252498162e-05, + "loss": 0.5342, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.3516402124247312, + "learning_rate": 8.451018197859153e-05, + "loss": 0.5776, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.2981138602796516, + "learning_rate": 8.425415550580162e-05, + "loss": 0.5745, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.3158015784657397, + "learning_rate": 8.399823482681262e-05, + "loss": 0.5261, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.2882184873804512, + "learning_rate": 8.374242166111448e-05, + "loss": 0.577, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.2738229806252894, + "learning_rate": 8.348671772747487e-05, + "loss": 0.5051, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.32973780330976316, + "learning_rate": 8.323112474392731e-05, + "loss": 0.5888, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.29928667533722103, + "learning_rate": 8.297564442776014e-05, + "loss": 0.5406, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.3285204465141545, + "learning_rate": 8.272027849550457e-05, + "loss": 0.597, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.33658694854786897, + "learning_rate": 8.246502866292324e-05, + "loss": 0.5394, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.30003941393678973, + "learning_rate": 8.220989664499878e-05, + "loss": 0.5418, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.28418221533922805, + "learning_rate": 8.195488415592238e-05, + "loss": 0.5221, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.28466767665546777, + "learning_rate": 8.169999290908188e-05, + "loss": 0.5402, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3109103099273694, + "learning_rate": 8.144522461705067e-05, + "loss": 0.5595, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 0.3008493866712516, + "learning_rate": 8.119058099157604e-05, + "loss": 0.5387, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.30239359653537784, + "learning_rate": 8.093606374356759e-05, + "loss": 0.5383, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.30888540228832956, + "learning_rate": 8.068167458308582e-05, + "loss": 0.5662, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.3041544703549886, + "learning_rate": 8.042741521933071e-05, + "loss": 0.5488, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.3095261506372009, + "learning_rate": 8.017328736063006e-05, + "loss": 0.5889, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.32286086032346434, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6072, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.32136638464728656, + "learning_rate": 7.966543298727425e-05, + "loss": 0.5359, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 0.28516728612063824, + "learning_rate": 7.941170988481108e-05, + "loss": 0.5104, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.3978012159473873, + "learning_rate": 7.915812511176347e-05, + "loss": 0.5992, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.36043649309480175, + "learning_rate": 7.89046803719267e-05, + "loss": 0.6012, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.29103040749135345, + "learning_rate": 7.865137736815535e-05, + "loss": 0.5761, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.3360190376562365, + "learning_rate": 7.839821780235168e-05, + "loss": 0.5908, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.3301162761711765, + "learning_rate": 7.814520337545406e-05, + "loss": 0.5547, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.301639490569475, + "learning_rate": 7.789233578742582e-05, + "loss": 0.5488, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.3087064983202081, + "learning_rate": 7.763961673724379e-05, + "loss": 0.5632, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 0.3050631522442395, + "learning_rate": 7.738704792288655e-05, + "loss": 0.5799, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 0.3292292803230435, + "learning_rate": 7.713463104132345e-05, + "loss": 0.5417, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.30277773825080534, + "learning_rate": 7.688236778850306e-05, + "loss": 0.5659, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.31459609815156425, + "learning_rate": 7.663025985934158e-05, + "loss": 0.5278, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 0.31405067993214364, + "learning_rate": 7.637830894771175e-05, + "loss": 0.5936, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.3261864155110813, + "learning_rate": 7.61265167464313e-05, + "loss": 0.5373, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.3256369222279084, + "learning_rate": 7.587488494725157e-05, + "loss": 0.5587, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.32732422612088785, + "learning_rate": 7.562341524084623e-05, + "loss": 0.5412, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 0.2909075591219546, + "learning_rate": 7.537210931679987e-05, + "loss": 0.5396, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 0.3102436144255524, + "learning_rate": 7.512096886359664e-05, + "loss": 0.5573, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.3089849518644942, + "learning_rate": 7.48699955686089e-05, + "loss": 0.5607, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 0.3421498615774315, + "learning_rate": 7.461919111808595e-05, + "loss": 0.5528, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3265352951673599, + "learning_rate": 7.43685571971426e-05, + "loss": 0.5569, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 0.32080085841420525, + "learning_rate": 7.411809548974792e-05, + "loss": 0.5858, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 0.2728739185226048, + "learning_rate": 7.386780767871397e-05, + "loss": 0.5312, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.31824075120690043, + "learning_rate": 7.361769544568425e-05, + "loss": 0.5732, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.34042444345005146, + "learning_rate": 7.336776047112276e-05, + "loss": 0.5925, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.310324892221723, + "learning_rate": 7.311800443430251e-05, + "loss": 0.5497, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.32304417812133285, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6018, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.2948690578149887, + "learning_rate": 7.26190358849548e-05, + "loss": 0.5487, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.27983536162198325, + "learning_rate": 7.236982672491698e-05, + "loss": 0.5124, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.3548489303896564, + "learning_rate": 7.212080320757695e-05, + "loss": 0.6156, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.35373252999175986, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6013, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.31108290438872843, + "learning_rate": 7.162331979232783e-05, + "loss": 0.5537, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.30654349530803887, + "learning_rate": 7.137486323692995e-05, + "loss": 0.5407, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.27628561537490304, + "learning_rate": 7.112659900922976e-05, + "loss": 0.4662, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.2848952041422797, + "learning_rate": 7.087852877727481e-05, + "loss": 0.5305, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.2929566568970297, + "learning_rate": 7.06306542078091e-05, + "loss": 0.5315, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.29284105716996106, + "learning_rate": 7.038297696626206e-05, + "loss": 0.535, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.2859049360458969, + "learning_rate": 7.013549871673736e-05, + "loss": 0.5387, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.3355319801411293, + "learning_rate": 6.988822112200156e-05, + "loss": 0.5671, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.3203929390909087, + "learning_rate": 6.964114584347316e-05, + "loss": 0.5538, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 0.3185197867849292, + "learning_rate": 6.939427454121128e-05, + "loss": 0.579, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.3280413731803322, + "learning_rate": 6.914760887390452e-05, + "loss": 0.5432, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.356417217277443, + "learning_rate": 6.890115049885994e-05, + "loss": 0.5501, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 0.34291843170723035, + "learning_rate": 6.865490107199181e-05, + "loss": 0.5891, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.33266859183140374, + "learning_rate": 6.84088622478104e-05, + "loss": 0.6053, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.3428756852638353, + "learning_rate": 6.816303567941112e-05, + "loss": 0.5835, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.32259740512068086, + "learning_rate": 6.791742301846326e-05, + "loss": 0.5447, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.29520311140254396, + "learning_rate": 6.767202591519875e-05, + "loss": 0.545, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.31599076092953177, + "learning_rate": 6.742684601840141e-05, + "loss": 0.5625, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.30610042652307184, + "learning_rate": 6.718188497539554e-05, + "loss": 0.5232, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.2875556938182346, + "learning_rate": 6.693714443203507e-05, + "loss": 0.5118, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 0.31626200677832067, + "learning_rate": 6.669262603269246e-05, + "loss": 0.6238, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.2899591130536518, + "learning_rate": 6.644833142024751e-05, + "loss": 0.5142, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.34510295520184053, + "learning_rate": 6.620426223607654e-05, + "loss": 0.6037, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.3326034789489528, + "learning_rate": 6.59604201200412e-05, + "loss": 0.5752, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 0.32708452711369174, + "learning_rate": 6.571680671047749e-05, + "loss": 0.559, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.2752132481519014, + "learning_rate": 6.547342364418481e-05, + "loss": 0.503, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.2898907790607678, + "learning_rate": 6.523027255641493e-05, + "loss": 0.5365, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.3047104154762272, + "learning_rate": 6.498735508086093e-05, + "loss": 0.5491, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.2985657914414307, + "learning_rate": 6.474467284964634e-05, + "loss": 0.5256, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 0.29121424575479926, + "learning_rate": 6.450222749331414e-05, + "loss": 0.512, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.32415455341460486, + "learning_rate": 6.426002064081565e-05, + "loss": 0.5653, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 0.2966756230431178, + "learning_rate": 6.40180539194999e-05, + "loss": 0.5506, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 0.3072390598848972, + "learning_rate": 6.377632895510248e-05, + "loss": 0.5494, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.335230602942832, + "learning_rate": 6.35348473717345e-05, + "loss": 0.5828, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.31009836683608927, + "learning_rate": 6.329361079187199e-05, + "loss": 0.556, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.30452383537861544, + "learning_rate": 6.305262083634488e-05, + "loss": 0.5497, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.38462804061730854, + "learning_rate": 6.281187912432587e-05, + "loss": 0.5529, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.31250693282575825, + "learning_rate": 6.25713872733199e-05, + "loss": 0.5535, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.29974803127058314, + "learning_rate": 6.233114689915316e-05, + "loss": 0.5369, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 0.2867417866680452, + "learning_rate": 6.209115961596208e-05, + "loss": 0.5438, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.2997163483151464, + "learning_rate": 6.18514270361827e-05, + "loss": 0.5556, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.34043546647378375, + "learning_rate": 6.161195077053976e-05, + "loss": 0.5719, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.29164856643978054, + "learning_rate": 6.13727324280358e-05, + "loss": 0.5612, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.3012834777462016, + "learning_rate": 6.113377361594049e-05, + "loss": 0.511, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.30801758600288326, + "learning_rate": 6.08950759397797e-05, + "loss": 0.5636, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.33889818246198006, + "learning_rate": 6.065664100332478e-05, + "loss": 0.609, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.31475874645443735, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.5595, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.34755245829472137, + "learning_rate": 6.018056575578075e-05, + "loss": 0.5808, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.27507511346149044, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.5395, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.3233144309369947, + "learning_rate": 5.970556066797941e-05, + "loss": 0.5376, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.29949070972137315, + "learning_rate": 5.946846342446214e-05, + "loss": 0.5369, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.29285700624899347, + "learning_rate": 5.923163850583113e-05, + "loss": 0.5022, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.32795453858716095, + "learning_rate": 5.899508750327501e-05, + "loss": 0.5413, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 0.30826104843825375, + "learning_rate": 5.875881200614207e-05, + "loss": 0.568, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.3143327052024473, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.5331, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.31702164905563585, + "learning_rate": 5.828709387627218e-05, + "loss": 0.5746, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.3121974900153664, + "learning_rate": 5.80516544129337e-05, + "loss": 0.5598, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.3012287251823861, + "learning_rate": 5.781649679379378e-05, + "loss": 0.5156, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.30979212824941293, + "learning_rate": 5.758162259883867e-05, + "loss": 0.5661, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.29784433613118455, + "learning_rate": 5.73470334061505e-05, + "loss": 0.5169, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.30376763893493497, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.5516, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.3898317785979762, + "learning_rate": 5.687871633031754e-05, + "loss": 0.5805, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.3094852084266824, + "learning_rate": 5.664499159372017e-05, + "loss": 0.5859, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.320339966420363, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.5822, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.32006842269251284, + "learning_rate": 5.617841757494762e-05, + "loss": 0.5352, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.37750231534770534, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6046, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.36655609158769825, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.5919, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.28623817014447206, + "learning_rate": 5.54807686792933e-05, + "loss": 0.5285, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.33349397751189286, + "learning_rate": 5.524881520125229e-05, + "loss": 0.6317, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.335647348296861, + "learning_rate": 5.501716239923642e-05, + "loss": 0.5885, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.3232602598959376, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.5834, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 0.3361254671620294, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.5939, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.31945038832981176, + "learning_rate": 5.432402360355615e-05, + "loss": 0.534, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.31226624481325327, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.5422, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.3077602752436613, + "learning_rate": 5.386346293357242e-05, + "loss": 0.574, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.3336995198562612, + "learning_rate": 5.363364680146725e-05, + "loss": 0.5377, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 0.35153220115023254, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.5564, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.32485718185413065, + "learning_rate": 5.31749506635086e-05, + "loss": 0.5886, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.30590842283104547, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.5554, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.32672208522717094, + "learning_rate": 5.271751296338823e-05, + "loss": 0.5595, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.3192783343397264, + "learning_rate": 5.248926987065417e-05, + "loss": 0.5418, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 0.30602064229834347, + "learning_rate": 5.226134599488728e-05, + "loss": 0.5657, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.29687986531333965, + "learning_rate": 5.203374286747158e-05, + "loss": 0.5329, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 0.34079652730441223, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6219, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 0.31774442893558724, + "learning_rate": 5.15795049724435e-05, + "loss": 0.5756, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.29105110444104043, + "learning_rate": 5.135287325678271e-05, + "loss": 0.5301, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.28591438949126685, + "learning_rate": 5.112656839335543e-05, + "loss": 0.4973, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.336860252500957, + "learning_rate": 5.090059190266779e-05, + "loss": 0.5571, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.28934036088163895, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.5367, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.29943661800798266, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.5513, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 0.3392111301843805, + "learning_rate": 5.022464783894744e-05, + "loss": 0.5423, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.3107349966613786, + "learning_rate": 5.000000000000002e-05, + "loss": 0.5184, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.2979788532115799, + "learning_rate": 4.977568810302432e-05, + "loss": 0.5369, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.29924739594845984, + "learning_rate": 4.955171365513603e-05, + "loss": 0.5513, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.33469631034634045, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.5512, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.29595906303526687, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.5319, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.30291502645029644, + "learning_rate": 4.88818300430819e-05, + "loss": 0.5502, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.35328192530331826, + "learning_rate": 4.865922041720239e-05, + "loss": 0.5762, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.28540925455822486, + "learning_rate": 4.843695574177737e-05, + "loss": 0.4979, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.304605777689829, + "learning_rate": 4.821503751016746e-05, + "loss": 0.5337, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.2729018856144321, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.5108, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.30020507460061946, + "learning_rate": 4.777224634018732e-05, + "loss": 0.5207, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.34238623815047864, + "learning_rate": 4.755137637685979e-05, + "loss": 0.5534, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 0.35003527619217056, + "learning_rate": 4.733085880741301e-05, + "loss": 0.5984, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.32561025592189263, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.5739, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.33034226924047594, + "learning_rate": 4.689088677427249e-05, + "loss": 0.5347, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.2792723221355915, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.5258, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 0.3044656271243137, + "learning_rate": 4.645234206515171e-05, + "loss": 0.517, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.3360880809613938, + "learning_rate": 4.623360864173893e-05, + "loss": 0.5362, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.3200015230925606, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.5639, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.40887674834776866, + "learning_rate": 4.579722700537268e-05, + "loss": 0.5625, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 0.3145436452910505, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.5451, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.31880844342506026, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.5465, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 0.34586690000736525, + "learning_rate": 4.514538954847064e-05, + "loss": 0.5597, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.3121607435832115, + "learning_rate": 4.492884557078688e-05, + "loss": 0.5423, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.29565589950502735, + "learning_rate": 4.471267160734731e-05, + "loss": 0.5499, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.3133910880222989, + "learning_rate": 4.449686911058992e-05, + "loss": 0.5773, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.30250808339080254, + "learning_rate": 4.428143953045717e-05, + "loss": 0.5252, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 0.31168795673266836, + "learning_rate": 4.406638431438576e-05, + "loss": 0.5758, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.30860647715999096, + "learning_rate": 4.385170490729712e-05, + "loss": 0.5537, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.30206562614258453, + "learning_rate": 4.36374027515878e-05, + "loss": 0.5735, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.29150087234273553, + "learning_rate": 4.342347928711953e-05, + "loss": 0.5506, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 0.30932251242575176, + "learning_rate": 4.320993595120969e-05, + "loss": 0.5691, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.34688608229870943, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.5896, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.299346299652687, + "learning_rate": 4.278399540155536e-05, + "loss": 0.5422, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.3134175521993939, + "learning_rate": 4.257160104963696e-05, + "loss": 0.5478, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.25714295747613014, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.4925, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.3028365511780001, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.536, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.32235312108259556, + "learning_rate": 4.193673880223339e-05, + "loss": 0.5728, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.32081281191842137, + "learning_rate": 4.172589639536991e-05, + "loss": 0.5634, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 0.3827895095206315, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.6354, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.30863741089331764, + "learning_rate": 4.130538759866457e-05, + "loss": 0.5537, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.3256260881947753, + "learning_rate": 4.109572403415386e-05, + "loss": 0.5867, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.3050446437744646, + "learning_rate": 4.088645623801534e-05, + "loss": 0.5361, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.3274713332029523, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.5497, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.3189105663709932, + "learning_rate": 4.046911357233343e-05, + "loss": 0.5626, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.2798343479189164, + "learning_rate": 4.026104150684835e-05, + "loss": 0.516, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.2954774076825385, + "learning_rate": 4.00533708178334e-05, + "loss": 0.512, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.3147840267355935, + "learning_rate": 3.984610290059467e-05, + "loss": 0.5566, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.3032627309661228, + "learning_rate": 3.963923914773187e-05, + "loss": 0.5088, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.31170713223938146, + "learning_rate": 3.943278094912946e-05, + "loss": 0.5535, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 0.32461281893145943, + "learning_rate": 3.922672969194686e-05, + "loss": 0.5957, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.2847997799830815, + "learning_rate": 3.902108676060937e-05, + "loss": 0.4989, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 0.33089303204648524, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.5405, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.29905218250378424, + "learning_rate": 3.861103139944449e-05, + "loss": 0.556, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.31422611021653635, + "learning_rate": 3.840662172471315e-05, + "loss": 0.5443, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.31144595157979055, + "learning_rate": 3.820262588600074e-05, + "loss": 0.519, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.3123499837096723, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5426, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.3459777573293554, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.5925, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3175452941693634, + "learning_rate": 3.759313507817196e-05, + "loss": 0.5195, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.30567617639153344, + "learning_rate": 3.739080826174498e-05, + "loss": 0.5279, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.35116963657254624, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.5305, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.28375567723560385, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.5184, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.2869409192199368, + "learning_rate": 3.678635720256737e-05, + "loss": 0.5551, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.29937788488621553, + "learning_rate": 3.658572115866541e-05, + "loss": 0.4832, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.3262491820167027, + "learning_rate": 3.638551118512089e-05, + "loss": 0.5378, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 0.30112379649038384, + "learning_rate": 3.618572862711247e-05, + "loss": 0.5688, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.29810789933135196, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.5481, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.31552908175115096, + "learning_rate": 3.578745112405083e-05, + "loss": 0.5245, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.29393333704746166, + "learning_rate": 3.558895885496023e-05, + "loss": 0.488, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.31391098423307, + "learning_rate": 3.539089935331294e-05, + "loss": 0.5536, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3346782470622131, + "learning_rate": 3.519327394983888e-05, + "loss": 0.5329, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.3672665593378921, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.5678, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.2944453587769173, + "learning_rate": 3.479933074573858e-05, + "loss": 0.5462, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.3370399550819899, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.5628, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.3288714841640314, + "learning_rate": 3.440713983000601e-05, + "loss": 0.5981, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.29689548182034403, + "learning_rate": 3.421170477595419e-05, + "loss": 0.5266, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.3080060985178883, + "learning_rate": 3.401671174289469e-05, + "loss": 0.4876, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.29239362748621933, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.5039, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.34189268844884885, + "learning_rate": 3.362805697728145e-05, + "loss": 0.5757, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.30140273049519617, + "learning_rate": 3.34343978560367e-05, + "loss": 0.5666, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.312712966702646, + "learning_rate": 3.324118597838464e-05, + "loss": 0.5343, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.32469996843838494, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.5954, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.31158861396211135, + "learning_rate": 3.285610914348332e-05, + "loss": 0.5284, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.3399520386722396, + "learning_rate": 3.266424677350346e-05, + "loss": 0.5834, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.31447096265182917, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.5595, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.3295635813346338, + "learning_rate": 3.228188057393895e-05, + "loss": 0.5963, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.31492787825694496, + "learning_rate": 3.209137931341143e-05, + "loss": 0.5387, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.34873012265855585, + "learning_rate": 3.190133432000252e-05, + "loss": 0.5469, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.2882059407031955, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.526, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.357003593863277, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.5742, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.3059292031158361, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.5542, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 0.34228930243021916, + "learning_rate": 3.114574250902558e-05, + "loss": 0.5418, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.31358476760363263, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.5515, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.2991049253730696, + "learning_rate": 3.077071725875116e-05, + "loss": 0.5499, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.3246964150268922, + "learning_rate": 3.058390171511196e-05, + "loss": 0.5379, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.2800061952399545, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.492, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.2953120642075083, + "learning_rate": 3.021167106673928e-05, + "loss": 0.5296, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.3245840047249902, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.5415, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.3082445800103548, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.501, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.32292629991709976, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.4635, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.299094795236697, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.4686, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.3508599466147304, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.5786, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.3171071081622717, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.5668, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.28065758877521735, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.5089, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.32489832926900575, + "learning_rate": 2.874160358524931e-05, + "loss": 0.5173, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.3095537494019464, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.546, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.32034370572961807, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.5171, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.3187215992671766, + "learning_rate": 2.819819423336775e-05, + "loss": 0.5504, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.3146826672456126, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.4789, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.3190013611747521, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.5321, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.326748839285513, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.5294, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.3081416312905957, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.5232, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.29513374549201843, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5401, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.2773408705908564, + "learning_rate": 2.712443353799984e-05, + "loss": 0.4806, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.2798973656042734, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.4714, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.30574862160174193, + "learning_rate": 2.677041764010988e-05, + "loss": 0.4972, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.32748214795140573, + "learning_rate": 2.659414712405398e-05, + "loss": 0.5661, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.3252553366736222, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.5583, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 0.35670821112923623, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5849, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 0.2871837048418551, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.5328, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.31323911862742787, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.535, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.32004837470051534, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.54, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 0.30508189378285194, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.5241, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.28600891039631177, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.5407, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.2685474416983108, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.5263, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 0.3288666824268659, + "learning_rate": 2.503004759861258e-05, + "loss": 0.5633, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3279722764357726, + "learning_rate": 2.485876184956928e-05, + "loss": 0.5567, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.30724088779643627, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.526, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 0.3075719636900623, + "learning_rate": 2.451770608467432e-05, + "loss": 0.5779, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.3264675500929435, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.5627, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3487082024475743, + "learning_rate": 2.417867893002387e-05, + "loss": 0.5495, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.30440794115951253, + "learning_rate": 2.400992893100822e-05, + "loss": 0.5452, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.32424742144960367, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.5304, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 0.38630918402153647, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.5589, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.2915176550129016, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.5208, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.3292020277917887, + "learning_rate": 2.334004587234717e-05, + "loss": 0.5611, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.32495013533889283, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.5448, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.3282110627791997, + "learning_rate": 2.300819024631603e-05, + "loss": 0.5604, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.28993296253098433, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.4881, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.33432515559265674, + "learning_rate": 2.26784037992395e-05, + "loss": 0.5635, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 0.31056272504625193, + "learning_rate": 2.251428928971102e-05, + "loss": 0.5052, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.2784522543426241, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.5279, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.28274294573486125, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.5471, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.3666468314245622, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.5479, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.3077175023277414, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.5127, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.3251709821723278, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.583, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.30886321887992296, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5752, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 0.27366018678935805, + "learning_rate": 2.138012622361689e-05, + "loss": 0.5128, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.3000169994037466, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.4773, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.3300365338724269, + "learning_rate": 2.106081749751897e-05, + "loss": 0.5475, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.2875891880642037, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.5044, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.3142456816575401, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.5518, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.3506197385399373, + "learning_rate": 2.058583491552465e-05, + "loss": 0.5615, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.32899475302997194, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.5667, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.3056242008956224, + "learning_rate": 2.027184594300898e-05, + "loss": 0.5245, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 0.28563965971737626, + "learning_rate": 2.011565445123711e-05, + "loss": 0.5152, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.2870864372747718, + "learning_rate": 1.995999968955641e-05, + "loss": 0.5015, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.3373663391946109, + "learning_rate": 1.980488270378612e-05, + "loss": 0.5638, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.3496232984165067, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.5595, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.28995611594609594, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.4965, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.4967736974804734, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.5359, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.3453416340275592, + "learning_rate": 1.918981330958678e-05, + "loss": 0.5437, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.3123927141691169, + "learning_rate": 1.903740076395151e-05, + "loss": 0.554, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.3543812860374282, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.5634, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 0.30432269982327737, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5288, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.3003660393916413, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.5476, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.3591173722638707, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.5912, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.3762179489067089, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.5701, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.3156922850378513, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.5274, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 0.31274555421611044, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.5137, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.30169419740639275, + "learning_rate": 1.783776873795994e-05, + "loss": 0.514, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.2782830158634876, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.489, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.3304509830037468, + "learning_rate": 1.754336106761927e-05, + "loss": 0.555, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.296451342339957, + "learning_rate": 1.739698775823442e-05, + "loss": 0.5328, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.31534851649038614, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.5541, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.3084359553582928, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.5328, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.341562199601936, + "learning_rate": 1.696120172352025e-05, + "loss": 0.5575, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.35015201530763834, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.6025, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.2993997403965686, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.5233, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.29982110762334596, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.51, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.3019064768245265, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.542, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.35085498091047945, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.5788, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.29740523699437377, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.4931, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.2876930635912273, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.4932, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.31294293634047604, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.5826, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.3068928309390789, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.5346, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.3100798430096716, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.5196, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 0.32708364181786515, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.5521, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.3207712088485768, + "learning_rate": 1.526852950422226e-05, + "loss": 0.524, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.3660178711580096, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.5655, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.3143621876194328, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.5449, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.31830508951267983, + "learning_rate": 1.485810737340767e-05, + "loss": 0.5385, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.27040696747271814, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.4921, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.31288584178094314, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.502, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.28642818157187094, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.5245, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.3491942090381071, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.5499, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.29823801708556746, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.5234, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.30719005009070094, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.5749, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3062795801855541, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.5825, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.27635298150584786, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.5003, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.3355786568989501, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.5482, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.2932756874456343, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.5011, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.3269210423789285, + "learning_rate": 1.339745962155613e-05, + "loss": 0.5726, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.29664305615872355, + "learning_rate": 1.326814704364262e-05, + "loss": 0.535, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.3279135895836481, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5337, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.3023263703012834, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.5241, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.31453396197595923, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.5706, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.32532182414773664, + "learning_rate": 1.275673273546758e-05, + "loss": 0.5738, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 0.35771452622909367, + "learning_rate": 1.263034245443473e-05, + "loss": 0.5646, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.3191261928779677, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.5559, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3157418636170192, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.5253, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.36085371540844025, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.5481, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.3003756422947911, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.5362, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.2817775041106162, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.4897, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.28590694646675563, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.4885, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.30886813713061007, + "learning_rate": 1.176209418012495e-05, + "loss": 0.5272, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 0.28847217084933324, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.5103, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.558607761663773, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.5394, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.29232474305564354, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.5071, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.40299383279271805, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.5773, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.299439992576117, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.5126, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.30741883936624265, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.5131, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.3823429238720938, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.5459, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 0.35231467984482157, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.5927, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.31776257432998906, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5432, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.3102367246725002, + "learning_rate": 1.057219974130903e-05, + "loss": 0.5321, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 0.2952381694304628, + "learning_rate": 1.045650195232819e-05, + "loss": 0.5259, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.3203347190740959, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.5375, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.33453341696425476, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.5545, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.2886438992098675, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.5063, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.2957922466846594, + "learning_rate": 9.999734793146998e-06, + "loss": 0.5138, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.32379520985731924, + "learning_rate": 9.887052838721322e-06, + "loss": 0.527, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 0.34378734225924573, + "learning_rate": 9.774976338718677e-06, + "loss": 0.5904, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.31208605637507303, + "learning_rate": 9.663506046162985e-06, + "loss": 0.5311, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.34496199350844126, + "learning_rate": 9.552642710005299e-06, + "loss": 0.5527, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 0.3110318604047679, + "learning_rate": 9.44238707511862e-06, + "loss": 0.5323, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.3069235571145225, + "learning_rate": 9.332739882292752e-06, + "loss": 0.5284, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.294701697184567, + "learning_rate": 9.22370186822965e-06, + "loss": 0.4935, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.34120922974979584, + "learning_rate": 9.115273765538202e-06, + "loss": 0.5729, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 0.3382073855820592, + "learning_rate": 9.0074563027294e-06, + "loss": 0.5626, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.3207381770313093, + "learning_rate": 8.900250204211514e-06, + "loss": 0.543, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.33639489656827687, + "learning_rate": 8.79365619028507e-06, + "loss": 0.5768, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.3164902799063137, + "learning_rate": 8.687674977138116e-06, + "loss": 0.5852, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.3269986614052039, + "learning_rate": 8.582307276841462e-06, + "loss": 0.5584, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.3401491702230297, + "learning_rate": 8.47755379734373e-06, + "loss": 0.5875, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.301546032734985, + "learning_rate": 8.37341524246672e-06, + "loss": 0.5155, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.29486548220063846, + "learning_rate": 8.269892311900696e-06, + "loss": 0.5409, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.28913316993391175, + "learning_rate": 8.166985701199582e-06, + "loss": 0.5164, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.2951028071209438, + "learning_rate": 8.064696101776358e-06, + "loss": 0.524, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 0.32268092430664863, + "learning_rate": 7.963024200898462e-06, + "loss": 0.5112, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.29083850843653986, + "learning_rate": 7.861970681683051e-06, + "loss": 0.5187, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.33878533738514793, + "learning_rate": 7.761536223092458e-06, + "loss": 0.5579, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.3583612786949319, + "learning_rate": 7.661721499929753e-06, + "loss": 0.6055, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.3157374969540624, + "learning_rate": 7.562527182833978e-06, + "loss": 0.5193, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.3323887403847907, + "learning_rate": 7.463953938275858e-06, + "loss": 0.5483, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.3271313369536848, + "learning_rate": 7.366002428553153e-06, + "loss": 0.5291, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 0.33490858014078323, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.4985, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.398077948712715, + "learning_rate": 7.171967241914224e-06, + "loss": 0.5612, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.31927806478811827, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5282, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.31620075198738284, + "learning_rate": 6.980426837673437e-06, + "loss": 0.5373, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.3140137639251144, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5141, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.30376308756664244, + "learning_rate": 6.791386363539065e-06, + "loss": 0.533, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.3050295266759193, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.4898, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.29951904118841577, + "learning_rate": 6.604850900032955e-06, + "loss": 0.5268, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.34540743633115883, + "learning_rate": 6.512524116523633e-06, + "loss": 0.567, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.30231064759206816, + "learning_rate": 6.420825460353974e-06, + "loss": 0.519, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.316037695232489, + "learning_rate": 6.329755547632499e-06, + "loss": 0.4763, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.31721374711969536, + "learning_rate": 6.239314990243339e-06, + "loss": 0.5215, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.3333723365925436, + "learning_rate": 6.149504395842087e-06, + "loss": 0.5361, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 0.2988022996120898, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.5202, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.313165965383161, + "learning_rate": 5.971775505458444e-06, + "loss": 0.5052, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 0.3133261195918503, + "learning_rate": 5.883858403607967e-06, + "loss": 0.5021, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 0.3310204849166293, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.5449, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.28642784716383, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.4939, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.3231255023681699, + "learning_rate": 5.623903547074549e-06, + "loss": 0.5071, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.3330966662426535, + "learning_rate": 5.538519351897575e-06, + "loss": 0.5205, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.3013409817811988, + "learning_rate": 5.453769828241872e-06, + "loss": 0.5063, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.2906630284974183, + "learning_rate": 5.369655545525909e-06, + "loss": 0.5013, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.31316666023740375, + "learning_rate": 5.286177068899989e-06, + "loss": 0.5215, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.3180187491635761, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.5432, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.28744382137160857, + "learning_rate": 5.121129773156663e-06, + "loss": 0.503, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.30423697755815965, + "learning_rate": 5.039562062965508e-06, + "loss": 0.5339, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.28959749985331457, + "learning_rate": 4.95863237670956e-06, + "loss": 0.5177, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.30651468374097873, + "learning_rate": 4.87834125814235e-06, + "loss": 0.5003, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.3399118692018144, + "learning_rate": 4.798689246727006e-06, + "loss": 0.505, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.30229537171643533, + "learning_rate": 4.719676877632639e-06, + "loss": 0.5135, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.2980808766300904, + "learning_rate": 4.641304681730641e-06, + "loss": 0.5157, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 0.32109007310226906, + "learning_rate": 4.563573185591219e-06, + "loss": 0.5441, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.31518895500804905, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5398, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.30746141207362454, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.5138, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.29844474026185125, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.5127, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.31372953790829117, + "learning_rate": 4.259064579323302e-06, + "loss": 0.5267, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.31513266463882467, + "learning_rate": 4.184544329761009e-06, + "loss": 0.5441, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.3373577735158178, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.5549, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.33313158820733985, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5365, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.3244540875818248, + "learning_rate": 3.964848174174541e-06, + "loss": 0.514, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.33794058251637915, + "learning_rate": 3.892905960127546e-06, + "loss": 0.5078, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.2923761625384951, + "learning_rate": 3.821609474213983e-06, + "loss": 0.5028, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.3083872375206693, + "learning_rate": 3.750959195463466e-06, + "loss": 0.5173, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.37042508296412585, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.5302, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.30268944873115844, + "learning_rate": 3.611599153858214e-06, + "loss": 0.5077, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.3299826370842139, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.5566, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 0.32169935120948345, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.5135, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.3154373318031867, + "learning_rate": 3.40741737109318e-06, + "loss": 0.5457, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.2928616342677835, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.4758, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.3255590348687943, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.4986, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.36347369876321284, + "learning_rate": 3.209076472645112e-06, + "loss": 0.5508, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.31922620403351204, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.5499, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.2806988390920151, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.4693, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.31126016115643457, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.5503, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.28994395860031663, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.525, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.3154581960565633, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.5432, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.3176066044180408, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.5437, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.3326477947403509, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.4988, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.45512219273453286, + "learning_rate": 2.708812932856253e-06, + "loss": 0.568, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.3278966468396552, + "learning_rate": 2.649217248223468e-06, + "loss": 0.5401, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.323752718264929, + "learning_rate": 2.590275647868867e-06, + "loss": 0.5268, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.31816355397904306, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.5268, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 0.31568343150732775, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.5798, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.2918941726574028, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.5028, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.3364645819786627, + "learning_rate": 2.3610579436393e-06, + "loss": 0.6063, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.3252749199764253, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.5074, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.32307432124495544, + "learning_rate": 2.250383684694579e-06, + "loss": 0.537, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.31387578239447933, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.5442, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.304637618589797, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5397, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.41468661451644506, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.4844, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 0.302638682703656, + "learning_rate": 2.036919225091827e-06, + "loss": 0.4872, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 0.28655926896016676, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.475, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.30358370328840534, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.5041, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.28478468702495974, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.5192, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.32519038445347415, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.5586, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 0.2993081248315083, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.5037, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.2958894909202451, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.4983, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 0.3142134015167779, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.5111, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.31999967793391854, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.5191, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.31633486555340273, + "learning_rate": 1.595161589389449e-06, + "loss": 0.5351, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.30675019805018006, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.5149, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.3035224544365652, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.513, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.3205777006087568, + "learning_rate": 1.459798471131868e-06, + "loss": 0.537, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.3453851337910027, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.5138, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 0.3110439277309376, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.538, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.2900825779303158, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.5188, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.3131984751420794, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.5088, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.3466175063672184, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.5194, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 0.32478119070806216, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5155, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.2999668283208606, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.5018, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3134414290580692, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.569, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 0.2953094923511177, + "learning_rate": 1.089491988176017e-06, + "loss": 0.4756, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.34344512875367766, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.5455, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.30558827977054226, + "learning_rate": 1.014505010326583e-06, + "loss": 0.5149, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.31275905263153886, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5286, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.28499175183291087, + "learning_rate": 9.421782985976068e-07, + "loss": 0.4876, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.4808999903189011, + "learning_rate": 9.070131527609604e-07, + "loss": 0.538, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.31226313414768764, + "learning_rate": 8.725137967920738e-07, + "loss": 0.5459, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.29523397120763456, + "learning_rate": 8.386804624865851e-07, + "loss": 0.486, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.3520154899027078, + "learning_rate": 8.055133771652345e-07, + "loss": 0.5364, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.37354671545114215, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5482, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.30755601609273053, + "learning_rate": 7.411788403743237e-07, + "loss": 0.5028, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 0.2939498097443535, + "learning_rate": 7.100118211581852e-07, + "loss": 0.5072, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.2746461418963455, + "learning_rate": 6.7951191543012e-07, + "loss": 0.4635, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.34560038360849, + "learning_rate": 6.496793281141056e-07, + "loss": 0.5528, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.9249883599597586, + "learning_rate": 6.205142596505176e-07, + "loss": 0.5623, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.3660112464881137, + "learning_rate": 5.920169059947411e-07, + "loss": 0.5154, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 0.3238829544216669, + "learning_rate": 5.64187458615939e-07, + "loss": 0.5607, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.31085999125447755, + "learning_rate": 5.370261044956971e-07, + "loss": 0.5063, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 0.30117998298835985, + "learning_rate": 5.105330261267916e-07, + "loss": 0.5451, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 0.31234562961090084, + "learning_rate": 4.847084015119574e-07, + "loss": 0.5213, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.3358034693770683, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.5301, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 0.3173787926469782, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.5645, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 0.4146473527673543, + "learning_rate": 4.112469628438365e-07, + "loss": 0.5677, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.2866437503923634, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.4946, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.2943197456246717, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.5112, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 0.30535385104586243, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.5516, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 0.3205778331850039, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.54, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.36322728698820905, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.5663, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.3339515307274971, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.5173, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.25378772151745665, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.4441, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.30785812626660675, + "learning_rate": 2.448018893333681e-07, + "loss": 0.5446, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3283730943219699, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.5399, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 0.3035437942206902, + "learning_rate": 2.098903854912515e-07, + "loss": 0.5246, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.34024560448906616, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.5227, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.3331008664020577, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.4932, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.2938886631788762, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.5289, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.30285455598285027, + "learning_rate": 1.481139151579991e-07, + "loss": 0.5553, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.32815425365861317, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.5341, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 0.29776320315837895, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.5015, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.29765736304272533, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5149, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.36027512949995577, + "learning_rate": 9.707157531134713e-08, + "loss": 0.5111, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.30288619177488824, + "learning_rate": 8.598886661895788e-08, + "loss": 0.5116, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.3103546766773175, + "learning_rate": 7.557746412468758e-08, + "loss": 0.5231, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 0.29366551229568166, + "learning_rate": 6.583743778106887e-08, + "loss": 0.4787, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 0.3165439339426244, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.5203, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.3233906599138733, + "learning_rate": 4.837177080119215e-08, + "loss": 0.5466, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 0.31714449024996777, + "learning_rate": 4.064624751394242e-08, + "loss": 0.517, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.3023578455209064, + "learning_rate": 3.359233507459481e-08, + "loss": 0.5351, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.2993120067816697, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.5218, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.3121313135121532, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.5376, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.29916412187483066, + "learning_rate": 1.646071422083395e-08, + "loss": 0.4909, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.30620573314273947, + "learning_rate": 1.209367398504746e-08, + "loss": 0.5024, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.3254168132789596, + "learning_rate": 8.398436437317969e-09, + "loss": 0.5706, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 0.29745520661849084, + "learning_rate": 5.375026405352035e-09, + "loss": 0.4772, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.30232853467614984, + "learning_rate": 3.023464202944748e-09, + "loss": 0.5229, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.30328558964827984, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.5235, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.3091051560807406, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.5267, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.3204451227014326, + "learning_rate": 0.0, + "loss": 0.5408, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 1262987898191872.0, + "train_loss": 0.5855974884986878, + "train_runtime": 20106.5862, + "train_samples_per_second": 0.995, + "train_steps_per_second": 0.062 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1262987898191872.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..d7644c362e723a4b41037abb906be799d6d33bd2 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "gate_proj", + "q_proj", + "v_proj", + "o_proj", + "k_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..651aeba4abfdb6710a1b953b7d6c469fc10534eb --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:483c779bb16880a64bc2d1efce90d92f99570391c506bc6e5ea3da406b2d2179 +size 671150064 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..0bf2a2688f8a97641d0462fdc0dddb272604c27b --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da8e960957e2137b6feb2287acfa4440b3e6fd8f7293c07327837d10a9bc134b +size 918507402 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..e22847176eb6f46cc078bbbf5646872348c4f46d --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_1_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.7897449163773217, + "learning_rate": 5e-05, + "loss": 1.0158, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.7286029077519581, + "learning_rate": 0.0001, + "loss": 0.9785, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.5007841633835526, + "learning_rate": 0.00015000000000000001, + "loss": 0.8868, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.551066986410025, + "learning_rate": 0.0002, + "loss": 0.869, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 0.7067240936501759, + "learning_rate": 0.00019996629653035126, + "loss": 0.8414, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.44664188050548564, + "learning_rate": 0.00019986520883988232, + "loss": 0.7815, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.4193441176930537, + "learning_rate": 0.00019969680506871137, + "loss": 0.804, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.41306296762126027, + "learning_rate": 0.00019946119873266613, + "loss": 0.7387, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.435261450186937, + "learning_rate": 0.00019915854864676664, + "loss": 0.7559, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.41019140182818303, + "learning_rate": 0.00019878905881817252, + "loss": 0.7582, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.43892155428418417, + "learning_rate": 0.00019835297830866826, + "loss": 0.7391, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.45186235095684296, + "learning_rate": 0.00019785060106677818, + "loss": 0.7633, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.44124204480242757, + "learning_rate": 0.00019728226572962473, + "loss": 0.7436, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.3664602718616742, + "learning_rate": 0.0001966483553946637, + "loss": 0.7012, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.36386121096582563, + "learning_rate": 0.00019594929736144976, + "loss": 0.7551, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.35587801041616296, + "learning_rate": 0.00019518556284360696, + "loss": 0.7146, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.34427088443737364, + "learning_rate": 0.0001943576666511982, + "loss": 0.6675, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.35523064882833827, + "learning_rate": 0.0001934661668437073, + "loss": 0.7573, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.36215020830520817, + "learning_rate": 0.0001925116643538684, + "loss": 0.7106, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.3397367701549374, + "learning_rate": 0.00019149480258259533, + "loss": 0.6803, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.41634440568087716, + "learning_rate": 0.00019041626696528503, + "loss": 0.6785, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.38124851642853896, + "learning_rate": 0.0001892767845097864, + "loss": 0.7187, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.38564705024675255, + "learning_rate": 0.00018807712330634642, + "loss": 0.7416, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.3848627816158859, + "learning_rate": 0.0001868180920098644, + "loss": 0.7437, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.39332044646176956, + "learning_rate": 0.00018550053929480202, + "loss": 0.7776, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.3884802424868953, + "learning_rate": 0.00018412535328311814, + "loss": 0.7332, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.3428409307296526, + "learning_rate": 0.0001826934609456129, + "loss": 0.7081, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.3303519063768321, + "learning_rate": 0.00018120582747708502, + "loss": 0.6989, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.30598505514605384, + "learning_rate": 0.0001796634556457236, + "loss": 0.6628, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.4709929717685942, + "learning_rate": 0.0001780673851171728, + "loss": 0.7294, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.3607051665952329, + "learning_rate": 0.00017641869175372493, + "loss": 0.7177, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.3396445814711014, + "learning_rate": 0.00017471848688911464, + "loss": 0.6752, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.3843901133072089, + "learning_rate": 0.000172967916579403, + "loss": 0.7306, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.3209838157061621, + "learning_rate": 0.00017116816083045602, + "loss": 0.6754, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.3157296469039439, + "learning_rate": 0.0001693204328025389, + "loss": 0.7055, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.3636809922566001, + "learning_rate": 0.00016742597799256182, + "loss": 0.755, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.33887530853673087, + "learning_rate": 0.00016548607339452853, + "loss": 0.6715, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.3139532292851415, + "learning_rate": 0.00016350202663875386, + "loss": 0.6737, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.3887953669372525, + "learning_rate": 0.0001614751751104301, + "loss": 0.7348, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.34671023775518583, + "learning_rate": 0.00015940688504813662, + "loss": 0.6969, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.4055661542804412, + "learning_rate": 0.00015729855062290022, + "loss": 0.72, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.3281789122262739, + "learning_rate": 0.00015515159299842707, + "loss": 0.6991, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.33527063756600683, + "learning_rate": 0.00015296745937313987, + "loss": 0.6772, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.34628576129023203, + "learning_rate": 0.00015074762200466556, + "loss": 0.6718, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.32625551196869307, + "learning_rate": 0.00014849357721743168, + "loss": 0.6714, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.33248202305747837, + "learning_rate": 0.00014620684439403962, + "loss": 0.6676, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.3203522050028239, + "learning_rate": 0.0001438889649510956, + "loss": 0.6786, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.38431418470170736, + "learning_rate": 0.00014154150130018866, + "loss": 0.6839, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.3543522461020586, + "learning_rate": 0.00013916603579471705, + "loss": 0.6793, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.3938030668231684, + "learning_rate": 0.000136764169663272, + "loss": 0.7094, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.3434735143890009, + "learning_rate": 0.00013433752193029886, + "loss": 0.6976, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.33074261951215367, + "learning_rate": 0.00013188772832476188, + "loss": 0.6408, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.325639624418813, + "learning_rate": 0.00012941644017754964, + "loss": 0.6598, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.32277191745270095, + "learning_rate": 0.00012692532330836346, + "loss": 0.6417, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.3469996270480694, + "learning_rate": 0.00012441605690283915, + "loss": 0.6502, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.3222945210391576, + "learning_rate": 0.0001218903323806595, + "loss": 0.6656, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.358944803881362, + "learning_rate": 0.00011934985225541998, + "loss": 0.6622, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.3726202507097444, + "learning_rate": 0.00011679632898701649, + "loss": 0.6961, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.39039592382246363, + "learning_rate": 0.00011423148382732853, + "loss": 0.6898, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.3155957271966707, + "learning_rate": 0.00011165704565997593, + "loss": 0.6199, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.3593109426693744, + "learning_rate": 0.00010907474983493144, + "loss": 0.6475, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.3369009994062507, + "learning_rate": 0.0001064863369987743, + "loss": 0.6385, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.34306269167543985, + "learning_rate": 0.00010389355192137377, + "loss": 0.6595, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.308051310722022, + "learning_rate": 0.0001012981423197931, + "loss": 0.6155, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.3352389931983909, + "learning_rate": 9.870185768020693e-05, + "loss": 0.6081, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.3543975811840973, + "learning_rate": 9.610644807862625e-05, + "loss": 0.6623, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.3002191119803479, + "learning_rate": 9.35136630012257e-05, + "loss": 0.6203, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.3164322589890033, + "learning_rate": 9.092525016506858e-05, + "loss": 0.653, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.31243198237928543, + "learning_rate": 8.83429543400241e-05, + "loss": 0.6454, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.32215726035020803, + "learning_rate": 8.57685161726715e-05, + "loss": 0.6572, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.3013215539057925, + "learning_rate": 8.320367101298351e-05, + "loss": 0.6145, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.3315404842154109, + "learning_rate": 8.065014774458003e-05, + "loss": 0.656, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.3258222002707797, + "learning_rate": 7.810966761934053e-05, + "loss": 0.6498, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.31360991756541984, + "learning_rate": 7.558394309716088e-05, + "loss": 0.6379, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.3177067712810427, + "learning_rate": 7.307467669163655e-05, + "loss": 0.6955, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.3317732798005526, + "learning_rate": 7.058355982245037e-05, + "loss": 0.6479, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.28782041118815993, + "learning_rate": 6.811227167523815e-05, + "loss": 0.5791, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.3181330408066894, + "learning_rate": 6.566247806970119e-05, + "loss": 0.6304, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.3105640994734434, + "learning_rate": 6.323583033672799e-05, + "loss": 0.6146, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.31041728231206744, + "learning_rate": 6.083396420528298e-05, + "loss": 0.6782, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.28998618193622805, + "learning_rate": 5.845849869981137e-05, + "loss": 0.6205, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.3015857735322906, + "learning_rate": 5.611103504890444e-05, + "loss": 0.6126, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.32714747306023156, + "learning_rate": 5.379315560596038e-05, + "loss": 0.6886, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.3259955110435763, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.6559, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.35745803891368194, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.6654, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.28442451383341705, + "learning_rate": 4.703254062686017e-05, + "loss": 0.6287, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.31498875825303785, + "learning_rate": 4.484840700157295e-05, + "loss": 0.6631, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.35372019891399126, + "learning_rate": 4.270144937709981e-05, + "loss": 0.6475, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.3331501046613709, + "learning_rate": 4.059311495186338e-05, + "loss": 0.6494, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.28067458269386936, + "learning_rate": 3.852482488956992e-05, + "loss": 0.5944, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.37713100766680624, + "learning_rate": 3.649797336124615e-05, + "loss": 0.6791, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.31509292608707157, + "learning_rate": 3.45139266054715e-05, + "loss": 0.6719, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.3152963433402979, + "learning_rate": 3.257402200743821e-05, + "loss": 0.6616, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.31293005737263796, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.6147, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.3092685519717373, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.6758, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.3533787649261356, + "learning_rate": 2.7032083420597e-05, + "loss": 0.6105, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.34853704155291676, + "learning_rate": 2.528151311088537e-05, + "loss": 0.6801, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.3219342935342763, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.6127, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.3050310948436688, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.6367, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.31232129631117883, + "learning_rate": 2.03365443542764e-05, + "loss": 0.6458, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.3140181650190237, + "learning_rate": 1.879417252291502e-05, + "loss": 0.6416, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.3201000886253217, + "learning_rate": 1.730653905438714e-05, + "loss": 0.6114, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.3156734020633334, + "learning_rate": 1.587464671688187e-05, + "loss": 0.5782, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.2933312644780593, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.6489, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.300575715048929, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.6312, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.3283711441741722, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.6499, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.34622150730617574, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.6292, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.3358034751151104, + "learning_rate": 9.583733034714981e-06, + "loss": 0.6374, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.291595756315806, + "learning_rate": 8.505197417404687e-06, + "loss": 0.6187, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.3385592583782257, + "learning_rate": 7.488335646131628e-06, + "loss": 0.6704, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.3211302101059581, + "learning_rate": 6.533833156292679e-06, + "loss": 0.635, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.32649921097581136, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.6635, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.3264709794087455, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.647, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.3300373343975954, + "learning_rate": 4.050702638550275e-06, + "loss": 0.631, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.32177072851179134, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.629, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.3158053643511603, + "learning_rate": 2.717734270375272e-06, + "loss": 0.6011, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.2967406337957916, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.6299, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.34898701539844956, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.6639, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.2935252997159363, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.6434, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.3056888738443196, + "learning_rate": 8.41451353233369e-07, + "loss": 0.6242, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.28229812547022365, + "learning_rate": 5.388012673338661e-07, + "loss": 0.5963, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.3043090639132224, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.6056, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.3209412637695787, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.6676, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.3293937488127893, + "learning_rate": 3.370346964876036e-08, + "loss": 0.684, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.3203293853257442, + "learning_rate": 0.0, + "loss": 0.6528, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 122223981690880.0, + "train_loss": 0.6799252457618713, + "train_runtime": 2005.2448, + "train_samples_per_second": 0.997, + "train_steps_per_second": 0.062 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 122223981690880.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..b099c2177afe74fbee058fa2f26dff7569a57566 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "o_proj", + "k_proj", + "v_proj", + "down_proj", + "gate_proj", + "q_proj", + "up_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..b0a3dd656d69a633da22888c7ebd9e1470f349c2 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:651813e02a6e841f95aead8d7be4dc00f7054c6c7d118943b8eb39c0e338b841 +size 671150064 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..f22ace60b438e6f004b60dcc70a9360993270bbc --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba9a24d2cb3e8de37aa61b86e709f5084e1df61cfdeac00092e390771e9b32a +size 918507402 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0d39418ae70f790557bb5b5429f93d25b9da979d --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_2000_epochs_2_lora/trainer_state.json @@ -0,0 +1,1792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 0.8103855719925606, + "learning_rate": 2.5e-05, + "loss": 1.0158, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 0.749041337221085, + "learning_rate": 5e-05, + "loss": 0.9785, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 0.6098257778772503, + "learning_rate": 7.500000000000001e-05, + "loss": 0.9223, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 0.45221154229482324, + "learning_rate": 0.0001, + "loss": 0.8843, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 0.6036953220555253, + "learning_rate": 0.000125, + "loss": 0.8286, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 0.620523806065544, + "learning_rate": 0.00015000000000000001, + "loss": 0.815, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 0.4957034078191343, + "learning_rate": 0.000175, + "loss": 0.8283, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 0.42028357018114476, + "learning_rate": 0.0002, + "loss": 0.7462, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 0.4560597515326352, + "learning_rate": 0.0001999915737775817, + "loss": 0.7616, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 0.41835878738687804, + "learning_rate": 0.00019996629653035126, + "loss": 0.767, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 0.4519080647788084, + "learning_rate": 0.00019992417251814282, + "loss": 0.7513, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 0.4374793073145124, + "learning_rate": 0.00019986520883988232, + "loss": 0.766, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 0.4322774659638822, + "learning_rate": 0.0001997894154323911, + "loss": 0.7468, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 0.38086545856215315, + "learning_rate": 0.00019969680506871137, + "loss": 0.706, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 0.39822175757488365, + "learning_rate": 0.0001995873933559535, + "loss": 0.7625, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 0.3725703988716922, + "learning_rate": 0.00019946119873266613, + "loss": 0.7136, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 0.3478374330159542, + "learning_rate": 0.0001993182424657285, + "loss": 0.6671, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 0.36446041603835655, + "learning_rate": 0.00019915854864676664, + "loss": 0.7601, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 0.36694244434340323, + "learning_rate": 0.0001989821441880933, + "loss": 0.7126, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 0.34083750152431197, + "learning_rate": 0.00019878905881817252, + "loss": 0.6833, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 0.36394234532749736, + "learning_rate": 0.0001985793250766098, + "loss": 0.6821, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 0.3709212631288505, + "learning_rate": 0.00019835297830866826, + "loss": 0.7241, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 0.36975396353372153, + "learning_rate": 0.00019811005665931205, + "loss": 0.7421, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 0.3740507599475808, + "learning_rate": 0.00019785060106677818, + "loss": 0.7452, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 0.4306403705259776, + "learning_rate": 0.0001975746552556772, + "loss": 0.7769, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 0.362436854577378, + "learning_rate": 0.00019728226572962473, + "loss": 0.7345, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 0.34354980264780216, + "learning_rate": 0.0001969734817634044, + "loss": 0.7058, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 0.34800648439170206, + "learning_rate": 0.0001966483553946637, + "loss": 0.6982, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 0.3053541603133724, + "learning_rate": 0.00019630694141514464, + "loss": 0.663, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 0.3575768848594637, + "learning_rate": 0.00019594929736144976, + "loss": 0.7314, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 0.36309312304789304, + "learning_rate": 0.0001955754835053459, + "loss": 0.7163, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 0.3265601653767092, + "learning_rate": 0.00019518556284360696, + "loss": 0.6771, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.3856001462608568, + "learning_rate": 0.0001947796010873974, + "loss": 0.7349, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 0.3291353426839004, + "learning_rate": 0.0001943576666511982, + "loss": 0.6753, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 0.32706839904535945, + "learning_rate": 0.0001939198306412775, + "loss": 0.7078, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 0.35810709497865256, + "learning_rate": 0.0001934661668437073, + "loss": 0.7593, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 0.35027802088238474, + "learning_rate": 0.0001929967517119289, + "loss": 0.6751, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 0.3319865050168186, + "learning_rate": 0.0001925116643538684, + "loss": 0.6748, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 0.3531139938876366, + "learning_rate": 0.0001920109865186052, + "loss": 0.7372, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 0.35271327408605185, + "learning_rate": 0.00019149480258259533, + "loss": 0.6992, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 0.41785713317021467, + "learning_rate": 0.00019096319953545185, + "loss": 0.7252, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 0.328699243494675, + "learning_rate": 0.00019041626696528503, + "loss": 0.7003, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 0.3381449706301949, + "learning_rate": 0.00018985409704360456, + "loss": 0.6796, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 0.37297787530769627, + "learning_rate": 0.0001892767845097864, + "loss": 0.6778, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 0.379159875115435, + "learning_rate": 0.00018868442665510678, + "loss": 0.6713, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 0.3377589122295175, + "learning_rate": 0.00018807712330634642, + "loss": 0.6695, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 0.31996790180194656, + "learning_rate": 0.00018745497680896722, + "loss": 0.6832, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 0.3540049565362026, + "learning_rate": 0.0001868180920098644, + "loss": 0.6877, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 0.33989354564300944, + "learning_rate": 0.0001861665762396974, + "loss": 0.6877, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 0.3824615978075444, + "learning_rate": 0.00018550053929480202, + "loss": 0.7122, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 0.37033212364615214, + "learning_rate": 0.00018482009341868697, + "loss": 0.7023, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 0.3542820245473082, + "learning_rate": 0.00018412535328311814, + "loss": 0.6467, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 0.3389377943078783, + "learning_rate": 0.00018341643596879367, + "loss": 0.6637, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 0.3417210956425829, + "learning_rate": 0.0001826934609456129, + "loss": 0.6436, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 0.3381593766262924, + "learning_rate": 0.00018195655005254273, + "loss": 0.6526, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 0.33860874870666346, + "learning_rate": 0.00018120582747708502, + "loss": 0.6693, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.3400023167966099, + "learning_rate": 0.00018044141973434758, + "loss": 0.6654, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 0.37388798138190976, + "learning_rate": 0.0001796634556457236, + "loss": 0.7037, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 0.3502639926183974, + "learning_rate": 0.00017887206631718203, + "loss": 0.6953, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.3292362366547009, + "learning_rate": 0.0001780673851171728, + "loss": 0.6236, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.36883404655464086, + "learning_rate": 0.00017724954765415137, + "loss": 0.6536, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 0.3510875420258689, + "learning_rate": 0.00017641869175372493, + "loss": 0.6439, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.3634644898935124, + "learning_rate": 0.00017557495743542585, + "loss": 0.6664, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 0.31941526418984295, + "learning_rate": 0.00017471848688911464, + "loss": 0.6234, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 0.3347914844533306, + "learning_rate": 0.00017384942445101772, + "loss": 0.6135, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 0.343437021079617, + "learning_rate": 0.000172967916579403, + "loss": 0.6676, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 0.31238707340906624, + "learning_rate": 0.00017207411182989832, + "loss": 0.6299, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 0.3231974475690498, + "learning_rate": 0.00017116816083045602, + "loss": 0.6653, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 0.31339344001858105, + "learning_rate": 0.00017025021625596853, + "loss": 0.6518, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 0.32723978959602745, + "learning_rate": 0.0001693204328025389, + "loss": 0.6671, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 0.3091424191482883, + "learning_rate": 0.0001683789671614107, + "loss": 0.619, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.3437405961553586, + "learning_rate": 0.00016742597799256182, + "loss": 0.6604, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.3379880167585152, + "learning_rate": 0.00016646162589796615, + "loss": 0.6602, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 0.33743544509510187, + "learning_rate": 0.00016548607339452853, + "loss": 0.649, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 0.34455164058166915, + "learning_rate": 0.00016449948488669639, + "loss": 0.7012, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 0.325758750535915, + "learning_rate": 0.00016350202663875386, + "loss": 0.6624, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.29045691337733437, + "learning_rate": 0.00016249386674680184, + "loss": 0.5921, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 0.32785744667852906, + "learning_rate": 0.0001614751751104301, + "loss": 0.6415, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.29899909920728357, + "learning_rate": 0.00016044612340408466, + "loss": 0.6226, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 0.32325512818097923, + "learning_rate": 0.00015940688504813662, + "loss": 0.6867, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.31602836762331643, + "learning_rate": 0.00015835763517965673, + "loss": 0.6342, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 0.34250938844506945, + "learning_rate": 0.00015729855062290022, + "loss": 0.626, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 0.3424939486401721, + "learning_rate": 0.0001562298098595078, + "loss": 0.6957, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 0.334417096892308, + "learning_rate": 0.00015515159299842707, + "loss": 0.6675, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.46076813754051577, + "learning_rate": 0.00015406408174555976, + "loss": 0.6749, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.29210933308142606, + "learning_rate": 0.00015296745937313987, + "loss": 0.6409, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 0.3528923975825312, + "learning_rate": 0.00015186191068884775, + "loss": 0.6765, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 0.324799963637437, + "learning_rate": 0.00015074762200466556, + "loss": 0.6609, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 0.3088789930704027, + "learning_rate": 0.00014962478110547918, + "loss": 0.66, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 0.2854688168520043, + "learning_rate": 0.00014849357721743168, + "loss": 0.6035, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.42689653734470706, + "learning_rate": 0.0001473542009760343, + "loss": 0.6938, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.35651456548401894, + "learning_rate": 0.00014620684439403962, + "loss": 0.6838, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 0.33635785287045716, + "learning_rate": 0.0001450517008290827, + "loss": 0.6671, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.336852572915818, + "learning_rate": 0.0001438889649510956, + "loss": 0.6317, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 0.33576829692825366, + "learning_rate": 0.00014271883270950073, + "loss": 0.6805, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 0.3329231732565777, + "learning_rate": 0.00014154150130018866, + "loss": 0.6249, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 0.3598961646096611, + "learning_rate": 0.00014035716913228568, + "loss": 0.6907, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.3115561422500585, + "learning_rate": 0.00013916603579471705, + "loss": 0.6239, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 0.3459018767378786, + "learning_rate": 0.0001379683020225714, + "loss": 0.6466, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 0.32457665510352385, + "learning_rate": 0.000136764169663272, + "loss": 0.6533, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 0.33508111254965955, + "learning_rate": 0.00013555384164256048, + "loss": 0.6509, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.3269052404706792, + "learning_rate": 0.00013433752193029886, + "loss": 0.6048, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.30770998047308523, + "learning_rate": 0.00013311541550609565, + "loss": 0.5852, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 0.31424672359407035, + "learning_rate": 0.00013188772832476188, + "loss": 0.6484, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 0.43876915582562553, + "learning_rate": 0.00013065466728160252, + "loss": 0.6495, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.34563580586225817, + "learning_rate": 0.00012941644017754964, + "loss": 0.6577, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 0.3572647064435535, + "learning_rate": 0.00012817325568414297, + "loss": 0.6246, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 0.3522582386523308, + "learning_rate": 0.00012692532330836346, + "loss": 0.635, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.2967086734627664, + "learning_rate": 0.00012567285335732633, + "loss": 0.6269, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 0.34166098841794695, + "learning_rate": 0.00012441605690283915, + "loss": 0.6769, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.35539661438628445, + "learning_rate": 0.00012315514574583113, + "loss": 0.6341, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 0.33161189448182593, + "learning_rate": 0.0001218903323806595, + "loss": 0.6723, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.338951247552023, + "learning_rate": 0.00012062182995929882, + "loss": 0.6582, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.3116039910821697, + "learning_rate": 0.00011934985225541998, + "loss": 0.6347, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.3222821987139117, + "learning_rate": 0.0001180746136283638, + "loss": 0.6295, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.3180175846318794, + "learning_rate": 0.00011679632898701649, + "loss": 0.6105, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.30937980648503693, + "learning_rate": 0.00011551521375359206, + "loss": 0.6388, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 0.3642307008353836, + "learning_rate": 0.00011423148382732853, + "loss": 0.6686, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 0.322639383880906, + "learning_rate": 0.00011294535554810354, + "loss": 0.6501, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 0.3162171243639988, + "learning_rate": 0.00011165704565997593, + "loss": 0.6274, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.28758115580861665, + "learning_rate": 0.00011036677127465889, + "loss": 0.6027, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.3073842474256611, + "learning_rate": 0.00010907474983493144, + "loss": 0.6139, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.32310515303670384, + "learning_rate": 0.00010778119907799398, + "loss": 0.6554, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.32596935361275065, + "learning_rate": 0.0001064863369987743, + "loss": 0.6884, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.3312656057889271, + "learning_rate": 0.00010519038181318999, + "loss": 0.6653, + "step": 125 + }, + { + "epoch": 1.008, + "grad_norm": 0.30999248898809556, + "learning_rate": 0.00010389355192137377, + "loss": 0.5014, + "step": 126 + }, + { + "epoch": 1.016, + "grad_norm": 0.2763160568966665, + "learning_rate": 0.00010259606587086783, + "loss": 0.4397, + "step": 127 + }, + { + "epoch": 1.024, + "grad_norm": 0.27788323256482617, + "learning_rate": 0.0001012981423197931, + "loss": 0.4393, + "step": 128 + }, + { + "epoch": 1.032, + "grad_norm": 0.3150059690847303, + "learning_rate": 0.0001, + "loss": 0.4681, + "step": 129 + }, + { + "epoch": 1.04, + "grad_norm": 0.31481102781455156, + "learning_rate": 9.870185768020693e-05, + "loss": 0.4353, + "step": 130 + }, + { + "epoch": 1.048, + "grad_norm": 0.4709701646556055, + "learning_rate": 9.740393412913219e-05, + "loss": 0.4882, + "step": 131 + }, + { + "epoch": 1.056, + "grad_norm": 0.4116226720028693, + "learning_rate": 9.610644807862625e-05, + "loss": 0.4502, + "step": 132 + }, + { + "epoch": 1.064, + "grad_norm": 0.3862376767849978, + "learning_rate": 9.480961818681004e-05, + "loss": 0.4818, + "step": 133 + }, + { + "epoch": 1.072, + "grad_norm": 0.3930610489851894, + "learning_rate": 9.35136630012257e-05, + "loss": 0.4577, + "step": 134 + }, + { + "epoch": 1.08, + "grad_norm": 0.3576815014862209, + "learning_rate": 9.221880092200601e-05, + "loss": 0.4423, + "step": 135 + }, + { + "epoch": 1.088, + "grad_norm": 0.3513091515896804, + "learning_rate": 9.092525016506858e-05, + "loss": 0.4169, + "step": 136 + }, + { + "epoch": 1.096, + "grad_norm": 0.351105537272752, + "learning_rate": 8.963322872534114e-05, + "loss": 0.4212, + "step": 137 + }, + { + "epoch": 1.104, + "grad_norm": 0.36779752955190265, + "learning_rate": 8.83429543400241e-05, + "loss": 0.4522, + "step": 138 + }, + { + "epoch": 1.112, + "grad_norm": 0.3904869721114291, + "learning_rate": 8.705464445189647e-05, + "loss": 0.491, + "step": 139 + }, + { + "epoch": 1.12, + "grad_norm": 0.330258132098933, + "learning_rate": 8.57685161726715e-05, + "loss": 0.4438, + "step": 140 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 0.37964928396907405, + "learning_rate": 8.448478624640797e-05, + "loss": 0.4462, + "step": 141 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.3397159120181284, + "learning_rate": 8.320367101298351e-05, + "loss": 0.4206, + "step": 142 + }, + { + "epoch": 1.144, + "grad_norm": 0.34881844600691564, + "learning_rate": 8.192538637163621e-05, + "loss": 0.4341, + "step": 143 + }, + { + "epoch": 1.152, + "grad_norm": 0.36104875179096, + "learning_rate": 8.065014774458003e-05, + "loss": 0.4276, + "step": 144 + }, + { + "epoch": 1.16, + "grad_norm": 0.3300081995494602, + "learning_rate": 7.93781700407012e-05, + "loss": 0.4197, + "step": 145 + }, + { + "epoch": 1.168, + "grad_norm": 0.3773761135278097, + "learning_rate": 7.810966761934053e-05, + "loss": 0.4228, + "step": 146 + }, + { + "epoch": 1.176, + "grad_norm": 0.3677726783865881, + "learning_rate": 7.684485425416888e-05, + "loss": 0.4308, + "step": 147 + }, + { + "epoch": 1.184, + "grad_norm": 0.4039348666723891, + "learning_rate": 7.558394309716088e-05, + "loss": 0.4643, + "step": 148 + }, + { + "epoch": 1.192, + "grad_norm": 0.3977290252315367, + "learning_rate": 7.432714664267373e-05, + "loss": 0.4652, + "step": 149 + }, + { + "epoch": 1.2, + "grad_norm": 0.39042245840535644, + "learning_rate": 7.307467669163655e-05, + "loss": 0.4417, + "step": 150 + }, + { + "epoch": 1.208, + "grad_norm": 0.3546136856499018, + "learning_rate": 7.182674431585704e-05, + "loss": 0.4112, + "step": 151 + }, + { + "epoch": 1.216, + "grad_norm": 0.35726383327350786, + "learning_rate": 7.058355982245037e-05, + "loss": 0.417, + "step": 152 + }, + { + "epoch": 1.224, + "grad_norm": 0.35997235150118934, + "learning_rate": 6.934533271839752e-05, + "loss": 0.4269, + "step": 153 + }, + { + "epoch": 1.232, + "grad_norm": 0.38548264483214323, + "learning_rate": 6.811227167523815e-05, + "loss": 0.467, + "step": 154 + }, + { + "epoch": 1.24, + "grad_norm": 0.3663689248918521, + "learning_rate": 6.688458449390437e-05, + "loss": 0.4469, + "step": 155 + }, + { + "epoch": 1.248, + "grad_norm": 0.34663568156293084, + "learning_rate": 6.566247806970119e-05, + "loss": 0.4111, + "step": 156 + }, + { + "epoch": 1.256, + "grad_norm": 0.36455028785765153, + "learning_rate": 6.444615835743955e-05, + "loss": 0.4388, + "step": 157 + }, + { + "epoch": 1.264, + "grad_norm": 0.35674055617897765, + "learning_rate": 6.323583033672799e-05, + "loss": 0.4301, + "step": 158 + }, + { + "epoch": 1.272, + "grad_norm": 0.3158129223428509, + "learning_rate": 6.203169797742861e-05, + "loss": 0.3943, + "step": 159 + }, + { + "epoch": 1.28, + "grad_norm": 0.37730456344078195, + "learning_rate": 6.083396420528298e-05, + "loss": 0.4303, + "step": 160 + }, + { + "epoch": 1.288, + "grad_norm": 0.3600002465418534, + "learning_rate": 5.964283086771435e-05, + "loss": 0.4036, + "step": 161 + }, + { + "epoch": 1.296, + "grad_norm": 0.411983597945873, + "learning_rate": 5.845849869981137e-05, + "loss": 0.4323, + "step": 162 + }, + { + "epoch": 1.304, + "grad_norm": 0.5646047576197016, + "learning_rate": 5.728116729049928e-05, + "loss": 0.41, + "step": 163 + }, + { + "epoch": 1.312, + "grad_norm": 0.3799713515892477, + "learning_rate": 5.611103504890444e-05, + "loss": 0.435, + "step": 164 + }, + { + "epoch": 1.32, + "grad_norm": 0.3701436657920335, + "learning_rate": 5.4948299170917325e-05, + "loss": 0.4382, + "step": 165 + }, + { + "epoch": 1.328, + "grad_norm": 0.43034822774042925, + "learning_rate": 5.379315560596038e-05, + "loss": 0.4385, + "step": 166 + }, + { + "epoch": 1.336, + "grad_norm": 0.3396215340957992, + "learning_rate": 5.26457990239657e-05, + "loss": 0.4132, + "step": 167 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.3372682640636405, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.4014, + "step": 168 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.3420013343555165, + "learning_rate": 5.0375218894520834e-05, + "loss": 0.4308, + "step": 169 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.36313664255414346, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.4241, + "step": 170 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.38860897284885565, + "learning_rate": 4.813808931115228e-05, + "loss": 0.4001, + "step": 171 + }, + { + "epoch": 1.376, + "grad_norm": 0.3996408209451103, + "learning_rate": 4.703254062686017e-05, + "loss": 0.45, + "step": 172 + }, + { + "epoch": 1.384, + "grad_norm": 0.35352369486178004, + "learning_rate": 4.593591825444028e-05, + "loss": 0.4104, + "step": 173 + }, + { + "epoch": 1.392, + "grad_norm": 0.34130167364663144, + "learning_rate": 4.484840700157295e-05, + "loss": 0.3857, + "step": 174 + }, + { + "epoch": 1.4, + "grad_norm": 0.39244887671048206, + "learning_rate": 4.377019014049223e-05, + "loss": 0.4486, + "step": 175 + }, + { + "epoch": 1.408, + "grad_norm": 0.4167715151074163, + "learning_rate": 4.270144937709981e-05, + "loss": 0.4404, + "step": 176 + }, + { + "epoch": 1.416, + "grad_norm": 0.3735006311923496, + "learning_rate": 4.164236482034327e-05, + "loss": 0.4534, + "step": 177 + }, + { + "epoch": 1.424, + "grad_norm": 0.38617905049661294, + "learning_rate": 4.059311495186338e-05, + "loss": 0.3994, + "step": 178 + }, + { + "epoch": 1.432, + "grad_norm": 0.3261675968534702, + "learning_rate": 3.9553876595915375e-05, + "loss": 0.4156, + "step": 179 + }, + { + "epoch": 1.44, + "grad_norm": 0.35742950700768167, + "learning_rate": 3.852482488956992e-05, + "loss": 0.4105, + "step": 180 + }, + { + "epoch": 1.448, + "grad_norm": 0.36928757386227556, + "learning_rate": 3.750613325319817e-05, + "loss": 0.4323, + "step": 181 + }, + { + "epoch": 1.456, + "grad_norm": 0.3868031899079498, + "learning_rate": 3.649797336124615e-05, + "loss": 0.4234, + "step": 182 + }, + { + "epoch": 1.464, + "grad_norm": 0.32848087629552325, + "learning_rate": 3.550051511330361e-05, + "loss": 0.3961, + "step": 183 + }, + { + "epoch": 1.472, + "grad_norm": 0.3427179300809076, + "learning_rate": 3.45139266054715e-05, + "loss": 0.4109, + "step": 184 + }, + { + "epoch": 1.48, + "grad_norm": 0.3668791005619767, + "learning_rate": 3.3538374102033866e-05, + "loss": 0.4141, + "step": 185 + }, + { + "epoch": 1.488, + "grad_norm": 0.39299921470783666, + "learning_rate": 3.257402200743821e-05, + "loss": 0.4462, + "step": 186 + }, + { + "epoch": 1.496, + "grad_norm": 0.3869815292768112, + "learning_rate": 3.1621032838589305e-05, + "loss": 0.4061, + "step": 187 + }, + { + "epoch": 1.504, + "grad_norm": 0.4097815530872006, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.4704, + "step": 188 + }, + { + "epoch": 1.512, + "grad_norm": 0.37871448435448984, + "learning_rate": 2.974978374403147e-05, + "loss": 0.429, + "step": 189 + }, + { + "epoch": 1.52, + "grad_norm": 0.3511999133515402, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.4109, + "step": 190 + }, + { + "epoch": 1.528, + "grad_norm": 0.3731155678723334, + "learning_rate": 2.7925888170101665e-05, + "loss": 0.4061, + "step": 191 + }, + { + "epoch": 1.536, + "grad_norm": 0.3331471399925095, + "learning_rate": 2.7032083420597e-05, + "loss": 0.406, + "step": 192 + }, + { + "epoch": 1.544, + "grad_norm": 0.36824739532374523, + "learning_rate": 2.6150575548982292e-05, + "loss": 0.4413, + "step": 193 + }, + { + "epoch": 1.552, + "grad_norm": 0.37028915853531374, + "learning_rate": 2.528151311088537e-05, + "loss": 0.4289, + "step": 194 + }, + { + "epoch": 1.56, + "grad_norm": 0.3377837537975486, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.3847, + "step": 195 + }, + { + "epoch": 1.568, + "grad_norm": 0.35978467243099815, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.4319, + "step": 196 + }, + { + "epoch": 1.576, + "grad_norm": 0.35418080181674005, + "learning_rate": 2.2750452345848682e-05, + "loss": 0.4199, + "step": 197 + }, + { + "epoch": 1.584, + "grad_norm": 0.37268287174991027, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.4087, + "step": 198 + }, + { + "epoch": 1.592, + "grad_norm": 0.3409473333562335, + "learning_rate": 2.112793368281799e-05, + "loss": 0.4342, + "step": 199 + }, + { + "epoch": 1.6, + "grad_norm": 0.355860200634431, + "learning_rate": 2.03365443542764e-05, + "loss": 0.37, + "step": 200 + }, + { + "epoch": 1.608, + "grad_norm": 0.35012224620268156, + "learning_rate": 1.9558580265652448e-05, + "loss": 0.394, + "step": 201 + }, + { + "epoch": 1.616, + "grad_norm": 0.3695753625239139, + "learning_rate": 1.879417252291502e-05, + "loss": 0.4148, + "step": 202 + }, + { + "epoch": 1.624, + "grad_norm": 0.4258711912122203, + "learning_rate": 1.804344994745727e-05, + "loss": 0.4375, + "step": 203 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 0.3803097291019688, + "learning_rate": 1.730653905438714e-05, + "loss": 0.4166, + "step": 204 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.37465233653565705, + "learning_rate": 1.6583564031206357e-05, + "loss": 0.4016, + "step": 205 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.3672479996101215, + "learning_rate": 1.587464671688187e-05, + "loss": 0.406, + "step": 206 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.3774177953897066, + "learning_rate": 1.5179906581313064e-05, + "loss": 0.4288, + "step": 207 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 0.3710652365358924, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.4011, + "step": 208 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.3471124217622006, + "learning_rate": 1.3833423760302611e-05, + "loss": 0.3787, + "step": 209 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.37562050193480484, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.456, + "step": 210 + }, + { + "epoch": 1.688, + "grad_norm": 0.39588222129140765, + "learning_rate": 1.2545023191032801e-05, + "loss": 0.3887, + "step": 211 + }, + { + "epoch": 1.696, + "grad_norm": 0.3602840989644041, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.3935, + "step": 212 + }, + { + "epoch": 1.704, + "grad_norm": 0.34855973676200813, + "learning_rate": 1.131557334489326e-05, + "loss": 0.3749, + "step": 213 + }, + { + "epoch": 1.712, + "grad_norm": 0.3479812256611428, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.3951, + "step": 214 + }, + { + "epoch": 1.72, + "grad_norm": 0.3818486969634882, + "learning_rate": 1.0145902956395447e-05, + "loss": 0.4067, + "step": 215 + }, + { + "epoch": 1.728, + "grad_norm": 0.35208306834551606, + "learning_rate": 9.583733034714981e-06, + "loss": 0.3844, + "step": 216 + }, + { + "epoch": 1.736, + "grad_norm": 0.3877100998448152, + "learning_rate": 9.036800464548157e-06, + "loss": 0.4431, + "step": 217 + }, + { + "epoch": 1.744, + "grad_norm": 0.38366231473311935, + "learning_rate": 8.505197417404687e-06, + "loss": 0.3757, + "step": 218 + }, + { + "epoch": 1.752, + "grad_norm": 0.38165201326373444, + "learning_rate": 7.989013481394814e-06, + "loss": 0.4083, + "step": 219 + }, + { + "epoch": 1.76, + "grad_norm": 0.36942326110852475, + "learning_rate": 7.488335646131628e-06, + "loss": 0.4043, + "step": 220 + }, + { + "epoch": 1.768, + "grad_norm": 0.3547002213884238, + "learning_rate": 7.003248288071118e-06, + "loss": 0.3822, + "step": 221 + }, + { + "epoch": 1.776, + "grad_norm": 0.3336558366840081, + "learning_rate": 6.533833156292679e-06, + "loss": 0.3822, + "step": 222 + }, + { + "epoch": 1.784, + "grad_norm": 0.37406602326195987, + "learning_rate": 6.08016935872251e-06, + "loss": 0.4182, + "step": 223 + }, + { + "epoch": 1.792, + "grad_norm": 0.3713633267545657, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.4094, + "step": 224 + }, + { + "epoch": 1.8, + "grad_norm": 0.38273162844988756, + "learning_rate": 5.22039891260262e-06, + "loss": 0.4072, + "step": 225 + }, + { + "epoch": 1.808, + "grad_norm": 0.3509985921798932, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.3954, + "step": 226 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.37305289489211013, + "learning_rate": 4.424516494654118e-06, + "loss": 0.4254, + "step": 227 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 0.3965276510176484, + "learning_rate": 4.050702638550275e-06, + "loss": 0.4322, + "step": 228 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.3685211551685983, + "learning_rate": 3.693058584855369e-06, + "loss": 0.4307, + "step": 229 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.3686387036160562, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.4582, + "step": 230 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.40006221636184197, + "learning_rate": 3.026518236595621e-06, + "loss": 0.4319, + "step": 231 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 0.3686101936012674, + "learning_rate": 2.717734270375272e-06, + "loss": 0.4401, + "step": 232 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.3639816237696081, + "learning_rate": 2.4253447443228106e-06, + "loss": 0.4228, + "step": 233 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.3914836318682469, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.4232, + "step": 234 + }, + { + "epoch": 1.88, + "grad_norm": 0.3475812805328899, + "learning_rate": 1.8899433406879608e-06, + "loss": 0.4048, + "step": 235 + }, + { + "epoch": 1.888, + "grad_norm": 0.3819179857607728, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.4349, + "step": 236 + }, + { + "epoch": 1.896, + "grad_norm": 0.34980986059743474, + "learning_rate": 1.4206749233902084e-06, + "loss": 0.3982, + "step": 237 + }, + { + "epoch": 1.904, + "grad_norm": 0.3618752000752406, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.3869, + "step": 238 + }, + { + "epoch": 1.912, + "grad_norm": 0.3617386662356333, + "learning_rate": 1.0178558119067315e-06, + "loss": 0.4229, + "step": 239 + }, + { + "epoch": 1.92, + "grad_norm": 0.3440710649386622, + "learning_rate": 8.41451353233369e-07, + "loss": 0.3715, + "step": 240 + }, + { + "epoch": 1.928, + "grad_norm": 0.41173420560281065, + "learning_rate": 6.817575342714988e-07, + "loss": 0.4432, + "step": 241 + }, + { + "epoch": 1.936, + "grad_norm": 0.3707711016435056, + "learning_rate": 5.388012673338661e-07, + "loss": 0.3993, + "step": 242 + }, + { + "epoch": 1.944, + "grad_norm": 0.41577428666946314, + "learning_rate": 4.126066440464982e-07, + "loss": 0.3685, + "step": 243 + }, + { + "epoch": 1.952, + "grad_norm": 0.3814321087081568, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.4361, + "step": 244 + }, + { + "epoch": 1.96, + "grad_norm": 0.3721575866482138, + "learning_rate": 2.1058456760891798e-07, + "loss": 0.4032, + "step": 245 + }, + { + "epoch": 1.968, + "grad_norm": 0.41596386646666555, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.4572, + "step": 246 + }, + { + "epoch": 1.976, + "grad_norm": 0.40832138728799905, + "learning_rate": 7.582748185719358e-08, + "loss": 0.417, + "step": 247 + }, + { + "epoch": 1.984, + "grad_norm": 0.3473444140339669, + "learning_rate": 3.370346964876036e-08, + "loss": 0.3723, + "step": 248 + }, + { + "epoch": 1.992, + "grad_norm": 0.3546075160485246, + "learning_rate": 8.426222418311814e-09, + "loss": 0.3919, + "step": 249 + }, + { + "epoch": 2.0, + "grad_norm": 0.3512853828343401, + "learning_rate": 0.0, + "loss": 0.3783, + "step": 250 + }, + { + "epoch": 2.0, + "step": 250, + "total_flos": 246886161580032.0, + "train_loss": 0.5540215849876404, + "train_runtime": 3986.2101, + "train_samples_per_second": 1.003, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 246886161580032.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/README.md b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8027b4d955825d8c902a7977d7bb207d2c735951 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "q_proj", + "v_proj", + "down_proj", + "o_proj", + "k_proj", + "up_proj", + "gate_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e959511bb990b4f2a368e96c598aabae0ba51f9f --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7ee94bcd70cdfc6d418cd4ede64ab52c3f5744b251a02d5bba5942a3744d7807 +size 671150064 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/config.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..8815adc96801343827d2a39db8ac806ca8fe7324 --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b74c7e1c3ceb8e3b785c8f79cfd5cea3179108ee16ae6e7b5f134c9efe2e39a +size 918507402 diff --git a/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..8fa1f947e2214f33195d12572f14edef07c45f3f --- /dev/null +++ b/single_dataset/long_caption/VideoGameBunny_v1_1-Llama-3-8B-V-long_caption_dataset_5000_epochs_1_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 0.7457730890858434, + "learning_rate": 2e-05, + "loss": 0.9392, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 0.7630371133565212, + "learning_rate": 4e-05, + "loss": 0.9595, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 0.671741960683176, + "learning_rate": 6e-05, + "loss": 0.9466, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 0.45841791491457373, + "learning_rate": 8e-05, + "loss": 0.8459, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 0.5244239392744238, + "learning_rate": 0.0001, + "loss": 0.8724, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 0.6516328157503206, + "learning_rate": 0.00012, + "loss": 0.819, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 0.49530456242105575, + "learning_rate": 0.00014, + "loss": 0.824, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 0.4596397773308755, + "learning_rate": 0.00016, + "loss": 0.7665, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 0.44608821672777943, + "learning_rate": 0.00018, + "loss": 0.796, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 0.40026645073169514, + "learning_rate": 0.0002, + "loss": 0.7114, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 0.42152892282804516, + "learning_rate": 0.00019999458931878073, + "loss": 0.7637, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 0.6645956201488676, + "learning_rate": 0.0001999783578606323, + "loss": 0.7394, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 0.4559407361234458, + "learning_rate": 0.00019995130738201966, + "loss": 0.7769, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 0.4572611529813379, + "learning_rate": 0.0001999134408101731, + "loss": 0.7066, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 0.4500067808913182, + "learning_rate": 0.00019986476224277165, + "loss": 0.7371, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 0.4197230755489818, + "learning_rate": 0.00019980527694749952, + "loss": 0.7665, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 0.36391045871614297, + "learning_rate": 0.00019973499136147606, + "loss": 0.7045, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 0.3872776544353665, + "learning_rate": 0.0001996539130905593, + "loss": 0.7429, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 0.40240763245757283, + "learning_rate": 0.0001995620509085228, + "loss": 0.7356, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 0.3589800680715318, + "learning_rate": 0.00019945941475610623, + "loss": 0.7669, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 0.3529214687594287, + "learning_rate": 0.0001993460157399396, + "loss": 0.7162, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 0.3731504462501941, + "learning_rate": 0.0001992218661313415, + "loss": 0.7032, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 0.357510448570862, + "learning_rate": 0.00019908697936499103, + "loss": 0.6901, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 0.4240364640458765, + "learning_rate": 0.00019894137003747403, + "loss": 0.704, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 0.3439785502742619, + "learning_rate": 0.00019878505390570362, + "loss": 0.6762, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 0.3313605555697624, + "learning_rate": 0.00019861804788521493, + "loss": 0.668, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 0.35837445292114534, + "learning_rate": 0.00019844037004833473, + "loss": 0.6628, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 0.38609941012926274, + "learning_rate": 0.00019825203962222572, + "loss": 0.7309, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 0.35471157176289014, + "learning_rate": 0.0001980530769868059, + "loss": 0.7492, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 0.34328303627248624, + "learning_rate": 0.00019784350367254322, + "loss": 0.7155, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 0.36673916279013236, + "learning_rate": 0.0001976233423581255, + "loss": 0.7241, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 0.3234177882633011, + "learning_rate": 0.0001973926168680066, + "loss": 0.6861, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 0.32299347366268083, + "learning_rate": 0.00019715135216982798, + "loss": 0.6615, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 0.3635950424319359, + "learning_rate": 0.0001968995743717171, + "loss": 0.7291, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 0.313827921433598, + "learning_rate": 0.00019663731071946206, + "loss": 0.6951, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 0.3968710602244457, + "learning_rate": 0.00019636458959356316, + "loss": 0.6907, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 0.3433259010957558, + "learning_rate": 0.0001960814405061619, + "loss": 0.7033, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 0.5783899904182404, + "learning_rate": 0.00019578789409784727, + "loss": 0.6653, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 0.3676724843428395, + "learning_rate": 0.00019548398213434007, + "loss": 0.7422, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 0.3243352625036398, + "learning_rate": 0.00019516973750305532, + "loss": 0.6482, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 0.34547858090045774, + "learning_rate": 0.00019484519420954354, + "loss": 0.6816, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 0.32765881962059995, + "learning_rate": 0.00019451038737381077, + "loss": 0.6874, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 0.38403027942722356, + "learning_rate": 0.00019416535322651818, + "loss": 0.7531, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 0.36817048699658556, + "learning_rate": 0.00019381012910506146, + "loss": 0.7186, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 0.3423144655043922, + "learning_rate": 0.00019344475344953012, + "loss": 0.6902, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 0.32637477572324414, + "learning_rate": 0.00019306926579854821, + "loss": 0.6774, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 0.36502486947017465, + "learning_rate": 0.00019268370678499533, + "loss": 0.6889, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 0.40991300125547164, + "learning_rate": 0.0001922881181316097, + "loss": 0.6986, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 0.35479733232421357, + "learning_rate": 0.00019188254264647337, + "loss": 0.6632, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 0.3672867016153713, + "learning_rate": 0.0001914670242183795, + "loss": 0.6963, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 0.3002688404746075, + "learning_rate": 0.0001910416078120832, + "loss": 0.6864, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 0.3324674434885357, + "learning_rate": 0.0001906063394634356, + "loss": 0.6438, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 0.3601713479724341, + "learning_rate": 0.00019016126627440237, + "loss": 0.6184, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 0.3249708764798294, + "learning_rate": 0.00018970643640796642, + "loss": 0.6917, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 0.3452207394377197, + "learning_rate": 0.000189241899082916, + "loss": 0.6914, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 0.3402007827003844, + "learning_rate": 0.00018876770456851877, + "loss": 0.69, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 0.39866643674994495, + "learning_rate": 0.0001882839041790818, + "loss": 0.6917, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 0.33719844735024124, + "learning_rate": 0.00018779055026839868, + "loss": 0.6461, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 0.4103903760834288, + "learning_rate": 0.00018728769622408423, + "loss": 0.7571, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 0.3247408010466196, + "learning_rate": 0.00018677539646179707, + "loss": 0.7127, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 0.3019357718139876, + "learning_rate": 0.00018625370641935129, + "loss": 0.63, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 0.32110595479702053, + "learning_rate": 0.00018572268255071718, + "loss": 0.6407, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 0.3459032253481727, + "learning_rate": 0.00018518238231991218, + "loss": 0.7009, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 0.3643552745116129, + "learning_rate": 0.00018463286419478255, + "loss": 0.6431, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 0.3287998055114684, + "learning_rate": 0.00018407418764067627, + "loss": 0.7143, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 0.3286988331002986, + "learning_rate": 0.00018350641311400812, + "loss": 0.6417, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 0.32519808551208057, + "learning_rate": 0.0001829296020557174, + "loss": 0.6687, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 0.37437821580599384, + "learning_rate": 0.00018234381688461942, + "loss": 0.698, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 0.3135688689683587, + "learning_rate": 0.0001817491209906506, + "loss": 0.6652, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 0.3312906601598693, + "learning_rate": 0.00018114557872800905, + "loss": 0.6685, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 0.3039247867103697, + "learning_rate": 0.00018053325540819045, + "loss": 0.6245, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 0.3086328804027952, + "learning_rate": 0.0001799122172929206, + "loss": 0.6405, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 0.3610255480279461, + "learning_rate": 0.00017928253158698473, + "loss": 0.6895, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 0.34434512108300025, + "learning_rate": 0.0001786442664309554, + "loss": 0.6932, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 0.3810006950863619, + "learning_rate": 0.0001779974908938184, + "loss": 0.7155, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 0.3722738863269969, + "learning_rate": 0.0001773422749654988, + "loss": 0.6964, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 0.34270037865742503, + "learning_rate": 0.00017667868954928694, + "loss": 0.686, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.30772890236835826, + "learning_rate": 0.00017600680645416583, + "loss": 0.6469, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 0.31029540900645003, + "learning_rate": 0.00017532669838704035, + "loss": 0.6407, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 0.32844082133524866, + "learning_rate": 0.00017463843894486937, + "loss": 0.6526, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 0.33040883292034673, + "learning_rate": 0.0001739421026067017, + "loss": 0.6682, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.37490784330706134, + "learning_rate": 0.00017323776472561627, + "loss": 0.6697, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 0.3062844132482711, + "learning_rate": 0.00017252550152056795, + "loss": 0.6464, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 0.35671527882571014, + "learning_rate": 0.0001718053900681397, + "loss": 0.6666, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 0.35176591059029183, + "learning_rate": 0.00017107750829420176, + "loss": 0.6331, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 0.3379988728130438, + "learning_rate": 0.00017034193496547902, + "loss": 0.6719, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.36243107510700484, + "learning_rate": 0.00016959874968102735, + "loss": 0.6481, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 0.3203238702127848, + "learning_rate": 0.00016884803286362, + "loss": 0.6424, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.3368827289630954, + "learning_rate": 0.00016808986575104465, + "loss": 0.6767, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 0.3188974603799366, + "learning_rate": 0.00016732433038731242, + "loss": 0.6543, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 0.33675315618562107, + "learning_rate": 0.0001665515096137797, + "loss": 0.651, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 0.33898214421058065, + "learning_rate": 0.00016577148706018328, + "loss": 0.6441, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 0.2983027225473012, + "learning_rate": 0.00016498434713559088, + "loss": 0.6124, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 0.302608615368401, + "learning_rate": 0.00016419017501926656, + "loss": 0.6199, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 0.28961037507312426, + "learning_rate": 0.0001633890566514535, + "loss": 0.6032, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 0.31889345457891505, + "learning_rate": 0.00016258107872407375, + "loss": 0.6335, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 0.3349103755034773, + "learning_rate": 0.0001617663286713474, + "loss": 0.6384, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 0.3116776065640942, + "learning_rate": 0.00016094489466033043, + "loss": 0.6404, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 0.3340743011867128, + "learning_rate": 0.00016011686558137448, + "loss": 0.6499, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.35774989469652474, + "learning_rate": 0.0001592823310385073, + "loss": 0.6728, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 0.29207981025345064, + "learning_rate": 0.0001584413813397364, + "loss": 0.6294, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 0.34093300396933807, + "learning_rate": 0.00015759410748727662, + "loss": 0.6515, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 0.4550580716289749, + "learning_rate": 0.00015674060116770236, + "loss": 0.6658, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 0.3549625285796341, + "learning_rate": 0.00015588095474202595, + "loss": 0.6705, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 0.41015881382795516, + "learning_rate": 0.00015501526123570277, + "loss": 0.65, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.33220099050742774, + "learning_rate": 0.00015414361432856475, + "loss": 0.6247, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 0.33820658657375424, + "learning_rate": 0.0001532661083446829, + "loss": 0.6728, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 0.34870862445670225, + "learning_rate": 0.00015238283824216015, + "loss": 0.6613, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 0.2994763222445106, + "learning_rate": 0.00015149389960285558, + "loss": 0.6296, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.3258903531906652, + "learning_rate": 0.00015059938862204127, + "loss": 0.6723, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 0.3216665038573486, + "learning_rate": 0.00014969940209799248, + "loss": 0.6383, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 0.3707207885410005, + "learning_rate": 0.00014879403742151283, + "loss": 0.67, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 0.31713373063325917, + "learning_rate": 0.00014788339256539544, + "loss": 0.6414, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 0.3279430359127387, + "learning_rate": 0.0001469675660738206, + "loss": 0.6174, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.3328376067264693, + "learning_rate": 0.00014604665705169237, + "loss": 0.6611, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 0.30883182781096974, + "learning_rate": 0.00014512076515391375, + "loss": 0.6343, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 0.3414957274784356, + "learning_rate": 0.00014418999057460276, + "loss": 0.6418, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 0.34237503688138976, + "learning_rate": 0.0001432544340362501, + "loss": 0.6705, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 0.3563435268997631, + "learning_rate": 0.00014231419677881966, + "loss": 0.67, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.29555256017828213, + "learning_rate": 0.00014136938054879283, + "loss": 0.6188, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.3894091065376875, + "learning_rate": 0.00014042008758815818, + "loss": 0.5784, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.30171407654719007, + "learning_rate": 0.00013946642062334766, + "loss": 0.658, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 0.3434709265832977, + "learning_rate": 0.00013850848285411994, + "loss": 0.6915, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 0.32041483095191065, + "learning_rate": 0.000137546377942393, + "loss": 0.6158, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 0.33393110124438835, + "learning_rate": 0.00013658021000102636, + "loss": 0.6575, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 0.3387835672635283, + "learning_rate": 0.00013561008358255468, + "loss": 0.6555, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 0.33957636048101425, + "learning_rate": 0.00013463610366787392, + "loss": 0.6804, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 0.34764023958223117, + "learning_rate": 0.00013365837565488064, + "loss": 0.6463, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 0.30499431339882926, + "learning_rate": 0.0001326770053470668, + "loss": 0.6012, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 0.40182928175919536, + "learning_rate": 0.0001316920989420703, + "loss": 0.6684, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 0.3582372625393394, + "learning_rate": 0.00013070376302018287, + "loss": 0.6767, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 0.354828481051296, + "learning_rate": 0.00012971210453281674, + "loss": 0.6673, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 0.3205163623498333, + "learning_rate": 0.000128717230790931, + "loss": 0.6508, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 0.32106540344304196, + "learning_rate": 0.00012771924945341906, + "loss": 0.6447, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.32479318644166927, + "learning_rate": 0.00012671826851545851, + "loss": 0.6456, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 0.3501963258782712, + "learning_rate": 0.0001257143962968246, + "loss": 0.6437, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 0.3097916234964394, + "learning_rate": 0.00012470774143016853, + "loss": 0.6224, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 0.33887867132062244, + "learning_rate": 0.00012369841284926188, + "loss": 0.6574, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 0.28865597440102597, + "learning_rate": 0.00012268651977720866, + "loss": 0.5969, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 0.29899423235735395, + "learning_rate": 0.00012167217171462566, + "loss": 0.594, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 0.30879258648168484, + "learning_rate": 0.0001206554784277931, + "loss": 0.6337, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 0.3388183296668431, + "learning_rate": 0.00011963654993677645, + "loss": 0.673, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 0.3537905016162661, + "learning_rate": 0.00011861549650352069, + "loss": 0.6195, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 0.33907350883788884, + "learning_rate": 0.00011759242861991855, + "loss": 0.6509, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 0.33481880505361394, + "learning_rate": 0.00011656745699585371, + "loss": 0.6373, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 0.3485772842962881, + "learning_rate": 0.00011554069254722051, + "loss": 0.6452, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.3093219837930065, + "learning_rate": 0.00011451224638392129, + "loss": 0.6055, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 0.33305496212146257, + "learning_rate": 0.00011348222979784289, + "loss": 0.5984, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.33472414511634047, + "learning_rate": 0.00011245075425081328, + "loss": 0.6757, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 0.32435244822651405, + "learning_rate": 0.00011141793136253986, + "loss": 0.6251, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.31796653746835785, + "learning_rate": 0.0001103838728985307, + "loss": 0.5902, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.33268287537089214, + "learning_rate": 0.000109348690758, + "loss": 0.6279, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.3271900702908842, + "learning_rate": 0.00010831249696175918, + "loss": 0.6561, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.33126008778280047, + "learning_rate": 0.0001072754036400944, + "loss": 0.6536, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.34781823010454777, + "learning_rate": 0.00010623752302063283, + "loss": 0.6579, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 0.3073647432254352, + "learning_rate": 0.00010519896741619803, + "loss": 0.616, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 0.31755966518040124, + "learning_rate": 0.00010415984921265609, + "loss": 0.6735, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.3381564369674285, + "learning_rate": 0.00010312028085675391, + "loss": 0.6511, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 0.2972906789467651, + "learning_rate": 0.00010208037484395114, + "loss": 0.626, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 0.32531081197126965, + "learning_rate": 0.00010104024370624644, + "loss": 0.6333, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 0.29158833917517063, + "learning_rate": 0.0001, + "loss": 0.6049, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 0.30238077259284174, + "learning_rate": 9.895975629375359e-05, + "loss": 0.6035, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 0.31722530427474355, + "learning_rate": 9.791962515604887e-05, + "loss": 0.6041, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 0.30151547809589097, + "learning_rate": 9.687971914324607e-05, + "loss": 0.5839, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.34665183430334884, + "learning_rate": 9.584015078734395e-05, + "loss": 0.5952, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.30772019611290397, + "learning_rate": 9.480103258380198e-05, + "loss": 0.6552, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 0.28516168246199697, + "learning_rate": 9.376247697936719e-05, + "loss": 0.586, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.3091269679334812, + "learning_rate": 9.272459635990562e-05, + "loss": 0.6509, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.31298172317771933, + "learning_rate": 9.168750303824084e-05, + "loss": 0.6399, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 0.3125082847550607, + "learning_rate": 9.065130924199998e-05, + "loss": 0.5864, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 0.33615452322346095, + "learning_rate": 8.961612710146934e-05, + "loss": 0.6538, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 0.280194545648247, + "learning_rate": 8.858206863746018e-05, + "loss": 0.6142, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 0.30664071129075987, + "learning_rate": 8.754924574918675e-05, + "loss": 0.6081, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.3302810403450827, + "learning_rate": 8.651777020215712e-05, + "loss": 0.5998, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.33210002602685823, + "learning_rate": 8.548775361607872e-05, + "loss": 0.6207, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.30444517344086625, + "learning_rate": 8.445930745277953e-05, + "loss": 0.6556, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 0.3039785181090473, + "learning_rate": 8.343254300414628e-05, + "loss": 0.6177, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.3066941689270814, + "learning_rate": 8.240757138008149e-05, + "loss": 0.603, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 0.3528154065914179, + "learning_rate": 8.138450349647936e-05, + "loss": 0.6599, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 0.4495795663737578, + "learning_rate": 8.036345006322359e-05, + "loss": 0.6187, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.3184430356828446, + "learning_rate": 7.934452157220694e-05, + "loss": 0.6517, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.31099668809147824, + "learning_rate": 7.832782828537437e-05, + "loss": 0.6026, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.2833253101198598, + "learning_rate": 7.731348022279134e-05, + "loss": 0.6062, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.31858986421365953, + "learning_rate": 7.630158715073813e-05, + "loss": 0.6014, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.3186441028771271, + "learning_rate": 7.52922585698315e-05, + "loss": 0.6247, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 0.3152306416432087, + "learning_rate": 7.428560370317542e-05, + "loss": 0.5964, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 0.3123285565579661, + "learning_rate": 7.328173148454151e-05, + "loss": 0.6082, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.3142125866867691, + "learning_rate": 7.228075054658096e-05, + "loss": 0.6241, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.30715777727174004, + "learning_rate": 7.1282769209069e-05, + "loss": 0.6207, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.31222220068605205, + "learning_rate": 7.028789546718326e-05, + "loss": 0.6354, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 0.29608382663769417, + "learning_rate": 6.929623697981718e-05, + "loss": 0.622, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 0.3099543613136313, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6128, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.3193556644485293, + "learning_rate": 6.732299465293322e-05, + "loss": 0.6693, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.34238637106239805, + "learning_rate": 6.63416243451194e-05, + "loss": 0.6133, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.2660721685995888, + "learning_rate": 6.536389633212609e-05, + "loss": 0.5928, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 0.312403615349147, + "learning_rate": 6.43899164174453e-05, + "loss": 0.645, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 0.32919867469057845, + "learning_rate": 6.341978999897365e-05, + "loss": 0.6037, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 0.29608072859945217, + "learning_rate": 6.245362205760704e-05, + "loss": 0.6108, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.29644956796385985, + "learning_rate": 6.149151714588009e-05, + "loss": 0.598, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.3287260629333863, + "learning_rate": 6.053357937665237e-05, + "loss": 0.6136, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 0.29751135110841737, + "learning_rate": 5.957991241184184e-05, + "loss": 0.6009, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.3025139827613611, + "learning_rate": 5.863061945120719e-05, + "loss": 0.5997, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.28291034044323077, + "learning_rate": 5.768580322118034e-05, + "loss": 0.552, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.29253931579179776, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.5657, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.30854967094455543, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.6203, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.2928674567367288, + "learning_rate": 5.487923484608629e-05, + "loss": 0.5918, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.3022165498099548, + "learning_rate": 5.395334294830765e-05, + "loss": 0.5768, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.3373168884415293, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.5954, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 0.31858218039290526, + "learning_rate": 5.211660743460458e-05, + "loss": 0.6315, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.31784021073230706, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.5988, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.31569488085577824, + "learning_rate": 5.030059790200756e-05, + "loss": 0.6117, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.3391114574160496, + "learning_rate": 4.940061137795876e-05, + "loss": 0.6308, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.3099686225231022, + "learning_rate": 4.850610039714444e-05, + "loss": 0.5874, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.3272152195311452, + "learning_rate": 4.761716175783989e-05, + "loss": 0.6295, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.3006309655603955, + "learning_rate": 4.673389165531714e-05, + "loss": 0.6095, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.2917718896861704, + "learning_rate": 4.585638567143529e-05, + "loss": 0.5584, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 0.3256058776538367, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6446, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.30540029018850307, + "learning_rate": 4.411904525797408e-05, + "loss": 0.5903, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 0.2768986114277422, + "learning_rate": 4.325939883229766e-05, + "loss": 0.5508, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.319691457038105, + "learning_rate": 4.240589251272342e-05, + "loss": 0.5967, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.3245235055980944, + "learning_rate": 4.155861866026364e-05, + "loss": 0.6145, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.2971007844582699, + "learning_rate": 4.071766896149273e-05, + "loss": 0.6083, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.3132403382765072, + "learning_rate": 3.988313441862553e-05, + "loss": 0.6197, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.3029269929031576, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.5976, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 0.2943639707173013, + "learning_rate": 3.823367132865265e-05, + "loss": 0.5643, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 0.3259079582732509, + "learning_rate": 3.741892127592625e-05, + "loss": 0.6379, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.3912281847418963, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.6369, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.3392790073664649, + "learning_rate": 3.580982498073344e-05, + "loss": 0.6046, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.3093047016302224, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6305, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.31740563814051154, + "learning_rate": 3.422851293981676e-05, + "loss": 0.5927, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.35757022744841516, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.6589, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.2771972396943578, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.556, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 0.3112545964276882, + "learning_rate": 3.191013424895536e-05, + "loss": 0.5963, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.28881758258048246, + "learning_rate": 3.115196713638e-05, + "loss": 0.5699, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 0.29705034758563464, + "learning_rate": 3.040125031897264e-05, + "loss": 0.6018, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.30799071774314674, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.61, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.327747513366901, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6307, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.2896705972194219, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.5946, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 0.3010433149593574, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.61, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 0.3215726833693975, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.5684, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 0.28698128890577407, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.5704, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 0.2874906284664439, + "learning_rate": 2.536156105513062e-05, + "loss": 0.5546, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.3274733948655497, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.6283, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.3407257862140524, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6226, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 0.30940111584956204, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.6182, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 0.318109829131004, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6321, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.29914292634306344, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.599, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 0.2797871835987113, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.5546, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.30229086592693627, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.6192, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.2954873259014947, + "learning_rate": 2.008778270707944e-05, + "loss": 0.5913, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.2950203262186019, + "learning_rate": 1.946674459180955e-05, + "loss": 0.561, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 0.28779718073226096, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.5939, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.314887711005164, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.5933, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.2986903753941433, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.5932, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.2826250372389659, + "learning_rate": 1.707039794428259e-05, + "loss": 0.5509, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 0.33381206504142535, + "learning_rate": 1.649358688599191e-05, + "loss": 0.6154, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.30596472002594716, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.594, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 0.35185807954331105, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.5958, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 0.28230006882220116, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.5486, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.3130668066124635, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.6108, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.3294167678586574, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.6684, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.32078244804037015, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.6503, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.47405184916696186, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.5995, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.3070564349588061, + "learning_rate": 1.220944973160133e-05, + "loss": 0.5683, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.2946455471506841, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.6075, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.31174149044192556, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.6102, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.2894262410662935, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.5679, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.3072867937660247, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.5944, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 0.3141327748974818, + "learning_rate": 9.838733725597615e-06, + "loss": 0.6146, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.3297375563349942, + "learning_rate": 9.393660536564408e-06, + "loss": 0.6093, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 0.3025268519097175, + "learning_rate": 8.958392187916841e-06, + "loss": 0.5715, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.2763400100984386, + "learning_rate": 8.532975781620512e-06, + "loss": 0.5531, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.3057061440220331, + "learning_rate": 8.117457353526625e-06, + "loss": 0.565, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.30568364567525963, + "learning_rate": 7.711881868390291e-06, + "loss": 0.5713, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 0.3331966345949505, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.6677, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.2908370654322784, + "learning_rate": 6.930734201451816e-06, + "loss": 0.5812, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 0.26727331401737486, + "learning_rate": 6.555246550469907e-06, + "loss": 0.5618, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 0.312133529139538, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6026, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.2922244076597637, + "learning_rate": 5.834646773481811e-06, + "loss": 0.5905, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 0.30187540342600905, + "learning_rate": 5.489612626189245e-06, + "loss": 0.6277, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.34788501107378356, + "learning_rate": 5.154805790456485e-06, + "loss": 0.6326, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.27072389174734296, + "learning_rate": 4.830262496944693e-06, + "loss": 0.5552, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.2931131966977575, + "learning_rate": 4.516017865659949e-06, + "loss": 0.5835, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.3082429395866023, + "learning_rate": 4.21210590215273e-06, + "loss": 0.5704, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.30589246711582313, + "learning_rate": 3.918559493838114e-06, + "loss": 0.6061, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.28004706080113423, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.5771, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 0.3048928905270255, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.5453, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.2903529261392363, + "learning_rate": 3.100425628282899e-06, + "loss": 0.5858, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.29670524308182067, + "learning_rate": 2.848647830172024e-06, + "loss": 0.5713, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.3038955745133722, + "learning_rate": 2.607383131993424e-06, + "loss": 0.6174, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 0.3201029301410521, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.5997, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.30209605384437654, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.5846, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 0.2843990942016293, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.5617, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.3117408742395904, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.6211, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.29773734562986487, + "learning_rate": 1.559629951665298e-06, + "loss": 0.5651, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.281978621308173, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.5525, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.32845611494771243, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.5862, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 0.3506177862675044, + "learning_rate": 1.05862996252597e-06, + "loss": 0.585, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 0.3297169416792786, + "learning_rate": 9.130206350089765e-07, + "loss": 0.63, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.28035403388693836, + "learning_rate": 7.781338686584927e-07, + "loss": 0.5548, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 0.3001871927203313, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6112, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.2795530026041373, + "learning_rate": 5.405852438937764e-07, + "loss": 0.5666, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 0.3139555068178617, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.5986, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.31776585164466525, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.5706, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.2936353664174258, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.5616, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.3198153539489569, + "learning_rate": 1.947230525005006e-07, + "loss": 0.6109, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.3199566602187319, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.6099, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.31573660141129345, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6208, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 0.30735316287976444, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.6242, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.33578478937957057, + "learning_rate": 2.164213936770576e-08, + "loss": 0.59, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.32324798409195554, + "learning_rate": 5.410681219286673e-09, + "loss": 0.6159, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 0.31751165611922544, + "learning_rate": 0.0, + "loss": 0.592, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 305815172284416.0, + "train_loss": 0.6410520928792465, + "train_runtime": 4972.4596, + "train_samples_per_second": 1.006, + "train_steps_per_second": 0.063 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 305815172284416.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..571f183351ceab6dc287cddd61c1ec2dd10b069b --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "v_proj", + "gate_proj", + "q_proj", + "o_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4c7dd911b2b71f3a127404b0be8cbb3561fe15c1 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e6a23753094a97b50ff3abbd77362a6b1f9b843a8a5fc00991c7d5ad8a9f22 +size 671150064 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..2d55c3fbf822ac17eba2e044eb8bf18d5af6ebae --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fba640a818e1406f5718e38d454b65bc39aa09bf28dcfbaaf649cd9805a6a21 +size 918507402 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..0f9883d89f621211dd0c11158f5f59624fd1a88d --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_10000_epochs_1_lora/trainer_state.json @@ -0,0 +1,4417 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 625, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0016, + "grad_norm": 4.079327622118584, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4765, + "step": 1 + }, + { + "epoch": 0.0032, + "grad_norm": 4.40113376763699, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4812, + "step": 2 + }, + { + "epoch": 0.0048, + "grad_norm": 2.852285481000278, + "learning_rate": 3.157894736842105e-05, + "loss": 1.4284, + "step": 3 + }, + { + "epoch": 0.0064, + "grad_norm": 2.4225314639300977, + "learning_rate": 4.210526315789474e-05, + "loss": 1.3549, + "step": 4 + }, + { + "epoch": 0.008, + "grad_norm": 1.9486859923452315, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.1165, + "step": 5 + }, + { + "epoch": 0.0096, + "grad_norm": 1.9208121876027364, + "learning_rate": 6.31578947368421e-05, + "loss": 1.0992, + "step": 6 + }, + { + "epoch": 0.0112, + "grad_norm": 1.732766800590471, + "learning_rate": 7.368421052631579e-05, + "loss": 0.9592, + "step": 7 + }, + { + "epoch": 0.0128, + "grad_norm": 1.5637155696936844, + "learning_rate": 8.421052631578948e-05, + "loss": 0.8463, + "step": 8 + }, + { + "epoch": 0.0144, + "grad_norm": 2.0586132784503013, + "learning_rate": 9.473684210526316e-05, + "loss": 0.9338, + "step": 9 + }, + { + "epoch": 0.016, + "grad_norm": 1.9050656462455207, + "learning_rate": 0.00010526315789473685, + "loss": 0.9428, + "step": 10 + }, + { + "epoch": 0.0176, + "grad_norm": 1.5521805714320187, + "learning_rate": 0.00011578947368421053, + "loss": 0.8948, + "step": 11 + }, + { + "epoch": 0.0192, + "grad_norm": 1.3914837471605024, + "learning_rate": 0.0001263157894736842, + "loss": 0.8751, + "step": 12 + }, + { + "epoch": 0.0208, + "grad_norm": 1.3037317191929243, + "learning_rate": 0.0001368421052631579, + "loss": 0.8701, + "step": 13 + }, + { + "epoch": 0.0224, + "grad_norm": 1.1895932626241843, + "learning_rate": 0.00014736842105263158, + "loss": 0.9284, + "step": 14 + }, + { + "epoch": 0.024, + "grad_norm": 1.1682465876717918, + "learning_rate": 0.00015789473684210527, + "loss": 0.7463, + "step": 15 + }, + { + "epoch": 0.0256, + "grad_norm": 1.3458482300267895, + "learning_rate": 0.00016842105263157895, + "loss": 0.8145, + "step": 16 + }, + { + "epoch": 0.0272, + "grad_norm": 1.3343115842350872, + "learning_rate": 0.00017894736842105264, + "loss": 0.78, + "step": 17 + }, + { + "epoch": 0.0288, + "grad_norm": 1.345994899831727, + "learning_rate": 0.00018947368421052632, + "loss": 0.8096, + "step": 18 + }, + { + "epoch": 0.0304, + "grad_norm": 1.5179124182931218, + "learning_rate": 0.0002, + "loss": 0.9103, + "step": 19 + }, + { + "epoch": 0.032, + "grad_norm": 1.453884829604924, + "learning_rate": 0.00019999865623437013, + "loss": 0.8481, + "step": 20 + }, + { + "epoch": 0.0336, + "grad_norm": 1.3244483538701937, + "learning_rate": 0.00019999462497359466, + "loss": 0.9012, + "step": 21 + }, + { + "epoch": 0.0352, + "grad_norm": 1.4273501446403416, + "learning_rate": 0.00019998790632601496, + "loss": 0.8483, + "step": 22 + }, + { + "epoch": 0.0368, + "grad_norm": 1.351681032600888, + "learning_rate": 0.0001999785004721968, + "loss": 0.8476, + "step": 23 + }, + { + "epoch": 0.0384, + "grad_norm": 1.2002303746648013, + "learning_rate": 0.00019996640766492543, + "loss": 0.7745, + "step": 24 + }, + { + "epoch": 0.04, + "grad_norm": 1.224676997564932, + "learning_rate": 0.00019995162822919883, + "loss": 0.8161, + "step": 25 + }, + { + "epoch": 0.0416, + "grad_norm": 1.2521722157868296, + "learning_rate": 0.00019993416256221895, + "loss": 0.8237, + "step": 26 + }, + { + "epoch": 0.0432, + "grad_norm": 1.3839045461488768, + "learning_rate": 0.00019991401113338104, + "loss": 0.796, + "step": 27 + }, + { + "epoch": 0.0448, + "grad_norm": 1.4393329250022675, + "learning_rate": 0.00019989117448426108, + "loss": 0.9021, + "step": 28 + }, + { + "epoch": 0.0464, + "grad_norm": 1.1708534336814247, + "learning_rate": 0.00019986565322860115, + "loss": 0.8184, + "step": 29 + }, + { + "epoch": 0.048, + "grad_norm": 1.3971888030003934, + "learning_rate": 0.00019983744805229296, + "loss": 0.83, + "step": 30 + }, + { + "epoch": 0.0496, + "grad_norm": 1.3950041409572966, + "learning_rate": 0.00019980655971335945, + "loss": 0.791, + "step": 31 + }, + { + "epoch": 0.0512, + "grad_norm": 1.3172061836573148, + "learning_rate": 0.00019977298904193437, + "loss": 0.7919, + "step": 32 + }, + { + "epoch": 0.0528, + "grad_norm": 1.1674279258163187, + "learning_rate": 0.00019973673694024, + "loss": 0.7204, + "step": 33 + }, + { + "epoch": 0.0544, + "grad_norm": 1.216402015685836, + "learning_rate": 0.00019969780438256293, + "loss": 0.839, + "step": 34 + }, + { + "epoch": 0.056, + "grad_norm": 1.1738455727715729, + "learning_rate": 0.0001996561924152278, + "loss": 0.9203, + "step": 35 + }, + { + "epoch": 0.0576, + "grad_norm": 1.3324423007220763, + "learning_rate": 0.0001996119021565693, + "loss": 0.8302, + "step": 36 + }, + { + "epoch": 0.0592, + "grad_norm": 1.1692674041237634, + "learning_rate": 0.0001995649347969019, + "loss": 0.787, + "step": 37 + }, + { + "epoch": 0.0608, + "grad_norm": 1.129891896166518, + "learning_rate": 0.00019951529159848805, + "loss": 0.8299, + "step": 38 + }, + { + "epoch": 0.0624, + "grad_norm": 1.4645497511737668, + "learning_rate": 0.00019946297389550433, + "loss": 0.9184, + "step": 39 + }, + { + "epoch": 0.064, + "grad_norm": 1.1889881200623615, + "learning_rate": 0.00019940798309400526, + "loss": 0.8212, + "step": 40 + }, + { + "epoch": 0.0656, + "grad_norm": 1.3882019110988386, + "learning_rate": 0.0001993503206718859, + "loss": 0.8463, + "step": 41 + }, + { + "epoch": 0.0672, + "grad_norm": 1.2617605007479555, + "learning_rate": 0.00019928998817884182, + "loss": 0.7893, + "step": 42 + }, + { + "epoch": 0.0688, + "grad_norm": 1.3199721270879063, + "learning_rate": 0.00019922698723632767, + "loss": 0.771, + "step": 43 + }, + { + "epoch": 0.0704, + "grad_norm": 1.3256258709641475, + "learning_rate": 0.00019916131953751342, + "loss": 0.7393, + "step": 44 + }, + { + "epoch": 0.072, + "grad_norm": 1.1326132922389427, + "learning_rate": 0.00019909298684723904, + "loss": 0.7181, + "step": 45 + }, + { + "epoch": 0.0736, + "grad_norm": 1.1341954775455474, + "learning_rate": 0.00019902199100196697, + "loss": 0.8069, + "step": 46 + }, + { + "epoch": 0.0752, + "grad_norm": 1.109294934202468, + "learning_rate": 0.00019894833390973266, + "loss": 0.7307, + "step": 47 + }, + { + "epoch": 0.0768, + "grad_norm": 1.3263077512939443, + "learning_rate": 0.00019887201755009357, + "loss": 0.733, + "step": 48 + }, + { + "epoch": 0.0784, + "grad_norm": 1.0214273654864277, + "learning_rate": 0.0001987930439740757, + "loss": 0.7177, + "step": 49 + }, + { + "epoch": 0.08, + "grad_norm": 1.2125792061110614, + "learning_rate": 0.00019871141530411853, + "loss": 0.8271, + "step": 50 + }, + { + "epoch": 0.0816, + "grad_norm": 1.2990697925289099, + "learning_rate": 0.0001986271337340182, + "loss": 0.819, + "step": 51 + }, + { + "epoch": 0.0832, + "grad_norm": 1.1626490409509023, + "learning_rate": 0.00019854020152886814, + "loss": 0.7431, + "step": 52 + }, + { + "epoch": 0.0848, + "grad_norm": 1.3252770453069025, + "learning_rate": 0.0001984506210249986, + "loss": 0.9627, + "step": 53 + }, + { + "epoch": 0.0864, + "grad_norm": 1.2275945378216162, + "learning_rate": 0.00019835839462991361, + "loss": 0.7013, + "step": 54 + }, + { + "epoch": 0.088, + "grad_norm": 1.1206286767819427, + "learning_rate": 0.00019826352482222638, + "loss": 0.7889, + "step": 55 + }, + { + "epoch": 0.0896, + "grad_norm": 1.2147862865201278, + "learning_rate": 0.00019816601415159263, + "loss": 0.879, + "step": 56 + }, + { + "epoch": 0.0912, + "grad_norm": 1.0970231076045274, + "learning_rate": 0.0001980658652386421, + "loss": 0.7385, + "step": 57 + }, + { + "epoch": 0.0928, + "grad_norm": 1.3233091017326513, + "learning_rate": 0.00019796308077490817, + "loss": 0.8113, + "step": 58 + }, + { + "epoch": 0.0944, + "grad_norm": 1.1792782814591045, + "learning_rate": 0.00019785766352275542, + "loss": 0.832, + "step": 59 + }, + { + "epoch": 0.096, + "grad_norm": 1.0095028078066783, + "learning_rate": 0.00019774961631530545, + "loss": 0.6964, + "step": 60 + }, + { + "epoch": 0.0976, + "grad_norm": 1.6843718249061146, + "learning_rate": 0.00019763894205636072, + "loss": 0.7526, + "step": 61 + }, + { + "epoch": 0.0992, + "grad_norm": 1.1186319562327094, + "learning_rate": 0.00019752564372032657, + "loss": 0.734, + "step": 62 + }, + { + "epoch": 0.1008, + "grad_norm": 1.1719208207564302, + "learning_rate": 0.00019740972435213115, + "loss": 0.868, + "step": 63 + }, + { + "epoch": 0.1024, + "grad_norm": 1.0609711989245456, + "learning_rate": 0.00019729118706714375, + "loss": 0.7287, + "step": 64 + }, + { + "epoch": 0.104, + "grad_norm": 0.9692517342409362, + "learning_rate": 0.00019717003505109095, + "loss": 0.6865, + "step": 65 + }, + { + "epoch": 0.1056, + "grad_norm": 1.153771253083348, + "learning_rate": 0.00019704627155997108, + "loss": 0.8092, + "step": 66 + }, + { + "epoch": 0.1072, + "grad_norm": 1.2623889816531393, + "learning_rate": 0.00019691989991996663, + "loss": 0.6917, + "step": 67 + }, + { + "epoch": 0.1088, + "grad_norm": 1.1208482751559274, + "learning_rate": 0.0001967909235273549, + "loss": 0.8146, + "step": 68 + }, + { + "epoch": 0.1104, + "grad_norm": 1.2217174035955245, + "learning_rate": 0.00019665934584841682, + "loss": 0.7433, + "step": 69 + }, + { + "epoch": 0.112, + "grad_norm": 1.2026904931491873, + "learning_rate": 0.00019652517041934356, + "loss": 0.9273, + "step": 70 + }, + { + "epoch": 0.1136, + "grad_norm": 1.112998793185748, + "learning_rate": 0.00019638840084614182, + "loss": 0.7562, + "step": 71 + }, + { + "epoch": 0.1152, + "grad_norm": 1.0998903851247233, + "learning_rate": 0.00019624904080453655, + "loss": 0.7692, + "step": 72 + }, + { + "epoch": 0.1168, + "grad_norm": 1.0972310916444612, + "learning_rate": 0.00019610709403987246, + "loss": 0.685, + "step": 73 + }, + { + "epoch": 0.1184, + "grad_norm": 1.1538532053917443, + "learning_rate": 0.00019596256436701324, + "loss": 0.8985, + "step": 74 + }, + { + "epoch": 0.12, + "grad_norm": 1.2150127231448657, + "learning_rate": 0.000195815455670239, + "loss": 0.788, + "step": 75 + }, + { + "epoch": 0.1216, + "grad_norm": 1.2428997345801873, + "learning_rate": 0.00019566577190314197, + "loss": 0.8087, + "step": 76 + }, + { + "epoch": 0.1232, + "grad_norm": 1.180723999696628, + "learning_rate": 0.0001955135170885202, + "loss": 0.7937, + "step": 77 + }, + { + "epoch": 0.1248, + "grad_norm": 1.1698683675027783, + "learning_rate": 0.00019535869531826937, + "loss": 0.8433, + "step": 78 + }, + { + "epoch": 0.1264, + "grad_norm": 1.196657806303993, + "learning_rate": 0.00019520131075327298, + "loss": 0.7706, + "step": 79 + }, + { + "epoch": 0.128, + "grad_norm": 1.2298160597232919, + "learning_rate": 0.00019504136762329047, + "loss": 0.7931, + "step": 80 + }, + { + "epoch": 0.1296, + "grad_norm": 1.1391045969000217, + "learning_rate": 0.00019487887022684336, + "loss": 0.7492, + "step": 81 + }, + { + "epoch": 0.1312, + "grad_norm": 1.1205475244780054, + "learning_rate": 0.00019471382293110003, + "loss": 0.7671, + "step": 82 + }, + { + "epoch": 0.1328, + "grad_norm": 1.18514036810532, + "learning_rate": 0.00019454623017175812, + "loss": 0.8046, + "step": 83 + }, + { + "epoch": 0.1344, + "grad_norm": 1.1780736050925318, + "learning_rate": 0.00019437609645292546, + "loss": 0.9289, + "step": 84 + }, + { + "epoch": 0.136, + "grad_norm": 1.110286792602596, + "learning_rate": 0.0001942034263469989, + "loss": 0.808, + "step": 85 + }, + { + "epoch": 0.1376, + "grad_norm": 1.1733343791671424, + "learning_rate": 0.00019402822449454153, + "loss": 0.9261, + "step": 86 + }, + { + "epoch": 0.1392, + "grad_norm": 1.0814328442374197, + "learning_rate": 0.00019385049560415794, + "loss": 0.8203, + "step": 87 + }, + { + "epoch": 0.1408, + "grad_norm": 1.1385768725890104, + "learning_rate": 0.00019367024445236754, + "loss": 0.7974, + "step": 88 + }, + { + "epoch": 0.1424, + "grad_norm": 1.0216275712576461, + "learning_rate": 0.00019348747588347637, + "loss": 0.6849, + "step": 89 + }, + { + "epoch": 0.144, + "grad_norm": 1.327797248643074, + "learning_rate": 0.00019330219480944694, + "loss": 0.8543, + "step": 90 + }, + { + "epoch": 0.1456, + "grad_norm": 1.183051703626731, + "learning_rate": 0.00019311440620976597, + "loss": 0.8368, + "step": 91 + }, + { + "epoch": 0.1472, + "grad_norm": 1.1337857151472512, + "learning_rate": 0.0001929241151313108, + "loss": 0.6706, + "step": 92 + }, + { + "epoch": 0.1488, + "grad_norm": 1.135855789118863, + "learning_rate": 0.00019273132668821364, + "loss": 0.7744, + "step": 93 + }, + { + "epoch": 0.1504, + "grad_norm": 1.3569335916233816, + "learning_rate": 0.00019253604606172417, + "loss": 0.8503, + "step": 94 + }, + { + "epoch": 0.152, + "grad_norm": 1.1437156491959917, + "learning_rate": 0.00019233827850007027, + "loss": 0.7745, + "step": 95 + }, + { + "epoch": 0.1536, + "grad_norm": 1.113672148884586, + "learning_rate": 0.00019213802931831696, + "loss": 0.727, + "step": 96 + }, + { + "epoch": 0.1552, + "grad_norm": 1.0071472374514407, + "learning_rate": 0.00019193530389822363, + "loss": 0.7332, + "step": 97 + }, + { + "epoch": 0.1568, + "grad_norm": 1.1103465364736207, + "learning_rate": 0.00019173010768809933, + "loss": 0.8267, + "step": 98 + }, + { + "epoch": 0.1584, + "grad_norm": 1.0693141101920882, + "learning_rate": 0.0001915224462026563, + "loss": 0.779, + "step": 99 + }, + { + "epoch": 0.16, + "grad_norm": 0.9573192627909511, + "learning_rate": 0.00019131232502286188, + "loss": 0.6754, + "step": 100 + }, + { + "epoch": 0.1616, + "grad_norm": 1.056695179414592, + "learning_rate": 0.0001910997497957885, + "loss": 0.7361, + "step": 101 + }, + { + "epoch": 0.1632, + "grad_norm": 1.1824410547847057, + "learning_rate": 0.00019088472623446183, + "loss": 0.7389, + "step": 102 + }, + { + "epoch": 0.1648, + "grad_norm": 1.240303615114854, + "learning_rate": 0.00019066726011770726, + "loss": 0.8205, + "step": 103 + }, + { + "epoch": 0.1664, + "grad_norm": 1.0851520282652303, + "learning_rate": 0.0001904473572899947, + "loss": 0.7433, + "step": 104 + }, + { + "epoch": 0.168, + "grad_norm": 1.2824427242383798, + "learning_rate": 0.00019022502366128135, + "loss": 0.8288, + "step": 105 + }, + { + "epoch": 0.1696, + "grad_norm": 1.2027009179273351, + "learning_rate": 0.00019000026520685302, + "loss": 0.8518, + "step": 106 + }, + { + "epoch": 0.1712, + "grad_norm": 1.0018538317795969, + "learning_rate": 0.0001897730879671634, + "loss": 0.7416, + "step": 107 + }, + { + "epoch": 0.1728, + "grad_norm": 1.0773970729250346, + "learning_rate": 0.00018954349804767184, + "loss": 0.7264, + "step": 108 + }, + { + "epoch": 0.1744, + "grad_norm": 1.0385112139274224, + "learning_rate": 0.00018931150161867916, + "loss": 0.7199, + "step": 109 + }, + { + "epoch": 0.176, + "grad_norm": 1.1202961144635684, + "learning_rate": 0.00018907710491516199, + "loss": 0.7759, + "step": 110 + }, + { + "epoch": 0.1776, + "grad_norm": 1.10002835715013, + "learning_rate": 0.0001888403142366049, + "loss": 0.7672, + "step": 111 + }, + { + "epoch": 0.1792, + "grad_norm": 1.1523630454468816, + "learning_rate": 0.00018860113594683148, + "loss": 0.782, + "step": 112 + }, + { + "epoch": 0.1808, + "grad_norm": 1.2910603290325495, + "learning_rate": 0.00018835957647383303, + "loss": 0.8601, + "step": 113 + }, + { + "epoch": 0.1824, + "grad_norm": 1.0313609737226423, + "learning_rate": 0.00018811564230959588, + "loss": 0.7411, + "step": 114 + }, + { + "epoch": 0.184, + "grad_norm": 1.2011834969161035, + "learning_rate": 0.00018786934000992688, + "loss": 0.8854, + "step": 115 + }, + { + "epoch": 0.1856, + "grad_norm": 1.0578982681367388, + "learning_rate": 0.00018762067619427746, + "loss": 0.7166, + "step": 116 + }, + { + "epoch": 0.1872, + "grad_norm": 1.126190003093847, + "learning_rate": 0.00018736965754556528, + "loss": 0.728, + "step": 117 + }, + { + "epoch": 0.1888, + "grad_norm": 1.1364729312140345, + "learning_rate": 0.00018711629080999504, + "loss": 0.7879, + "step": 118 + }, + { + "epoch": 0.1904, + "grad_norm": 0.9556525935880364, + "learning_rate": 0.00018686058279687698, + "loss": 0.7246, + "step": 119 + }, + { + "epoch": 0.192, + "grad_norm": 1.0169082348426133, + "learning_rate": 0.00018660254037844388, + "loss": 0.692, + "step": 120 + }, + { + "epoch": 0.1936, + "grad_norm": 1.1399661009178772, + "learning_rate": 0.00018634217048966637, + "loss": 0.8633, + "step": 121 + }, + { + "epoch": 0.1952, + "grad_norm": 1.025597618650162, + "learning_rate": 0.0001860794801280666, + "loss": 0.7531, + "step": 122 + }, + { + "epoch": 0.1968, + "grad_norm": 1.1140897475974443, + "learning_rate": 0.0001858144763535302, + "loss": 0.7009, + "step": 123 + }, + { + "epoch": 0.1984, + "grad_norm": 0.989635599878665, + "learning_rate": 0.0001855471662881164, + "loss": 0.7085, + "step": 124 + }, + { + "epoch": 0.2, + "grad_norm": 0.9861571045739278, + "learning_rate": 0.00018527755711586678, + "loss": 0.7402, + "step": 125 + }, + { + "epoch": 0.2016, + "grad_norm": 1.1280775918681303, + "learning_rate": 0.00018500565608261214, + "loss": 0.7753, + "step": 126 + }, + { + "epoch": 0.2032, + "grad_norm": 0.970219215323538, + "learning_rate": 0.00018473147049577774, + "loss": 0.6952, + "step": 127 + }, + { + "epoch": 0.2048, + "grad_norm": 1.0634205518795208, + "learning_rate": 0.00018445500772418697, + "loss": 0.9528, + "step": 128 + }, + { + "epoch": 0.2064, + "grad_norm": 0.9079606051649208, + "learning_rate": 0.00018417627519786315, + "loss": 0.6912, + "step": 129 + }, + { + "epoch": 0.208, + "grad_norm": 1.0830255428134676, + "learning_rate": 0.00018389528040783012, + "loss": 0.7586, + "step": 130 + }, + { + "epoch": 0.2096, + "grad_norm": 0.9682713423800534, + "learning_rate": 0.00018361203090591071, + "loss": 0.7374, + "step": 131 + }, + { + "epoch": 0.2112, + "grad_norm": 0.9778161706645351, + "learning_rate": 0.00018332653430452376, + "loss": 0.675, + "step": 132 + }, + { + "epoch": 0.2128, + "grad_norm": 1.1138474318789258, + "learning_rate": 0.00018303879827647975, + "loss": 0.8055, + "step": 133 + }, + { + "epoch": 0.2144, + "grad_norm": 0.9632096169407617, + "learning_rate": 0.00018274883055477436, + "loss": 0.7298, + "step": 134 + }, + { + "epoch": 0.216, + "grad_norm": 1.09864313311594, + "learning_rate": 0.00018245663893238075, + "loss": 0.7825, + "step": 135 + }, + { + "epoch": 0.2176, + "grad_norm": 1.0390210658058932, + "learning_rate": 0.00018216223126204007, + "loss": 0.6806, + "step": 136 + }, + { + "epoch": 0.2192, + "grad_norm": 1.2086212594832617, + "learning_rate": 0.00018186561545605054, + "loss": 0.7926, + "step": 137 + }, + { + "epoch": 0.2208, + "grad_norm": 1.0230209515691793, + "learning_rate": 0.00018156679948605467, + "loss": 0.6909, + "step": 138 + }, + { + "epoch": 0.2224, + "grad_norm": 1.1026368694000623, + "learning_rate": 0.00018126579138282503, + "loss": 0.7129, + "step": 139 + }, + { + "epoch": 0.224, + "grad_norm": 1.2764382720926033, + "learning_rate": 0.0001809625992360485, + "loss": 0.7805, + "step": 140 + }, + { + "epoch": 0.2256, + "grad_norm": 1.005584536935183, + "learning_rate": 0.00018065723119410884, + "loss": 0.6905, + "step": 141 + }, + { + "epoch": 0.2272, + "grad_norm": 1.1944269380445902, + "learning_rate": 0.00018034969546386757, + "loss": 0.9269, + "step": 142 + }, + { + "epoch": 0.2288, + "grad_norm": 1.0683837257803421, + "learning_rate": 0.0001800400003104436, + "loss": 0.7768, + "step": 143 + }, + { + "epoch": 0.2304, + "grad_norm": 1.445206274329109, + "learning_rate": 0.00017972815405699103, + "loss": 0.7828, + "step": 144 + }, + { + "epoch": 0.232, + "grad_norm": 0.9845662569670789, + "learning_rate": 0.00017941416508447536, + "loss": 0.7021, + "step": 145 + }, + { + "epoch": 0.2336, + "grad_norm": 0.9426691715777189, + "learning_rate": 0.0001790980418314484, + "loss": 0.7166, + "step": 146 + }, + { + "epoch": 0.2352, + "grad_norm": 1.095566507890165, + "learning_rate": 0.00017877979279382135, + "loss": 0.7469, + "step": 147 + }, + { + "epoch": 0.2368, + "grad_norm": 1.0000912230334456, + "learning_rate": 0.0001784594265246366, + "loss": 0.7413, + "step": 148 + }, + { + "epoch": 0.2384, + "grad_norm": 0.9985260964561642, + "learning_rate": 0.0001781369516338378, + "loss": 0.7222, + "step": 149 + }, + { + "epoch": 0.24, + "grad_norm": 1.0365770710824433, + "learning_rate": 0.00017781237678803847, + "loss": 0.6543, + "step": 150 + }, + { + "epoch": 0.2416, + "grad_norm": 1.0560049830637563, + "learning_rate": 0.000177485710710289, + "loss": 0.6799, + "step": 151 + }, + { + "epoch": 0.2432, + "grad_norm": 1.2285475773918355, + "learning_rate": 0.00017715696217984235, + "loss": 0.6613, + "step": 152 + }, + { + "epoch": 0.2448, + "grad_norm": 1.1948610951408163, + "learning_rate": 0.00017682614003191807, + "loss": 0.7119, + "step": 153 + }, + { + "epoch": 0.2464, + "grad_norm": 1.1392864200733446, + "learning_rate": 0.00017649325315746478, + "loss": 0.7611, + "step": 154 + }, + { + "epoch": 0.248, + "grad_norm": 1.026286407883845, + "learning_rate": 0.0001761583105029213, + "loss": 0.78, + "step": 155 + }, + { + "epoch": 0.2496, + "grad_norm": 1.0817341923761168, + "learning_rate": 0.00017582132106997616, + "loss": 0.803, + "step": 156 + }, + { + "epoch": 0.2512, + "grad_norm": 1.098411059588056, + "learning_rate": 0.00017548229391532572, + "loss": 0.8301, + "step": 157 + }, + { + "epoch": 0.2528, + "grad_norm": 1.0307802416356842, + "learning_rate": 0.00017514123815043074, + "loss": 0.7403, + "step": 158 + }, + { + "epoch": 0.2544, + "grad_norm": 0.9227971563943753, + "learning_rate": 0.00017479816294127152, + "loss": 0.6352, + "step": 159 + }, + { + "epoch": 0.256, + "grad_norm": 0.9404002808142968, + "learning_rate": 0.0001744530775081015, + "loss": 0.707, + "step": 160 + }, + { + "epoch": 0.2576, + "grad_norm": 0.9868828966467482, + "learning_rate": 0.0001741059911251997, + "loss": 0.7391, + "step": 161 + }, + { + "epoch": 0.2592, + "grad_norm": 1.208640381494263, + "learning_rate": 0.000173756913120621, + "loss": 0.8004, + "step": 162 + }, + { + "epoch": 0.2608, + "grad_norm": 1.3273546340028264, + "learning_rate": 0.00017340585287594604, + "loss": 0.7896, + "step": 163 + }, + { + "epoch": 0.2624, + "grad_norm": 1.1911031665705774, + "learning_rate": 0.0001730528198260285, + "loss": 0.8223, + "step": 164 + }, + { + "epoch": 0.264, + "grad_norm": 1.098678240240061, + "learning_rate": 0.00017269782345874203, + "loss": 0.6788, + "step": 165 + }, + { + "epoch": 0.2656, + "grad_norm": 1.1857382862897203, + "learning_rate": 0.00017234087331472497, + "loss": 0.7293, + "step": 166 + }, + { + "epoch": 0.2672, + "grad_norm": 1.007216283110517, + "learning_rate": 0.00017198197898712404, + "loss": 0.7151, + "step": 167 + }, + { + "epoch": 0.2688, + "grad_norm": 1.264950302161379, + "learning_rate": 0.00017162115012133643, + "loss": 0.772, + "step": 168 + }, + { + "epoch": 0.2704, + "grad_norm": 0.982465076866397, + "learning_rate": 0.00017125839641475072, + "loss": 0.6562, + "step": 169 + }, + { + "epoch": 0.272, + "grad_norm": 1.0399829085179315, + "learning_rate": 0.00017089372761648616, + "loss": 0.7398, + "step": 170 + }, + { + "epoch": 0.2736, + "grad_norm": 0.8972089218935757, + "learning_rate": 0.00017052715352713075, + "loss": 0.6605, + "step": 171 + }, + { + "epoch": 0.2752, + "grad_norm": 1.0072069220509277, + "learning_rate": 0.00017015868399847768, + "loss": 0.7291, + "step": 172 + }, + { + "epoch": 0.2768, + "grad_norm": 1.1137214709250107, + "learning_rate": 0.00016978832893326074, + "loss": 0.7609, + "step": 173 + }, + { + "epoch": 0.2784, + "grad_norm": 1.0323488374422616, + "learning_rate": 0.00016941609828488807, + "loss": 0.7059, + "step": 174 + }, + { + "epoch": 0.28, + "grad_norm": 1.1455282929819348, + "learning_rate": 0.0001690420020571747, + "loss": 0.7978, + "step": 175 + }, + { + "epoch": 0.2816, + "grad_norm": 1.2518720150891187, + "learning_rate": 0.0001686660503040737, + "loss": 0.7756, + "step": 176 + }, + { + "epoch": 0.2832, + "grad_norm": 1.139310258657853, + "learning_rate": 0.00016828825312940592, + "loss": 0.8073, + "step": 177 + }, + { + "epoch": 0.2848, + "grad_norm": 1.0180182174487173, + "learning_rate": 0.0001679086206865886, + "loss": 0.6822, + "step": 178 + }, + { + "epoch": 0.2864, + "grad_norm": 1.0863077091069504, + "learning_rate": 0.00016752716317836229, + "loss": 0.7754, + "step": 179 + }, + { + "epoch": 0.288, + "grad_norm": 0.9798952036583372, + "learning_rate": 0.0001671438908565167, + "loss": 0.7036, + "step": 180 + }, + { + "epoch": 0.2896, + "grad_norm": 1.0702131643504458, + "learning_rate": 0.00016675881402161536, + "loss": 0.7164, + "step": 181 + }, + { + "epoch": 0.2912, + "grad_norm": 1.1092023425287232, + "learning_rate": 0.0001663719430227186, + "loss": 0.7876, + "step": 182 + }, + { + "epoch": 0.2928, + "grad_norm": 0.9681477457769564, + "learning_rate": 0.00016598328825710533, + "loss": 0.7024, + "step": 183 + }, + { + "epoch": 0.2944, + "grad_norm": 1.0777622448498192, + "learning_rate": 0.000165592860169994, + "loss": 0.7934, + "step": 184 + }, + { + "epoch": 0.296, + "grad_norm": 1.0378556048281868, + "learning_rate": 0.00016520066925426144, + "loss": 0.6783, + "step": 185 + }, + { + "epoch": 0.2976, + "grad_norm": 1.078799504586605, + "learning_rate": 0.0001648067260501611, + "loss": 0.7475, + "step": 186 + }, + { + "epoch": 0.2992, + "grad_norm": 0.9880842745377312, + "learning_rate": 0.0001644110411450398, + "loss": 0.6907, + "step": 187 + }, + { + "epoch": 0.3008, + "grad_norm": 1.3298912447392872, + "learning_rate": 0.00016401362517305296, + "loss": 0.7766, + "step": 188 + }, + { + "epoch": 0.3024, + "grad_norm": 1.0478723955052052, + "learning_rate": 0.00016361448881487914, + "loss": 0.6743, + "step": 189 + }, + { + "epoch": 0.304, + "grad_norm": 1.082513016613253, + "learning_rate": 0.00016321364279743266, + "loss": 0.7341, + "step": 190 + }, + { + "epoch": 0.3056, + "grad_norm": 1.1597793053417285, + "learning_rate": 0.0001628110978935756, + "loss": 0.7885, + "step": 191 + }, + { + "epoch": 0.3072, + "grad_norm": 1.0514479318888261, + "learning_rate": 0.00016240686492182804, + "loss": 0.7331, + "step": 192 + }, + { + "epoch": 0.3088, + "grad_norm": 1.0601751982528507, + "learning_rate": 0.00016200095474607753, + "loss": 0.6553, + "step": 193 + }, + { + "epoch": 0.3104, + "grad_norm": 1.0319484697425518, + "learning_rate": 0.00016159337827528685, + "loss": 0.7605, + "step": 194 + }, + { + "epoch": 0.312, + "grad_norm": 1.012216598945127, + "learning_rate": 0.0001611841464632011, + "loss": 0.6985, + "step": 195 + }, + { + "epoch": 0.3136, + "grad_norm": 1.0507711581883221, + "learning_rate": 0.0001607732703080532, + "loss": 0.7649, + "step": 196 + }, + { + "epoch": 0.3152, + "grad_norm": 1.1102733037788175, + "learning_rate": 0.00016036076085226814, + "loss": 0.8312, + "step": 197 + }, + { + "epoch": 0.3168, + "grad_norm": 1.1723596161929746, + "learning_rate": 0.0001599466291821666, + "loss": 0.712, + "step": 198 + }, + { + "epoch": 0.3184, + "grad_norm": 1.0108375187081908, + "learning_rate": 0.0001595308864276666, + "loss": 0.7188, + "step": 199 + }, + { + "epoch": 0.32, + "grad_norm": 0.9392203267720511, + "learning_rate": 0.0001591135437619847, + "loss": 0.6394, + "step": 200 + }, + { + "epoch": 0.3216, + "grad_norm": 1.150752101244234, + "learning_rate": 0.0001586946124013354, + "loss": 0.7492, + "step": 201 + }, + { + "epoch": 0.3232, + "grad_norm": 0.9587303154259936, + "learning_rate": 0.0001582741036046301, + "loss": 0.6729, + "step": 202 + }, + { + "epoch": 0.3248, + "grad_norm": 0.9855902546008898, + "learning_rate": 0.00015785202867317407, + "loss": 0.633, + "step": 203 + }, + { + "epoch": 0.3264, + "grad_norm": 1.060546855777705, + "learning_rate": 0.00015742839895036305, + "loss": 0.6774, + "step": 204 + }, + { + "epoch": 0.328, + "grad_norm": 1.0355711807123962, + "learning_rate": 0.00015700322582137827, + "loss": 0.7186, + "step": 205 + }, + { + "epoch": 0.3296, + "grad_norm": 1.1326202360080857, + "learning_rate": 0.0001565765207128805, + "loss": 0.7048, + "step": 206 + }, + { + "epoch": 0.3312, + "grad_norm": 1.119484491703591, + "learning_rate": 0.0001561482950927029, + "loss": 0.7221, + "step": 207 + }, + { + "epoch": 0.3328, + "grad_norm": 1.0831780159710254, + "learning_rate": 0.00015571856046954285, + "loss": 0.6418, + "step": 208 + }, + { + "epoch": 0.3344, + "grad_norm": 1.0552261943119159, + "learning_rate": 0.00015528732839265272, + "loss": 0.7194, + "step": 209 + }, + { + "epoch": 0.336, + "grad_norm": 1.0329416858025888, + "learning_rate": 0.0001548546104515294, + "loss": 0.6308, + "step": 210 + }, + { + "epoch": 0.3376, + "grad_norm": 1.0549094744987852, + "learning_rate": 0.00015442041827560274, + "loss": 0.7516, + "step": 211 + }, + { + "epoch": 0.3392, + "grad_norm": 1.206286897863987, + "learning_rate": 0.00015398476353392323, + "loss": 0.6335, + "step": 212 + }, + { + "epoch": 0.3408, + "grad_norm": 0.9668575563358071, + "learning_rate": 0.00015354765793484834, + "loss": 0.6736, + "step": 213 + }, + { + "epoch": 0.3424, + "grad_norm": 1.1902121758328896, + "learning_rate": 0.00015310911322572753, + "loss": 0.7796, + "step": 214 + }, + { + "epoch": 0.344, + "grad_norm": 1.035141759747216, + "learning_rate": 0.000152669141192587, + "loss": 0.7214, + "step": 215 + }, + { + "epoch": 0.3456, + "grad_norm": 1.0424215694596601, + "learning_rate": 0.00015222775365981273, + "loss": 0.7808, + "step": 216 + }, + { + "epoch": 0.3472, + "grad_norm": 0.9479099900527113, + "learning_rate": 0.00015178496248983254, + "loss": 0.627, + "step": 217 + }, + { + "epoch": 0.3488, + "grad_norm": 1.1020536345657894, + "learning_rate": 0.00015134077958279765, + "loss": 0.7508, + "step": 218 + }, + { + "epoch": 0.3504, + "grad_norm": 0.9970714751839678, + "learning_rate": 0.00015089521687626243, + "loss": 0.779, + "step": 219 + }, + { + "epoch": 0.352, + "grad_norm": 1.2921192337162992, + "learning_rate": 0.000150448286344864, + "loss": 0.7891, + "step": 220 + }, + { + "epoch": 0.3536, + "grad_norm": 1.0558980245055478, + "learning_rate": 0.00015000000000000001, + "loss": 0.8122, + "step": 221 + }, + { + "epoch": 0.3552, + "grad_norm": 1.0805873429862258, + "learning_rate": 0.00014955036988950618, + "loss": 0.8215, + "step": 222 + }, + { + "epoch": 0.3568, + "grad_norm": 0.9248577857440794, + "learning_rate": 0.00014909940809733222, + "loss": 0.592, + "step": 223 + }, + { + "epoch": 0.3584, + "grad_norm": 1.123236762677146, + "learning_rate": 0.00014864712674321734, + "loss": 0.7329, + "step": 224 + }, + { + "epoch": 0.36, + "grad_norm": 0.895888853884425, + "learning_rate": 0.00014819353798236427, + "loss": 0.604, + "step": 225 + }, + { + "epoch": 0.3616, + "grad_norm": 1.0126898243531504, + "learning_rate": 0.00014773865400511272, + "loss": 0.7117, + "step": 226 + }, + { + "epoch": 0.3632, + "grad_norm": 1.090178683291668, + "learning_rate": 0.00014728248703661182, + "loss": 0.7499, + "step": 227 + }, + { + "epoch": 0.3648, + "grad_norm": 1.0905080067559638, + "learning_rate": 0.00014682504933649144, + "loss": 0.7259, + "step": 228 + }, + { + "epoch": 0.3664, + "grad_norm": 1.2249133092092843, + "learning_rate": 0.00014636635319853275, + "loss": 0.8601, + "step": 229 + }, + { + "epoch": 0.368, + "grad_norm": 1.2319777842616844, + "learning_rate": 0.00014590641095033787, + "loss": 0.5961, + "step": 230 + }, + { + "epoch": 0.3696, + "grad_norm": 1.0604549084816592, + "learning_rate": 0.00014544523495299842, + "loss": 0.6648, + "step": 231 + }, + { + "epoch": 0.3712, + "grad_norm": 1.042442851312364, + "learning_rate": 0.0001449828376007636, + "loss": 0.6882, + "step": 232 + }, + { + "epoch": 0.3728, + "grad_norm": 0.975840517205056, + "learning_rate": 0.0001445192313207067, + "loss": 0.6608, + "step": 233 + }, + { + "epoch": 0.3744, + "grad_norm": 1.102072511804385, + "learning_rate": 0.0001440544285723915, + "loss": 0.7783, + "step": 234 + }, + { + "epoch": 0.376, + "grad_norm": 1.0868007224800307, + "learning_rate": 0.00014358844184753712, + "loss": 0.7766, + "step": 235 + }, + { + "epoch": 0.3776, + "grad_norm": 1.1404920509896797, + "learning_rate": 0.00014312128366968243, + "loss": 0.7486, + "step": 236 + }, + { + "epoch": 0.3792, + "grad_norm": 1.14229321612636, + "learning_rate": 0.00014265296659384956, + "loss": 0.7688, + "step": 237 + }, + { + "epoch": 0.3808, + "grad_norm": 0.8648821946642871, + "learning_rate": 0.00014218350320620624, + "loss": 0.6948, + "step": 238 + }, + { + "epoch": 0.3824, + "grad_norm": 0.9551010631354665, + "learning_rate": 0.0001417129061237278, + "loss": 0.7006, + "step": 239 + }, + { + "epoch": 0.384, + "grad_norm": 0.9608900542742783, + "learning_rate": 0.00014124118799385796, + "loss": 0.7046, + "step": 240 + }, + { + "epoch": 0.3856, + "grad_norm": 0.9590920777826683, + "learning_rate": 0.00014076836149416887, + "loss": 0.6704, + "step": 241 + }, + { + "epoch": 0.3872, + "grad_norm": 1.0230550565367582, + "learning_rate": 0.0001402944393320206, + "loss": 0.735, + "step": 242 + }, + { + "epoch": 0.3888, + "grad_norm": 1.0291519546724859, + "learning_rate": 0.00013981943424421932, + "loss": 0.8103, + "step": 243 + }, + { + "epoch": 0.3904, + "grad_norm": 0.9339980403141241, + "learning_rate": 0.00013934335899667527, + "loss": 0.6413, + "step": 244 + }, + { + "epoch": 0.392, + "grad_norm": 1.0511443805713403, + "learning_rate": 0.00013886622638405952, + "loss": 0.7237, + "step": 245 + }, + { + "epoch": 0.3936, + "grad_norm": 1.0922518145677038, + "learning_rate": 0.00013838804922946027, + "loss": 0.7535, + "step": 246 + }, + { + "epoch": 0.3952, + "grad_norm": 1.2553331397373684, + "learning_rate": 0.00013790884038403795, + "loss": 0.7583, + "step": 247 + }, + { + "epoch": 0.3968, + "grad_norm": 0.9743102635915039, + "learning_rate": 0.00013742861272668012, + "loss": 0.6881, + "step": 248 + }, + { + "epoch": 0.3984, + "grad_norm": 1.1036394719223697, + "learning_rate": 0.00013694737916365517, + "loss": 0.8104, + "step": 249 + }, + { + "epoch": 0.4, + "grad_norm": 0.9788220277837504, + "learning_rate": 0.00013646515262826552, + "loss": 0.7212, + "step": 250 + }, + { + "epoch": 0.4016, + "grad_norm": 0.960277760082036, + "learning_rate": 0.0001359819460805001, + "loss": 0.6453, + "step": 251 + }, + { + "epoch": 0.4032, + "grad_norm": 0.9458327687010233, + "learning_rate": 0.0001354977725066859, + "loss": 0.668, + "step": 252 + }, + { + "epoch": 0.4048, + "grad_norm": 0.9747996311203249, + "learning_rate": 0.00013501264491913906, + "loss": 0.6374, + "step": 253 + }, + { + "epoch": 0.4064, + "grad_norm": 0.9456870504677144, + "learning_rate": 0.0001345265763558152, + "loss": 0.6517, + "step": 254 + }, + { + "epoch": 0.408, + "grad_norm": 0.8625374012779374, + "learning_rate": 0.00013403957987995882, + "loss": 0.5754, + "step": 255 + }, + { + "epoch": 0.4096, + "grad_norm": 0.9080357709864995, + "learning_rate": 0.0001335516685797525, + "loss": 0.6524, + "step": 256 + }, + { + "epoch": 0.4112, + "grad_norm": 1.075255370157194, + "learning_rate": 0.00013306285556796495, + "loss": 0.8184, + "step": 257 + }, + { + "epoch": 0.4128, + "grad_norm": 0.9120963289083652, + "learning_rate": 0.00013257315398159864, + "loss": 0.6736, + "step": 258 + }, + { + "epoch": 0.4144, + "grad_norm": 0.8977532308272703, + "learning_rate": 0.00013208257698153677, + "loss": 0.5864, + "step": 259 + }, + { + "epoch": 0.416, + "grad_norm": 1.056234611336399, + "learning_rate": 0.00013159113775218964, + "loss": 0.7144, + "step": 260 + }, + { + "epoch": 0.4176, + "grad_norm": 0.9401494515757224, + "learning_rate": 0.00013109884950114007, + "loss": 0.6107, + "step": 261 + }, + { + "epoch": 0.4192, + "grad_norm": 0.9707218164731339, + "learning_rate": 0.00013060572545878875, + "loss": 0.6492, + "step": 262 + }, + { + "epoch": 0.4208, + "grad_norm": 1.3642147649660619, + "learning_rate": 0.00013011177887799845, + "loss": 0.8226, + "step": 263 + }, + { + "epoch": 0.4224, + "grad_norm": 0.9492156617664067, + "learning_rate": 0.00012961702303373795, + "loss": 0.6348, + "step": 264 + }, + { + "epoch": 0.424, + "grad_norm": 0.9818255136201256, + "learning_rate": 0.00012912147122272523, + "loss": 0.6432, + "step": 265 + }, + { + "epoch": 0.4256, + "grad_norm": 0.9075058520846708, + "learning_rate": 0.00012862513676307008, + "loss": 0.6809, + "step": 266 + }, + { + "epoch": 0.4272, + "grad_norm": 0.9161334178248897, + "learning_rate": 0.00012812803299391628, + "loss": 0.639, + "step": 267 + }, + { + "epoch": 0.4288, + "grad_norm": 1.0330443042304416, + "learning_rate": 0.00012763017327508305, + "loss": 0.6901, + "step": 268 + }, + { + "epoch": 0.4304, + "grad_norm": 1.0947898284412185, + "learning_rate": 0.0001271315709867059, + "loss": 0.7464, + "step": 269 + }, + { + "epoch": 0.432, + "grad_norm": 1.004213113420765, + "learning_rate": 0.00012663223952887723, + "loss": 0.7511, + "step": 270 + }, + { + "epoch": 0.4336, + "grad_norm": 0.9347148068533467, + "learning_rate": 0.00012613219232128608, + "loss": 0.6544, + "step": 271 + }, + { + "epoch": 0.4352, + "grad_norm": 0.9978120044481448, + "learning_rate": 0.00012563144280285741, + "loss": 0.6873, + "step": 272 + }, + { + "epoch": 0.4368, + "grad_norm": 0.9809643859106156, + "learning_rate": 0.00012513000443139112, + "loss": 0.6544, + "step": 273 + }, + { + "epoch": 0.4384, + "grad_norm": 1.0029170228266115, + "learning_rate": 0.00012462789068320017, + "loss": 0.6255, + "step": 274 + }, + { + "epoch": 0.44, + "grad_norm": 0.9417083512697166, + "learning_rate": 0.00012412511505274844, + "loss": 0.6818, + "step": 275 + }, + { + "epoch": 0.4416, + "grad_norm": 1.111134559079807, + "learning_rate": 0.00012362169105228826, + "loss": 0.785, + "step": 276 + }, + { + "epoch": 0.4432, + "grad_norm": 0.869568952356725, + "learning_rate": 0.000123117632211497, + "loss": 0.6143, + "step": 277 + }, + { + "epoch": 0.4448, + "grad_norm": 1.1588538526446959, + "learning_rate": 0.00012261295207711346, + "loss": 0.7077, + "step": 278 + }, + { + "epoch": 0.4464, + "grad_norm": 0.8564976410122681, + "learning_rate": 0.0001221076642125742, + "loss": 0.5817, + "step": 279 + }, + { + "epoch": 0.448, + "grad_norm": 0.9835681341640148, + "learning_rate": 0.00012160178219764837, + "loss": 0.675, + "step": 280 + }, + { + "epoch": 0.4496, + "grad_norm": 0.9661752945573999, + "learning_rate": 0.00012109531962807332, + "loss": 0.6143, + "step": 281 + }, + { + "epoch": 0.4512, + "grad_norm": 0.9771623452698963, + "learning_rate": 0.00012058829011518896, + "loss": 0.6958, + "step": 282 + }, + { + "epoch": 0.4528, + "grad_norm": 1.044800176172783, + "learning_rate": 0.00012008070728557186, + "loss": 0.8356, + "step": 283 + }, + { + "epoch": 0.4544, + "grad_norm": 0.9774178207640642, + "learning_rate": 0.00011957258478066931, + "loss": 0.6511, + "step": 284 + }, + { + "epoch": 0.456, + "grad_norm": 0.8994286932155998, + "learning_rate": 0.00011906393625643244, + "loss": 0.6797, + "step": 285 + }, + { + "epoch": 0.4576, + "grad_norm": 1.0613102619576853, + "learning_rate": 0.00011855477538294935, + "loss": 0.7288, + "step": 286 + }, + { + "epoch": 0.4592, + "grad_norm": 0.9167212286755693, + "learning_rate": 0.00011804511584407763, + "loss": 0.6564, + "step": 287 + }, + { + "epoch": 0.4608, + "grad_norm": 0.925114318133125, + "learning_rate": 0.00011753497133707679, + "loss": 0.6213, + "step": 288 + }, + { + "epoch": 0.4624, + "grad_norm": 0.8982728232717752, + "learning_rate": 0.00011702435557223987, + "loss": 0.6834, + "step": 289 + }, + { + "epoch": 0.464, + "grad_norm": 0.9549955696620802, + "learning_rate": 0.00011651328227252517, + "loss": 0.5918, + "step": 290 + }, + { + "epoch": 0.4656, + "grad_norm": 1.1297928094792418, + "learning_rate": 0.00011600176517318741, + "loss": 0.7396, + "step": 291 + }, + { + "epoch": 0.4672, + "grad_norm": 0.8772073437397565, + "learning_rate": 0.00011548981802140848, + "loss": 0.6218, + "step": 292 + }, + { + "epoch": 0.4688, + "grad_norm": 0.9165917991301511, + "learning_rate": 0.00011497745457592816, + "loss": 0.5938, + "step": 293 + }, + { + "epoch": 0.4704, + "grad_norm": 0.9603735838232083, + "learning_rate": 0.00011446468860667421, + "loss": 0.6543, + "step": 294 + }, + { + "epoch": 0.472, + "grad_norm": 0.9506735479947094, + "learning_rate": 0.00011395153389439233, + "loss": 0.6704, + "step": 295 + }, + { + "epoch": 0.4736, + "grad_norm": 1.1405296151018007, + "learning_rate": 0.00011343800423027582, + "loss": 0.7492, + "step": 296 + }, + { + "epoch": 0.4752, + "grad_norm": 0.948725655617116, + "learning_rate": 0.0001129241134155949, + "loss": 0.5824, + "step": 297 + }, + { + "epoch": 0.4768, + "grad_norm": 0.8840195525822759, + "learning_rate": 0.00011240987526132594, + "loss": 0.5742, + "step": 298 + }, + { + "epoch": 0.4784, + "grad_norm": 1.0050823635292419, + "learning_rate": 0.00011189530358778005, + "loss": 0.7589, + "step": 299 + }, + { + "epoch": 0.48, + "grad_norm": 0.9518152908863455, + "learning_rate": 0.00011138041222423177, + "loss": 0.6223, + "step": 300 + }, + { + "epoch": 0.4816, + "grad_norm": 0.9935413533356781, + "learning_rate": 0.00011086521500854745, + "loss": 0.6868, + "step": 301 + }, + { + "epoch": 0.4832, + "grad_norm": 0.9391071497672916, + "learning_rate": 0.00011034972578681338, + "loss": 0.67, + "step": 302 + }, + { + "epoch": 0.4848, + "grad_norm": 1.0111144448690341, + "learning_rate": 0.00010983395841296348, + "loss": 0.6036, + "step": 303 + }, + { + "epoch": 0.4864, + "grad_norm": 1.0528200546723232, + "learning_rate": 0.00010931792674840718, + "loss": 0.6703, + "step": 304 + }, + { + "epoch": 0.488, + "grad_norm": 0.9604242335429634, + "learning_rate": 0.00010880164466165674, + "loss": 0.6515, + "step": 305 + }, + { + "epoch": 0.4896, + "grad_norm": 0.9714302901929514, + "learning_rate": 0.00010828512602795462, + "loss": 0.7044, + "step": 306 + }, + { + "epoch": 0.4912, + "grad_norm": 0.891198807596005, + "learning_rate": 0.00010776838472890065, + "loss": 0.5525, + "step": 307 + }, + { + "epoch": 0.4928, + "grad_norm": 1.0040614684000009, + "learning_rate": 0.00010725143465207867, + "loss": 0.6712, + "step": 308 + }, + { + "epoch": 0.4944, + "grad_norm": 0.9502194665760952, + "learning_rate": 0.00010673428969068364, + "loss": 0.7171, + "step": 309 + }, + { + "epoch": 0.496, + "grad_norm": 0.9344284914557824, + "learning_rate": 0.00010621696374314807, + "loss": 0.5947, + "step": 310 + }, + { + "epoch": 0.4976, + "grad_norm": 1.5216884120567316, + "learning_rate": 0.00010569947071276847, + "loss": 0.7081, + "step": 311 + }, + { + "epoch": 0.4992, + "grad_norm": 1.0529099246063125, + "learning_rate": 0.00010518182450733186, + "loss": 0.7295, + "step": 312 + }, + { + "epoch": 0.5008, + "grad_norm": 0.9624795340361507, + "learning_rate": 0.00010466403903874176, + "loss": 0.6547, + "step": 313 + }, + { + "epoch": 0.5024, + "grad_norm": 1.0277158131759265, + "learning_rate": 0.00010414612822264455, + "loss": 0.6217, + "step": 314 + }, + { + "epoch": 0.504, + "grad_norm": 1.170824964889325, + "learning_rate": 0.00010362810597805526, + "loss": 0.7425, + "step": 315 + }, + { + "epoch": 0.5056, + "grad_norm": 0.9519262978283264, + "learning_rate": 0.0001031099862269837, + "loss": 0.7318, + "step": 316 + }, + { + "epoch": 0.5072, + "grad_norm": 0.9481035069089333, + "learning_rate": 0.00010259178289406011, + "loss": 0.6413, + "step": 317 + }, + { + "epoch": 0.5088, + "grad_norm": 1.2202753892971472, + "learning_rate": 0.00010207350990616107, + "loss": 0.5965, + "step": 318 + }, + { + "epoch": 0.5104, + "grad_norm": 0.8646568279176724, + "learning_rate": 0.0001015551811920351, + "loss": 0.6388, + "step": 319 + }, + { + "epoch": 0.512, + "grad_norm": 0.9370755125251587, + "learning_rate": 0.00010103681068192845, + "loss": 0.7069, + "step": 320 + }, + { + "epoch": 0.5136, + "grad_norm": 0.9558369774418428, + "learning_rate": 0.00010051841230721065, + "loss": 0.6096, + "step": 321 + }, + { + "epoch": 0.5152, + "grad_norm": 0.8798063213833051, + "learning_rate": 0.0001, + "loss": 0.5768, + "step": 322 + }, + { + "epoch": 0.5168, + "grad_norm": 1.0451492218020761, + "learning_rate": 9.948158769278939e-05, + "loss": 0.7366, + "step": 323 + }, + { + "epoch": 0.5184, + "grad_norm": 0.924152447792308, + "learning_rate": 9.896318931807155e-05, + "loss": 0.664, + "step": 324 + }, + { + "epoch": 0.52, + "grad_norm": 1.0744081909607641, + "learning_rate": 9.844481880796491e-05, + "loss": 0.7296, + "step": 325 + }, + { + "epoch": 0.5216, + "grad_norm": 0.955668756734273, + "learning_rate": 9.792649009383899e-05, + "loss": 0.6078, + "step": 326 + }, + { + "epoch": 0.5232, + "grad_norm": 1.3144636543595971, + "learning_rate": 9.740821710593989e-05, + "loss": 0.73, + "step": 327 + }, + { + "epoch": 0.5248, + "grad_norm": 1.0668093428299052, + "learning_rate": 9.689001377301633e-05, + "loss": 0.7291, + "step": 328 + }, + { + "epoch": 0.5264, + "grad_norm": 0.9512383602185149, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6279, + "step": 329 + }, + { + "epoch": 0.528, + "grad_norm": 1.2922466129710943, + "learning_rate": 9.585387177735547e-05, + "loss": 0.7305, + "step": 330 + }, + { + "epoch": 0.5296, + "grad_norm": 1.0772051024912837, + "learning_rate": 9.533596096125825e-05, + "loss": 0.6187, + "step": 331 + }, + { + "epoch": 0.5312, + "grad_norm": 0.958020772759464, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6412, + "step": 332 + }, + { + "epoch": 0.5328, + "grad_norm": 0.9069568866284937, + "learning_rate": 9.430052928723153e-05, + "loss": 0.5917, + "step": 333 + }, + { + "epoch": 0.5344, + "grad_norm": 0.9297065653874017, + "learning_rate": 9.378303625685195e-05, + "loss": 0.5462, + "step": 334 + }, + { + "epoch": 0.536, + "grad_norm": 0.9650420091823859, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6332, + "step": 335 + }, + { + "epoch": 0.5376, + "grad_norm": 1.1380448305985547, + "learning_rate": 9.274856534792138e-05, + "loss": 0.6879, + "step": 336 + }, + { + "epoch": 0.5392, + "grad_norm": 0.924710104212599, + "learning_rate": 9.223161527109937e-05, + "loss": 0.6348, + "step": 337 + }, + { + "epoch": 0.5408, + "grad_norm": 1.0015805912872244, + "learning_rate": 9.171487397204539e-05, + "loss": 0.7113, + "step": 338 + }, + { + "epoch": 0.5424, + "grad_norm": 0.9384437671302087, + "learning_rate": 9.119835533834331e-05, + "loss": 0.6188, + "step": 339 + }, + { + "epoch": 0.544, + "grad_norm": 1.0604948892936632, + "learning_rate": 9.068207325159284e-05, + "loss": 0.7284, + "step": 340 + }, + { + "epoch": 0.5456, + "grad_norm": 0.9831510324855539, + "learning_rate": 9.016604158703654e-05, + "loss": 0.6808, + "step": 341 + }, + { + "epoch": 0.5472, + "grad_norm": 1.0069678330531393, + "learning_rate": 8.965027421318665e-05, + "loss": 0.5993, + "step": 342 + }, + { + "epoch": 0.5488, + "grad_norm": 0.9680924106629697, + "learning_rate": 8.913478499145254e-05, + "loss": 0.6227, + "step": 343 + }, + { + "epoch": 0.5504, + "grad_norm": 0.888854482469667, + "learning_rate": 8.861958777576827e-05, + "loss": 0.6456, + "step": 344 + }, + { + "epoch": 0.552, + "grad_norm": 0.9281177668939593, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6306, + "step": 345 + }, + { + "epoch": 0.5536, + "grad_norm": 0.9751738708275449, + "learning_rate": 8.759012473867407e-05, + "loss": 0.6838, + "step": 346 + }, + { + "epoch": 0.5552, + "grad_norm": 0.9582206003918131, + "learning_rate": 8.707588658440511e-05, + "loss": 0.6507, + "step": 347 + }, + { + "epoch": 0.5568, + "grad_norm": 0.9001648649106497, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6308, + "step": 348 + }, + { + "epoch": 0.5584, + "grad_norm": 0.9092857558878517, + "learning_rate": 8.604846610560771e-05, + "loss": 0.6, + "step": 349 + }, + { + "epoch": 0.56, + "grad_norm": 0.9636102854062633, + "learning_rate": 8.553531139332582e-05, + "loss": 0.7061, + "step": 350 + }, + { + "epoch": 0.5616, + "grad_norm": 0.9504775655843016, + "learning_rate": 8.502254542407186e-05, + "loss": 0.7104, + "step": 351 + }, + { + "epoch": 0.5632, + "grad_norm": 0.9057432352639376, + "learning_rate": 8.451018197859153e-05, + "loss": 0.5754, + "step": 352 + }, + { + "epoch": 0.5648, + "grad_norm": 1.0007313803850506, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6785, + "step": 353 + }, + { + "epoch": 0.5664, + "grad_norm": 0.9177411475104688, + "learning_rate": 8.348671772747487e-05, + "loss": 0.5778, + "step": 354 + }, + { + "epoch": 0.568, + "grad_norm": 1.0005491489675047, + "learning_rate": 8.297564442776014e-05, + "loss": 0.665, + "step": 355 + }, + { + "epoch": 0.5696, + "grad_norm": 0.8983210808586486, + "learning_rate": 8.246502866292324e-05, + "loss": 0.5667, + "step": 356 + }, + { + "epoch": 0.5712, + "grad_norm": 0.9795152857862884, + "learning_rate": 8.195488415592238e-05, + "loss": 0.6444, + "step": 357 + }, + { + "epoch": 0.5728, + "grad_norm": 0.873584571853934, + "learning_rate": 8.144522461705067e-05, + "loss": 0.615, + "step": 358 + }, + { + "epoch": 0.5744, + "grad_norm": 0.9549504359740664, + "learning_rate": 8.093606374356759e-05, + "loss": 0.6288, + "step": 359 + }, + { + "epoch": 0.576, + "grad_norm": 0.9188240525915284, + "learning_rate": 8.042741521933071e-05, + "loss": 0.5946, + "step": 360 + }, + { + "epoch": 0.5776, + "grad_norm": 0.8607473920140348, + "learning_rate": 7.991929271442817e-05, + "loss": 0.5552, + "step": 361 + }, + { + "epoch": 0.5792, + "grad_norm": 0.8639807183747521, + "learning_rate": 7.941170988481108e-05, + "loss": 0.5721, + "step": 362 + }, + { + "epoch": 0.5808, + "grad_norm": 0.8850184972345249, + "learning_rate": 7.89046803719267e-05, + "loss": 0.5223, + "step": 363 + }, + { + "epoch": 0.5824, + "grad_norm": 2.1591022434661853, + "learning_rate": 7.839821780235168e-05, + "loss": 0.5993, + "step": 364 + }, + { + "epoch": 0.584, + "grad_norm": 1.1043226783052784, + "learning_rate": 7.789233578742582e-05, + "loss": 0.7933, + "step": 365 + }, + { + "epoch": 0.5856, + "grad_norm": 1.0884233624626425, + "learning_rate": 7.738704792288655e-05, + "loss": 0.7357, + "step": 366 + }, + { + "epoch": 0.5872, + "grad_norm": 1.015113606562981, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6774, + "step": 367 + }, + { + "epoch": 0.5888, + "grad_norm": 1.0502589325922018, + "learning_rate": 7.637830894771175e-05, + "loss": 0.6794, + "step": 368 + }, + { + "epoch": 0.5904, + "grad_norm": 0.9900878921092949, + "learning_rate": 7.587488494725157e-05, + "loss": 0.6068, + "step": 369 + }, + { + "epoch": 0.592, + "grad_norm": 0.9497748309914926, + "learning_rate": 7.537210931679987e-05, + "loss": 0.5922, + "step": 370 + }, + { + "epoch": 0.5936, + "grad_norm": 0.9705843187607024, + "learning_rate": 7.48699955686089e-05, + "loss": 0.6093, + "step": 371 + }, + { + "epoch": 0.5952, + "grad_norm": 1.010373441513078, + "learning_rate": 7.43685571971426e-05, + "loss": 0.6703, + "step": 372 + }, + { + "epoch": 0.5968, + "grad_norm": 2.1607022629679125, + "learning_rate": 7.386780767871397e-05, + "loss": 0.5826, + "step": 373 + }, + { + "epoch": 0.5984, + "grad_norm": 0.9856272909655424, + "learning_rate": 7.336776047112276e-05, + "loss": 0.6254, + "step": 374 + }, + { + "epoch": 0.6, + "grad_norm": 0.9295906722233483, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6555, + "step": 375 + }, + { + "epoch": 0.6016, + "grad_norm": 1.0457233381248927, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6991, + "step": 376 + }, + { + "epoch": 0.6032, + "grad_norm": 1.0108760483023163, + "learning_rate": 7.187196700608373e-05, + "loss": 0.6769, + "step": 377 + }, + { + "epoch": 0.6048, + "grad_norm": 0.9057058186724547, + "learning_rate": 7.137486323692995e-05, + "loss": 0.5315, + "step": 378 + }, + { + "epoch": 0.6064, + "grad_norm": 0.9369047617658811, + "learning_rate": 7.087852877727481e-05, + "loss": 0.6602, + "step": 379 + }, + { + "epoch": 0.608, + "grad_norm": 0.972680202054679, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6024, + "step": 380 + }, + { + "epoch": 0.6096, + "grad_norm": 1.321048930261487, + "learning_rate": 6.988822112200156e-05, + "loss": 0.6481, + "step": 381 + }, + { + "epoch": 0.6112, + "grad_norm": 1.0577345819836654, + "learning_rate": 6.939427454121128e-05, + "loss": 0.6457, + "step": 382 + }, + { + "epoch": 0.6128, + "grad_norm": 0.9218333728889919, + "learning_rate": 6.890115049885994e-05, + "loss": 0.6023, + "step": 383 + }, + { + "epoch": 0.6144, + "grad_norm": 0.9399207896564503, + "learning_rate": 6.84088622478104e-05, + "loss": 0.5672, + "step": 384 + }, + { + "epoch": 0.616, + "grad_norm": 1.1250946584883899, + "learning_rate": 6.791742301846326e-05, + "loss": 0.7217, + "step": 385 + }, + { + "epoch": 0.6176, + "grad_norm": 0.9522625346054029, + "learning_rate": 6.742684601840141e-05, + "loss": 0.6172, + "step": 386 + }, + { + "epoch": 0.6192, + "grad_norm": 0.9922580878539909, + "learning_rate": 6.693714443203507e-05, + "loss": 0.6201, + "step": 387 + }, + { + "epoch": 0.6208, + "grad_norm": 1.0168541498929677, + "learning_rate": 6.644833142024751e-05, + "loss": 0.637, + "step": 388 + }, + { + "epoch": 0.6224, + "grad_norm": 0.9687595220988009, + "learning_rate": 6.59604201200412e-05, + "loss": 0.5511, + "step": 389 + }, + { + "epoch": 0.624, + "grad_norm": 0.98923428378789, + "learning_rate": 6.547342364418481e-05, + "loss": 0.6611, + "step": 390 + }, + { + "epoch": 0.6256, + "grad_norm": 1.0975903332325236, + "learning_rate": 6.498735508086093e-05, + "loss": 0.638, + "step": 391 + }, + { + "epoch": 0.6272, + "grad_norm": 0.9211169122137681, + "learning_rate": 6.450222749331414e-05, + "loss": 0.5502, + "step": 392 + }, + { + "epoch": 0.6288, + "grad_norm": 1.1161164797290022, + "learning_rate": 6.40180539194999e-05, + "loss": 0.695, + "step": 393 + }, + { + "epoch": 0.6304, + "grad_norm": 0.9946376550649852, + "learning_rate": 6.35348473717345e-05, + "loss": 0.6079, + "step": 394 + }, + { + "epoch": 0.632, + "grad_norm": 0.967078877686999, + "learning_rate": 6.305262083634488e-05, + "loss": 0.5904, + "step": 395 + }, + { + "epoch": 0.6336, + "grad_norm": 1.0989047275834596, + "learning_rate": 6.25713872733199e-05, + "loss": 0.7644, + "step": 396 + }, + { + "epoch": 0.6352, + "grad_norm": 1.0749511648643626, + "learning_rate": 6.209115961596208e-05, + "loss": 0.6075, + "step": 397 + }, + { + "epoch": 0.6368, + "grad_norm": 1.0113772216181294, + "learning_rate": 6.161195077053976e-05, + "loss": 0.6055, + "step": 398 + }, + { + "epoch": 0.6384, + "grad_norm": 1.0528699708098506, + "learning_rate": 6.113377361594049e-05, + "loss": 0.5924, + "step": 399 + }, + { + "epoch": 0.64, + "grad_norm": 0.9884474205488997, + "learning_rate": 6.065664100332478e-05, + "loss": 0.5836, + "step": 400 + }, + { + "epoch": 0.6416, + "grad_norm": 1.0817565853558482, + "learning_rate": 6.018056575578075e-05, + "loss": 0.5429, + "step": 401 + }, + { + "epoch": 0.6432, + "grad_norm": 0.9666144235321176, + "learning_rate": 5.970556066797941e-05, + "loss": 0.6413, + "step": 402 + }, + { + "epoch": 0.6448, + "grad_norm": 0.9153559071033491, + "learning_rate": 5.923163850583113e-05, + "loss": 0.659, + "step": 403 + }, + { + "epoch": 0.6464, + "grad_norm": 0.9137471340069504, + "learning_rate": 5.875881200614207e-05, + "loss": 0.6103, + "step": 404 + }, + { + "epoch": 0.648, + "grad_norm": 1.510146019378789, + "learning_rate": 5.828709387627218e-05, + "loss": 0.4828, + "step": 405 + }, + { + "epoch": 0.6496, + "grad_norm": 0.987447720692412, + "learning_rate": 5.781649679379378e-05, + "loss": 0.7018, + "step": 406 + }, + { + "epoch": 0.6512, + "grad_norm": 0.9238724147222424, + "learning_rate": 5.73470334061505e-05, + "loss": 0.5758, + "step": 407 + }, + { + "epoch": 0.6528, + "grad_norm": 0.9389633061963913, + "learning_rate": 5.687871633031754e-05, + "loss": 0.6042, + "step": 408 + }, + { + "epoch": 0.6544, + "grad_norm": 0.9132362305366166, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.5972, + "step": 409 + }, + { + "epoch": 0.656, + "grad_norm": 0.8654296238944921, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.5282, + "step": 410 + }, + { + "epoch": 0.6576, + "grad_norm": 0.8453176077925335, + "learning_rate": 5.54807686792933e-05, + "loss": 0.5726, + "step": 411 + }, + { + "epoch": 0.6592, + "grad_norm": 0.9242326747879537, + "learning_rate": 5.501716239923642e-05, + "loss": 0.6372, + "step": 412 + }, + { + "epoch": 0.6608, + "grad_norm": 0.9308988892533574, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.5738, + "step": 413 + }, + { + "epoch": 0.6624, + "grad_norm": 0.9071409171308088, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.5761, + "step": 414 + }, + { + "epoch": 0.664, + "grad_norm": 1.3028630461587731, + "learning_rate": 5.363364680146725e-05, + "loss": 0.6067, + "step": 415 + }, + { + "epoch": 0.6656, + "grad_norm": 0.9172024043826201, + "learning_rate": 5.31749506635086e-05, + "loss": 0.5679, + "step": 416 + }, + { + "epoch": 0.6672, + "grad_norm": 0.8807958549351818, + "learning_rate": 5.271751296338823e-05, + "loss": 0.542, + "step": 417 + }, + { + "epoch": 0.6688, + "grad_norm": 0.9704249571552533, + "learning_rate": 5.226134599488728e-05, + "loss": 0.5464, + "step": 418 + }, + { + "epoch": 0.6704, + "grad_norm": 1.0969144972578753, + "learning_rate": 5.180646201763577e-05, + "loss": 0.6738, + "step": 419 + }, + { + "epoch": 0.672, + "grad_norm": 0.8696864132878669, + "learning_rate": 5.135287325678271e-05, + "loss": 0.5705, + "step": 420 + }, + { + "epoch": 0.6736, + "grad_norm": 0.9781208222151636, + "learning_rate": 5.090059190266779e-05, + "loss": 0.5582, + "step": 421 + }, + { + "epoch": 0.6752, + "grad_norm": 0.9967778574729614, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6579, + "step": 422 + }, + { + "epoch": 0.6768, + "grad_norm": 1.0082960039077469, + "learning_rate": 5.000000000000002e-05, + "loss": 0.6184, + "step": 423 + }, + { + "epoch": 0.6784, + "grad_norm": 1.0366166862534494, + "learning_rate": 4.955171365513603e-05, + "loss": 0.5641, + "step": 424 + }, + { + "epoch": 0.68, + "grad_norm": 0.9920315082734081, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.669, + "step": 425 + }, + { + "epoch": 0.6816, + "grad_norm": 0.9956020669208389, + "learning_rate": 4.865922041720239e-05, + "loss": 0.5392, + "step": 426 + }, + { + "epoch": 0.6832, + "grad_norm": 0.8490541008850564, + "learning_rate": 4.821503751016746e-05, + "loss": 0.5029, + "step": 427 + }, + { + "epoch": 0.6848, + "grad_norm": 0.9163974087483752, + "learning_rate": 4.777224634018732e-05, + "loss": 0.531, + "step": 428 + }, + { + "epoch": 0.6864, + "grad_norm": 1.008949676876316, + "learning_rate": 4.733085880741301e-05, + "loss": 0.637, + "step": 429 + }, + { + "epoch": 0.688, + "grad_norm": 0.9030007856605029, + "learning_rate": 4.689088677427249e-05, + "loss": 0.5628, + "step": 430 + }, + { + "epoch": 0.6896, + "grad_norm": 0.9449537478248724, + "learning_rate": 4.645234206515171e-05, + "loss": 0.5636, + "step": 431 + }, + { + "epoch": 0.6912, + "grad_norm": 0.8198257100363766, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.4939, + "step": 432 + }, + { + "epoch": 0.6928, + "grad_norm": 0.8867518601433609, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.5413, + "step": 433 + }, + { + "epoch": 0.6944, + "grad_norm": 1.1818256787206338, + "learning_rate": 4.514538954847064e-05, + "loss": 0.6136, + "step": 434 + }, + { + "epoch": 0.696, + "grad_norm": 0.9452893844358714, + "learning_rate": 4.471267160734731e-05, + "loss": 0.5748, + "step": 435 + }, + { + "epoch": 0.6976, + "grad_norm": 0.9275850629781354, + "learning_rate": 4.428143953045717e-05, + "loss": 0.5189, + "step": 436 + }, + { + "epoch": 0.6992, + "grad_norm": 0.9002710366246102, + "learning_rate": 4.385170490729712e-05, + "loss": 0.597, + "step": 437 + }, + { + "epoch": 0.7008, + "grad_norm": 0.9415059576235015, + "learning_rate": 4.342347928711953e-05, + "loss": 0.6083, + "step": 438 + }, + { + "epoch": 0.7024, + "grad_norm": 0.9647744573314104, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.6556, + "step": 439 + }, + { + "epoch": 0.704, + "grad_norm": 0.8670661153270305, + "learning_rate": 4.257160104963696e-05, + "loss": 0.473, + "step": 440 + }, + { + "epoch": 0.7056, + "grad_norm": 0.9646118936497053, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.5672, + "step": 441 + }, + { + "epoch": 0.7072, + "grad_norm": 0.9095148534965027, + "learning_rate": 4.172589639536991e-05, + "loss": 0.5433, + "step": 442 + }, + { + "epoch": 0.7088, + "grad_norm": 1.062269147102224, + "learning_rate": 4.130538759866457e-05, + "loss": 0.6931, + "step": 443 + }, + { + "epoch": 0.7104, + "grad_norm": 0.9241710738105149, + "learning_rate": 4.088645623801534e-05, + "loss": 0.5148, + "step": 444 + }, + { + "epoch": 0.712, + "grad_norm": 0.9559033451868152, + "learning_rate": 4.046911357233343e-05, + "loss": 0.5564, + "step": 445 + }, + { + "epoch": 0.7136, + "grad_norm": 1.030323125995463, + "learning_rate": 4.00533708178334e-05, + "loss": 0.6195, + "step": 446 + }, + { + "epoch": 0.7152, + "grad_norm": 1.0271604613717926, + "learning_rate": 3.963923914773187e-05, + "loss": 0.7014, + "step": 447 + }, + { + "epoch": 0.7168, + "grad_norm": 0.8942452975398771, + "learning_rate": 3.922672969194686e-05, + "loss": 0.5011, + "step": 448 + }, + { + "epoch": 0.7184, + "grad_norm": 0.8955644246748649, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.5521, + "step": 449 + }, + { + "epoch": 0.72, + "grad_norm": 0.8332928517210115, + "learning_rate": 3.840662172471315e-05, + "loss": 0.5414, + "step": 450 + }, + { + "epoch": 0.7216, + "grad_norm": 1.1169520622611744, + "learning_rate": 3.79990452539225e-05, + "loss": 0.6897, + "step": 451 + }, + { + "epoch": 0.7232, + "grad_norm": 1.3097679094718664, + "learning_rate": 3.759313507817196e-05, + "loss": 0.5497, + "step": 452 + }, + { + "epoch": 0.7248, + "grad_norm": 0.8772376782312741, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.5951, + "step": 453 + }, + { + "epoch": 0.7264, + "grad_norm": 1.1755811538495757, + "learning_rate": 3.678635720256737e-05, + "loss": 0.6902, + "step": 454 + }, + { + "epoch": 0.728, + "grad_norm": 0.9150533628307943, + "learning_rate": 3.638551118512089e-05, + "loss": 0.5854, + "step": 455 + }, + { + "epoch": 0.7296, + "grad_norm": 0.9812405096651364, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.5138, + "step": 456 + }, + { + "epoch": 0.7312, + "grad_norm": 0.9257901458035728, + "learning_rate": 3.558895885496023e-05, + "loss": 0.5882, + "step": 457 + }, + { + "epoch": 0.7328, + "grad_norm": 0.9099467473629124, + "learning_rate": 3.519327394983888e-05, + "loss": 0.551, + "step": 458 + }, + { + "epoch": 0.7344, + "grad_norm": 0.9739503971331055, + "learning_rate": 3.479933074573858e-05, + "loss": 0.5545, + "step": 459 + }, + { + "epoch": 0.736, + "grad_norm": 0.8385781188398105, + "learning_rate": 3.440713983000601e-05, + "loss": 0.4841, + "step": 460 + }, + { + "epoch": 0.7376, + "grad_norm": 1.1359826928085195, + "learning_rate": 3.401671174289469e-05, + "loss": 0.7864, + "step": 461 + }, + { + "epoch": 0.7392, + "grad_norm": 0.9909228203441943, + "learning_rate": 3.362805697728145e-05, + "loss": 0.5488, + "step": 462 + }, + { + "epoch": 0.7408, + "grad_norm": 1.0019540774010631, + "learning_rate": 3.324118597838464e-05, + "loss": 0.6181, + "step": 463 + }, + { + "epoch": 0.7424, + "grad_norm": 0.8385188190911381, + "learning_rate": 3.285610914348332e-05, + "loss": 0.5049, + "step": 464 + }, + { + "epoch": 0.744, + "grad_norm": 0.919376923204666, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.5377, + "step": 465 + }, + { + "epoch": 0.7456, + "grad_norm": 1.0232245476746933, + "learning_rate": 3.209137931341143e-05, + "loss": 0.6487, + "step": 466 + }, + { + "epoch": 0.7472, + "grad_norm": 0.9618340330851003, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.6218, + "step": 467 + }, + { + "epoch": 0.7488, + "grad_norm": 0.9712726064151155, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.5529, + "step": 468 + }, + { + "epoch": 0.7504, + "grad_norm": 0.9771971192171456, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.6552, + "step": 469 + }, + { + "epoch": 0.752, + "grad_norm": 0.9786775722449325, + "learning_rate": 3.058390171511196e-05, + "loss": 0.5603, + "step": 470 + }, + { + "epoch": 0.7536, + "grad_norm": 1.1332505597262252, + "learning_rate": 3.021167106673928e-05, + "loss": 0.6783, + "step": 471 + }, + { + "epoch": 0.7552, + "grad_norm": 0.9836611457205962, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.6031, + "step": 472 + }, + { + "epoch": 0.7568, + "grad_norm": 0.9773706601264938, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.6464, + "step": 473 + }, + { + "epoch": 0.7584, + "grad_norm": 0.8695143545764731, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.5364, + "step": 474 + }, + { + "epoch": 0.76, + "grad_norm": 0.9314585082087594, + "learning_rate": 2.874160358524931e-05, + "loss": 0.6129, + "step": 475 + }, + { + "epoch": 0.7616, + "grad_norm": 0.8352473468581094, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.5302, + "step": 476 + }, + { + "epoch": 0.7632, + "grad_norm": 0.9288707870085203, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.6198, + "step": 477 + }, + { + "epoch": 0.7648, + "grad_norm": 1.054548843821071, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.6421, + "step": 478 + }, + { + "epoch": 0.7664, + "grad_norm": 0.8919318029675155, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5169, + "step": 479 + }, + { + "epoch": 0.768, + "grad_norm": 1.0066503456444664, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.6089, + "step": 480 + }, + { + "epoch": 0.7696, + "grad_norm": 0.9230978252331297, + "learning_rate": 2.659414712405398e-05, + "loss": 0.5957, + "step": 481 + }, + { + "epoch": 0.7712, + "grad_norm": 0.8789633076103941, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5541, + "step": 482 + }, + { + "epoch": 0.7728, + "grad_norm": 0.8718844378045656, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.5485, + "step": 483 + }, + { + "epoch": 0.7744, + "grad_norm": 0.9335181761420575, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6404, + "step": 484 + }, + { + "epoch": 0.776, + "grad_norm": 0.9484725144376542, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.59, + "step": 485 + }, + { + "epoch": 0.7776, + "grad_norm": 0.8382977366416045, + "learning_rate": 2.485876184956928e-05, + "loss": 0.5561, + "step": 486 + }, + { + "epoch": 0.7792, + "grad_norm": 0.9355603237904612, + "learning_rate": 2.451770608467432e-05, + "loss": 0.5818, + "step": 487 + }, + { + "epoch": 0.7808, + "grad_norm": 0.961809490288736, + "learning_rate": 2.417867893002387e-05, + "loss": 0.6146, + "step": 488 + }, + { + "epoch": 0.7824, + "grad_norm": 0.8567670714065785, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.5652, + "step": 489 + }, + { + "epoch": 0.784, + "grad_norm": 0.9455553754652066, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.5788, + "step": 490 + }, + { + "epoch": 0.7856, + "grad_norm": 1.0721800816234002, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.5842, + "step": 491 + }, + { + "epoch": 0.7872, + "grad_norm": 0.7675806075541949, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.4302, + "step": 492 + }, + { + "epoch": 0.7888, + "grad_norm": 0.991350128046776, + "learning_rate": 2.251428928971102e-05, + "loss": 0.5715, + "step": 493 + }, + { + "epoch": 0.7904, + "grad_norm": 0.9511054276695102, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.6427, + "step": 494 + }, + { + "epoch": 0.792, + "grad_norm": 1.0426683588308392, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.5714, + "step": 495 + }, + { + "epoch": 0.7936, + "grad_norm": 0.9523954557582163, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.5734, + "step": 496 + }, + { + "epoch": 0.7952, + "grad_norm": 0.7987380143628661, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.5117, + "step": 497 + }, + { + "epoch": 0.7968, + "grad_norm": 0.8573326519139678, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.5222, + "step": 498 + }, + { + "epoch": 0.7984, + "grad_norm": 0.979581786160496, + "learning_rate": 2.058583491552465e-05, + "loss": 0.5971, + "step": 499 + }, + { + "epoch": 0.8, + "grad_norm": 0.778165869500947, + "learning_rate": 2.027184594300898e-05, + "loss": 0.4771, + "step": 500 + }, + { + "epoch": 0.8016, + "grad_norm": 0.8457108669461191, + "learning_rate": 1.995999968955641e-05, + "loss": 0.4731, + "step": 501 + }, + { + "epoch": 0.8032, + "grad_norm": 0.9508003542076502, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.6279, + "step": 502 + }, + { + "epoch": 0.8048, + "grad_norm": 0.9958779714256399, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.6277, + "step": 503 + }, + { + "epoch": 0.8064, + "grad_norm": 0.9862852127069748, + "learning_rate": 1.903740076395151e-05, + "loss": 0.5806, + "step": 504 + }, + { + "epoch": 0.808, + "grad_norm": 0.9397524949225728, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5331, + "step": 505 + }, + { + "epoch": 0.8096, + "grad_norm": 0.9083043981571588, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.5043, + "step": 506 + }, + { + "epoch": 0.8112, + "grad_norm": 1.0118708577850164, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.5932, + "step": 507 + }, + { + "epoch": 0.8128, + "grad_norm": 1.1251363208694125, + "learning_rate": 1.783776873795994e-05, + "loss": 0.6196, + "step": 508 + }, + { + "epoch": 0.8144, + "grad_norm": 0.8574627852708463, + "learning_rate": 1.754336106761927e-05, + "loss": 0.5193, + "step": 509 + }, + { + "epoch": 0.816, + "grad_norm": 0.7780339043692484, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.4682, + "step": 510 + }, + { + "epoch": 0.8176, + "grad_norm": 1.1964531716023608, + "learning_rate": 1.696120172352025e-05, + "loss": 0.6528, + "step": 511 + }, + { + "epoch": 0.8192, + "grad_norm": 0.9404864780809205, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.6223, + "step": 512 + }, + { + "epoch": 0.8208, + "grad_norm": 1.0226708718743212, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.6927, + "step": 513 + }, + { + "epoch": 0.8224, + "grad_norm": 1.137349776592833, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.6141, + "step": 514 + }, + { + "epoch": 0.824, + "grad_norm": 0.9814117218965409, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.5735, + "step": 515 + }, + { + "epoch": 0.8256, + "grad_norm": 0.8830516289878225, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.5071, + "step": 516 + }, + { + "epoch": 0.8272, + "grad_norm": 0.996486615239273, + "learning_rate": 1.526852950422226e-05, + "loss": 0.5259, + "step": 517 + }, + { + "epoch": 0.8288, + "grad_norm": 1.0330770451297686, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.6197, + "step": 518 + }, + { + "epoch": 0.8304, + "grad_norm": 0.8319899947732018, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.4996, + "step": 519 + }, + { + "epoch": 0.832, + "grad_norm": 0.9250451531628848, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.5247, + "step": 520 + }, + { + "epoch": 0.8336, + "grad_norm": 1.144102148606056, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.7603, + "step": 521 + }, + { + "epoch": 0.8352, + "grad_norm": 0.8191778600556882, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.5248, + "step": 522 + }, + { + "epoch": 0.8368, + "grad_norm": 1.0247859395725494, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.6693, + "step": 523 + }, + { + "epoch": 0.8384, + "grad_norm": 0.9576748722399607, + "learning_rate": 1.339745962155613e-05, + "loss": 0.5266, + "step": 524 + }, + { + "epoch": 0.84, + "grad_norm": 0.9386852671410497, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5242, + "step": 525 + }, + { + "epoch": 0.8416, + "grad_norm": 1.0008594863534126, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.5434, + "step": 526 + }, + { + "epoch": 0.8432, + "grad_norm": 0.8296819976374453, + "learning_rate": 1.263034245443473e-05, + "loss": 0.4819, + "step": 527 + }, + { + "epoch": 0.8448, + "grad_norm": 1.0366267401553004, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.648, + "step": 528 + }, + { + "epoch": 0.8464, + "grad_norm": 0.9301658256767201, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.5796, + "step": 529 + }, + { + "epoch": 0.848, + "grad_norm": 0.9540408598436331, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.6085, + "step": 530 + }, + { + "epoch": 0.8496, + "grad_norm": 0.9601452062599933, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.6306, + "step": 531 + }, + { + "epoch": 0.8512, + "grad_norm": 1.1064698430304876, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.7302, + "step": 532 + }, + { + "epoch": 0.8528, + "grad_norm": 0.8406468101560786, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.5, + "step": 533 + }, + { + "epoch": 0.8544, + "grad_norm": 1.0896860673883275, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.6729, + "step": 534 + }, + { + "epoch": 0.856, + "grad_norm": 0.912833026472872, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5126, + "step": 535 + }, + { + "epoch": 0.8576, + "grad_norm": 0.8657286990660678, + "learning_rate": 1.045650195232819e-05, + "loss": 0.4976, + "step": 536 + }, + { + "epoch": 0.8592, + "grad_norm": 0.825689451879938, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.4834, + "step": 537 + }, + { + "epoch": 0.8608, + "grad_norm": 0.9905337253057244, + "learning_rate": 9.999734793146998e-06, + "loss": 0.5336, + "step": 538 + }, + { + "epoch": 0.8624, + "grad_norm": 0.9726342394010897, + "learning_rate": 9.774976338718677e-06, + "loss": 0.6517, + "step": 539 + }, + { + "epoch": 0.864, + "grad_norm": 0.9142314798265335, + "learning_rate": 9.552642710005299e-06, + "loss": 0.6603, + "step": 540 + }, + { + "epoch": 0.8656, + "grad_norm": 0.898642677276061, + "learning_rate": 9.332739882292752e-06, + "loss": 0.4928, + "step": 541 + }, + { + "epoch": 0.8672, + "grad_norm": 0.8916371940098342, + "learning_rate": 9.115273765538202e-06, + "loss": 0.5164, + "step": 542 + }, + { + "epoch": 0.8688, + "grad_norm": 1.4104850715242292, + "learning_rate": 8.900250204211514e-06, + "loss": 0.5929, + "step": 543 + }, + { + "epoch": 0.8704, + "grad_norm": 0.7293795782350329, + "learning_rate": 8.687674977138116e-06, + "loss": 0.4157, + "step": 544 + }, + { + "epoch": 0.872, + "grad_norm": 1.019652420183055, + "learning_rate": 8.47755379734373e-06, + "loss": 0.7263, + "step": 545 + }, + { + "epoch": 0.8736, + "grad_norm": 0.9448381054119659, + "learning_rate": 8.269892311900696e-06, + "loss": 0.586, + "step": 546 + }, + { + "epoch": 0.8752, + "grad_norm": 0.8637348751302155, + "learning_rate": 8.064696101776358e-06, + "loss": 0.4999, + "step": 547 + }, + { + "epoch": 0.8768, + "grad_norm": 0.9690117974376475, + "learning_rate": 7.861970681683051e-06, + "loss": 0.6164, + "step": 548 + }, + { + "epoch": 0.8784, + "grad_norm": 0.9531128366285172, + "learning_rate": 7.661721499929753e-06, + "loss": 0.5583, + "step": 549 + }, + { + "epoch": 0.88, + "grad_norm": 0.9078704210432088, + "learning_rate": 7.463953938275858e-06, + "loss": 0.5536, + "step": 550 + }, + { + "epoch": 0.8816, + "grad_norm": 0.9630577217237317, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.5851, + "step": 551 + }, + { + "epoch": 0.8832, + "grad_norm": 0.9270212590280982, + "learning_rate": 7.07588486868922e-06, + "loss": 0.6402, + "step": 552 + }, + { + "epoch": 0.8848, + "grad_norm": 0.9423601972149899, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5566, + "step": 553 + }, + { + "epoch": 0.8864, + "grad_norm": 0.9976033030848523, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.5256, + "step": 554 + }, + { + "epoch": 0.888, + "grad_norm": 0.9393026960150982, + "learning_rate": 6.512524116523633e-06, + "loss": 0.6015, + "step": 555 + }, + { + "epoch": 0.8896, + "grad_norm": 0.9274191647891893, + "learning_rate": 6.329755547632499e-06, + "loss": 0.579, + "step": 556 + }, + { + "epoch": 0.8912, + "grad_norm": 0.9942581681622578, + "learning_rate": 6.149504395842087e-06, + "loss": 0.582, + "step": 557 + }, + { + "epoch": 0.8928, + "grad_norm": 0.9681948735153119, + "learning_rate": 5.971775505458444e-06, + "loss": 0.5492, + "step": 558 + }, + { + "epoch": 0.8944, + "grad_norm": 0.9833395857883787, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.6341, + "step": 559 + }, + { + "epoch": 0.896, + "grad_norm": 0.7933212026979115, + "learning_rate": 5.623903547074549e-06, + "loss": 0.5183, + "step": 560 + }, + { + "epoch": 0.8976, + "grad_norm": 1.0599017928831775, + "learning_rate": 5.453769828241872e-06, + "loss": 0.762, + "step": 561 + }, + { + "epoch": 0.8992, + "grad_norm": 0.9256304637576591, + "learning_rate": 5.286177068899989e-06, + "loss": 0.521, + "step": 562 + }, + { + "epoch": 0.9008, + "grad_norm": 0.9585611922630486, + "learning_rate": 5.121129773156663e-06, + "loss": 0.6063, + "step": 563 + }, + { + "epoch": 0.9024, + "grad_norm": 0.8490070058645339, + "learning_rate": 4.95863237670956e-06, + "loss": 0.4427, + "step": 564 + }, + { + "epoch": 0.904, + "grad_norm": 0.891364762865935, + "learning_rate": 4.798689246727006e-06, + "loss": 0.5148, + "step": 565 + }, + { + "epoch": 0.9056, + "grad_norm": 0.9988470856594625, + "learning_rate": 4.641304681730641e-06, + "loss": 0.6633, + "step": 566 + }, + { + "epoch": 0.9072, + "grad_norm": 0.9444281906347085, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5217, + "step": 567 + }, + { + "epoch": 0.9088, + "grad_norm": 0.8752628215788081, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.4908, + "step": 568 + }, + { + "epoch": 0.9104, + "grad_norm": 0.9694387789394738, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6527, + "step": 569 + }, + { + "epoch": 0.912, + "grad_norm": 0.9022866716275811, + "learning_rate": 4.037435632986786e-06, + "loss": 0.5921, + "step": 570 + }, + { + "epoch": 0.9136, + "grad_norm": 0.9205268255297675, + "learning_rate": 3.892905960127546e-06, + "loss": 0.4699, + "step": 571 + }, + { + "epoch": 0.9152, + "grad_norm": 0.8755805675570406, + "learning_rate": 3.750959195463466e-06, + "loss": 0.4811, + "step": 572 + }, + { + "epoch": 0.9168, + "grad_norm": 0.8912675243422706, + "learning_rate": 3.611599153858214e-06, + "loss": 0.5741, + "step": 573 + }, + { + "epoch": 0.9184, + "grad_norm": 0.9689570717903273, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6094, + "step": 574 + }, + { + "epoch": 0.92, + "grad_norm": 0.8392206648042595, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.5112, + "step": 575 + }, + { + "epoch": 0.9216, + "grad_norm": 0.8072022169653216, + "learning_rate": 3.209076472645112e-06, + "loss": 0.448, + "step": 576 + }, + { + "epoch": 0.9232, + "grad_norm": 0.7790347510000438, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.402, + "step": 577 + }, + { + "epoch": 0.9248, + "grad_norm": 0.8953515136947062, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.5285, + "step": 578 + }, + { + "epoch": 0.9264, + "grad_norm": 0.9809443047886482, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.5989, + "step": 579 + }, + { + "epoch": 0.928, + "grad_norm": 0.8975191336454618, + "learning_rate": 2.708812932856253e-06, + "loss": 0.555, + "step": 580 + }, + { + "epoch": 0.9296, + "grad_norm": 0.9629421299799577, + "learning_rate": 2.590275647868867e-06, + "loss": 0.5406, + "step": 581 + }, + { + "epoch": 0.9312, + "grad_norm": 0.8243589514012051, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.4341, + "step": 582 + }, + { + "epoch": 0.9328, + "grad_norm": 1.479828868077692, + "learning_rate": 2.3610579436393e-06, + "loss": 0.5796, + "step": 583 + }, + { + "epoch": 0.9344, + "grad_norm": 1.1184626925960366, + "learning_rate": 2.250383684694579e-06, + "loss": 0.5626, + "step": 584 + }, + { + "epoch": 0.936, + "grad_norm": 0.7892836016009552, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.4644, + "step": 585 + }, + { + "epoch": 0.9376, + "grad_norm": 0.8453496028010578, + "learning_rate": 2.036919225091827e-06, + "loss": 0.5184, + "step": 586 + }, + { + "epoch": 0.9392, + "grad_norm": 1.415420131082792, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.5357, + "step": 587 + }, + { + "epoch": 0.9408, + "grad_norm": 1.9466599285665418, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.5657, + "step": 588 + }, + { + "epoch": 0.9424, + "grad_norm": 0.9510079260717255, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.53, + "step": 589 + }, + { + "epoch": 0.944, + "grad_norm": 0.8046507506345103, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.4359, + "step": 590 + }, + { + "epoch": 0.9456, + "grad_norm": 0.9085865871187612, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.5194, + "step": 591 + }, + { + "epoch": 0.9472, + "grad_norm": 2.1375597713997694, + "learning_rate": 1.459798471131868e-06, + "loss": 0.5032, + "step": 592 + }, + { + "epoch": 0.9488, + "grad_norm": 0.8858092438131193, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.5091, + "step": 593 + }, + { + "epoch": 0.9504, + "grad_norm": 0.9982469502268863, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.5391, + "step": 594 + }, + { + "epoch": 0.952, + "grad_norm": 0.8523525415208159, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5074, + "step": 595 + }, + { + "epoch": 0.9536, + "grad_norm": 1.0576780431109767, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.521, + "step": 596 + }, + { + "epoch": 0.9552, + "grad_norm": 1.2215689985898792, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.542, + "step": 597 + }, + { + "epoch": 0.9568, + "grad_norm": 0.9951221792223286, + "learning_rate": 9.780089980330642e-07, + "loss": 0.5588, + "step": 598 + }, + { + "epoch": 0.9584, + "grad_norm": 1.1516393717472841, + "learning_rate": 9.070131527609604e-07, + "loss": 0.7024, + "step": 599 + }, + { + "epoch": 0.96, + "grad_norm": 0.8699478987557641, + "learning_rate": 8.386804624865851e-07, + "loss": 0.5441, + "step": 600 + }, + { + "epoch": 0.9616, + "grad_norm": 0.9761622409616965, + "learning_rate": 7.730127636723539e-07, + "loss": 0.6659, + "step": 601 + }, + { + "epoch": 0.9632, + "grad_norm": 1.0657941819580883, + "learning_rate": 7.100118211581852e-07, + "loss": 0.6802, + "step": 602 + }, + { + "epoch": 0.9648, + "grad_norm": 0.9483310552984877, + "learning_rate": 6.496793281141056e-07, + "loss": 0.5146, + "step": 603 + }, + { + "epoch": 0.9664, + "grad_norm": 2.9508099527683926, + "learning_rate": 5.920169059947411e-07, + "loss": 0.6694, + "step": 604 + }, + { + "epoch": 0.968, + "grad_norm": 1.1203177018025743, + "learning_rate": 5.370261044956971e-07, + "loss": 0.6277, + "step": 605 + }, + { + "epoch": 0.9696, + "grad_norm": 0.8712036353092558, + "learning_rate": 4.847084015119574e-07, + "loss": 0.4921, + "step": 606 + }, + { + "epoch": 0.9712, + "grad_norm": 0.8349081617610802, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.4697, + "step": 607 + }, + { + "epoch": 0.9728, + "grad_norm": 0.9816640206077006, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.6025, + "step": 608 + }, + { + "epoch": 0.9744, + "grad_norm": 0.8137012439481905, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.5452, + "step": 609 + }, + { + "epoch": 0.976, + "grad_norm": 0.8775138259188492, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.5815, + "step": 610 + }, + { + "epoch": 0.9776, + "grad_norm": 0.9510137492394066, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.5334, + "step": 611 + }, + { + "epoch": 0.9792, + "grad_norm": 1.1228685544623338, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.6673, + "step": 612 + }, + { + "epoch": 0.9808, + "grad_norm": 0.9574163289616511, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.608, + "step": 613 + }, + { + "epoch": 0.9824, + "grad_norm": 0.9693634569140042, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.6054, + "step": 614 + }, + { + "epoch": 0.984, + "grad_norm": 0.9760444131755792, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.5478, + "step": 615 + }, + { + "epoch": 0.9856, + "grad_norm": 0.9474416415255731, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5137, + "step": 616 + }, + { + "epoch": 0.9872, + "grad_norm": 1.0300094506610928, + "learning_rate": 8.598886661895788e-08, + "loss": 0.5334, + "step": 617 + }, + { + "epoch": 0.9888, + "grad_norm": 0.8898281929370233, + "learning_rate": 6.583743778106887e-08, + "loss": 0.5279, + "step": 618 + }, + { + "epoch": 0.9904, + "grad_norm": 1.3279687374109668, + "learning_rate": 4.837177080119215e-08, + "loss": 0.7511, + "step": 619 + }, + { + "epoch": 0.992, + "grad_norm": 0.854279957525338, + "learning_rate": 3.359233507459481e-08, + "loss": 0.5012, + "step": 620 + }, + { + "epoch": 0.9936, + "grad_norm": 0.9686724654578658, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.5938, + "step": 621 + }, + { + "epoch": 0.9952, + "grad_norm": 0.9521684426642905, + "learning_rate": 1.209367398504746e-08, + "loss": 0.6124, + "step": 622 + }, + { + "epoch": 0.9968, + "grad_norm": 0.8766730681498675, + "learning_rate": 5.375026405352035e-09, + "loss": 0.4793, + "step": 623 + }, + { + "epoch": 0.9984, + "grad_norm": 0.8576827239737382, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.4929, + "step": 624 + }, + { + "epoch": 1.0, + "grad_norm": 1.0180614580257639, + "learning_rate": 0.0, + "loss": 0.5727, + "step": 625 + }, + { + "epoch": 1.0, + "step": 625, + "total_flos": 82273390780416.0, + "train_loss": 0.6731263710498809, + "train_runtime": 5195.4773, + "train_samples_per_second": 1.925, + "train_steps_per_second": 0.12 + } + ], + "logging_steps": 1.0, + "max_steps": 625, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 82273390780416.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8e8b6814e3d80a4fc8d72542ebf99a79124e6feb --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "k_proj", + "o_proj", + "gate_proj", + "v_proj", + "down_proj", + "up_proj", + "q_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4b99e56679a991f138b6408e617c8c486856859a --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cabb144a8f73e1386612a4f2e8bdb914dcdc1f1ff59b3ba0cb089fd218695976 +size 671150064 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..663527f23602619fab33e6381707423cb7e97433 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:549752e36daf80a072340c08e532c8c2d7715202ab7e88363018d122504a4ffe +size 918507402 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..391140927b14c0dfaa2d377b66159d516ecab6ec --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_20000_epochs_1_lora/trainer_state.json @@ -0,0 +1,8792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 1250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0008, + "grad_norm": 4.408342819175507, + "learning_rate": 5.263157894736842e-06, + "loss": 1.4923, + "step": 1 + }, + { + "epoch": 0.0016, + "grad_norm": 4.110439939161045, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4308, + "step": 2 + }, + { + "epoch": 0.0024, + "grad_norm": 3.627360088009308, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.515, + "step": 3 + }, + { + "epoch": 0.0032, + "grad_norm": 2.509499785965118, + "learning_rate": 2.105263157894737e-05, + "loss": 1.4493, + "step": 4 + }, + { + "epoch": 0.004, + "grad_norm": 2.323968524411629, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.3003, + "step": 5 + }, + { + "epoch": 0.0048, + "grad_norm": 1.995055111232002, + "learning_rate": 3.157894736842105e-05, + "loss": 1.1018, + "step": 6 + }, + { + "epoch": 0.0056, + "grad_norm": 1.889932196942419, + "learning_rate": 3.6842105263157895e-05, + "loss": 1.1819, + "step": 7 + }, + { + "epoch": 0.0064, + "grad_norm": 1.6684082430973282, + "learning_rate": 4.210526315789474e-05, + "loss": 1.02, + "step": 8 + }, + { + "epoch": 0.0072, + "grad_norm": 3.7755827325690263, + "learning_rate": 4.736842105263158e-05, + "loss": 0.9298, + "step": 9 + }, + { + "epoch": 0.008, + "grad_norm": 2.196627684433904, + "learning_rate": 5.2631578947368424e-05, + "loss": 1.0584, + "step": 10 + }, + { + "epoch": 0.0088, + "grad_norm": 1.6824981348422472, + "learning_rate": 5.789473684210527e-05, + "loss": 0.8387, + "step": 11 + }, + { + "epoch": 0.0096, + "grad_norm": 1.6198472340502856, + "learning_rate": 6.31578947368421e-05, + "loss": 0.8099, + "step": 12 + }, + { + "epoch": 0.0104, + "grad_norm": 1.8117757984096798, + "learning_rate": 6.842105263157895e-05, + "loss": 1.1271, + "step": 13 + }, + { + "epoch": 0.0112, + "grad_norm": 1.5445028531856497, + "learning_rate": 7.368421052631579e-05, + "loss": 0.8958, + "step": 14 + }, + { + "epoch": 0.012, + "grad_norm": 1.6869380158754907, + "learning_rate": 7.894736842105263e-05, + "loss": 0.9061, + "step": 15 + }, + { + "epoch": 0.0128, + "grad_norm": 1.4254207766658846, + "learning_rate": 8.421052631578948e-05, + "loss": 0.8401, + "step": 16 + }, + { + "epoch": 0.0136, + "grad_norm": 1.3931299025207386, + "learning_rate": 8.947368421052632e-05, + "loss": 0.8321, + "step": 17 + }, + { + "epoch": 0.0144, + "grad_norm": 1.328820134404943, + "learning_rate": 9.473684210526316e-05, + "loss": 0.8292, + "step": 18 + }, + { + "epoch": 0.0152, + "grad_norm": 1.3628683546991005, + "learning_rate": 0.0001, + "loss": 0.9717, + "step": 19 + }, + { + "epoch": 0.016, + "grad_norm": 1.071482812407722, + "learning_rate": 0.00010526315789473685, + "loss": 0.7228, + "step": 20 + }, + { + "epoch": 0.0168, + "grad_norm": 1.2276194299084946, + "learning_rate": 0.0001105263157894737, + "loss": 0.7846, + "step": 21 + }, + { + "epoch": 0.0176, + "grad_norm": 1.2605068638679624, + "learning_rate": 0.00011578947368421053, + "loss": 0.8337, + "step": 22 + }, + { + "epoch": 0.0184, + "grad_norm": 1.2125163219825974, + "learning_rate": 0.00012105263157894738, + "loss": 0.7455, + "step": 23 + }, + { + "epoch": 0.0192, + "grad_norm": 1.3559412240231647, + "learning_rate": 0.0001263157894736842, + "loss": 0.8704, + "step": 24 + }, + { + "epoch": 0.02, + "grad_norm": 1.1931836827361815, + "learning_rate": 0.00013157894736842108, + "loss": 0.7802, + "step": 25 + }, + { + "epoch": 0.0208, + "grad_norm": 1.2815768897397926, + "learning_rate": 0.0001368421052631579, + "loss": 0.7516, + "step": 26 + }, + { + "epoch": 0.0216, + "grad_norm": 1.2487451213256184, + "learning_rate": 0.00014210526315789474, + "loss": 0.8588, + "step": 27 + }, + { + "epoch": 0.0224, + "grad_norm": 1.3428187117644304, + "learning_rate": 0.00014736842105263158, + "loss": 0.7912, + "step": 28 + }, + { + "epoch": 0.0232, + "grad_norm": 1.5896900378362886, + "learning_rate": 0.00015263157894736845, + "loss": 0.7279, + "step": 29 + }, + { + "epoch": 0.024, + "grad_norm": 1.6758848687903816, + "learning_rate": 0.00015789473684210527, + "loss": 0.8775, + "step": 30 + }, + { + "epoch": 0.0248, + "grad_norm": 1.2914936264462082, + "learning_rate": 0.0001631578947368421, + "loss": 0.8787, + "step": 31 + }, + { + "epoch": 0.0256, + "grad_norm": 1.131260376713681, + "learning_rate": 0.00016842105263157895, + "loss": 0.8037, + "step": 32 + }, + { + "epoch": 0.0264, + "grad_norm": 1.3915706978304456, + "learning_rate": 0.0001736842105263158, + "loss": 0.8949, + "step": 33 + }, + { + "epoch": 0.0272, + "grad_norm": 1.3423957957870956, + "learning_rate": 0.00017894736842105264, + "loss": 1.0454, + "step": 34 + }, + { + "epoch": 0.028, + "grad_norm": 1.1888239761622272, + "learning_rate": 0.00018421052631578948, + "loss": 0.8437, + "step": 35 + }, + { + "epoch": 0.0288, + "grad_norm": 1.2529062114232787, + "learning_rate": 0.00018947368421052632, + "loss": 0.8602, + "step": 36 + }, + { + "epoch": 0.0296, + "grad_norm": 1.4199469178386859, + "learning_rate": 0.00019473684210526317, + "loss": 0.7949, + "step": 37 + }, + { + "epoch": 0.0304, + "grad_norm": 1.414948053728861, + "learning_rate": 0.0002, + "loss": 0.8271, + "step": 38 + }, + { + "epoch": 0.0312, + "grad_norm": 1.1710159773683249, + "learning_rate": 0.00019999966405802826, + "loss": 0.7915, + "step": 39 + }, + { + "epoch": 0.032, + "grad_norm": 1.5657566210361664, + "learning_rate": 0.00019999865623437013, + "loss": 0.8607, + "step": 40 + }, + { + "epoch": 0.0328, + "grad_norm": 1.446707772738992, + "learning_rate": 0.00019999697653579705, + "loss": 0.7905, + "step": 41 + }, + { + "epoch": 0.0336, + "grad_norm": 1.517378154263656, + "learning_rate": 0.00019999462497359466, + "loss": 0.9057, + "step": 42 + }, + { + "epoch": 0.0344, + "grad_norm": 1.2143992013412503, + "learning_rate": 0.0001999916015635627, + "loss": 0.7846, + "step": 43 + }, + { + "epoch": 0.0352, + "grad_norm": 1.1886700399966317, + "learning_rate": 0.00019998790632601496, + "loss": 0.7071, + "step": 44 + }, + { + "epoch": 0.036, + "grad_norm": 1.1456550756916823, + "learning_rate": 0.00019998353928577919, + "loss": 0.8241, + "step": 45 + }, + { + "epoch": 0.0368, + "grad_norm": 1.2086948448352206, + "learning_rate": 0.0001999785004721968, + "loss": 0.8328, + "step": 46 + }, + { + "epoch": 0.0376, + "grad_norm": 1.238059074281379, + "learning_rate": 0.0001999727899191228, + "loss": 0.7725, + "step": 47 + }, + { + "epoch": 0.0384, + "grad_norm": 1.2570874197951665, + "learning_rate": 0.00019996640766492543, + "loss": 0.8732, + "step": 48 + }, + { + "epoch": 0.0392, + "grad_norm": 1.2636827067135736, + "learning_rate": 0.00019995935375248606, + "loss": 0.7409, + "step": 49 + }, + { + "epoch": 0.04, + "grad_norm": 1.354771516961733, + "learning_rate": 0.00019995162822919883, + "loss": 0.8311, + "step": 50 + }, + { + "epoch": 0.0408, + "grad_norm": 1.1547283885905204, + "learning_rate": 0.00019994323114697022, + "loss": 0.706, + "step": 51 + }, + { + "epoch": 0.0416, + "grad_norm": 1.3394959141288645, + "learning_rate": 0.00019993416256221895, + "loss": 0.7398, + "step": 52 + }, + { + "epoch": 0.0424, + "grad_norm": 1.367883852532359, + "learning_rate": 0.0001999244225358753, + "loss": 0.8653, + "step": 53 + }, + { + "epoch": 0.0432, + "grad_norm": 1.3107142541709331, + "learning_rate": 0.00019991401113338104, + "loss": 0.8879, + "step": 54 + }, + { + "epoch": 0.044, + "grad_norm": 1.1805821359949669, + "learning_rate": 0.00019990292842468868, + "loss": 0.7184, + "step": 55 + }, + { + "epoch": 0.0448, + "grad_norm": 1.1968923989864841, + "learning_rate": 0.00019989117448426108, + "loss": 0.7353, + "step": 56 + }, + { + "epoch": 0.0456, + "grad_norm": 1.4422865807870915, + "learning_rate": 0.0001998787493910712, + "loss": 0.849, + "step": 57 + }, + { + "epoch": 0.0464, + "grad_norm": 1.2480605497927955, + "learning_rate": 0.00019986565322860115, + "loss": 0.7776, + "step": 58 + }, + { + "epoch": 0.0472, + "grad_norm": 1.2418802053286733, + "learning_rate": 0.000199851886084842, + "loss": 0.7971, + "step": 59 + }, + { + "epoch": 0.048, + "grad_norm": 1.1927030318890224, + "learning_rate": 0.00019983744805229296, + "loss": 0.8142, + "step": 60 + }, + { + "epoch": 0.0488, + "grad_norm": 1.1873157476816283, + "learning_rate": 0.00019982233922796085, + "loss": 0.7529, + "step": 61 + }, + { + "epoch": 0.0496, + "grad_norm": 1.2902061763218409, + "learning_rate": 0.00019980655971335945, + "loss": 0.8838, + "step": 62 + }, + { + "epoch": 0.0504, + "grad_norm": 1.340897802880216, + "learning_rate": 0.00019979010961450878, + "loss": 0.8265, + "step": 63 + }, + { + "epoch": 0.0512, + "grad_norm": 1.2373723619717891, + "learning_rate": 0.00019977298904193437, + "loss": 0.719, + "step": 64 + }, + { + "epoch": 0.052, + "grad_norm": 1.107760632682364, + "learning_rate": 0.00019975519811066663, + "loss": 0.7628, + "step": 65 + }, + { + "epoch": 0.0528, + "grad_norm": 1.1981904383495332, + "learning_rate": 0.00019973673694024, + "loss": 0.8435, + "step": 66 + }, + { + "epoch": 0.0536, + "grad_norm": 1.1429350930888995, + "learning_rate": 0.0001997176056546921, + "loss": 0.7228, + "step": 67 + }, + { + "epoch": 0.0544, + "grad_norm": 1.435754871570911, + "learning_rate": 0.00019969780438256293, + "loss": 0.7299, + "step": 68 + }, + { + "epoch": 0.0552, + "grad_norm": 1.1865159800313239, + "learning_rate": 0.0001996773332568941, + "loss": 0.8046, + "step": 69 + }, + { + "epoch": 0.056, + "grad_norm": 1.176760552196966, + "learning_rate": 0.0001996561924152278, + "loss": 0.8158, + "step": 70 + }, + { + "epoch": 0.0568, + "grad_norm": 1.10103801941083, + "learning_rate": 0.00019963438199960599, + "loss": 0.7268, + "step": 71 + }, + { + "epoch": 0.0576, + "grad_norm": 1.305214804300615, + "learning_rate": 0.0001996119021565693, + "loss": 0.8605, + "step": 72 + }, + { + "epoch": 0.0584, + "grad_norm": 1.2743862748586496, + "learning_rate": 0.00019958875303715615, + "loss": 0.9368, + "step": 73 + }, + { + "epoch": 0.0592, + "grad_norm": 1.0492622374201535, + "learning_rate": 0.0001995649347969019, + "loss": 0.7463, + "step": 74 + }, + { + "epoch": 0.06, + "grad_norm": 1.1931733147785315, + "learning_rate": 0.0001995404475958373, + "loss": 0.8728, + "step": 75 + }, + { + "epoch": 0.0608, + "grad_norm": 1.0460031573743418, + "learning_rate": 0.00019951529159848805, + "loss": 0.7114, + "step": 76 + }, + { + "epoch": 0.0616, + "grad_norm": 1.1399063887491887, + "learning_rate": 0.0001994894669738732, + "loss": 0.7837, + "step": 77 + }, + { + "epoch": 0.0624, + "grad_norm": 1.0790701153365434, + "learning_rate": 0.00019946297389550433, + "loss": 0.7044, + "step": 78 + }, + { + "epoch": 0.0632, + "grad_norm": 1.1576158790364408, + "learning_rate": 0.0001994358125413841, + "loss": 0.8326, + "step": 79 + }, + { + "epoch": 0.064, + "grad_norm": 1.0646241406248742, + "learning_rate": 0.00019940798309400526, + "loss": 0.6692, + "step": 80 + }, + { + "epoch": 0.0648, + "grad_norm": 1.1308196743214243, + "learning_rate": 0.0001993794857403495, + "loss": 0.6967, + "step": 81 + }, + { + "epoch": 0.0656, + "grad_norm": 1.149997527773717, + "learning_rate": 0.0001993503206718859, + "loss": 0.7963, + "step": 82 + }, + { + "epoch": 0.0664, + "grad_norm": 1.1742855406191646, + "learning_rate": 0.0001993204880845699, + "loss": 0.8313, + "step": 83 + }, + { + "epoch": 0.0672, + "grad_norm": 1.127633976627731, + "learning_rate": 0.00019928998817884182, + "loss": 0.8215, + "step": 84 + }, + { + "epoch": 0.068, + "grad_norm": 1.2741868523423572, + "learning_rate": 0.00019925882115962568, + "loss": 0.9249, + "step": 85 + }, + { + "epoch": 0.0688, + "grad_norm": 1.1549550551753378, + "learning_rate": 0.00019922698723632767, + "loss": 0.7753, + "step": 86 + }, + { + "epoch": 0.0696, + "grad_norm": 1.222430259153762, + "learning_rate": 0.00019919448662283478, + "loss": 0.7563, + "step": 87 + }, + { + "epoch": 0.0704, + "grad_norm": 1.0514952641117037, + "learning_rate": 0.00019916131953751342, + "loss": 0.8027, + "step": 88 + }, + { + "epoch": 0.0712, + "grad_norm": 1.086998228036307, + "learning_rate": 0.00019912748620320794, + "loss": 0.7755, + "step": 89 + }, + { + "epoch": 0.072, + "grad_norm": 1.1630351309565898, + "learning_rate": 0.00019909298684723904, + "loss": 0.7805, + "step": 90 + }, + { + "epoch": 0.0728, + "grad_norm": 1.184830463331376, + "learning_rate": 0.00019905782170140238, + "loss": 0.7548, + "step": 91 + }, + { + "epoch": 0.0736, + "grad_norm": 1.434415169297192, + "learning_rate": 0.00019902199100196697, + "loss": 0.7904, + "step": 92 + }, + { + "epoch": 0.0744, + "grad_norm": 1.2341565822260379, + "learning_rate": 0.00019898549498967343, + "loss": 0.74, + "step": 93 + }, + { + "epoch": 0.0752, + "grad_norm": 1.32604291197053, + "learning_rate": 0.00019894833390973266, + "loss": 0.8844, + "step": 94 + }, + { + "epoch": 0.076, + "grad_norm": 1.1274789532241294, + "learning_rate": 0.000198910508011824, + "loss": 0.8091, + "step": 95 + }, + { + "epoch": 0.0768, + "grad_norm": 1.1333728621491337, + "learning_rate": 0.00019887201755009357, + "loss": 0.7048, + "step": 96 + }, + { + "epoch": 0.0776, + "grad_norm": 1.4217150158455454, + "learning_rate": 0.00019883286278315262, + "loss": 0.8696, + "step": 97 + }, + { + "epoch": 0.0784, + "grad_norm": 1.236434536080269, + "learning_rate": 0.0001987930439740757, + "loss": 0.8657, + "step": 98 + }, + { + "epoch": 0.0792, + "grad_norm": 1.2906232510193867, + "learning_rate": 0.00019875256139039902, + "loss": 0.9025, + "step": 99 + }, + { + "epoch": 0.08, + "grad_norm": 1.174051064286104, + "learning_rate": 0.00019871141530411853, + "loss": 0.7729, + "step": 100 + }, + { + "epoch": 0.0808, + "grad_norm": 1.190405459676803, + "learning_rate": 0.00019866960599168826, + "loss": 0.7598, + "step": 101 + }, + { + "epoch": 0.0816, + "grad_norm": 1.2428373437223619, + "learning_rate": 0.0001986271337340182, + "loss": 0.8066, + "step": 102 + }, + { + "epoch": 0.0824, + "grad_norm": 1.0756769612595862, + "learning_rate": 0.0001985839988164726, + "loss": 0.7582, + "step": 103 + }, + { + "epoch": 0.0832, + "grad_norm": 1.053929056366854, + "learning_rate": 0.00019854020152886814, + "loss": 0.7495, + "step": 104 + }, + { + "epoch": 0.084, + "grad_norm": 1.302085916712659, + "learning_rate": 0.00019849574216547171, + "loss": 0.8239, + "step": 105 + }, + { + "epoch": 0.0848, + "grad_norm": 1.2202083319942585, + "learning_rate": 0.0001984506210249986, + "loss": 0.7572, + "step": 106 + }, + { + "epoch": 0.0856, + "grad_norm": 1.123037646990899, + "learning_rate": 0.00019840483841061058, + "loss": 0.727, + "step": 107 + }, + { + "epoch": 0.0864, + "grad_norm": 1.096235066230464, + "learning_rate": 0.00019835839462991361, + "loss": 0.8576, + "step": 108 + }, + { + "epoch": 0.0872, + "grad_norm": 1.0847203500993057, + "learning_rate": 0.00019831128999495606, + "loss": 0.7677, + "step": 109 + }, + { + "epoch": 0.088, + "grad_norm": 1.1951784729681403, + "learning_rate": 0.00019826352482222638, + "loss": 0.7637, + "step": 110 + }, + { + "epoch": 0.0888, + "grad_norm": 1.3106303412164404, + "learning_rate": 0.0001982150994326511, + "loss": 0.9177, + "step": 111 + }, + { + "epoch": 0.0896, + "grad_norm": 1.01740925515726, + "learning_rate": 0.00019816601415159263, + "loss": 0.7668, + "step": 112 + }, + { + "epoch": 0.0904, + "grad_norm": 1.1165587109703807, + "learning_rate": 0.0001981162693088471, + "loss": 0.6997, + "step": 113 + }, + { + "epoch": 0.0912, + "grad_norm": 1.339872329337332, + "learning_rate": 0.0001980658652386421, + "loss": 0.9134, + "step": 114 + }, + { + "epoch": 0.092, + "grad_norm": 1.155065798032305, + "learning_rate": 0.0001980148022796345, + "loss": 0.7457, + "step": 115 + }, + { + "epoch": 0.0928, + "grad_norm": 1.115420938539364, + "learning_rate": 0.00019796308077490817, + "loss": 0.7114, + "step": 116 + }, + { + "epoch": 0.0936, + "grad_norm": 1.0275049872706479, + "learning_rate": 0.00019791070107197153, + "loss": 0.7621, + "step": 117 + }, + { + "epoch": 0.0944, + "grad_norm": 1.1422631177427438, + "learning_rate": 0.00019785766352275542, + "loss": 0.7088, + "step": 118 + }, + { + "epoch": 0.0952, + "grad_norm": 1.1723839805566112, + "learning_rate": 0.0001978039684836106, + "loss": 0.7436, + "step": 119 + }, + { + "epoch": 0.096, + "grad_norm": 1.0082428312322236, + "learning_rate": 0.00019774961631530545, + "loss": 0.7748, + "step": 120 + }, + { + "epoch": 0.0968, + "grad_norm": 1.0938380826486647, + "learning_rate": 0.0001976946073830234, + "loss": 0.7407, + "step": 121 + }, + { + "epoch": 0.0976, + "grad_norm": 1.1495446964574842, + "learning_rate": 0.00019763894205636072, + "loss": 0.8554, + "step": 122 + }, + { + "epoch": 0.0984, + "grad_norm": 0.9819945252756392, + "learning_rate": 0.00019758262070932375, + "loss": 0.7449, + "step": 123 + }, + { + "epoch": 0.0992, + "grad_norm": 0.9841459680536125, + "learning_rate": 0.00019752564372032657, + "loss": 0.7292, + "step": 124 + }, + { + "epoch": 0.1, + "grad_norm": 1.377259630104572, + "learning_rate": 0.00019746801147218842, + "loss": 0.8627, + "step": 125 + }, + { + "epoch": 0.1008, + "grad_norm": 1.0954099330436704, + "learning_rate": 0.00019740972435213115, + "loss": 0.8581, + "step": 126 + }, + { + "epoch": 0.1016, + "grad_norm": 1.0801564424791268, + "learning_rate": 0.00019735078275177654, + "loss": 0.7146, + "step": 127 + }, + { + "epoch": 0.1024, + "grad_norm": 1.186455419707288, + "learning_rate": 0.00019729118706714375, + "loss": 0.8809, + "step": 128 + }, + { + "epoch": 0.1032, + "grad_norm": 1.1156046251711629, + "learning_rate": 0.00019723093769864663, + "loss": 0.8294, + "step": 129 + }, + { + "epoch": 0.104, + "grad_norm": 1.0552736379204224, + "learning_rate": 0.00019717003505109095, + "loss": 0.7684, + "step": 130 + }, + { + "epoch": 0.1048, + "grad_norm": 1.0881246454501003, + "learning_rate": 0.0001971084795336719, + "loss": 0.7453, + "step": 131 + }, + { + "epoch": 0.1056, + "grad_norm": 1.0023599103674323, + "learning_rate": 0.00019704627155997108, + "loss": 0.77, + "step": 132 + }, + { + "epoch": 0.1064, + "grad_norm": 1.1699658416978167, + "learning_rate": 0.00019698341154795389, + "loss": 0.8204, + "step": 133 + }, + { + "epoch": 0.1072, + "grad_norm": 1.1614645359235414, + "learning_rate": 0.00019691989991996663, + "loss": 0.7786, + "step": 134 + }, + { + "epoch": 0.108, + "grad_norm": 1.097653278367737, + "learning_rate": 0.00019685573710273376, + "loss": 0.6808, + "step": 135 + }, + { + "epoch": 0.1088, + "grad_norm": 1.1734841782207333, + "learning_rate": 0.0001967909235273549, + "loss": 0.8497, + "step": 136 + }, + { + "epoch": 0.1096, + "grad_norm": 1.0942018776385893, + "learning_rate": 0.00019672545962930215, + "loss": 0.6629, + "step": 137 + }, + { + "epoch": 0.1104, + "grad_norm": 1.1238186898701805, + "learning_rate": 0.00019665934584841682, + "loss": 0.725, + "step": 138 + }, + { + "epoch": 0.1112, + "grad_norm": 1.2471987096128463, + "learning_rate": 0.00019659258262890683, + "loss": 0.7983, + "step": 139 + }, + { + "epoch": 0.112, + "grad_norm": 1.22943045949165, + "learning_rate": 0.00019652517041934356, + "loss": 0.8379, + "step": 140 + }, + { + "epoch": 0.1128, + "grad_norm": 0.9922021980387541, + "learning_rate": 0.00019645710967265882, + "loss": 0.7344, + "step": 141 + }, + { + "epoch": 0.1136, + "grad_norm": 1.1142637872966648, + "learning_rate": 0.00019638840084614182, + "loss": 0.7845, + "step": 142 + }, + { + "epoch": 0.1144, + "grad_norm": 1.0269659172627574, + "learning_rate": 0.00019631904440143612, + "loss": 0.7648, + "step": 143 + }, + { + "epoch": 0.1152, + "grad_norm": 1.0338558129722986, + "learning_rate": 0.00019624904080453655, + "loss": 0.7507, + "step": 144 + }, + { + "epoch": 0.116, + "grad_norm": 1.0709258671144004, + "learning_rate": 0.00019617839052578603, + "loss": 0.7821, + "step": 145 + }, + { + "epoch": 0.1168, + "grad_norm": 1.252304631033193, + "learning_rate": 0.00019610709403987246, + "loss": 0.8575, + "step": 146 + }, + { + "epoch": 0.1176, + "grad_norm": 1.0327264932414084, + "learning_rate": 0.0001960351518258255, + "loss": 0.7447, + "step": 147 + }, + { + "epoch": 0.1184, + "grad_norm": 1.2688781875903254, + "learning_rate": 0.00019596256436701324, + "loss": 0.8508, + "step": 148 + }, + { + "epoch": 0.1192, + "grad_norm": 1.1803015987731276, + "learning_rate": 0.00019588933215113926, + "loss": 0.7471, + "step": 149 + }, + { + "epoch": 0.12, + "grad_norm": 1.0899050814056226, + "learning_rate": 0.000195815455670239, + "loss": 0.7357, + "step": 150 + }, + { + "epoch": 0.1208, + "grad_norm": 0.9177192363588716, + "learning_rate": 0.00019574093542067673, + "loss": 0.6452, + "step": 151 + }, + { + "epoch": 0.1216, + "grad_norm": 1.1109819282825883, + "learning_rate": 0.00019566577190314197, + "loss": 0.7391, + "step": 152 + }, + { + "epoch": 0.1224, + "grad_norm": 1.0845202801960454, + "learning_rate": 0.0001955899656226464, + "loss": 0.799, + "step": 153 + }, + { + "epoch": 0.1232, + "grad_norm": 0.9784873646403908, + "learning_rate": 0.0001955135170885202, + "loss": 0.6742, + "step": 154 + }, + { + "epoch": 0.124, + "grad_norm": 1.1502313941088143, + "learning_rate": 0.0001954364268144088, + "loss": 0.7794, + "step": 155 + }, + { + "epoch": 0.1248, + "grad_norm": 1.2651075421089168, + "learning_rate": 0.00019535869531826937, + "loss": 0.8077, + "step": 156 + }, + { + "epoch": 0.1256, + "grad_norm": 1.2660307896771321, + "learning_rate": 0.00019528032312236736, + "loss": 0.9314, + "step": 157 + }, + { + "epoch": 0.1264, + "grad_norm": 1.219453764327284, + "learning_rate": 0.00019520131075327298, + "loss": 0.9102, + "step": 158 + }, + { + "epoch": 0.1272, + "grad_norm": 1.1752038197472823, + "learning_rate": 0.00019512165874185767, + "loss": 0.859, + "step": 159 + }, + { + "epoch": 0.128, + "grad_norm": 1.042717048482976, + "learning_rate": 0.00019504136762329047, + "loss": 0.7813, + "step": 160 + }, + { + "epoch": 0.1288, + "grad_norm": 0.9947703403682241, + "learning_rate": 0.0001949604379370345, + "loss": 0.7639, + "step": 161 + }, + { + "epoch": 0.1296, + "grad_norm": 1.0285566389334777, + "learning_rate": 0.00019487887022684336, + "loss": 0.7345, + "step": 162 + }, + { + "epoch": 0.1304, + "grad_norm": 1.029063636323494, + "learning_rate": 0.00019479666504075736, + "loss": 0.7483, + "step": 163 + }, + { + "epoch": 0.1312, + "grad_norm": 1.0906097120134792, + "learning_rate": 0.00019471382293110003, + "loss": 0.7249, + "step": 164 + }, + { + "epoch": 0.132, + "grad_norm": 1.1334391639310268, + "learning_rate": 0.0001946303444544741, + "loss": 0.7058, + "step": 165 + }, + { + "epoch": 0.1328, + "grad_norm": 0.9833642785833447, + "learning_rate": 0.00019454623017175812, + "loss": 0.7198, + "step": 166 + }, + { + "epoch": 0.1336, + "grad_norm": 1.2967508433454027, + "learning_rate": 0.00019446148064810242, + "loss": 1.0848, + "step": 167 + }, + { + "epoch": 0.1344, + "grad_norm": 1.2014098936434896, + "learning_rate": 0.00019437609645292546, + "loss": 0.7061, + "step": 168 + }, + { + "epoch": 0.1352, + "grad_norm": 0.9760578979869019, + "learning_rate": 0.00019429007815990993, + "loss": 0.6657, + "step": 169 + }, + { + "epoch": 0.136, + "grad_norm": 1.1154382155788707, + "learning_rate": 0.0001942034263469989, + "loss": 0.721, + "step": 170 + }, + { + "epoch": 0.1368, + "grad_norm": 1.1067342540180944, + "learning_rate": 0.00019411614159639204, + "loss": 0.7666, + "step": 171 + }, + { + "epoch": 0.1376, + "grad_norm": 1.1078308318396748, + "learning_rate": 0.00019402822449454153, + "loss": 0.7355, + "step": 172 + }, + { + "epoch": 0.1384, + "grad_norm": 1.0664046934283806, + "learning_rate": 0.00019393967563214833, + "loss": 0.7811, + "step": 173 + }, + { + "epoch": 0.1392, + "grad_norm": 1.1422077472163201, + "learning_rate": 0.00019385049560415794, + "loss": 0.7691, + "step": 174 + }, + { + "epoch": 0.14, + "grad_norm": 1.0985709920564717, + "learning_rate": 0.00019376068500975667, + "loss": 0.731, + "step": 175 + }, + { + "epoch": 0.1408, + "grad_norm": 1.1252405981323013, + "learning_rate": 0.00019367024445236754, + "loss": 0.7998, + "step": 176 + }, + { + "epoch": 0.1416, + "grad_norm": 1.0675403907916068, + "learning_rate": 0.000193579174539646, + "loss": 0.7248, + "step": 177 + }, + { + "epoch": 0.1424, + "grad_norm": 1.05184998619709, + "learning_rate": 0.00019348747588347637, + "loss": 0.7517, + "step": 178 + }, + { + "epoch": 0.1432, + "grad_norm": 1.1147301140311476, + "learning_rate": 0.00019339514909996706, + "loss": 0.7584, + "step": 179 + }, + { + "epoch": 0.144, + "grad_norm": 0.9105532274684514, + "learning_rate": 0.00019330219480944694, + "loss": 0.7178, + "step": 180 + }, + { + "epoch": 0.1448, + "grad_norm": 1.0654400102516637, + "learning_rate": 0.00019320861363646095, + "loss": 0.8339, + "step": 181 + }, + { + "epoch": 0.1456, + "grad_norm": 1.013459177418056, + "learning_rate": 0.00019311440620976597, + "loss": 0.7937, + "step": 182 + }, + { + "epoch": 0.1464, + "grad_norm": 1.0619435864285793, + "learning_rate": 0.00019301957316232658, + "loss": 0.8075, + "step": 183 + }, + { + "epoch": 0.1472, + "grad_norm": 1.0415089705114855, + "learning_rate": 0.0001929241151313108, + "loss": 0.9126, + "step": 184 + }, + { + "epoch": 0.148, + "grad_norm": 0.9923339843313642, + "learning_rate": 0.0001928280327580858, + "loss": 0.6941, + "step": 185 + }, + { + "epoch": 0.1488, + "grad_norm": 1.05939359254116, + "learning_rate": 0.00019273132668821364, + "loss": 0.7379, + "step": 186 + }, + { + "epoch": 0.1496, + "grad_norm": 1.056017771383427, + "learning_rate": 0.00019263399757144683, + "loss": 0.8552, + "step": 187 + }, + { + "epoch": 0.1504, + "grad_norm": 1.1849751523305005, + "learning_rate": 0.00019253604606172417, + "loss": 0.8967, + "step": 188 + }, + { + "epoch": 0.1512, + "grad_norm": 1.0163769370421663, + "learning_rate": 0.000192437472817166, + "loss": 0.6705, + "step": 189 + }, + { + "epoch": 0.152, + "grad_norm": 1.1637448866611821, + "learning_rate": 0.00019233827850007027, + "loss": 0.7875, + "step": 190 + }, + { + "epoch": 0.1528, + "grad_norm": 1.0582185088998888, + "learning_rate": 0.00019223846377690754, + "loss": 0.7379, + "step": 191 + }, + { + "epoch": 0.1536, + "grad_norm": 1.107910568839856, + "learning_rate": 0.00019213802931831696, + "loss": 0.8034, + "step": 192 + }, + { + "epoch": 0.1544, + "grad_norm": 1.193940802100663, + "learning_rate": 0.00019203697579910154, + "loss": 0.9218, + "step": 193 + }, + { + "epoch": 0.1552, + "grad_norm": 1.031544957221749, + "learning_rate": 0.00019193530389822363, + "loss": 0.8096, + "step": 194 + }, + { + "epoch": 0.156, + "grad_norm": 1.0550279337742818, + "learning_rate": 0.00019183301429880043, + "loss": 0.7969, + "step": 195 + }, + { + "epoch": 0.1568, + "grad_norm": 1.339378720092458, + "learning_rate": 0.00019173010768809933, + "loss": 0.7135, + "step": 196 + }, + { + "epoch": 0.1576, + "grad_norm": 1.053673240761701, + "learning_rate": 0.00019162658475753327, + "loss": 0.6952, + "step": 197 + }, + { + "epoch": 0.1584, + "grad_norm": 0.9992381473663097, + "learning_rate": 0.0001915224462026563, + "loss": 0.6662, + "step": 198 + }, + { + "epoch": 0.1592, + "grad_norm": 1.0255698032449383, + "learning_rate": 0.00019141769272315858, + "loss": 0.6567, + "step": 199 + }, + { + "epoch": 0.16, + "grad_norm": 1.133600671042738, + "learning_rate": 0.00019131232502286188, + "loss": 0.7207, + "step": 200 + }, + { + "epoch": 0.1608, + "grad_norm": 1.2134110529027724, + "learning_rate": 0.00019120634380971496, + "loss": 0.7027, + "step": 201 + }, + { + "epoch": 0.1616, + "grad_norm": 1.2053506400272829, + "learning_rate": 0.0001910997497957885, + "loss": 0.7832, + "step": 202 + }, + { + "epoch": 0.1624, + "grad_norm": 1.1102749097485602, + "learning_rate": 0.0001909925436972706, + "loss": 0.7118, + "step": 203 + }, + { + "epoch": 0.1632, + "grad_norm": 1.178902593048484, + "learning_rate": 0.00019088472623446183, + "loss": 0.7437, + "step": 204 + }, + { + "epoch": 0.164, + "grad_norm": 0.9933065180211077, + "learning_rate": 0.00019077629813177036, + "loss": 0.7066, + "step": 205 + }, + { + "epoch": 0.1648, + "grad_norm": 1.1495869058395793, + "learning_rate": 0.00019066726011770726, + "loss": 0.7963, + "step": 206 + }, + { + "epoch": 0.1656, + "grad_norm": 1.1003146005307034, + "learning_rate": 0.00019055761292488142, + "loss": 0.7443, + "step": 207 + }, + { + "epoch": 0.1664, + "grad_norm": 0.9808868668847505, + "learning_rate": 0.0001904473572899947, + "loss": 0.6977, + "step": 208 + }, + { + "epoch": 0.1672, + "grad_norm": 1.1925770685238926, + "learning_rate": 0.00019033649395383702, + "loss": 0.6928, + "step": 209 + }, + { + "epoch": 0.168, + "grad_norm": 0.927693974621165, + "learning_rate": 0.00019022502366128135, + "loss": 0.6247, + "step": 210 + }, + { + "epoch": 0.1688, + "grad_norm": 1.0742454706985174, + "learning_rate": 0.00019011294716127867, + "loss": 0.7723, + "step": 211 + }, + { + "epoch": 0.1696, + "grad_norm": 1.051319474324855, + "learning_rate": 0.00019000026520685302, + "loss": 0.8035, + "step": 212 + }, + { + "epoch": 0.1704, + "grad_norm": 1.1216518209851702, + "learning_rate": 0.0001898869785550963, + "loss": 0.7857, + "step": 213 + }, + { + "epoch": 0.1712, + "grad_norm": 1.0553361616852703, + "learning_rate": 0.0001897730879671634, + "loss": 0.7145, + "step": 214 + }, + { + "epoch": 0.172, + "grad_norm": 1.0715026333857263, + "learning_rate": 0.00018965859420826684, + "loss": 0.7705, + "step": 215 + }, + { + "epoch": 0.1728, + "grad_norm": 1.0211629635492872, + "learning_rate": 0.00018954349804767184, + "loss": 0.7289, + "step": 216 + }, + { + "epoch": 0.1736, + "grad_norm": 1.0409857521666497, + "learning_rate": 0.00018942780025869098, + "loss": 0.7827, + "step": 217 + }, + { + "epoch": 0.1744, + "grad_norm": 0.9309511034517031, + "learning_rate": 0.00018931150161867916, + "loss": 0.7313, + "step": 218 + }, + { + "epoch": 0.1752, + "grad_norm": 0.9576692146265773, + "learning_rate": 0.00018919460290902826, + "loss": 0.6621, + "step": 219 + }, + { + "epoch": 0.176, + "grad_norm": 1.062739137706439, + "learning_rate": 0.00018907710491516199, + "loss": 0.7597, + "step": 220 + }, + { + "epoch": 0.1768, + "grad_norm": 0.9310388910091685, + "learning_rate": 0.0001889590084265304, + "loss": 0.7826, + "step": 221 + }, + { + "epoch": 0.1776, + "grad_norm": 1.1253541815203907, + "learning_rate": 0.0001888403142366049, + "loss": 0.8474, + "step": 222 + }, + { + "epoch": 0.1784, + "grad_norm": 1.1166945029798787, + "learning_rate": 0.0001887210231428727, + "loss": 0.7838, + "step": 223 + }, + { + "epoch": 0.1792, + "grad_norm": 1.1496684622972821, + "learning_rate": 0.00018860113594683148, + "loss": 0.7847, + "step": 224 + }, + { + "epoch": 0.18, + "grad_norm": 1.0678223253517922, + "learning_rate": 0.0001884806534539841, + "loss": 0.8023, + "step": 225 + }, + { + "epoch": 0.1808, + "grad_norm": 1.1793532152413913, + "learning_rate": 0.00018835957647383303, + "loss": 0.794, + "step": 226 + }, + { + "epoch": 0.1816, + "grad_norm": 0.9831007867254152, + "learning_rate": 0.0001882379058198751, + "loss": 0.6924, + "step": 227 + }, + { + "epoch": 0.1824, + "grad_norm": 1.0453689547667198, + "learning_rate": 0.00018811564230959588, + "loss": 0.7247, + "step": 228 + }, + { + "epoch": 0.1832, + "grad_norm": 0.9882385097853617, + "learning_rate": 0.00018799278676446423, + "loss": 0.7262, + "step": 229 + }, + { + "epoch": 0.184, + "grad_norm": 1.080323670807221, + "learning_rate": 0.00018786934000992688, + "loss": 0.6887, + "step": 230 + }, + { + "epoch": 0.1848, + "grad_norm": 0.9501516932213585, + "learning_rate": 0.00018774530287540278, + "loss": 0.7022, + "step": 231 + }, + { + "epoch": 0.1856, + "grad_norm": 1.1400626270113077, + "learning_rate": 0.00018762067619427746, + "loss": 0.8338, + "step": 232 + }, + { + "epoch": 0.1864, + "grad_norm": 1.0003535208374308, + "learning_rate": 0.00018749546080389757, + "loss": 0.7794, + "step": 233 + }, + { + "epoch": 0.1872, + "grad_norm": 1.1702426650805333, + "learning_rate": 0.00018736965754556528, + "loss": 0.8726, + "step": 234 + }, + { + "epoch": 0.188, + "grad_norm": 1.184758961113029, + "learning_rate": 0.00018724326726453244, + "loss": 0.8259, + "step": 235 + }, + { + "epoch": 0.1888, + "grad_norm": 1.112356983214534, + "learning_rate": 0.00018711629080999504, + "loss": 0.8045, + "step": 236 + }, + { + "epoch": 0.1896, + "grad_norm": 0.9831874184738855, + "learning_rate": 0.00018698872903508755, + "loss": 0.6723, + "step": 237 + }, + { + "epoch": 0.1904, + "grad_norm": 1.026630504115062, + "learning_rate": 0.00018686058279687698, + "loss": 0.7538, + "step": 238 + }, + { + "epoch": 0.1912, + "grad_norm": 1.0318358455151204, + "learning_rate": 0.0001867318529563574, + "loss": 0.7098, + "step": 239 + }, + { + "epoch": 0.192, + "grad_norm": 1.0884518238836358, + "learning_rate": 0.00018660254037844388, + "loss": 0.762, + "step": 240 + }, + { + "epoch": 0.1928, + "grad_norm": 1.1808121910364464, + "learning_rate": 0.00018647264593196688, + "loss": 0.8122, + "step": 241 + }, + { + "epoch": 0.1936, + "grad_norm": 1.0831484545402992, + "learning_rate": 0.00018634217048966637, + "loss": 0.7173, + "step": 242 + }, + { + "epoch": 0.1944, + "grad_norm": 1.065442394577919, + "learning_rate": 0.00018621111492818585, + "loss": 0.7634, + "step": 243 + }, + { + "epoch": 0.1952, + "grad_norm": 1.176715798585199, + "learning_rate": 0.0001860794801280666, + "loss": 0.8176, + "step": 244 + }, + { + "epoch": 0.196, + "grad_norm": 0.9816781183969229, + "learning_rate": 0.00018594726697374175, + "loss": 0.6896, + "step": 245 + }, + { + "epoch": 0.1968, + "grad_norm": 1.099486782417909, + "learning_rate": 0.0001858144763535302, + "loss": 0.8123, + "step": 246 + }, + { + "epoch": 0.1976, + "grad_norm": 1.0148183908798116, + "learning_rate": 0.0001856811091596308, + "loss": 0.7322, + "step": 247 + }, + { + "epoch": 0.1984, + "grad_norm": 0.9799797820366946, + "learning_rate": 0.0001855471662881164, + "loss": 0.7213, + "step": 248 + }, + { + "epoch": 0.1992, + "grad_norm": 1.0340760098203616, + "learning_rate": 0.00018541264863892754, + "loss": 0.703, + "step": 249 + }, + { + "epoch": 0.2, + "grad_norm": 1.5978607470498314, + "learning_rate": 0.00018527755711586678, + "loss": 0.6617, + "step": 250 + }, + { + "epoch": 0.2008, + "grad_norm": 1.088949176532953, + "learning_rate": 0.00018514189262659235, + "loss": 0.8486, + "step": 251 + }, + { + "epoch": 0.2016, + "grad_norm": 1.0390771666964085, + "learning_rate": 0.00018500565608261214, + "loss": 0.7702, + "step": 252 + }, + { + "epoch": 0.2024, + "grad_norm": 0.9454467561447718, + "learning_rate": 0.00018486884839927768, + "loss": 0.6682, + "step": 253 + }, + { + "epoch": 0.2032, + "grad_norm": 1.0688599597247057, + "learning_rate": 0.00018473147049577774, + "loss": 0.8487, + "step": 254 + }, + { + "epoch": 0.204, + "grad_norm": 0.8988151969453305, + "learning_rate": 0.0001845935232951325, + "loss": 0.6799, + "step": 255 + }, + { + "epoch": 0.2048, + "grad_norm": 0.9681952414826085, + "learning_rate": 0.00018445500772418697, + "loss": 0.6717, + "step": 256 + }, + { + "epoch": 0.2056, + "grad_norm": 1.1719052599277335, + "learning_rate": 0.00018431592471360503, + "loss": 0.8016, + "step": 257 + }, + { + "epoch": 0.2064, + "grad_norm": 0.954039432416863, + "learning_rate": 0.00018417627519786315, + "loss": 0.6911, + "step": 258 + }, + { + "epoch": 0.2072, + "grad_norm": 1.2131581008392034, + "learning_rate": 0.000184036060115244, + "loss": 0.8597, + "step": 259 + }, + { + "epoch": 0.208, + "grad_norm": 0.9789998124089584, + "learning_rate": 0.00018389528040783012, + "loss": 0.6927, + "step": 260 + }, + { + "epoch": 0.2088, + "grad_norm": 0.9877726243104934, + "learning_rate": 0.00018375393702149787, + "loss": 0.6464, + "step": 261 + }, + { + "epoch": 0.2096, + "grad_norm": 1.0150532508929966, + "learning_rate": 0.00018361203090591071, + "loss": 0.7407, + "step": 262 + }, + { + "epoch": 0.2104, + "grad_norm": 1.0882168566410513, + "learning_rate": 0.00018346956301451304, + "loss": 0.724, + "step": 263 + }, + { + "epoch": 0.2112, + "grad_norm": 0.982861951878359, + "learning_rate": 0.00018332653430452376, + "loss": 0.7284, + "step": 264 + }, + { + "epoch": 0.212, + "grad_norm": 1.0030001487449376, + "learning_rate": 0.00018318294573692985, + "loss": 0.7069, + "step": 265 + }, + { + "epoch": 0.2128, + "grad_norm": 1.0280008372899851, + "learning_rate": 0.00018303879827647975, + "loss": 0.7594, + "step": 266 + }, + { + "epoch": 0.2136, + "grad_norm": 1.0962550866233343, + "learning_rate": 0.0001828940928916772, + "loss": 0.8523, + "step": 267 + }, + { + "epoch": 0.2144, + "grad_norm": 1.0457328345364145, + "learning_rate": 0.00018274883055477436, + "loss": 0.6922, + "step": 268 + }, + { + "epoch": 0.2152, + "grad_norm": 1.4036703444178804, + "learning_rate": 0.00018260301224176558, + "loss": 0.8643, + "step": 269 + }, + { + "epoch": 0.216, + "grad_norm": 0.9554017750179316, + "learning_rate": 0.00018245663893238075, + "loss": 0.697, + "step": 270 + }, + { + "epoch": 0.2168, + "grad_norm": 0.9682042596566531, + "learning_rate": 0.00018230971161007853, + "loss": 0.7175, + "step": 271 + }, + { + "epoch": 0.2176, + "grad_norm": 0.9415174317375208, + "learning_rate": 0.00018216223126204007, + "loss": 0.6513, + "step": 272 + }, + { + "epoch": 0.2184, + "grad_norm": 1.2727514684350927, + "learning_rate": 0.00018201419887916214, + "loss": 0.8116, + "step": 273 + }, + { + "epoch": 0.2192, + "grad_norm": 0.9921199460736623, + "learning_rate": 0.00018186561545605054, + "loss": 0.66, + "step": 274 + }, + { + "epoch": 0.22, + "grad_norm": 1.0137544087850203, + "learning_rate": 0.00018171648199101346, + "loss": 0.7267, + "step": 275 + }, + { + "epoch": 0.2208, + "grad_norm": 0.974035983194552, + "learning_rate": 0.00018156679948605467, + "loss": 0.6646, + "step": 276 + }, + { + "epoch": 0.2216, + "grad_norm": 1.088398496009498, + "learning_rate": 0.00018141656894686689, + "loss": 0.7733, + "step": 277 + }, + { + "epoch": 0.2224, + "grad_norm": 0.9967354379387562, + "learning_rate": 0.00018126579138282503, + "loss": 0.6388, + "step": 278 + }, + { + "epoch": 0.2232, + "grad_norm": 0.9526052437876638, + "learning_rate": 0.00018111446780697929, + "loss": 0.6986, + "step": 279 + }, + { + "epoch": 0.224, + "grad_norm": 1.0289683091704833, + "learning_rate": 0.0001809625992360485, + "loss": 0.7501, + "step": 280 + }, + { + "epoch": 0.2248, + "grad_norm": 0.9462458656763387, + "learning_rate": 0.00018081018669041324, + "loss": 0.7066, + "step": 281 + }, + { + "epoch": 0.2256, + "grad_norm": 1.0437208398768452, + "learning_rate": 0.00018065723119410884, + "loss": 0.7938, + "step": 282 + }, + { + "epoch": 0.2264, + "grad_norm": 1.0343520846176977, + "learning_rate": 0.00018050373377481878, + "loss": 0.7036, + "step": 283 + }, + { + "epoch": 0.2272, + "grad_norm": 0.9577644708974145, + "learning_rate": 0.00018034969546386757, + "loss": 0.6413, + "step": 284 + }, + { + "epoch": 0.228, + "grad_norm": 1.1006775477218305, + "learning_rate": 0.0001801951172962139, + "loss": 0.8025, + "step": 285 + }, + { + "epoch": 0.2288, + "grad_norm": 1.0151788021543564, + "learning_rate": 0.0001800400003104436, + "loss": 0.7577, + "step": 286 + }, + { + "epoch": 0.2296, + "grad_norm": 0.9937451275659185, + "learning_rate": 0.0001798843455487629, + "loss": 0.7137, + "step": 287 + }, + { + "epoch": 0.2304, + "grad_norm": 1.0350026345426953, + "learning_rate": 0.00017972815405699103, + "loss": 0.7823, + "step": 288 + }, + { + "epoch": 0.2312, + "grad_norm": 0.9108555752247096, + "learning_rate": 0.00017957142688455362, + "loss": 0.735, + "step": 289 + }, + { + "epoch": 0.232, + "grad_norm": 1.0668111431297043, + "learning_rate": 0.00017941416508447536, + "loss": 0.7714, + "step": 290 + }, + { + "epoch": 0.2328, + "grad_norm": 0.912021860911979, + "learning_rate": 0.00017925636971337304, + "loss": 0.6831, + "step": 291 + }, + { + "epoch": 0.2336, + "grad_norm": 0.9905284496676833, + "learning_rate": 0.0001790980418314484, + "loss": 0.7561, + "step": 292 + }, + { + "epoch": 0.2344, + "grad_norm": 1.1232141052409121, + "learning_rate": 0.00017893918250248104, + "loss": 0.77, + "step": 293 + }, + { + "epoch": 0.2352, + "grad_norm": 0.9931298258308252, + "learning_rate": 0.00017877979279382135, + "loss": 0.7114, + "step": 294 + }, + { + "epoch": 0.236, + "grad_norm": 1.1089171732957042, + "learning_rate": 0.00017861987377638312, + "loss": 0.7296, + "step": 295 + }, + { + "epoch": 0.2368, + "grad_norm": 1.1286852814468344, + "learning_rate": 0.0001784594265246366, + "loss": 0.7465, + "step": 296 + }, + { + "epoch": 0.2376, + "grad_norm": 1.1190139468756013, + "learning_rate": 0.0001782984521166011, + "loss": 0.6236, + "step": 297 + }, + { + "epoch": 0.2384, + "grad_norm": 1.1483830524013459, + "learning_rate": 0.0001781369516338378, + "loss": 0.7324, + "step": 298 + }, + { + "epoch": 0.2392, + "grad_norm": 1.2212551541061174, + "learning_rate": 0.00017797492616144256, + "loss": 0.8115, + "step": 299 + }, + { + "epoch": 0.24, + "grad_norm": 1.090068717757821, + "learning_rate": 0.00017781237678803847, + "loss": 0.7191, + "step": 300 + }, + { + "epoch": 0.2408, + "grad_norm": 1.1085791446066866, + "learning_rate": 0.00017764930460576866, + "loss": 0.8004, + "step": 301 + }, + { + "epoch": 0.2416, + "grad_norm": 1.0113359039302352, + "learning_rate": 0.000177485710710289, + "loss": 0.6248, + "step": 302 + }, + { + "epoch": 0.2424, + "grad_norm": 1.0964367246355113, + "learning_rate": 0.00017732159620076053, + "loss": 0.7408, + "step": 303 + }, + { + "epoch": 0.2432, + "grad_norm": 1.0301462814934836, + "learning_rate": 0.00017715696217984235, + "loss": 0.7247, + "step": 304 + }, + { + "epoch": 0.244, + "grad_norm": 1.0739566603530641, + "learning_rate": 0.00017699180975368396, + "loss": 0.7033, + "step": 305 + }, + { + "epoch": 0.2448, + "grad_norm": 0.8807320293892692, + "learning_rate": 0.00017682614003191807, + "loss": 0.6746, + "step": 306 + }, + { + "epoch": 0.2456, + "grad_norm": 0.9793213144526057, + "learning_rate": 0.00017665995412765285, + "loss": 0.7779, + "step": 307 + }, + { + "epoch": 0.2464, + "grad_norm": 0.9415126280388529, + "learning_rate": 0.00017649325315746478, + "loss": 0.6607, + "step": 308 + }, + { + "epoch": 0.2472, + "grad_norm": 0.9909061915628322, + "learning_rate": 0.00017632603824139085, + "loss": 0.714, + "step": 309 + }, + { + "epoch": 0.248, + "grad_norm": 1.00513563363099, + "learning_rate": 0.0001761583105029213, + "loss": 0.756, + "step": 310 + }, + { + "epoch": 0.2488, + "grad_norm": 1.1245792806635877, + "learning_rate": 0.0001759900710689918, + "loss": 0.7553, + "step": 311 + }, + { + "epoch": 0.2496, + "grad_norm": 1.1950916131345375, + "learning_rate": 0.00017582132106997616, + "loss": 0.8932, + "step": 312 + }, + { + "epoch": 0.2504, + "grad_norm": 0.9932996847458989, + "learning_rate": 0.00017565206163967846, + "loss": 0.6877, + "step": 313 + }, + { + "epoch": 0.2512, + "grad_norm": 1.248713647763005, + "learning_rate": 0.00017548229391532572, + "loss": 0.9275, + "step": 314 + }, + { + "epoch": 0.252, + "grad_norm": 1.2215744887666666, + "learning_rate": 0.00017531201903755994, + "loss": 0.8051, + "step": 315 + }, + { + "epoch": 0.2528, + "grad_norm": 1.0512964840297354, + "learning_rate": 0.00017514123815043074, + "loss": 0.6843, + "step": 316 + }, + { + "epoch": 0.2536, + "grad_norm": 0.9928744107754482, + "learning_rate": 0.00017496995240138744, + "loss": 0.6953, + "step": 317 + }, + { + "epoch": 0.2544, + "grad_norm": 1.105947283350789, + "learning_rate": 0.00017479816294127152, + "loss": 0.7055, + "step": 318 + }, + { + "epoch": 0.2552, + "grad_norm": 0.9883961136937218, + "learning_rate": 0.00017462587092430875, + "loss": 0.6444, + "step": 319 + }, + { + "epoch": 0.256, + "grad_norm": 1.0207447755812524, + "learning_rate": 0.0001744530775081015, + "loss": 0.7367, + "step": 320 + }, + { + "epoch": 0.2568, + "grad_norm": 1.0094331506812593, + "learning_rate": 0.00017427978385362112, + "loss": 0.8237, + "step": 321 + }, + { + "epoch": 0.2576, + "grad_norm": 1.0138108530230707, + "learning_rate": 0.0001741059911251997, + "loss": 0.703, + "step": 322 + }, + { + "epoch": 0.2584, + "grad_norm": 0.928129814825029, + "learning_rate": 0.0001739317004905227, + "loss": 0.6776, + "step": 323 + }, + { + "epoch": 0.2592, + "grad_norm": 1.0288030369711199, + "learning_rate": 0.000173756913120621, + "loss": 0.7732, + "step": 324 + }, + { + "epoch": 0.26, + "grad_norm": 1.0640848613747005, + "learning_rate": 0.00017358163018986282, + "loss": 0.777, + "step": 325 + }, + { + "epoch": 0.2608, + "grad_norm": 1.0528999133328119, + "learning_rate": 0.00017340585287594604, + "loss": 0.8037, + "step": 326 + }, + { + "epoch": 0.2616, + "grad_norm": 0.9653114458714556, + "learning_rate": 0.00017322958235989016, + "loss": 0.6779, + "step": 327 + }, + { + "epoch": 0.2624, + "grad_norm": 1.0262546590838455, + "learning_rate": 0.0001730528198260285, + "loss": 0.7933, + "step": 328 + }, + { + "epoch": 0.2632, + "grad_norm": 0.9164158735787065, + "learning_rate": 0.00017287556646200018, + "loss": 0.6451, + "step": 329 + }, + { + "epoch": 0.264, + "grad_norm": 1.066701177172031, + "learning_rate": 0.00017269782345874203, + "loss": 0.7566, + "step": 330 + }, + { + "epoch": 0.2648, + "grad_norm": 0.9439582191068443, + "learning_rate": 0.00017251959201048083, + "loss": 0.6703, + "step": 331 + }, + { + "epoch": 0.2656, + "grad_norm": 0.9459146283947549, + "learning_rate": 0.00017234087331472497, + "loss": 0.6672, + "step": 332 + }, + { + "epoch": 0.2664, + "grad_norm": 0.9981667462424716, + "learning_rate": 0.00017216166857225674, + "loss": 0.6923, + "step": 333 + }, + { + "epoch": 0.2672, + "grad_norm": 1.0875381794581624, + "learning_rate": 0.00017198197898712404, + "loss": 0.7673, + "step": 334 + }, + { + "epoch": 0.268, + "grad_norm": 0.8762075795160603, + "learning_rate": 0.00017180180576663228, + "loss": 0.5637, + "step": 335 + }, + { + "epoch": 0.2688, + "grad_norm": 1.0604507702776595, + "learning_rate": 0.00017162115012133643, + "loss": 0.7326, + "step": 336 + }, + { + "epoch": 0.2696, + "grad_norm": 0.9143594094269025, + "learning_rate": 0.00017144001326503273, + "loss": 0.6037, + "step": 337 + }, + { + "epoch": 0.2704, + "grad_norm": 1.0636858738692445, + "learning_rate": 0.00017125839641475072, + "loss": 0.7581, + "step": 338 + }, + { + "epoch": 0.2712, + "grad_norm": 0.9318846711345694, + "learning_rate": 0.00017107630079074478, + "loss": 0.6136, + "step": 339 + }, + { + "epoch": 0.272, + "grad_norm": 1.0384488393986446, + "learning_rate": 0.00017089372761648616, + "loss": 0.729, + "step": 340 + }, + { + "epoch": 0.2728, + "grad_norm": 1.0477460175382347, + "learning_rate": 0.00017071067811865476, + "loss": 0.8111, + "step": 341 + }, + { + "epoch": 0.2736, + "grad_norm": 1.105830244673777, + "learning_rate": 0.00017052715352713075, + "loss": 0.7597, + "step": 342 + }, + { + "epoch": 0.2744, + "grad_norm": 0.9834120376978399, + "learning_rate": 0.00017034315507498635, + "loss": 0.6362, + "step": 343 + }, + { + "epoch": 0.2752, + "grad_norm": 1.098162845193706, + "learning_rate": 0.00017015868399847768, + "loss": 0.725, + "step": 344 + }, + { + "epoch": 0.276, + "grad_norm": 1.012436854456873, + "learning_rate": 0.00016997374153703625, + "loss": 0.7046, + "step": 345 + }, + { + "epoch": 0.2768, + "grad_norm": 0.9760021946383537, + "learning_rate": 0.00016978832893326074, + "loss": 0.6925, + "step": 346 + }, + { + "epoch": 0.2776, + "grad_norm": 0.9382328441194895, + "learning_rate": 0.00016960244743290868, + "loss": 0.6759, + "step": 347 + }, + { + "epoch": 0.2784, + "grad_norm": 0.9760909397548315, + "learning_rate": 0.00016941609828488807, + "loss": 0.7327, + "step": 348 + }, + { + "epoch": 0.2792, + "grad_norm": 1.1133709538310124, + "learning_rate": 0.00016922928274124886, + "loss": 0.697, + "step": 349 + }, + { + "epoch": 0.28, + "grad_norm": 0.933158293206893, + "learning_rate": 0.0001690420020571747, + "loss": 0.6591, + "step": 350 + }, + { + "epoch": 0.2808, + "grad_norm": 0.8610989939996339, + "learning_rate": 0.00016885425749097444, + "loss": 0.5716, + "step": 351 + }, + { + "epoch": 0.2816, + "grad_norm": 0.9861142706518471, + "learning_rate": 0.0001686660503040737, + "loss": 0.6708, + "step": 352 + }, + { + "epoch": 0.2824, + "grad_norm": 1.0257131017794683, + "learning_rate": 0.00016847738176100632, + "loss": 0.7431, + "step": 353 + }, + { + "epoch": 0.2832, + "grad_norm": 1.0198752411317464, + "learning_rate": 0.00016828825312940592, + "loss": 0.7411, + "step": 354 + }, + { + "epoch": 0.284, + "grad_norm": 1.0409890244815698, + "learning_rate": 0.0001680986656799975, + "loss": 0.7328, + "step": 355 + }, + { + "epoch": 0.2848, + "grad_norm": 0.988010259209258, + "learning_rate": 0.0001679086206865886, + "loss": 0.6229, + "step": 356 + }, + { + "epoch": 0.2856, + "grad_norm": 1.1042524286135829, + "learning_rate": 0.00016771811942606108, + "loss": 0.6528, + "step": 357 + }, + { + "epoch": 0.2864, + "grad_norm": 1.167309119109961, + "learning_rate": 0.00016752716317836229, + "loss": 0.6666, + "step": 358 + }, + { + "epoch": 0.2872, + "grad_norm": 1.3041398533142197, + "learning_rate": 0.00016733575322649657, + "loss": 0.6798, + "step": 359 + }, + { + "epoch": 0.288, + "grad_norm": 1.0865400113033143, + "learning_rate": 0.0001671438908565167, + "loss": 0.7536, + "step": 360 + }, + { + "epoch": 0.2888, + "grad_norm": 0.8555479713966215, + "learning_rate": 0.00016695157735751513, + "loss": 0.5758, + "step": 361 + }, + { + "epoch": 0.2896, + "grad_norm": 1.0568143499880747, + "learning_rate": 0.00016675881402161536, + "loss": 0.7081, + "step": 362 + }, + { + "epoch": 0.2904, + "grad_norm": 1.0162822170661614, + "learning_rate": 0.0001665656021439633, + "loss": 0.6428, + "step": 363 + }, + { + "epoch": 0.2912, + "grad_norm": 1.0359036973974367, + "learning_rate": 0.0001663719430227186, + "loss": 0.7316, + "step": 364 + }, + { + "epoch": 0.292, + "grad_norm": 0.9560884895362259, + "learning_rate": 0.00016617783795904565, + "loss": 0.653, + "step": 365 + }, + { + "epoch": 0.2928, + "grad_norm": 1.1544322689744408, + "learning_rate": 0.00016598328825710533, + "loss": 0.7334, + "step": 366 + }, + { + "epoch": 0.2936, + "grad_norm": 0.953809912936542, + "learning_rate": 0.00016578829522404583, + "loss": 0.6703, + "step": 367 + }, + { + "epoch": 0.2944, + "grad_norm": 1.0444013863707968, + "learning_rate": 0.000165592860169994, + "loss": 0.6968, + "step": 368 + }, + { + "epoch": 0.2952, + "grad_norm": 0.9447882428261958, + "learning_rate": 0.00016539698440804661, + "loss": 0.68, + "step": 369 + }, + { + "epoch": 0.296, + "grad_norm": 1.0831435406919236, + "learning_rate": 0.00016520066925426144, + "loss": 0.6961, + "step": 370 + }, + { + "epoch": 0.2968, + "grad_norm": 1.1363438282806402, + "learning_rate": 0.0001650039160276485, + "loss": 0.7148, + "step": 371 + }, + { + "epoch": 0.2976, + "grad_norm": 1.0696012560299293, + "learning_rate": 0.0001648067260501611, + "loss": 0.7761, + "step": 372 + }, + { + "epoch": 0.2984, + "grad_norm": 0.9333342558350548, + "learning_rate": 0.0001646091006466871, + "loss": 0.6054, + "step": 373 + }, + { + "epoch": 0.2992, + "grad_norm": 0.9033082485068732, + "learning_rate": 0.0001644110411450398, + "loss": 0.6315, + "step": 374 + }, + { + "epoch": 0.3, + "grad_norm": 1.0930337368883383, + "learning_rate": 0.00016421254887594917, + "loss": 0.7495, + "step": 375 + }, + { + "epoch": 0.3008, + "grad_norm": 1.081838167819764, + "learning_rate": 0.00016401362517305296, + "loss": 0.7293, + "step": 376 + }, + { + "epoch": 0.3016, + "grad_norm": 0.9923722852152731, + "learning_rate": 0.00016381427137288754, + "loss": 0.6889, + "step": 377 + }, + { + "epoch": 0.3024, + "grad_norm": 0.9196648489839756, + "learning_rate": 0.00016361448881487914, + "loss": 0.6371, + "step": 378 + }, + { + "epoch": 0.3032, + "grad_norm": 1.2048809375107268, + "learning_rate": 0.0001634142788413346, + "loss": 0.8533, + "step": 379 + }, + { + "epoch": 0.304, + "grad_norm": 0.9673963449218785, + "learning_rate": 0.00016321364279743266, + "loss": 0.6977, + "step": 380 + }, + { + "epoch": 0.3048, + "grad_norm": 0.9703411130239993, + "learning_rate": 0.00016301258203121462, + "loss": 0.7172, + "step": 381 + }, + { + "epoch": 0.3056, + "grad_norm": 0.9373760822707912, + "learning_rate": 0.0001628110978935756, + "loss": 0.7073, + "step": 382 + }, + { + "epoch": 0.3064, + "grad_norm": 0.994147007240893, + "learning_rate": 0.00016260919173825508, + "loss": 0.7076, + "step": 383 + }, + { + "epoch": 0.3072, + "grad_norm": 1.0746978834189227, + "learning_rate": 0.00016240686492182804, + "loss": 0.7495, + "step": 384 + }, + { + "epoch": 0.308, + "grad_norm": 1.052331627756332, + "learning_rate": 0.00016220411880369601, + "loss": 0.6775, + "step": 385 + }, + { + "epoch": 0.3088, + "grad_norm": 1.0672546588497585, + "learning_rate": 0.00016200095474607753, + "loss": 0.7034, + "step": 386 + }, + { + "epoch": 0.3096, + "grad_norm": 1.215223998177791, + "learning_rate": 0.00016179737411399926, + "loss": 0.7603, + "step": 387 + }, + { + "epoch": 0.3104, + "grad_norm": 0.9440701837765714, + "learning_rate": 0.00016159337827528685, + "loss": 0.6243, + "step": 388 + }, + { + "epoch": 0.3112, + "grad_norm": 0.9438729501972922, + "learning_rate": 0.00016138896860055555, + "loss": 0.6566, + "step": 389 + }, + { + "epoch": 0.312, + "grad_norm": 1.0654804345994373, + "learning_rate": 0.0001611841464632011, + "loss": 0.7174, + "step": 390 + }, + { + "epoch": 0.3128, + "grad_norm": 1.1763623753804215, + "learning_rate": 0.00016097891323939062, + "loss": 0.7637, + "step": 391 + }, + { + "epoch": 0.3136, + "grad_norm": 1.0741594802453314, + "learning_rate": 0.0001607732703080532, + "loss": 0.7241, + "step": 392 + }, + { + "epoch": 0.3144, + "grad_norm": 0.9675989080151941, + "learning_rate": 0.00016056721905087056, + "loss": 0.7439, + "step": 393 + }, + { + "epoch": 0.3152, + "grad_norm": 0.9636647510459591, + "learning_rate": 0.00016036076085226814, + "loss": 0.6479, + "step": 394 + }, + { + "epoch": 0.316, + "grad_norm": 1.0087841611490806, + "learning_rate": 0.00016015389709940538, + "loss": 0.7383, + "step": 395 + }, + { + "epoch": 0.3168, + "grad_norm": 0.9818867802966742, + "learning_rate": 0.0001599466291821666, + "loss": 0.6549, + "step": 396 + }, + { + "epoch": 0.3176, + "grad_norm": 1.0355078040973305, + "learning_rate": 0.0001597389584931517, + "loss": 0.7541, + "step": 397 + }, + { + "epoch": 0.3184, + "grad_norm": 0.9802743835120142, + "learning_rate": 0.0001595308864276666, + "loss": 0.6655, + "step": 398 + }, + { + "epoch": 0.3192, + "grad_norm": 0.971194464938782, + "learning_rate": 0.0001593224143837142, + "loss": 0.7107, + "step": 399 + }, + { + "epoch": 0.32, + "grad_norm": 1.1220475607440976, + "learning_rate": 0.0001591135437619847, + "loss": 0.6531, + "step": 400 + }, + { + "epoch": 0.3208, + "grad_norm": 1.0605304536118139, + "learning_rate": 0.00015890427596584617, + "loss": 0.6269, + "step": 401 + }, + { + "epoch": 0.3216, + "grad_norm": 1.2204345511388361, + "learning_rate": 0.0001586946124013354, + "loss": 0.7026, + "step": 402 + }, + { + "epoch": 0.3224, + "grad_norm": 0.9738217416134146, + "learning_rate": 0.00015848455447714822, + "loss": 0.5849, + "step": 403 + }, + { + "epoch": 0.3232, + "grad_norm": 0.9441774928753534, + "learning_rate": 0.0001582741036046301, + "loss": 0.5969, + "step": 404 + }, + { + "epoch": 0.324, + "grad_norm": 1.12389339746799, + "learning_rate": 0.00015806326119776663, + "loss": 0.7606, + "step": 405 + }, + { + "epoch": 0.3248, + "grad_norm": 0.9304351293542036, + "learning_rate": 0.00015785202867317407, + "loss": 0.639, + "step": 406 + }, + { + "epoch": 0.3256, + "grad_norm": 0.9565046885589732, + "learning_rate": 0.00015764040745008988, + "loss": 0.6884, + "step": 407 + }, + { + "epoch": 0.3264, + "grad_norm": 1.205634227332287, + "learning_rate": 0.00015742839895036305, + "loss": 0.8178, + "step": 408 + }, + { + "epoch": 0.3272, + "grad_norm": 1.0188207226357398, + "learning_rate": 0.00015721600459844468, + "loss": 0.7172, + "step": 409 + }, + { + "epoch": 0.328, + "grad_norm": 1.0540021733325038, + "learning_rate": 0.00015700322582137827, + "loss": 0.6787, + "step": 410 + }, + { + "epoch": 0.3288, + "grad_norm": 0.9948112159703907, + "learning_rate": 0.00015679006404879033, + "loss": 0.6498, + "step": 411 + }, + { + "epoch": 0.3296, + "grad_norm": 0.8836323531769209, + "learning_rate": 0.0001565765207128805, + "loss": 0.6339, + "step": 412 + }, + { + "epoch": 0.3304, + "grad_norm": 0.9576548443778085, + "learning_rate": 0.00015636259724841222, + "loss": 0.6895, + "step": 413 + }, + { + "epoch": 0.3312, + "grad_norm": 1.1102043397230095, + "learning_rate": 0.0001561482950927029, + "loss": 0.7384, + "step": 414 + }, + { + "epoch": 0.332, + "grad_norm": 1.1351565388435783, + "learning_rate": 0.00015593361568561428, + "loss": 0.7097, + "step": 415 + }, + { + "epoch": 0.3328, + "grad_norm": 1.146994910537703, + "learning_rate": 0.00015571856046954285, + "loss": 0.7317, + "step": 416 + }, + { + "epoch": 0.3336, + "grad_norm": 0.9918991649557216, + "learning_rate": 0.0001555031308894101, + "loss": 0.6544, + "step": 417 + }, + { + "epoch": 0.3344, + "grad_norm": 1.198648658812263, + "learning_rate": 0.00015528732839265272, + "loss": 0.8527, + "step": 418 + }, + { + "epoch": 0.3352, + "grad_norm": 0.9626362344342084, + "learning_rate": 0.0001550711544292131, + "loss": 0.6238, + "step": 419 + }, + { + "epoch": 0.336, + "grad_norm": 0.9499452817998587, + "learning_rate": 0.0001548546104515294, + "loss": 0.6527, + "step": 420 + }, + { + "epoch": 0.3368, + "grad_norm": 1.0658101866983538, + "learning_rate": 0.00015463769791452574, + "loss": 0.8236, + "step": 421 + }, + { + "epoch": 0.3376, + "grad_norm": 0.8084895765521102, + "learning_rate": 0.00015442041827560274, + "loss": 0.5902, + "step": 422 + }, + { + "epoch": 0.3384, + "grad_norm": 0.9690490833614454, + "learning_rate": 0.00015420277299462736, + "loss": 0.7191, + "step": 423 + }, + { + "epoch": 0.3392, + "grad_norm": 1.008697111330277, + "learning_rate": 0.00015398476353392323, + "loss": 0.7917, + "step": 424 + }, + { + "epoch": 0.34, + "grad_norm": 0.9646953454012074, + "learning_rate": 0.00015376639135826107, + "loss": 0.6793, + "step": 425 + }, + { + "epoch": 0.3408, + "grad_norm": 0.9653443273926454, + "learning_rate": 0.00015354765793484834, + "loss": 0.6591, + "step": 426 + }, + { + "epoch": 0.3416, + "grad_norm": 0.9179052874207917, + "learning_rate": 0.00015332856473331978, + "loss": 0.6613, + "step": 427 + }, + { + "epoch": 0.3424, + "grad_norm": 1.0562144373472704, + "learning_rate": 0.00015310911322572753, + "loss": 0.6906, + "step": 428 + }, + { + "epoch": 0.3432, + "grad_norm": 1.0950071511884267, + "learning_rate": 0.00015288930488653094, + "loss": 0.7611, + "step": 429 + }, + { + "epoch": 0.344, + "grad_norm": 1.0185041599966833, + "learning_rate": 0.000152669141192587, + "loss": 0.7104, + "step": 430 + }, + { + "epoch": 0.3448, + "grad_norm": 1.0044446718738724, + "learning_rate": 0.0001524486236231402, + "loss": 0.6764, + "step": 431 + }, + { + "epoch": 0.3456, + "grad_norm": 0.9120289651783, + "learning_rate": 0.00015222775365981273, + "loss": 0.6452, + "step": 432 + }, + { + "epoch": 0.3464, + "grad_norm": 0.9606647124420223, + "learning_rate": 0.00015200653278659432, + "loss": 0.671, + "step": 433 + }, + { + "epoch": 0.3472, + "grad_norm": 1.0245320681964516, + "learning_rate": 0.00015178496248983254, + "loss": 0.654, + "step": 434 + }, + { + "epoch": 0.348, + "grad_norm": 1.6576495465222114, + "learning_rate": 0.00015156304425822267, + "loss": 0.847, + "step": 435 + }, + { + "epoch": 0.3488, + "grad_norm": 0.9865908610386174, + "learning_rate": 0.00015134077958279765, + "loss": 0.6993, + "step": 436 + }, + { + "epoch": 0.3496, + "grad_norm": 1.1295837213180702, + "learning_rate": 0.00015111816995691809, + "loss": 0.7228, + "step": 437 + }, + { + "epoch": 0.3504, + "grad_norm": 0.948216062157724, + "learning_rate": 0.00015089521687626243, + "loss": 0.6986, + "step": 438 + }, + { + "epoch": 0.3512, + "grad_norm": 1.0487720133192266, + "learning_rate": 0.00015067192183881658, + "loss": 0.6659, + "step": 439 + }, + { + "epoch": 0.352, + "grad_norm": 1.0532693653317589, + "learning_rate": 0.000150448286344864, + "loss": 0.6925, + "step": 440 + }, + { + "epoch": 0.3528, + "grad_norm": 1.1645235432270866, + "learning_rate": 0.00015022431189697568, + "loss": 0.7907, + "step": 441 + }, + { + "epoch": 0.3536, + "grad_norm": 1.0657274676274178, + "learning_rate": 0.00015000000000000001, + "loss": 0.7278, + "step": 442 + }, + { + "epoch": 0.3544, + "grad_norm": 0.9943155432977927, + "learning_rate": 0.0001497753521610526, + "loss": 0.6269, + "step": 443 + }, + { + "epoch": 0.3552, + "grad_norm": 1.1107776604547468, + "learning_rate": 0.00014955036988950618, + "loss": 0.7161, + "step": 444 + }, + { + "epoch": 0.356, + "grad_norm": 0.9734125226125735, + "learning_rate": 0.00014932505469698052, + "loss": 0.6816, + "step": 445 + }, + { + "epoch": 0.3568, + "grad_norm": 0.9992083479961862, + "learning_rate": 0.00014909940809733222, + "loss": 0.6643, + "step": 446 + }, + { + "epoch": 0.3576, + "grad_norm": 1.0558325256562415, + "learning_rate": 0.0001488734316066446, + "loss": 0.7538, + "step": 447 + }, + { + "epoch": 0.3584, + "grad_norm": 1.0821258580451227, + "learning_rate": 0.00014864712674321734, + "loss": 0.7827, + "step": 448 + }, + { + "epoch": 0.3592, + "grad_norm": 1.0243746286607833, + "learning_rate": 0.0001484204950275565, + "loss": 0.7297, + "step": 449 + }, + { + "epoch": 0.36, + "grad_norm": 0.9717240585903687, + "learning_rate": 0.00014819353798236427, + "loss": 0.624, + "step": 450 + }, + { + "epoch": 0.3608, + "grad_norm": 1.0315755463193517, + "learning_rate": 0.00014796625713252848, + "loss": 0.72, + "step": 451 + }, + { + "epoch": 0.3616, + "grad_norm": 0.9457788209819127, + "learning_rate": 0.00014773865400511272, + "loss": 0.6104, + "step": 452 + }, + { + "epoch": 0.3624, + "grad_norm": 0.8622451847398616, + "learning_rate": 0.00014751073012934587, + "loss": 0.5773, + "step": 453 + }, + { + "epoch": 0.3632, + "grad_norm": 0.9170339040279909, + "learning_rate": 0.00014728248703661182, + "loss": 0.6697, + "step": 454 + }, + { + "epoch": 0.364, + "grad_norm": 0.9309659564189371, + "learning_rate": 0.0001470539262604393, + "loss": 0.6061, + "step": 455 + }, + { + "epoch": 0.3648, + "grad_norm": 1.0139811893861588, + "learning_rate": 0.00014682504933649144, + "loss": 0.6063, + "step": 456 + }, + { + "epoch": 0.3656, + "grad_norm": 0.9565559184221922, + "learning_rate": 0.00014659585780255556, + "loss": 0.5972, + "step": 457 + }, + { + "epoch": 0.3664, + "grad_norm": 1.5548048389718108, + "learning_rate": 0.00014636635319853275, + "loss": 0.5764, + "step": 458 + }, + { + "epoch": 0.3672, + "grad_norm": 1.0799981251001614, + "learning_rate": 0.0001461365370664276, + "loss": 0.6595, + "step": 459 + }, + { + "epoch": 0.368, + "grad_norm": 1.049770262215874, + "learning_rate": 0.00014590641095033787, + "loss": 0.7363, + "step": 460 + }, + { + "epoch": 0.3688, + "grad_norm": 1.0325936012889165, + "learning_rate": 0.00014567597639644387, + "loss": 0.6885, + "step": 461 + }, + { + "epoch": 0.3696, + "grad_norm": 0.8625772450786451, + "learning_rate": 0.00014544523495299842, + "loss": 0.6287, + "step": 462 + }, + { + "epoch": 0.3704, + "grad_norm": 0.9112822988052758, + "learning_rate": 0.00014521418817031628, + "loss": 0.5691, + "step": 463 + }, + { + "epoch": 0.3712, + "grad_norm": 1.0632388018056593, + "learning_rate": 0.0001449828376007636, + "loss": 0.6636, + "step": 464 + }, + { + "epoch": 0.372, + "grad_norm": 1.029535711215482, + "learning_rate": 0.00014475118479874774, + "loss": 0.6906, + "step": 465 + }, + { + "epoch": 0.3728, + "grad_norm": 0.9591614110692922, + "learning_rate": 0.0001445192313207067, + "loss": 0.6314, + "step": 466 + }, + { + "epoch": 0.3736, + "grad_norm": 0.8949330694021034, + "learning_rate": 0.0001442869787250987, + "loss": 0.5899, + "step": 467 + }, + { + "epoch": 0.3744, + "grad_norm": 0.9153773857789488, + "learning_rate": 0.0001440544285723915, + "loss": 0.6051, + "step": 468 + }, + { + "epoch": 0.3752, + "grad_norm": 1.0849278669728846, + "learning_rate": 0.00014382158242505234, + "loss": 0.7544, + "step": 469 + }, + { + "epoch": 0.376, + "grad_norm": 1.142813464120941, + "learning_rate": 0.00014358844184753712, + "loss": 0.7313, + "step": 470 + }, + { + "epoch": 0.3768, + "grad_norm": 1.001543961930351, + "learning_rate": 0.00014335500840627986, + "loss": 0.6699, + "step": 471 + }, + { + "epoch": 0.3776, + "grad_norm": 1.0425623116876044, + "learning_rate": 0.00014312128366968243, + "loss": 0.5833, + "step": 472 + }, + { + "epoch": 0.3784, + "grad_norm": 1.050040552652407, + "learning_rate": 0.0001428872692081038, + "loss": 0.6643, + "step": 473 + }, + { + "epoch": 0.3792, + "grad_norm": 1.1424081378005362, + "learning_rate": 0.00014265296659384956, + "loss": 0.7225, + "step": 474 + }, + { + "epoch": 0.38, + "grad_norm": 1.1312337168049746, + "learning_rate": 0.00014241837740116132, + "loss": 0.6858, + "step": 475 + }, + { + "epoch": 0.3808, + "grad_norm": 1.0679202158442578, + "learning_rate": 0.00014218350320620624, + "loss": 0.6776, + "step": 476 + }, + { + "epoch": 0.3816, + "grad_norm": 1.067151691225152, + "learning_rate": 0.00014194834558706632, + "loss": 0.6363, + "step": 477 + }, + { + "epoch": 0.3824, + "grad_norm": 0.9431028592814902, + "learning_rate": 0.0001417129061237278, + "loss": 0.6697, + "step": 478 + }, + { + "epoch": 0.3832, + "grad_norm": 0.9549602521283611, + "learning_rate": 0.0001414771863980707, + "loss": 0.7064, + "step": 479 + }, + { + "epoch": 0.384, + "grad_norm": 0.9529789295668323, + "learning_rate": 0.00014124118799385796, + "loss": 0.7105, + "step": 480 + }, + { + "epoch": 0.3848, + "grad_norm": 1.0965618097118262, + "learning_rate": 0.00014100491249672498, + "loss": 0.7246, + "step": 481 + }, + { + "epoch": 0.3856, + "grad_norm": 0.9811034508234098, + "learning_rate": 0.00014076836149416887, + "loss": 0.7697, + "step": 482 + }, + { + "epoch": 0.3864, + "grad_norm": 0.956274166666771, + "learning_rate": 0.0001405315365755379, + "loss": 0.6784, + "step": 483 + }, + { + "epoch": 0.3872, + "grad_norm": 0.9798468248476327, + "learning_rate": 0.0001402944393320206, + "loss": 0.6797, + "step": 484 + }, + { + "epoch": 0.388, + "grad_norm": 1.0375968663501707, + "learning_rate": 0.00014005707135663527, + "loss": 0.6382, + "step": 485 + }, + { + "epoch": 0.3888, + "grad_norm": 0.9662694880416806, + "learning_rate": 0.00013981943424421932, + "loss": 0.6941, + "step": 486 + }, + { + "epoch": 0.3896, + "grad_norm": 1.0410310257756477, + "learning_rate": 0.00013958152959141825, + "loss": 0.701, + "step": 487 + }, + { + "epoch": 0.3904, + "grad_norm": 1.069424086283332, + "learning_rate": 0.00013934335899667527, + "loss": 0.8443, + "step": 488 + }, + { + "epoch": 0.3912, + "grad_norm": 0.973476345898565, + "learning_rate": 0.00013910492406022033, + "loss": 0.644, + "step": 489 + }, + { + "epoch": 0.392, + "grad_norm": 0.9690990691047512, + "learning_rate": 0.00013886622638405952, + "loss": 0.7045, + "step": 490 + }, + { + "epoch": 0.3928, + "grad_norm": 0.878145219403551, + "learning_rate": 0.0001386272675719642, + "loss": 0.6742, + "step": 491 + }, + { + "epoch": 0.3936, + "grad_norm": 0.9063044481911675, + "learning_rate": 0.00013838804922946027, + "loss": 0.641, + "step": 492 + }, + { + "epoch": 0.3944, + "grad_norm": 1.001203019256862, + "learning_rate": 0.00013814857296381728, + "loss": 0.7444, + "step": 493 + }, + { + "epoch": 0.3952, + "grad_norm": 0.9110717667370313, + "learning_rate": 0.00013790884038403795, + "loss": 0.6417, + "step": 494 + }, + { + "epoch": 0.396, + "grad_norm": 0.8802273740541856, + "learning_rate": 0.00013766885310084688, + "loss": 0.6553, + "step": 495 + }, + { + "epoch": 0.3968, + "grad_norm": 1.0995829480405193, + "learning_rate": 0.00013742861272668012, + "loss": 0.769, + "step": 496 + }, + { + "epoch": 0.3976, + "grad_norm": 1.0338274485826533, + "learning_rate": 0.00013718812087567414, + "loss": 0.6832, + "step": 497 + }, + { + "epoch": 0.3984, + "grad_norm": 1.0007253517744736, + "learning_rate": 0.00013694737916365517, + "loss": 0.7447, + "step": 498 + }, + { + "epoch": 0.3992, + "grad_norm": 0.971200354777103, + "learning_rate": 0.000136706389208128, + "loss": 0.6645, + "step": 499 + }, + { + "epoch": 0.4, + "grad_norm": 1.1064379377231044, + "learning_rate": 0.00013646515262826552, + "loss": 0.7071, + "step": 500 + }, + { + "epoch": 0.4008, + "grad_norm": 1.0282297754784526, + "learning_rate": 0.00013622367104489756, + "loss": 0.7565, + "step": 501 + }, + { + "epoch": 0.4016, + "grad_norm": 1.0170495081001916, + "learning_rate": 0.0001359819460805001, + "loss": 0.6728, + "step": 502 + }, + { + "epoch": 0.4024, + "grad_norm": 0.981573108226441, + "learning_rate": 0.0001357399793591844, + "loss": 0.6613, + "step": 503 + }, + { + "epoch": 0.4032, + "grad_norm": 0.9612999332629785, + "learning_rate": 0.0001354977725066859, + "loss": 0.685, + "step": 504 + }, + { + "epoch": 0.404, + "grad_norm": 1.1495625174354223, + "learning_rate": 0.00013525532715035366, + "loss": 0.7439, + "step": 505 + }, + { + "epoch": 0.4048, + "grad_norm": 1.0625147958003944, + "learning_rate": 0.00013501264491913906, + "loss": 0.7942, + "step": 506 + }, + { + "epoch": 0.4056, + "grad_norm": 0.8656809536333867, + "learning_rate": 0.00013476972744358507, + "loss": 0.62, + "step": 507 + }, + { + "epoch": 0.4064, + "grad_norm": 1.0814638376729038, + "learning_rate": 0.0001345265763558152, + "loss": 0.6849, + "step": 508 + }, + { + "epoch": 0.4072, + "grad_norm": 1.0136592426883342, + "learning_rate": 0.00013428319328952253, + "loss": 0.7686, + "step": 509 + }, + { + "epoch": 0.408, + "grad_norm": 1.0653979211052498, + "learning_rate": 0.00013403957987995882, + "loss": 0.7356, + "step": 510 + }, + { + "epoch": 0.4088, + "grad_norm": 1.5524672487774989, + "learning_rate": 0.0001337957377639235, + "loss": 0.7126, + "step": 511 + }, + { + "epoch": 0.4096, + "grad_norm": 1.2407551782976203, + "learning_rate": 0.0001335516685797525, + "loss": 0.7939, + "step": 512 + }, + { + "epoch": 0.4104, + "grad_norm": 0.9613723023205167, + "learning_rate": 0.0001333073739673076, + "loss": 0.6928, + "step": 513 + }, + { + "epoch": 0.4112, + "grad_norm": 1.0449124354890031, + "learning_rate": 0.00013306285556796495, + "loss": 0.7082, + "step": 514 + }, + { + "epoch": 0.412, + "grad_norm": 0.9340571369355848, + "learning_rate": 0.0001328181150246045, + "loss": 0.6169, + "step": 515 + }, + { + "epoch": 0.4128, + "grad_norm": 0.9700227307466612, + "learning_rate": 0.00013257315398159864, + "loss": 0.6793, + "step": 516 + }, + { + "epoch": 0.4136, + "grad_norm": 1.0780444177733026, + "learning_rate": 0.00013232797408480127, + "loss": 0.6701, + "step": 517 + }, + { + "epoch": 0.4144, + "grad_norm": 0.9821818841705071, + "learning_rate": 0.00013208257698153677, + "loss": 0.643, + "step": 518 + }, + { + "epoch": 0.4152, + "grad_norm": 0.9814401515031396, + "learning_rate": 0.00013183696432058888, + "loss": 0.6788, + "step": 519 + }, + { + "epoch": 0.416, + "grad_norm": 1.0926726449940525, + "learning_rate": 0.00013159113775218964, + "loss": 0.7499, + "step": 520 + }, + { + "epoch": 0.4168, + "grad_norm": 0.9902456615356826, + "learning_rate": 0.00013134509892800822, + "loss": 0.5531, + "step": 521 + }, + { + "epoch": 0.4176, + "grad_norm": 0.9813485221150713, + "learning_rate": 0.00013109884950114007, + "loss": 0.6782, + "step": 522 + }, + { + "epoch": 0.4184, + "grad_norm": 1.0354255517380966, + "learning_rate": 0.00013085239112609547, + "loss": 0.7147, + "step": 523 + }, + { + "epoch": 0.4192, + "grad_norm": 1.0282080376059664, + "learning_rate": 0.00013060572545878875, + "loss": 0.7793, + "step": 524 + }, + { + "epoch": 0.42, + "grad_norm": 0.9748715142365796, + "learning_rate": 0.00013035885415652685, + "loss": 0.6833, + "step": 525 + }, + { + "epoch": 0.4208, + "grad_norm": 1.040452534646341, + "learning_rate": 0.00013011177887799845, + "loss": 0.7646, + "step": 526 + }, + { + "epoch": 0.4216, + "grad_norm": 0.9523916277181167, + "learning_rate": 0.00012986450128326266, + "loss": 0.7967, + "step": 527 + }, + { + "epoch": 0.4224, + "grad_norm": 1.0329630631418163, + "learning_rate": 0.00012961702303373795, + "loss": 0.7531, + "step": 528 + }, + { + "epoch": 0.4232, + "grad_norm": 1.1285527698971287, + "learning_rate": 0.00012936934579219094, + "loss": 0.7493, + "step": 529 + }, + { + "epoch": 0.424, + "grad_norm": 0.9190585160356773, + "learning_rate": 0.00012912147122272523, + "loss": 0.6446, + "step": 530 + }, + { + "epoch": 0.4248, + "grad_norm": 0.9082981214968774, + "learning_rate": 0.00012887340099077024, + "loss": 0.5956, + "step": 531 + }, + { + "epoch": 0.4256, + "grad_norm": 0.9444245855283278, + "learning_rate": 0.00012862513676307008, + "loss": 0.654, + "step": 532 + }, + { + "epoch": 0.4264, + "grad_norm": 1.0689970782226674, + "learning_rate": 0.0001283766802076722, + "loss": 0.7062, + "step": 533 + }, + { + "epoch": 0.4272, + "grad_norm": 1.0546696322924753, + "learning_rate": 0.00012812803299391628, + "loss": 0.683, + "step": 534 + }, + { + "epoch": 0.428, + "grad_norm": 0.9971647907901832, + "learning_rate": 0.00012787919679242306, + "loss": 0.6792, + "step": 535 + }, + { + "epoch": 0.4288, + "grad_norm": 0.9892976751183552, + "learning_rate": 0.00012763017327508305, + "loss": 0.6402, + "step": 536 + }, + { + "epoch": 0.4296, + "grad_norm": 0.8936332990317126, + "learning_rate": 0.00012738096411504522, + "loss": 0.6351, + "step": 537 + }, + { + "epoch": 0.4304, + "grad_norm": 0.925522606737151, + "learning_rate": 0.0001271315709867059, + "loss": 0.5876, + "step": 538 + }, + { + "epoch": 0.4312, + "grad_norm": 0.992246902537022, + "learning_rate": 0.00012688199556569753, + "loss": 0.6892, + "step": 539 + }, + { + "epoch": 0.432, + "grad_norm": 1.0045201876739385, + "learning_rate": 0.00012663223952887723, + "loss": 0.6797, + "step": 540 + }, + { + "epoch": 0.4328, + "grad_norm": 0.90811240999659, + "learning_rate": 0.0001263823045543158, + "loss": 0.6378, + "step": 541 + }, + { + "epoch": 0.4336, + "grad_norm": 0.9385694690761545, + "learning_rate": 0.00012613219232128608, + "loss": 0.616, + "step": 542 + }, + { + "epoch": 0.4344, + "grad_norm": 0.7917103910922637, + "learning_rate": 0.00012588190451025207, + "loss": 0.5023, + "step": 543 + }, + { + "epoch": 0.4352, + "grad_norm": 0.870431902242613, + "learning_rate": 0.00012563144280285741, + "loss": 0.6534, + "step": 544 + }, + { + "epoch": 0.436, + "grad_norm": 0.8823717609207367, + "learning_rate": 0.00012538080888191408, + "loss": 0.6739, + "step": 545 + }, + { + "epoch": 0.4368, + "grad_norm": 0.8265835296319374, + "learning_rate": 0.00012513000443139112, + "loss": 0.6082, + "step": 546 + }, + { + "epoch": 0.4376, + "grad_norm": 0.9405103526779204, + "learning_rate": 0.00012487903113640337, + "loss": 0.6384, + "step": 547 + }, + { + "epoch": 0.4384, + "grad_norm": 1.024733649321451, + "learning_rate": 0.00012462789068320017, + "loss": 0.7039, + "step": 548 + }, + { + "epoch": 0.4392, + "grad_norm": 0.9268780097595519, + "learning_rate": 0.00012437658475915377, + "loss": 0.6524, + "step": 549 + }, + { + "epoch": 0.44, + "grad_norm": 0.8781618247231748, + "learning_rate": 0.00012412511505274844, + "loss": 0.5848, + "step": 550 + }, + { + "epoch": 0.4408, + "grad_norm": 0.9290248266795664, + "learning_rate": 0.00012387348325356874, + "loss": 0.712, + "step": 551 + }, + { + "epoch": 0.4416, + "grad_norm": 0.9193603267840847, + "learning_rate": 0.00012362169105228826, + "loss": 0.6238, + "step": 552 + }, + { + "epoch": 0.4424, + "grad_norm": 0.9482222228787937, + "learning_rate": 0.00012336974014065844, + "loss": 0.6282, + "step": 553 + }, + { + "epoch": 0.4432, + "grad_norm": 0.8862451690628104, + "learning_rate": 0.000123117632211497, + "loss": 0.6242, + "step": 554 + }, + { + "epoch": 0.444, + "grad_norm": 1.1310323019857198, + "learning_rate": 0.00012286536895867654, + "loss": 0.6774, + "step": 555 + }, + { + "epoch": 0.4448, + "grad_norm": 1.2015874953589551, + "learning_rate": 0.00012261295207711346, + "loss": 0.798, + "step": 556 + }, + { + "epoch": 0.4456, + "grad_norm": 1.0066583588082008, + "learning_rate": 0.00012236038326275626, + "loss": 0.5747, + "step": 557 + }, + { + "epoch": 0.4464, + "grad_norm": 1.2487930533796985, + "learning_rate": 0.0001221076642125742, + "loss": 0.7472, + "step": 558 + }, + { + "epoch": 0.4472, + "grad_norm": 1.0197015980249613, + "learning_rate": 0.00012185479662454595, + "loss": 0.7325, + "step": 559 + }, + { + "epoch": 0.448, + "grad_norm": 0.958331423601618, + "learning_rate": 0.00012160178219764837, + "loss": 0.6506, + "step": 560 + }, + { + "epoch": 0.4488, + "grad_norm": 1.0849774317040684, + "learning_rate": 0.00012134862263184467, + "loss": 0.7727, + "step": 561 + }, + { + "epoch": 0.4496, + "grad_norm": 0.9955829303680328, + "learning_rate": 0.00012109531962807332, + "loss": 0.6899, + "step": 562 + }, + { + "epoch": 0.4504, + "grad_norm": 1.0325170766691616, + "learning_rate": 0.00012084187488823657, + "loss": 0.7209, + "step": 563 + }, + { + "epoch": 0.4512, + "grad_norm": 0.9962515122153125, + "learning_rate": 0.00012058829011518896, + "loss": 0.6045, + "step": 564 + }, + { + "epoch": 0.452, + "grad_norm": 0.9424554738981582, + "learning_rate": 0.00012033456701272576, + "loss": 0.6261, + "step": 565 + }, + { + "epoch": 0.4528, + "grad_norm": 0.9008228342261408, + "learning_rate": 0.00012008070728557186, + "loss": 0.6281, + "step": 566 + }, + { + "epoch": 0.4536, + "grad_norm": 0.9795315779902153, + "learning_rate": 0.00011982671263936995, + "loss": 0.669, + "step": 567 + }, + { + "epoch": 0.4544, + "grad_norm": 0.9181733443905542, + "learning_rate": 0.00011957258478066931, + "loss": 0.6354, + "step": 568 + }, + { + "epoch": 0.4552, + "grad_norm": 0.9964838418025049, + "learning_rate": 0.00011931832541691418, + "loss": 0.606, + "step": 569 + }, + { + "epoch": 0.456, + "grad_norm": 0.9713730027331944, + "learning_rate": 0.00011906393625643244, + "loss": 0.6361, + "step": 570 + }, + { + "epoch": 0.4568, + "grad_norm": 1.0127955889148383, + "learning_rate": 0.00011880941900842397, + "loss": 0.661, + "step": 571 + }, + { + "epoch": 0.4576, + "grad_norm": 0.8965977593232317, + "learning_rate": 0.00011855477538294935, + "loss": 0.5803, + "step": 572 + }, + { + "epoch": 0.4584, + "grad_norm": 0.9555255268081055, + "learning_rate": 0.00011830000709091815, + "loss": 0.67, + "step": 573 + }, + { + "epoch": 0.4592, + "grad_norm": 1.429931874393292, + "learning_rate": 0.00011804511584407763, + "loss": 0.8253, + "step": 574 + }, + { + "epoch": 0.46, + "grad_norm": 0.9080529225486037, + "learning_rate": 0.0001177901033550012, + "loss": 0.5789, + "step": 575 + }, + { + "epoch": 0.4608, + "grad_norm": 0.9561747406233209, + "learning_rate": 0.00011753497133707679, + "loss": 0.6863, + "step": 576 + }, + { + "epoch": 0.4616, + "grad_norm": 0.8795515505847675, + "learning_rate": 0.00011727972150449544, + "loss": 0.6141, + "step": 577 + }, + { + "epoch": 0.4624, + "grad_norm": 1.1153897316373964, + "learning_rate": 0.00011702435557223987, + "loss": 0.7158, + "step": 578 + }, + { + "epoch": 0.4632, + "grad_norm": 1.0239509952378902, + "learning_rate": 0.00011676887525607271, + "loss": 0.6895, + "step": 579 + }, + { + "epoch": 0.464, + "grad_norm": 1.0255634523001187, + "learning_rate": 0.00011651328227252517, + "loss": 0.7033, + "step": 580 + }, + { + "epoch": 0.4648, + "grad_norm": 1.0779694730191423, + "learning_rate": 0.00011625757833888551, + "loss": 0.7482, + "step": 581 + }, + { + "epoch": 0.4656, + "grad_norm": 0.9033055174636703, + "learning_rate": 0.00011600176517318741, + "loss": 0.5817, + "step": 582 + }, + { + "epoch": 0.4664, + "grad_norm": 0.9234135553682853, + "learning_rate": 0.0001157458444941984, + "loss": 0.6561, + "step": 583 + }, + { + "epoch": 0.4672, + "grad_norm": 1.0930306682285496, + "learning_rate": 0.00011548981802140848, + "loss": 0.7402, + "step": 584 + }, + { + "epoch": 0.468, + "grad_norm": 1.0891783333532241, + "learning_rate": 0.00011523368747501839, + "loss": 0.838, + "step": 585 + }, + { + "epoch": 0.4688, + "grad_norm": 0.9063121785903229, + "learning_rate": 0.00011497745457592816, + "loss": 0.5588, + "step": 586 + }, + { + "epoch": 0.4696, + "grad_norm": 0.8756515092653024, + "learning_rate": 0.00011472112104572547, + "loss": 0.5871, + "step": 587 + }, + { + "epoch": 0.4704, + "grad_norm": 0.9052639166165494, + "learning_rate": 0.00011446468860667421, + "loss": 0.628, + "step": 588 + }, + { + "epoch": 0.4712, + "grad_norm": 0.9403490201939707, + "learning_rate": 0.0001142081589817027, + "loss": 0.6514, + "step": 589 + }, + { + "epoch": 0.472, + "grad_norm": 1.0477290724502009, + "learning_rate": 0.00011395153389439233, + "loss": 0.5836, + "step": 590 + }, + { + "epoch": 0.4728, + "grad_norm": 0.9100473365039066, + "learning_rate": 0.00011369481506896582, + "loss": 0.6018, + "step": 591 + }, + { + "epoch": 0.4736, + "grad_norm": 0.9016002157479855, + "learning_rate": 0.00011343800423027582, + "loss": 0.5607, + "step": 592 + }, + { + "epoch": 0.4744, + "grad_norm": 0.9850404704925205, + "learning_rate": 0.00011318110310379301, + "loss": 0.6702, + "step": 593 + }, + { + "epoch": 0.4752, + "grad_norm": 0.9749236728270172, + "learning_rate": 0.0001129241134155949, + "loss": 0.5877, + "step": 594 + }, + { + "epoch": 0.476, + "grad_norm": 0.9399585491083582, + "learning_rate": 0.00011266703689235394, + "loss": 0.6119, + "step": 595 + }, + { + "epoch": 0.4768, + "grad_norm": 1.0947518739913256, + "learning_rate": 0.00011240987526132594, + "loss": 0.8437, + "step": 596 + }, + { + "epoch": 0.4776, + "grad_norm": 0.8320013335953632, + "learning_rate": 0.00011215263025033869, + "loss": 0.6518, + "step": 597 + }, + { + "epoch": 0.4784, + "grad_norm": 0.9468846118303392, + "learning_rate": 0.00011189530358778005, + "loss": 0.6035, + "step": 598 + }, + { + "epoch": 0.4792, + "grad_norm": 0.9159399662224321, + "learning_rate": 0.00011163789700258655, + "loss": 0.6246, + "step": 599 + }, + { + "epoch": 0.48, + "grad_norm": 0.9676794539813114, + "learning_rate": 0.00011138041222423177, + "loss": 0.5911, + "step": 600 + }, + { + "epoch": 0.4808, + "grad_norm": 0.9252959644110134, + "learning_rate": 0.00011112285098271451, + "loss": 0.6548, + "step": 601 + }, + { + "epoch": 0.4816, + "grad_norm": 0.8685234570929422, + "learning_rate": 0.00011086521500854745, + "loss": 0.5444, + "step": 602 + }, + { + "epoch": 0.4824, + "grad_norm": 0.999888802188266, + "learning_rate": 0.00011060750603274535, + "loss": 0.6613, + "step": 603 + }, + { + "epoch": 0.4832, + "grad_norm": 0.9258630185529607, + "learning_rate": 0.00011034972578681338, + "loss": 0.7428, + "step": 604 + }, + { + "epoch": 0.484, + "grad_norm": 0.8978351873148683, + "learning_rate": 0.00011009187600273566, + "loss": 0.5685, + "step": 605 + }, + { + "epoch": 0.4848, + "grad_norm": 0.8867834326441457, + "learning_rate": 0.00010983395841296348, + "loss": 0.5201, + "step": 606 + }, + { + "epoch": 0.4856, + "grad_norm": 0.8236581805329797, + "learning_rate": 0.00010957597475040373, + "loss": 0.5255, + "step": 607 + }, + { + "epoch": 0.4864, + "grad_norm": 0.9372515761391587, + "learning_rate": 0.00010931792674840718, + "loss": 0.6118, + "step": 608 + }, + { + "epoch": 0.4872, + "grad_norm": 0.8860205977781878, + "learning_rate": 0.00010905981614075693, + "loss": 0.6414, + "step": 609 + }, + { + "epoch": 0.488, + "grad_norm": 1.0121532447470982, + "learning_rate": 0.00010880164466165674, + "loss": 0.7489, + "step": 610 + }, + { + "epoch": 0.4888, + "grad_norm": 0.95363120804436, + "learning_rate": 0.00010854341404571928, + "loss": 0.6106, + "step": 611 + }, + { + "epoch": 0.4896, + "grad_norm": 0.9254678867628731, + "learning_rate": 0.00010828512602795462, + "loss": 0.5573, + "step": 612 + }, + { + "epoch": 0.4904, + "grad_norm": 0.9674567730705677, + "learning_rate": 0.00010802678234375851, + "loss": 0.6123, + "step": 613 + }, + { + "epoch": 0.4912, + "grad_norm": 0.929134356753628, + "learning_rate": 0.00010776838472890065, + "loss": 0.6086, + "step": 614 + }, + { + "epoch": 0.492, + "grad_norm": 1.0062273044589374, + "learning_rate": 0.0001075099349195131, + "loss": 0.7352, + "step": 615 + }, + { + "epoch": 0.4928, + "grad_norm": 0.856612433286552, + "learning_rate": 0.00010725143465207867, + "loss": 0.5335, + "step": 616 + }, + { + "epoch": 0.4936, + "grad_norm": 0.9013543780447304, + "learning_rate": 0.00010699288566341914, + "loss": 0.5392, + "step": 617 + }, + { + "epoch": 0.4944, + "grad_norm": 0.9990058368463038, + "learning_rate": 0.00010673428969068364, + "loss": 0.7409, + "step": 618 + }, + { + "epoch": 0.4952, + "grad_norm": 0.9993198048944072, + "learning_rate": 0.000106475648471337, + "loss": 0.6626, + "step": 619 + }, + { + "epoch": 0.496, + "grad_norm": 0.8561173649083798, + "learning_rate": 0.00010621696374314807, + "loss": 0.6048, + "step": 620 + }, + { + "epoch": 0.4968, + "grad_norm": 0.8483751866663903, + "learning_rate": 0.00010595823724417795, + "loss": 0.6365, + "step": 621 + }, + { + "epoch": 0.4976, + "grad_norm": 0.9456170963528224, + "learning_rate": 0.00010569947071276847, + "loss": 0.5824, + "step": 622 + }, + { + "epoch": 0.4984, + "grad_norm": 0.9046667406999749, + "learning_rate": 0.00010544066588753044, + "loss": 0.5922, + "step": 623 + }, + { + "epoch": 0.4992, + "grad_norm": 0.8750038332233276, + "learning_rate": 0.00010518182450733186, + "loss": 0.567, + "step": 624 + }, + { + "epoch": 0.5, + "grad_norm": 0.8584138388306187, + "learning_rate": 0.00010492294831128641, + "loss": 0.5831, + "step": 625 + }, + { + "epoch": 0.5008, + "grad_norm": 1.0390547317146437, + "learning_rate": 0.00010466403903874176, + "loss": 0.6902, + "step": 626 + }, + { + "epoch": 0.5016, + "grad_norm": 0.9994915776953593, + "learning_rate": 0.00010440509842926767, + "loss": 0.6477, + "step": 627 + }, + { + "epoch": 0.5024, + "grad_norm": 1.0770129680588285, + "learning_rate": 0.00010414612822264455, + "loss": 0.5568, + "step": 628 + }, + { + "epoch": 0.5032, + "grad_norm": 1.0941378840305556, + "learning_rate": 0.00010388713015885161, + "loss": 0.6017, + "step": 629 + }, + { + "epoch": 0.504, + "grad_norm": 1.048632007629162, + "learning_rate": 0.00010362810597805526, + "loss": 0.622, + "step": 630 + }, + { + "epoch": 0.5048, + "grad_norm": 0.9955624104751872, + "learning_rate": 0.00010336905742059742, + "loss": 0.6559, + "step": 631 + }, + { + "epoch": 0.5056, + "grad_norm": 1.0259937315294585, + "learning_rate": 0.0001031099862269837, + "loss": 0.7529, + "step": 632 + }, + { + "epoch": 0.5064, + "grad_norm": 1.0051406872664894, + "learning_rate": 0.0001028508941378719, + "loss": 0.6572, + "step": 633 + }, + { + "epoch": 0.5072, + "grad_norm": 0.8437661707095883, + "learning_rate": 0.00010259178289406011, + "loss": 0.5758, + "step": 634 + }, + { + "epoch": 0.508, + "grad_norm": 0.9068740628413372, + "learning_rate": 0.00010233265423647523, + "loss": 0.5819, + "step": 635 + }, + { + "epoch": 0.5088, + "grad_norm": 1.0071286102184436, + "learning_rate": 0.00010207350990616107, + "loss": 0.6693, + "step": 636 + }, + { + "epoch": 0.5096, + "grad_norm": 0.8846472732604066, + "learning_rate": 0.00010181435164426676, + "loss": 0.536, + "step": 637 + }, + { + "epoch": 0.5104, + "grad_norm": 1.1775969933001442, + "learning_rate": 0.0001015551811920351, + "loss": 0.7716, + "step": 638 + }, + { + "epoch": 0.5112, + "grad_norm": 0.8841109595513913, + "learning_rate": 0.00010129600029079072, + "loss": 0.591, + "step": 639 + }, + { + "epoch": 0.512, + "grad_norm": 0.845641593117227, + "learning_rate": 0.00010103681068192845, + "loss": 0.587, + "step": 640 + }, + { + "epoch": 0.5128, + "grad_norm": 0.9093340586120714, + "learning_rate": 0.00010077761410690172, + "loss": 0.6042, + "step": 641 + }, + { + "epoch": 0.5136, + "grad_norm": 0.9160365891428205, + "learning_rate": 0.00010051841230721065, + "loss": 0.6342, + "step": 642 + }, + { + "epoch": 0.5144, + "grad_norm": 0.8473297542447655, + "learning_rate": 0.00010025920702439051, + "loss": 0.5884, + "step": 643 + }, + { + "epoch": 0.5152, + "grad_norm": 0.8494577589658796, + "learning_rate": 0.0001, + "loss": 0.5613, + "step": 644 + }, + { + "epoch": 0.516, + "grad_norm": 0.8867037444326773, + "learning_rate": 9.97407929756095e-05, + "loss": 0.6235, + "step": 645 + }, + { + "epoch": 0.5168, + "grad_norm": 0.990154130055683, + "learning_rate": 9.948158769278939e-05, + "loss": 0.5907, + "step": 646 + }, + { + "epoch": 0.5176, + "grad_norm": 1.0263388140013465, + "learning_rate": 9.92223858930983e-05, + "loss": 0.6316, + "step": 647 + }, + { + "epoch": 0.5184, + "grad_norm": 1.1271972521050353, + "learning_rate": 9.896318931807155e-05, + "loss": 0.5525, + "step": 648 + }, + { + "epoch": 0.5192, + "grad_norm": 1.0837906964394204, + "learning_rate": 9.870399970920932e-05, + "loss": 0.6587, + "step": 649 + }, + { + "epoch": 0.52, + "grad_norm": 1.0043765323916656, + "learning_rate": 9.844481880796491e-05, + "loss": 0.6208, + "step": 650 + }, + { + "epoch": 0.5208, + "grad_norm": 0.8366245514174349, + "learning_rate": 9.818564835573323e-05, + "loss": 0.4771, + "step": 651 + }, + { + "epoch": 0.5216, + "grad_norm": 0.918646383048879, + "learning_rate": 9.792649009383899e-05, + "loss": 0.5758, + "step": 652 + }, + { + "epoch": 0.5224, + "grad_norm": 0.9064290959922201, + "learning_rate": 9.766734576352478e-05, + "loss": 0.5214, + "step": 653 + }, + { + "epoch": 0.5232, + "grad_norm": 1.004774070235783, + "learning_rate": 9.740821710593989e-05, + "loss": 0.6805, + "step": 654 + }, + { + "epoch": 0.524, + "grad_norm": 0.9282402680467122, + "learning_rate": 9.714910586212816e-05, + "loss": 0.5931, + "step": 655 + }, + { + "epoch": 0.5248, + "grad_norm": 1.0751875949022203, + "learning_rate": 9.689001377301633e-05, + "loss": 0.6883, + "step": 656 + }, + { + "epoch": 0.5256, + "grad_norm": 0.9337831027390505, + "learning_rate": 9.663094257940258e-05, + "loss": 0.5999, + "step": 657 + }, + { + "epoch": 0.5264, + "grad_norm": 0.8738135695779778, + "learning_rate": 9.637189402194476e-05, + "loss": 0.6364, + "step": 658 + }, + { + "epoch": 0.5272, + "grad_norm": 0.8801911032071335, + "learning_rate": 9.611286984114841e-05, + "loss": 0.5952, + "step": 659 + }, + { + "epoch": 0.528, + "grad_norm": 0.9823176815317364, + "learning_rate": 9.585387177735547e-05, + "loss": 0.6753, + "step": 660 + }, + { + "epoch": 0.5288, + "grad_norm": 0.8879311198249467, + "learning_rate": 9.559490157073236e-05, + "loss": 0.5486, + "step": 661 + }, + { + "epoch": 0.5296, + "grad_norm": 0.8535088345660301, + "learning_rate": 9.533596096125825e-05, + "loss": 0.5567, + "step": 662 + }, + { + "epoch": 0.5304, + "grad_norm": 0.9407793728029339, + "learning_rate": 9.507705168871358e-05, + "loss": 0.674, + "step": 663 + }, + { + "epoch": 0.5312, + "grad_norm": 0.916554270033362, + "learning_rate": 9.481817549266817e-05, + "loss": 0.6477, + "step": 664 + }, + { + "epoch": 0.532, + "grad_norm": 0.9613974681796721, + "learning_rate": 9.455933411246958e-05, + "loss": 0.593, + "step": 665 + }, + { + "epoch": 0.5328, + "grad_norm": 1.1380334060060602, + "learning_rate": 9.430052928723153e-05, + "loss": 0.5568, + "step": 666 + }, + { + "epoch": 0.5336, + "grad_norm": 0.9110819234222967, + "learning_rate": 9.404176275582208e-05, + "loss": 0.6018, + "step": 667 + }, + { + "epoch": 0.5344, + "grad_norm": 0.9152424635634194, + "learning_rate": 9.378303625685195e-05, + "loss": 0.6375, + "step": 668 + }, + { + "epoch": 0.5352, + "grad_norm": 0.8918539267738597, + "learning_rate": 9.352435152866298e-05, + "loss": 0.5792, + "step": 669 + }, + { + "epoch": 0.536, + "grad_norm": 0.9025975880738885, + "learning_rate": 9.326571030931637e-05, + "loss": 0.6599, + "step": 670 + }, + { + "epoch": 0.5368, + "grad_norm": 0.840404349020553, + "learning_rate": 9.300711433658087e-05, + "loss": 0.5668, + "step": 671 + }, + { + "epoch": 0.5376, + "grad_norm": 0.9320459313831757, + "learning_rate": 9.274856534792138e-05, + "loss": 0.5902, + "step": 672 + }, + { + "epoch": 0.5384, + "grad_norm": 1.050896801767698, + "learning_rate": 9.249006508048694e-05, + "loss": 0.7424, + "step": 673 + }, + { + "epoch": 0.5392, + "grad_norm": 0.8374864129093207, + "learning_rate": 9.223161527109937e-05, + "loss": 0.4816, + "step": 674 + }, + { + "epoch": 0.54, + "grad_norm": 1.0697389056145188, + "learning_rate": 9.197321765624152e-05, + "loss": 0.6934, + "step": 675 + }, + { + "epoch": 0.5408, + "grad_norm": 0.874226541591921, + "learning_rate": 9.171487397204539e-05, + "loss": 0.5467, + "step": 676 + }, + { + "epoch": 0.5416, + "grad_norm": 0.9217989977358017, + "learning_rate": 9.145658595428074e-05, + "loss": 0.6351, + "step": 677 + }, + { + "epoch": 0.5424, + "grad_norm": 1.107297223197541, + "learning_rate": 9.119835533834331e-05, + "loss": 0.7024, + "step": 678 + }, + { + "epoch": 0.5432, + "grad_norm": 1.0106730056854827, + "learning_rate": 9.09401838592431e-05, + "loss": 0.5998, + "step": 679 + }, + { + "epoch": 0.544, + "grad_norm": 0.8501042546831452, + "learning_rate": 9.068207325159284e-05, + "loss": 0.5953, + "step": 680 + }, + { + "epoch": 0.5448, + "grad_norm": 0.9052200769081522, + "learning_rate": 9.04240252495963e-05, + "loss": 0.5746, + "step": 681 + }, + { + "epoch": 0.5456, + "grad_norm": 0.8894211309263867, + "learning_rate": 9.016604158703654e-05, + "loss": 0.5901, + "step": 682 + }, + { + "epoch": 0.5464, + "grad_norm": 0.922607707471488, + "learning_rate": 8.990812399726435e-05, + "loss": 0.6184, + "step": 683 + }, + { + "epoch": 0.5472, + "grad_norm": 0.8862217002560381, + "learning_rate": 8.965027421318665e-05, + "loss": 0.612, + "step": 684 + }, + { + "epoch": 0.548, + "grad_norm": 0.9911155918338997, + "learning_rate": 8.939249396725467e-05, + "loss": 0.611, + "step": 685 + }, + { + "epoch": 0.5488, + "grad_norm": 0.7657420492665408, + "learning_rate": 8.913478499145254e-05, + "loss": 0.5286, + "step": 686 + }, + { + "epoch": 0.5496, + "grad_norm": 0.8997367303235172, + "learning_rate": 8.887714901728551e-05, + "loss": 0.577, + "step": 687 + }, + { + "epoch": 0.5504, + "grad_norm": 0.8319139015288592, + "learning_rate": 8.861958777576827e-05, + "loss": 0.5347, + "step": 688 + }, + { + "epoch": 0.5512, + "grad_norm": 0.9457280659526005, + "learning_rate": 8.836210299741346e-05, + "loss": 0.6461, + "step": 689 + }, + { + "epoch": 0.552, + "grad_norm": 0.9271953572324194, + "learning_rate": 8.810469641222001e-05, + "loss": 0.6099, + "step": 690 + }, + { + "epoch": 0.5528, + "grad_norm": 0.9260935366306379, + "learning_rate": 8.784736974966135e-05, + "loss": 0.5267, + "step": 691 + }, + { + "epoch": 0.5536, + "grad_norm": 0.9528620793950472, + "learning_rate": 8.759012473867407e-05, + "loss": 0.5478, + "step": 692 + }, + { + "epoch": 0.5544, + "grad_norm": 1.0044560988345654, + "learning_rate": 8.733296310764611e-05, + "loss": 0.7119, + "step": 693 + }, + { + "epoch": 0.5552, + "grad_norm": 0.8135108163191106, + "learning_rate": 8.707588658440511e-05, + "loss": 0.5216, + "step": 694 + }, + { + "epoch": 0.556, + "grad_norm": 0.8980222891717288, + "learning_rate": 8.6818896896207e-05, + "loss": 0.576, + "step": 695 + }, + { + "epoch": 0.5568, + "grad_norm": 1.0477335377395407, + "learning_rate": 8.656199576972423e-05, + "loss": 0.6906, + "step": 696 + }, + { + "epoch": 0.5576, + "grad_norm": 1.0547626667854606, + "learning_rate": 8.63051849310342e-05, + "loss": 0.656, + "step": 697 + }, + { + "epoch": 0.5584, + "grad_norm": 0.8798438501542369, + "learning_rate": 8.604846610560771e-05, + "loss": 0.5354, + "step": 698 + }, + { + "epoch": 0.5592, + "grad_norm": 1.0117863162265472, + "learning_rate": 8.579184101829734e-05, + "loss": 0.6, + "step": 699 + }, + { + "epoch": 0.56, + "grad_norm": 0.895018639948916, + "learning_rate": 8.553531139332582e-05, + "loss": 0.6008, + "step": 700 + }, + { + "epoch": 0.5608, + "grad_norm": 0.9580031145377335, + "learning_rate": 8.527887895427454e-05, + "loss": 0.6302, + "step": 701 + }, + { + "epoch": 0.5616, + "grad_norm": 0.9021281611904125, + "learning_rate": 8.502254542407186e-05, + "loss": 0.6171, + "step": 702 + }, + { + "epoch": 0.5624, + "grad_norm": 0.9785185862912812, + "learning_rate": 8.476631252498162e-05, + "loss": 0.6775, + "step": 703 + }, + { + "epoch": 0.5632, + "grad_norm": 0.843433757105957, + "learning_rate": 8.451018197859153e-05, + "loss": 0.5161, + "step": 704 + }, + { + "epoch": 0.564, + "grad_norm": 0.9059987007232777, + "learning_rate": 8.425415550580162e-05, + "loss": 0.5804, + "step": 705 + }, + { + "epoch": 0.5648, + "grad_norm": 0.9850582512568193, + "learning_rate": 8.399823482681262e-05, + "loss": 0.6, + "step": 706 + }, + { + "epoch": 0.5656, + "grad_norm": 0.8949371449296837, + "learning_rate": 8.374242166111448e-05, + "loss": 0.5907, + "step": 707 + }, + { + "epoch": 0.5664, + "grad_norm": 0.7637689981060631, + "learning_rate": 8.348671772747487e-05, + "loss": 0.5015, + "step": 708 + }, + { + "epoch": 0.5672, + "grad_norm": 0.9173008428956848, + "learning_rate": 8.323112474392731e-05, + "loss": 0.5879, + "step": 709 + }, + { + "epoch": 0.568, + "grad_norm": 0.9345140097350884, + "learning_rate": 8.297564442776014e-05, + "loss": 0.6243, + "step": 710 + }, + { + "epoch": 0.5688, + "grad_norm": 0.946836398596421, + "learning_rate": 8.272027849550457e-05, + "loss": 0.6282, + "step": 711 + }, + { + "epoch": 0.5696, + "grad_norm": 0.8998445459421743, + "learning_rate": 8.246502866292324e-05, + "loss": 0.5566, + "step": 712 + }, + { + "epoch": 0.5704, + "grad_norm": 0.9022030377922716, + "learning_rate": 8.220989664499878e-05, + "loss": 0.5598, + "step": 713 + }, + { + "epoch": 0.5712, + "grad_norm": 0.9471302548173, + "learning_rate": 8.195488415592238e-05, + "loss": 0.5697, + "step": 714 + }, + { + "epoch": 0.572, + "grad_norm": 0.9315863652553878, + "learning_rate": 8.169999290908188e-05, + "loss": 0.6164, + "step": 715 + }, + { + "epoch": 0.5728, + "grad_norm": 1.0700682801215697, + "learning_rate": 8.144522461705067e-05, + "loss": 0.6168, + "step": 716 + }, + { + "epoch": 0.5736, + "grad_norm": 1.0606049920115181, + "learning_rate": 8.119058099157604e-05, + "loss": 0.64, + "step": 717 + }, + { + "epoch": 0.5744, + "grad_norm": 0.8285105075180479, + "learning_rate": 8.093606374356759e-05, + "loss": 0.5487, + "step": 718 + }, + { + "epoch": 0.5752, + "grad_norm": 0.8752142319351934, + "learning_rate": 8.068167458308582e-05, + "loss": 0.5347, + "step": 719 + }, + { + "epoch": 0.576, + "grad_norm": 0.9651658763970603, + "learning_rate": 8.042741521933071e-05, + "loss": 0.4671, + "step": 720 + }, + { + "epoch": 0.5768, + "grad_norm": 0.9657917378841041, + "learning_rate": 8.017328736063006e-05, + "loss": 0.6231, + "step": 721 + }, + { + "epoch": 0.5776, + "grad_norm": 0.9435597035462718, + "learning_rate": 7.991929271442817e-05, + "loss": 0.6741, + "step": 722 + }, + { + "epoch": 0.5784, + "grad_norm": 0.8761344875784453, + "learning_rate": 7.966543298727425e-05, + "loss": 0.4783, + "step": 723 + }, + { + "epoch": 0.5792, + "grad_norm": 1.0357740548151377, + "learning_rate": 7.941170988481108e-05, + "loss": 0.6813, + "step": 724 + }, + { + "epoch": 0.58, + "grad_norm": 0.8533213245126312, + "learning_rate": 7.915812511176347e-05, + "loss": 0.5665, + "step": 725 + }, + { + "epoch": 0.5808, + "grad_norm": 0.8780721704318892, + "learning_rate": 7.89046803719267e-05, + "loss": 0.5007, + "step": 726 + }, + { + "epoch": 0.5816, + "grad_norm": 0.9449061063034562, + "learning_rate": 7.865137736815535e-05, + "loss": 0.6092, + "step": 727 + }, + { + "epoch": 0.5824, + "grad_norm": 0.9716800570034508, + "learning_rate": 7.839821780235168e-05, + "loss": 0.6404, + "step": 728 + }, + { + "epoch": 0.5832, + "grad_norm": 0.8939295826894426, + "learning_rate": 7.814520337545406e-05, + "loss": 0.6126, + "step": 729 + }, + { + "epoch": 0.584, + "grad_norm": 0.9111497105991285, + "learning_rate": 7.789233578742582e-05, + "loss": 0.545, + "step": 730 + }, + { + "epoch": 0.5848, + "grad_norm": 0.9845780649515906, + "learning_rate": 7.763961673724379e-05, + "loss": 0.6521, + "step": 731 + }, + { + "epoch": 0.5856, + "grad_norm": 1.060729966646695, + "learning_rate": 7.738704792288655e-05, + "loss": 0.6937, + "step": 732 + }, + { + "epoch": 0.5864, + "grad_norm": 1.1137329565177583, + "learning_rate": 7.713463104132345e-05, + "loss": 0.6442, + "step": 733 + }, + { + "epoch": 0.5872, + "grad_norm": 0.9795416162775744, + "learning_rate": 7.688236778850306e-05, + "loss": 0.6429, + "step": 734 + }, + { + "epoch": 0.588, + "grad_norm": 0.9101660654997451, + "learning_rate": 7.663025985934158e-05, + "loss": 0.518, + "step": 735 + }, + { + "epoch": 0.5888, + "grad_norm": 1.029837582749723, + "learning_rate": 7.637830894771175e-05, + "loss": 0.5742, + "step": 736 + }, + { + "epoch": 0.5896, + "grad_norm": 0.8829773480744334, + "learning_rate": 7.61265167464313e-05, + "loss": 0.5556, + "step": 737 + }, + { + "epoch": 0.5904, + "grad_norm": 0.9135755633321601, + "learning_rate": 7.587488494725157e-05, + "loss": 0.5405, + "step": 738 + }, + { + "epoch": 0.5912, + "grad_norm": 0.9503980034000901, + "learning_rate": 7.562341524084623e-05, + "loss": 0.5779, + "step": 739 + }, + { + "epoch": 0.592, + "grad_norm": 1.0232350032188056, + "learning_rate": 7.537210931679987e-05, + "loss": 0.624, + "step": 740 + }, + { + "epoch": 0.5928, + "grad_norm": 1.0241056751401028, + "learning_rate": 7.512096886359664e-05, + "loss": 0.5954, + "step": 741 + }, + { + "epoch": 0.5936, + "grad_norm": 0.9520941462332658, + "learning_rate": 7.48699955686089e-05, + "loss": 0.5899, + "step": 742 + }, + { + "epoch": 0.5944, + "grad_norm": 1.1016754977518257, + "learning_rate": 7.461919111808595e-05, + "loss": 0.6661, + "step": 743 + }, + { + "epoch": 0.5952, + "grad_norm": 1.1279637263444664, + "learning_rate": 7.43685571971426e-05, + "loss": 0.8222, + "step": 744 + }, + { + "epoch": 0.596, + "grad_norm": 1.022109420265956, + "learning_rate": 7.411809548974792e-05, + "loss": 0.5703, + "step": 745 + }, + { + "epoch": 0.5968, + "grad_norm": 1.0138360617776458, + "learning_rate": 7.386780767871397e-05, + "loss": 0.6399, + "step": 746 + }, + { + "epoch": 0.5976, + "grad_norm": 0.9042528295182145, + "learning_rate": 7.361769544568425e-05, + "loss": 0.5849, + "step": 747 + }, + { + "epoch": 0.5984, + "grad_norm": 0.9197630436567867, + "learning_rate": 7.336776047112276e-05, + "loss": 0.5542, + "step": 748 + }, + { + "epoch": 0.5992, + "grad_norm": 0.8072283928049032, + "learning_rate": 7.311800443430251e-05, + "loss": 0.4602, + "step": 749 + }, + { + "epoch": 0.6, + "grad_norm": 0.9247334266158035, + "learning_rate": 7.286842901329412e-05, + "loss": 0.6019, + "step": 750 + }, + { + "epoch": 0.6008, + "grad_norm": 0.8401822746345465, + "learning_rate": 7.26190358849548e-05, + "loss": 0.5639, + "step": 751 + }, + { + "epoch": 0.6016, + "grad_norm": 0.9648310078029783, + "learning_rate": 7.236982672491698e-05, + "loss": 0.6491, + "step": 752 + }, + { + "epoch": 0.6024, + "grad_norm": 0.9603080004347097, + "learning_rate": 7.212080320757695e-05, + "loss": 0.6523, + "step": 753 + }, + { + "epoch": 0.6032, + "grad_norm": 0.9347051369082151, + "learning_rate": 7.187196700608373e-05, + "loss": 0.596, + "step": 754 + }, + { + "epoch": 0.604, + "grad_norm": 0.9923423082065466, + "learning_rate": 7.162331979232783e-05, + "loss": 0.611, + "step": 755 + }, + { + "epoch": 0.6048, + "grad_norm": 0.9035717517346512, + "learning_rate": 7.137486323692995e-05, + "loss": 0.6214, + "step": 756 + }, + { + "epoch": 0.6056, + "grad_norm": 0.9806422813540163, + "learning_rate": 7.112659900922976e-05, + "loss": 0.6559, + "step": 757 + }, + { + "epoch": 0.6064, + "grad_norm": 0.8358362273716006, + "learning_rate": 7.087852877727481e-05, + "loss": 0.5557, + "step": 758 + }, + { + "epoch": 0.6072, + "grad_norm": 0.9762546938398565, + "learning_rate": 7.06306542078091e-05, + "loss": 0.7442, + "step": 759 + }, + { + "epoch": 0.608, + "grad_norm": 0.8457968285899077, + "learning_rate": 7.038297696626206e-05, + "loss": 0.6362, + "step": 760 + }, + { + "epoch": 0.6088, + "grad_norm": 0.9867394684229117, + "learning_rate": 7.013549871673736e-05, + "loss": 0.6626, + "step": 761 + }, + { + "epoch": 0.6096, + "grad_norm": 0.9804817061905197, + "learning_rate": 6.988822112200156e-05, + "loss": 0.5673, + "step": 762 + }, + { + "epoch": 0.6104, + "grad_norm": 0.8686750808478814, + "learning_rate": 6.964114584347316e-05, + "loss": 0.5164, + "step": 763 + }, + { + "epoch": 0.6112, + "grad_norm": 1.0739535691613076, + "learning_rate": 6.939427454121128e-05, + "loss": 0.7, + "step": 764 + }, + { + "epoch": 0.612, + "grad_norm": 0.974910792051444, + "learning_rate": 6.914760887390452e-05, + "loss": 0.5915, + "step": 765 + }, + { + "epoch": 0.6128, + "grad_norm": 0.9274313889916219, + "learning_rate": 6.890115049885994e-05, + "loss": 0.5763, + "step": 766 + }, + { + "epoch": 0.6136, + "grad_norm": 1.1337383279689464, + "learning_rate": 6.865490107199181e-05, + "loss": 0.6324, + "step": 767 + }, + { + "epoch": 0.6144, + "grad_norm": 0.8722391993884088, + "learning_rate": 6.84088622478104e-05, + "loss": 0.5442, + "step": 768 + }, + { + "epoch": 0.6152, + "grad_norm": 0.9008678609241617, + "learning_rate": 6.816303567941112e-05, + "loss": 0.5032, + "step": 769 + }, + { + "epoch": 0.616, + "grad_norm": 0.8786081409518408, + "learning_rate": 6.791742301846326e-05, + "loss": 0.5224, + "step": 770 + }, + { + "epoch": 0.6168, + "grad_norm": 0.8782753024763686, + "learning_rate": 6.767202591519875e-05, + "loss": 0.5512, + "step": 771 + }, + { + "epoch": 0.6176, + "grad_norm": 0.9485361902342623, + "learning_rate": 6.742684601840141e-05, + "loss": 0.468, + "step": 772 + }, + { + "epoch": 0.6184, + "grad_norm": 0.9980592476102427, + "learning_rate": 6.718188497539554e-05, + "loss": 0.6164, + "step": 773 + }, + { + "epoch": 0.6192, + "grad_norm": 0.890415820394609, + "learning_rate": 6.693714443203507e-05, + "loss": 0.582, + "step": 774 + }, + { + "epoch": 0.62, + "grad_norm": 1.0930709924661823, + "learning_rate": 6.669262603269246e-05, + "loss": 0.6374, + "step": 775 + }, + { + "epoch": 0.6208, + "grad_norm": 0.8602461742143718, + "learning_rate": 6.644833142024751e-05, + "loss": 0.5258, + "step": 776 + }, + { + "epoch": 0.6216, + "grad_norm": 0.9049447130718785, + "learning_rate": 6.620426223607654e-05, + "loss": 0.615, + "step": 777 + }, + { + "epoch": 0.6224, + "grad_norm": 0.8478137147670415, + "learning_rate": 6.59604201200412e-05, + "loss": 0.5523, + "step": 778 + }, + { + "epoch": 0.6232, + "grad_norm": 1.0428062444573134, + "learning_rate": 6.571680671047749e-05, + "loss": 0.6102, + "step": 779 + }, + { + "epoch": 0.624, + "grad_norm": 0.9060803547799261, + "learning_rate": 6.547342364418481e-05, + "loss": 0.5192, + "step": 780 + }, + { + "epoch": 0.6248, + "grad_norm": 0.8671851045560147, + "learning_rate": 6.523027255641493e-05, + "loss": 0.5236, + "step": 781 + }, + { + "epoch": 0.6256, + "grad_norm": 0.9030872650059336, + "learning_rate": 6.498735508086093e-05, + "loss": 0.5564, + "step": 782 + }, + { + "epoch": 0.6264, + "grad_norm": 0.8891999761941229, + "learning_rate": 6.474467284964634e-05, + "loss": 0.4803, + "step": 783 + }, + { + "epoch": 0.6272, + "grad_norm": 1.2175211199100915, + "learning_rate": 6.450222749331414e-05, + "loss": 0.606, + "step": 784 + }, + { + "epoch": 0.628, + "grad_norm": 0.9223325697404515, + "learning_rate": 6.426002064081565e-05, + "loss": 0.6114, + "step": 785 + }, + { + "epoch": 0.6288, + "grad_norm": 1.1518593501565233, + "learning_rate": 6.40180539194999e-05, + "loss": 0.5726, + "step": 786 + }, + { + "epoch": 0.6296, + "grad_norm": 1.00007730967823, + "learning_rate": 6.377632895510248e-05, + "loss": 0.6566, + "step": 787 + }, + { + "epoch": 0.6304, + "grad_norm": 0.9727693231297316, + "learning_rate": 6.35348473717345e-05, + "loss": 0.5765, + "step": 788 + }, + { + "epoch": 0.6312, + "grad_norm": 0.9033874622473923, + "learning_rate": 6.329361079187199e-05, + "loss": 0.6042, + "step": 789 + }, + { + "epoch": 0.632, + "grad_norm": 0.9458238352865332, + "learning_rate": 6.305262083634488e-05, + "loss": 0.5683, + "step": 790 + }, + { + "epoch": 0.6328, + "grad_norm": 0.8562750584506673, + "learning_rate": 6.281187912432587e-05, + "loss": 0.5072, + "step": 791 + }, + { + "epoch": 0.6336, + "grad_norm": 0.9794396912129939, + "learning_rate": 6.25713872733199e-05, + "loss": 0.6185, + "step": 792 + }, + { + "epoch": 0.6344, + "grad_norm": 0.8850179136959749, + "learning_rate": 6.233114689915316e-05, + "loss": 0.5479, + "step": 793 + }, + { + "epoch": 0.6352, + "grad_norm": 1.179241015400573, + "learning_rate": 6.209115961596208e-05, + "loss": 0.781, + "step": 794 + }, + { + "epoch": 0.636, + "grad_norm": 0.9272477221458543, + "learning_rate": 6.18514270361827e-05, + "loss": 0.5702, + "step": 795 + }, + { + "epoch": 0.6368, + "grad_norm": 0.9009021563821138, + "learning_rate": 6.161195077053976e-05, + "loss": 0.561, + "step": 796 + }, + { + "epoch": 0.6376, + "grad_norm": 0.8581708176413048, + "learning_rate": 6.13727324280358e-05, + "loss": 0.5186, + "step": 797 + }, + { + "epoch": 0.6384, + "grad_norm": 0.8723976354383418, + "learning_rate": 6.113377361594049e-05, + "loss": 0.5318, + "step": 798 + }, + { + "epoch": 0.6392, + "grad_norm": 0.7431457385938055, + "learning_rate": 6.08950759397797e-05, + "loss": 0.4181, + "step": 799 + }, + { + "epoch": 0.64, + "grad_norm": 0.8837427101900246, + "learning_rate": 6.065664100332478e-05, + "loss": 0.5868, + "step": 800 + }, + { + "epoch": 0.6408, + "grad_norm": 0.8579725182366548, + "learning_rate": 6.0418470408581774e-05, + "loss": 0.5344, + "step": 801 + }, + { + "epoch": 0.6416, + "grad_norm": 0.9071593023553052, + "learning_rate": 6.018056575578075e-05, + "loss": 0.5674, + "step": 802 + }, + { + "epoch": 0.6424, + "grad_norm": 0.8253343827701816, + "learning_rate": 5.9942928643364724e-05, + "loss": 0.5168, + "step": 803 + }, + { + "epoch": 0.6432, + "grad_norm": 0.8347321820495228, + "learning_rate": 5.970556066797941e-05, + "loss": 0.4898, + "step": 804 + }, + { + "epoch": 0.644, + "grad_norm": 0.8580588005870989, + "learning_rate": 5.946846342446214e-05, + "loss": 0.6004, + "step": 805 + }, + { + "epoch": 0.6448, + "grad_norm": 0.9068280468472921, + "learning_rate": 5.923163850583113e-05, + "loss": 0.6366, + "step": 806 + }, + { + "epoch": 0.6456, + "grad_norm": 0.8725133877344808, + "learning_rate": 5.899508750327501e-05, + "loss": 0.5617, + "step": 807 + }, + { + "epoch": 0.6464, + "grad_norm": 1.1267908594103957, + "learning_rate": 5.875881200614207e-05, + "loss": 0.665, + "step": 808 + }, + { + "epoch": 0.6472, + "grad_norm": 0.9734516620486875, + "learning_rate": 5.8522813601929324e-05, + "loss": 0.6219, + "step": 809 + }, + { + "epoch": 0.648, + "grad_norm": 0.9183237827735163, + "learning_rate": 5.828709387627218e-05, + "loss": 0.5942, + "step": 810 + }, + { + "epoch": 0.6488, + "grad_norm": 0.9320508265907027, + "learning_rate": 5.80516544129337e-05, + "loss": 0.6137, + "step": 811 + }, + { + "epoch": 0.6496, + "grad_norm": 0.8649942159597835, + "learning_rate": 5.781649679379378e-05, + "loss": 0.5499, + "step": 812 + }, + { + "epoch": 0.6504, + "grad_norm": 0.8939880978861915, + "learning_rate": 5.758162259883867e-05, + "loss": 0.5474, + "step": 813 + }, + { + "epoch": 0.6512, + "grad_norm": 0.7997400615731897, + "learning_rate": 5.73470334061505e-05, + "loss": 0.5049, + "step": 814 + }, + { + "epoch": 0.652, + "grad_norm": 0.897826974097443, + "learning_rate": 5.7112730791896207e-05, + "loss": 0.5178, + "step": 815 + }, + { + "epoch": 0.6528, + "grad_norm": 0.9137494525351595, + "learning_rate": 5.687871633031754e-05, + "loss": 0.5506, + "step": 816 + }, + { + "epoch": 0.6536, + "grad_norm": 0.8968322331004761, + "learning_rate": 5.664499159372017e-05, + "loss": 0.5621, + "step": 817 + }, + { + "epoch": 0.6544, + "grad_norm": 0.8935469934960261, + "learning_rate": 5.6411558152462894e-05, + "loss": 0.5297, + "step": 818 + }, + { + "epoch": 0.6552, + "grad_norm": 0.9802607196989127, + "learning_rate": 5.617841757494762e-05, + "loss": 0.609, + "step": 819 + }, + { + "epoch": 0.656, + "grad_norm": 0.9244342711411846, + "learning_rate": 5.5945571427608526e-05, + "loss": 0.6323, + "step": 820 + }, + { + "epoch": 0.6568, + "grad_norm": 0.9921448739093076, + "learning_rate": 5.5713021274901335e-05, + "loss": 0.6659, + "step": 821 + }, + { + "epoch": 0.6576, + "grad_norm": 0.8082608559518543, + "learning_rate": 5.54807686792933e-05, + "loss": 0.4815, + "step": 822 + }, + { + "epoch": 0.6584, + "grad_norm": 0.8853742615880639, + "learning_rate": 5.524881520125229e-05, + "loss": 0.5958, + "step": 823 + }, + { + "epoch": 0.6592, + "grad_norm": 0.9318274124318456, + "learning_rate": 5.501716239923642e-05, + "loss": 0.5057, + "step": 824 + }, + { + "epoch": 0.66, + "grad_norm": 0.8440535946971125, + "learning_rate": 5.4785811829683764e-05, + "loss": 0.5626, + "step": 825 + }, + { + "epoch": 0.6608, + "grad_norm": 1.0839068704747066, + "learning_rate": 5.4554765047001613e-05, + "loss": 0.6763, + "step": 826 + }, + { + "epoch": 0.6616, + "grad_norm": 0.9503143284762061, + "learning_rate": 5.432402360355615e-05, + "loss": 0.5612, + "step": 827 + }, + { + "epoch": 0.6624, + "grad_norm": 0.8985459420279938, + "learning_rate": 5.4093589049662175e-05, + "loss": 0.6194, + "step": 828 + }, + { + "epoch": 0.6632, + "grad_norm": 0.9588031538158126, + "learning_rate": 5.386346293357242e-05, + "loss": 0.5763, + "step": 829 + }, + { + "epoch": 0.664, + "grad_norm": 0.9903749264660486, + "learning_rate": 5.363364680146725e-05, + "loss": 0.5527, + "step": 830 + }, + { + "epoch": 0.6648, + "grad_norm": 1.0545286776848906, + "learning_rate": 5.3404142197444506e-05, + "loss": 0.629, + "step": 831 + }, + { + "epoch": 0.6656, + "grad_norm": 0.9187204254406824, + "learning_rate": 5.31749506635086e-05, + "loss": 0.5336, + "step": 832 + }, + { + "epoch": 0.6664, + "grad_norm": 0.9074774869812791, + "learning_rate": 5.2946073739560706e-05, + "loss": 0.5807, + "step": 833 + }, + { + "epoch": 0.6672, + "grad_norm": 0.9354272316349254, + "learning_rate": 5.271751296338823e-05, + "loss": 0.603, + "step": 834 + }, + { + "epoch": 0.668, + "grad_norm": 0.8621968567745608, + "learning_rate": 5.248926987065417e-05, + "loss": 0.5029, + "step": 835 + }, + { + "epoch": 0.6688, + "grad_norm": 1.0503686217853245, + "learning_rate": 5.226134599488728e-05, + "loss": 0.6687, + "step": 836 + }, + { + "epoch": 0.6696, + "grad_norm": 0.8087437735035757, + "learning_rate": 5.203374286747158e-05, + "loss": 0.5049, + "step": 837 + }, + { + "epoch": 0.6704, + "grad_norm": 1.0286699396273984, + "learning_rate": 5.180646201763577e-05, + "loss": 0.7108, + "step": 838 + }, + { + "epoch": 0.6712, + "grad_norm": 1.1222870059539025, + "learning_rate": 5.15795049724435e-05, + "loss": 0.5778, + "step": 839 + }, + { + "epoch": 0.672, + "grad_norm": 0.8682346562913431, + "learning_rate": 5.135287325678271e-05, + "loss": 0.5769, + "step": 840 + }, + { + "epoch": 0.6728, + "grad_norm": 0.8970792816397353, + "learning_rate": 5.112656839335543e-05, + "loss": 0.5973, + "step": 841 + }, + { + "epoch": 0.6736, + "grad_norm": 0.8758641896709833, + "learning_rate": 5.090059190266779e-05, + "loss": 0.541, + "step": 842 + }, + { + "epoch": 0.6744, + "grad_norm": 0.8193765480762364, + "learning_rate": 5.0674945303019526e-05, + "loss": 0.4439, + "step": 843 + }, + { + "epoch": 0.6752, + "grad_norm": 0.9642302230497448, + "learning_rate": 5.0449630110493836e-05, + "loss": 0.6231, + "step": 844 + }, + { + "epoch": 0.676, + "grad_norm": 1.202530953279566, + "learning_rate": 5.022464783894744e-05, + "loss": 0.7005, + "step": 845 + }, + { + "epoch": 0.6768, + "grad_norm": 0.8186776036576376, + "learning_rate": 5.000000000000002e-05, + "loss": 0.5625, + "step": 846 + }, + { + "epoch": 0.6776, + "grad_norm": 0.8509909294818085, + "learning_rate": 4.977568810302432e-05, + "loss": 0.5207, + "step": 847 + }, + { + "epoch": 0.6784, + "grad_norm": 0.8630572024442712, + "learning_rate": 4.955171365513603e-05, + "loss": 0.5531, + "step": 848 + }, + { + "epoch": 0.6792, + "grad_norm": 0.9776376629053878, + "learning_rate": 4.9328078161183464e-05, + "loss": 0.6213, + "step": 849 + }, + { + "epoch": 0.68, + "grad_norm": 0.8327183472419162, + "learning_rate": 4.9104783123737566e-05, + "loss": 0.5478, + "step": 850 + }, + { + "epoch": 0.6808, + "grad_norm": 0.9629843450121315, + "learning_rate": 4.88818300430819e-05, + "loss": 0.6228, + "step": 851 + }, + { + "epoch": 0.6816, + "grad_norm": 0.9428958922382763, + "learning_rate": 4.865922041720239e-05, + "loss": 0.6306, + "step": 852 + }, + { + "epoch": 0.6824, + "grad_norm": 0.8766919189818014, + "learning_rate": 4.843695574177737e-05, + "loss": 0.5252, + "step": 853 + }, + { + "epoch": 0.6832, + "grad_norm": 0.9607715277505974, + "learning_rate": 4.821503751016746e-05, + "loss": 0.5826, + "step": 854 + }, + { + "epoch": 0.684, + "grad_norm": 0.8811357710516505, + "learning_rate": 4.7993467213405706e-05, + "loss": 0.558, + "step": 855 + }, + { + "epoch": 0.6848, + "grad_norm": 0.8633162238313675, + "learning_rate": 4.777224634018732e-05, + "loss": 0.6029, + "step": 856 + }, + { + "epoch": 0.6856, + "grad_norm": 0.7826786876638742, + "learning_rate": 4.755137637685979e-05, + "loss": 0.4493, + "step": 857 + }, + { + "epoch": 0.6864, + "grad_norm": 1.1234018311636582, + "learning_rate": 4.733085880741301e-05, + "loss": 0.5906, + "step": 858 + }, + { + "epoch": 0.6872, + "grad_norm": 0.8133381724130317, + "learning_rate": 4.7110695113469085e-05, + "loss": 0.4849, + "step": 859 + }, + { + "epoch": 0.688, + "grad_norm": 0.8811594930337688, + "learning_rate": 4.689088677427249e-05, + "loss": 0.5501, + "step": 860 + }, + { + "epoch": 0.6888, + "grad_norm": 0.9625523729130937, + "learning_rate": 4.6671435266680216e-05, + "loss": 0.602, + "step": 861 + }, + { + "epoch": 0.6896, + "grad_norm": 1.7004748721610694, + "learning_rate": 4.645234206515171e-05, + "loss": 0.7473, + "step": 862 + }, + { + "epoch": 0.6904, + "grad_norm": 0.9115527765393678, + "learning_rate": 4.623360864173893e-05, + "loss": 0.5729, + "step": 863 + }, + { + "epoch": 0.6912, + "grad_norm": 0.8633236144748906, + "learning_rate": 4.6015236466076747e-05, + "loss": 0.5243, + "step": 864 + }, + { + "epoch": 0.692, + "grad_norm": 0.861669670026954, + "learning_rate": 4.579722700537268e-05, + "loss": 0.4855, + "step": 865 + }, + { + "epoch": 0.6928, + "grad_norm": 1.0431346413694014, + "learning_rate": 4.5579581724397255e-05, + "loss": 0.648, + "step": 866 + }, + { + "epoch": 0.6936, + "grad_norm": 0.918967588419062, + "learning_rate": 4.5362302085474254e-05, + "loss": 0.5108, + "step": 867 + }, + { + "epoch": 0.6944, + "grad_norm": 1.010661153516246, + "learning_rate": 4.514538954847064e-05, + "loss": 0.5907, + "step": 868 + }, + { + "epoch": 0.6952, + "grad_norm": 0.8450664077888524, + "learning_rate": 4.492884557078688e-05, + "loss": 0.4896, + "step": 869 + }, + { + "epoch": 0.696, + "grad_norm": 0.9923031127222196, + "learning_rate": 4.471267160734731e-05, + "loss": 0.6135, + "step": 870 + }, + { + "epoch": 0.6968, + "grad_norm": 0.9288854200687368, + "learning_rate": 4.449686911058992e-05, + "loss": 0.5613, + "step": 871 + }, + { + "epoch": 0.6976, + "grad_norm": 0.9522743141061653, + "learning_rate": 4.428143953045717e-05, + "loss": 0.4701, + "step": 872 + }, + { + "epoch": 0.6984, + "grad_norm": 1.0094695453286113, + "learning_rate": 4.406638431438576e-05, + "loss": 0.58, + "step": 873 + }, + { + "epoch": 0.6992, + "grad_norm": 0.970218865338825, + "learning_rate": 4.385170490729712e-05, + "loss": 0.5926, + "step": 874 + }, + { + "epoch": 0.7, + "grad_norm": 0.790855904118202, + "learning_rate": 4.36374027515878e-05, + "loss": 0.4653, + "step": 875 + }, + { + "epoch": 0.7008, + "grad_norm": 0.8422660876340917, + "learning_rate": 4.342347928711953e-05, + "loss": 0.4801, + "step": 876 + }, + { + "epoch": 0.7016, + "grad_norm": 1.0014831126614723, + "learning_rate": 4.320993595120969e-05, + "loss": 0.4986, + "step": 877 + }, + { + "epoch": 0.7024, + "grad_norm": 0.9900852217587683, + "learning_rate": 4.2996774178621736e-05, + "loss": 0.5694, + "step": 878 + }, + { + "epoch": 0.7032, + "grad_norm": 0.9417817304764583, + "learning_rate": 4.278399540155536e-05, + "loss": 0.5908, + "step": 879 + }, + { + "epoch": 0.704, + "grad_norm": 0.9192656513426533, + "learning_rate": 4.257160104963696e-05, + "loss": 0.561, + "step": 880 + }, + { + "epoch": 0.7048, + "grad_norm": 0.9390210428837307, + "learning_rate": 4.2359592549910145e-05, + "loss": 0.5637, + "step": 881 + }, + { + "epoch": 0.7056, + "grad_norm": 0.8023042426447257, + "learning_rate": 4.2147971326825966e-05, + "loss": 0.5082, + "step": 882 + }, + { + "epoch": 0.7064, + "grad_norm": 0.994269753573064, + "learning_rate": 4.193673880223339e-05, + "loss": 0.5859, + "step": 883 + }, + { + "epoch": 0.7072, + "grad_norm": 0.9581278006089728, + "learning_rate": 4.172589639536991e-05, + "loss": 0.6368, + "step": 884 + }, + { + "epoch": 0.708, + "grad_norm": 1.157025545060076, + "learning_rate": 4.1515445522851784e-05, + "loss": 0.731, + "step": 885 + }, + { + "epoch": 0.7088, + "grad_norm": 0.9026081678125137, + "learning_rate": 4.130538759866457e-05, + "loss": 0.574, + "step": 886 + }, + { + "epoch": 0.7096, + "grad_norm": 0.8511295626883457, + "learning_rate": 4.109572403415386e-05, + "loss": 0.5233, + "step": 887 + }, + { + "epoch": 0.7104, + "grad_norm": 0.9605202807866595, + "learning_rate": 4.088645623801534e-05, + "loss": 0.5892, + "step": 888 + }, + { + "epoch": 0.7112, + "grad_norm": 0.9941106462464011, + "learning_rate": 4.0677585616285774e-05, + "loss": 0.6674, + "step": 889 + }, + { + "epoch": 0.712, + "grad_norm": 0.958307096804603, + "learning_rate": 4.046911357233343e-05, + "loss": 0.5351, + "step": 890 + }, + { + "epoch": 0.7128, + "grad_norm": 0.8848714279388494, + "learning_rate": 4.026104150684835e-05, + "loss": 0.6071, + "step": 891 + }, + { + "epoch": 0.7136, + "grad_norm": 0.9124009475249037, + "learning_rate": 4.00533708178334e-05, + "loss": 0.4971, + "step": 892 + }, + { + "epoch": 0.7144, + "grad_norm": 0.8170349079531632, + "learning_rate": 3.984610290059467e-05, + "loss": 0.4853, + "step": 893 + }, + { + "epoch": 0.7152, + "grad_norm": 0.9168135684559577, + "learning_rate": 3.963923914773187e-05, + "loss": 0.5256, + "step": 894 + }, + { + "epoch": 0.716, + "grad_norm": 0.8958868424097945, + "learning_rate": 3.943278094912946e-05, + "loss": 0.595, + "step": 895 + }, + { + "epoch": 0.7168, + "grad_norm": 1.0322136965161413, + "learning_rate": 3.922672969194686e-05, + "loss": 0.625, + "step": 896 + }, + { + "epoch": 0.7176, + "grad_norm": 0.8686088201111917, + "learning_rate": 3.902108676060937e-05, + "loss": 0.5213, + "step": 897 + }, + { + "epoch": 0.7184, + "grad_norm": 1.397943370653368, + "learning_rate": 3.8815853536798904e-05, + "loss": 0.7105, + "step": 898 + }, + { + "epoch": 0.7192, + "grad_norm": 0.849284154839858, + "learning_rate": 3.861103139944449e-05, + "loss": 0.471, + "step": 899 + }, + { + "epoch": 0.72, + "grad_norm": 0.9651700194122222, + "learning_rate": 3.840662172471315e-05, + "loss": 0.5713, + "step": 900 + }, + { + "epoch": 0.7208, + "grad_norm": 0.8461510681085123, + "learning_rate": 3.820262588600074e-05, + "loss": 0.526, + "step": 901 + }, + { + "epoch": 0.7216, + "grad_norm": 0.8872125095643726, + "learning_rate": 3.79990452539225e-05, + "loss": 0.5451, + "step": 902 + }, + { + "epoch": 0.7224, + "grad_norm": 0.7725367210449859, + "learning_rate": 3.7795881196303995e-05, + "loss": 0.5168, + "step": 903 + }, + { + "epoch": 0.7232, + "grad_norm": 0.9731750885863505, + "learning_rate": 3.759313507817196e-05, + "loss": 0.5905, + "step": 904 + }, + { + "epoch": 0.724, + "grad_norm": 0.8575890240241985, + "learning_rate": 3.739080826174498e-05, + "loss": 0.5402, + "step": 905 + }, + { + "epoch": 0.7248, + "grad_norm": 0.9310603758600113, + "learning_rate": 3.7188902106424416e-05, + "loss": 0.5713, + "step": 906 + }, + { + "epoch": 0.7256, + "grad_norm": 0.9486072925333384, + "learning_rate": 3.6987417968785366e-05, + "loss": 0.5536, + "step": 907 + }, + { + "epoch": 0.7264, + "grad_norm": 0.8796289974349658, + "learning_rate": 3.678635720256737e-05, + "loss": 0.473, + "step": 908 + }, + { + "epoch": 0.7272, + "grad_norm": 0.904832093813838, + "learning_rate": 3.658572115866541e-05, + "loss": 0.5644, + "step": 909 + }, + { + "epoch": 0.728, + "grad_norm": 0.9959967053012221, + "learning_rate": 3.638551118512089e-05, + "loss": 0.6113, + "step": 910 + }, + { + "epoch": 0.7288, + "grad_norm": 1.0035413750150226, + "learning_rate": 3.618572862711247e-05, + "loss": 0.5597, + "step": 911 + }, + { + "epoch": 0.7296, + "grad_norm": 0.7522128103248988, + "learning_rate": 3.5986374826947066e-05, + "loss": 0.3973, + "step": 912 + }, + { + "epoch": 0.7304, + "grad_norm": 0.9559089106196158, + "learning_rate": 3.578745112405083e-05, + "loss": 0.6254, + "step": 913 + }, + { + "epoch": 0.7312, + "grad_norm": 0.9082540088018486, + "learning_rate": 3.558895885496023e-05, + "loss": 0.5922, + "step": 914 + }, + { + "epoch": 0.732, + "grad_norm": 0.9209653753383338, + "learning_rate": 3.539089935331294e-05, + "loss": 0.5191, + "step": 915 + }, + { + "epoch": 0.7328, + "grad_norm": 0.9753363807773336, + "learning_rate": 3.519327394983888e-05, + "loss": 0.6246, + "step": 916 + }, + { + "epoch": 0.7336, + "grad_norm": 0.9128765043266641, + "learning_rate": 3.4996083972351515e-05, + "loss": 0.5684, + "step": 917 + }, + { + "epoch": 0.7344, + "grad_norm": 0.9795183066919234, + "learning_rate": 3.479933074573858e-05, + "loss": 0.627, + "step": 918 + }, + { + "epoch": 0.7352, + "grad_norm": 0.9370464016613507, + "learning_rate": 3.4603015591953395e-05, + "loss": 0.5785, + "step": 919 + }, + { + "epoch": 0.736, + "grad_norm": 0.8661804985123592, + "learning_rate": 3.440713983000601e-05, + "loss": 0.4774, + "step": 920 + }, + { + "epoch": 0.7368, + "grad_norm": 0.9275349767601161, + "learning_rate": 3.421170477595419e-05, + "loss": 0.5361, + "step": 921 + }, + { + "epoch": 0.7376, + "grad_norm": 0.9776163224048302, + "learning_rate": 3.401671174289469e-05, + "loss": 0.6501, + "step": 922 + }, + { + "epoch": 0.7384, + "grad_norm": 0.9684150776770701, + "learning_rate": 3.3822162040954354e-05, + "loss": 0.6016, + "step": 923 + }, + { + "epoch": 0.7392, + "grad_norm": 0.8536286255083416, + "learning_rate": 3.362805697728145e-05, + "loss": 0.4827, + "step": 924 + }, + { + "epoch": 0.74, + "grad_norm": 0.8488705178074374, + "learning_rate": 3.34343978560367e-05, + "loss": 0.5068, + "step": 925 + }, + { + "epoch": 0.7408, + "grad_norm": 0.9653390506845506, + "learning_rate": 3.324118597838464e-05, + "loss": 0.5196, + "step": 926 + }, + { + "epoch": 0.7416, + "grad_norm": 0.8436991002384736, + "learning_rate": 3.3048422642484886e-05, + "loss": 0.4656, + "step": 927 + }, + { + "epoch": 0.7424, + "grad_norm": 0.8749823412974033, + "learning_rate": 3.285610914348332e-05, + "loss": 0.5662, + "step": 928 + }, + { + "epoch": 0.7432, + "grad_norm": 0.8985882406835242, + "learning_rate": 3.266424677350346e-05, + "loss": 0.5482, + "step": 929 + }, + { + "epoch": 0.744, + "grad_norm": 0.9630164324990867, + "learning_rate": 3.2472836821637744e-05, + "loss": 0.5153, + "step": 930 + }, + { + "epoch": 0.7448, + "grad_norm": 0.7646616715144958, + "learning_rate": 3.228188057393895e-05, + "loss": 0.4989, + "step": 931 + }, + { + "epoch": 0.7456, + "grad_norm": 0.9902478452601741, + "learning_rate": 3.209137931341143e-05, + "loss": 0.586, + "step": 932 + }, + { + "epoch": 0.7464, + "grad_norm": 0.7945357744020473, + "learning_rate": 3.190133432000252e-05, + "loss": 0.49, + "step": 933 + }, + { + "epoch": 0.7472, + "grad_norm": 0.9437580839050794, + "learning_rate": 3.1711746870594086e-05, + "loss": 0.5464, + "step": 934 + }, + { + "epoch": 0.748, + "grad_norm": 0.9000794532132054, + "learning_rate": 3.1522618238993725e-05, + "loss": 0.5479, + "step": 935 + }, + { + "epoch": 0.7488, + "grad_norm": 0.8248876467754059, + "learning_rate": 3.1333949695926324e-05, + "loss": 0.4422, + "step": 936 + }, + { + "epoch": 0.7496, + "grad_norm": 1.0134703940851104, + "learning_rate": 3.114574250902558e-05, + "loss": 0.5729, + "step": 937 + }, + { + "epoch": 0.7504, + "grad_norm": 0.8799946140730117, + "learning_rate": 3.0957997942825336e-05, + "loss": 0.5678, + "step": 938 + }, + { + "epoch": 0.7512, + "grad_norm": 0.8065366798841783, + "learning_rate": 3.077071725875116e-05, + "loss": 0.4753, + "step": 939 + }, + { + "epoch": 0.752, + "grad_norm": 0.9475335105937116, + "learning_rate": 3.058390171511196e-05, + "loss": 0.6077, + "step": 940 + }, + { + "epoch": 0.7528, + "grad_norm": 0.7754114695147104, + "learning_rate": 3.0397552567091337e-05, + "loss": 0.4338, + "step": 941 + }, + { + "epoch": 0.7536, + "grad_norm": 0.8374808007484986, + "learning_rate": 3.021167106673928e-05, + "loss": 0.4497, + "step": 942 + }, + { + "epoch": 0.7544, + "grad_norm": 0.8323538409441401, + "learning_rate": 3.0026258462963787e-05, + "loss": 0.5003, + "step": 943 + }, + { + "epoch": 0.7552, + "grad_norm": 0.8992981567160874, + "learning_rate": 2.9841316001522347e-05, + "loss": 0.4931, + "step": 944 + }, + { + "epoch": 0.756, + "grad_norm": 0.9523785259083702, + "learning_rate": 2.9656844925013637e-05, + "loss": 0.545, + "step": 945 + }, + { + "epoch": 0.7568, + "grad_norm": 0.9028532965536356, + "learning_rate": 2.9472846472869298e-05, + "loss": 0.5643, + "step": 946 + }, + { + "epoch": 0.7576, + "grad_norm": 0.8329661181095044, + "learning_rate": 2.9289321881345254e-05, + "loss": 0.5065, + "step": 947 + }, + { + "epoch": 0.7584, + "grad_norm": 0.9045489579885994, + "learning_rate": 2.9106272383513835e-05, + "loss": 0.5754, + "step": 948 + }, + { + "epoch": 0.7592, + "grad_norm": 0.8122236201667463, + "learning_rate": 2.8923699209255284e-05, + "loss": 0.4596, + "step": 949 + }, + { + "epoch": 0.76, + "grad_norm": 0.9458700607662385, + "learning_rate": 2.874160358524931e-05, + "loss": 0.5422, + "step": 950 + }, + { + "epoch": 0.7608, + "grad_norm": 0.9400045938705994, + "learning_rate": 2.8559986734967282e-05, + "loss": 0.5691, + "step": 951 + }, + { + "epoch": 0.7616, + "grad_norm": 0.950119666470634, + "learning_rate": 2.8378849878663628e-05, + "loss": 0.6084, + "step": 952 + }, + { + "epoch": 0.7624, + "grad_norm": 0.8266852543982652, + "learning_rate": 2.819819423336775e-05, + "loss": 0.5058, + "step": 953 + }, + { + "epoch": 0.7632, + "grad_norm": 0.9171482494675368, + "learning_rate": 2.8018021012875994e-05, + "loss": 0.5265, + "step": 954 + }, + { + "epoch": 0.764, + "grad_norm": 0.9129964408049166, + "learning_rate": 2.7838331427743282e-05, + "loss": 0.4856, + "step": 955 + }, + { + "epoch": 0.7648, + "grad_norm": 0.970371319237002, + "learning_rate": 2.7659126685275027e-05, + "loss": 0.5786, + "step": 956 + }, + { + "epoch": 0.7656, + "grad_norm": 0.938108920869477, + "learning_rate": 2.7480407989519198e-05, + "loss": 0.5396, + "step": 957 + }, + { + "epoch": 0.7664, + "grad_norm": 0.8324834459051568, + "learning_rate": 2.7302176541257986e-05, + "loss": 0.5321, + "step": 958 + }, + { + "epoch": 0.7672, + "grad_norm": 0.9720401613921223, + "learning_rate": 2.712443353799984e-05, + "loss": 0.6282, + "step": 959 + }, + { + "epoch": 0.768, + "grad_norm": 0.9211477035330666, + "learning_rate": 2.6947180173971508e-05, + "loss": 0.5049, + "step": 960 + }, + { + "epoch": 0.7688, + "grad_norm": 0.9114536236330752, + "learning_rate": 2.677041764010988e-05, + "loss": 0.5723, + "step": 961 + }, + { + "epoch": 0.7696, + "grad_norm": 0.869206743166981, + "learning_rate": 2.659414712405398e-05, + "loss": 0.5251, + "step": 962 + }, + { + "epoch": 0.7704, + "grad_norm": 0.9131960209724165, + "learning_rate": 2.6418369810137188e-05, + "loss": 0.5253, + "step": 963 + }, + { + "epoch": 0.7712, + "grad_norm": 1.0325033567001431, + "learning_rate": 2.6243086879379e-05, + "loss": 0.5184, + "step": 964 + }, + { + "epoch": 0.772, + "grad_norm": 1.0711695560219818, + "learning_rate": 2.6068299509477266e-05, + "loss": 0.4953, + "step": 965 + }, + { + "epoch": 0.7728, + "grad_norm": 0.79993043009904, + "learning_rate": 2.5894008874800325e-05, + "loss": 0.4594, + "step": 966 + }, + { + "epoch": 0.7736, + "grad_norm": 0.9076938969046555, + "learning_rate": 2.5720216146378917e-05, + "loss": 0.5813, + "step": 967 + }, + { + "epoch": 0.7744, + "grad_norm": 1.1954518554339275, + "learning_rate": 2.5546922491898495e-05, + "loss": 0.6984, + "step": 968 + }, + { + "epoch": 0.7752, + "grad_norm": 0.9553192337296545, + "learning_rate": 2.5374129075691265e-05, + "loss": 0.645, + "step": 969 + }, + { + "epoch": 0.776, + "grad_norm": 0.8824602104943458, + "learning_rate": 2.5201837058728505e-05, + "loss": 0.5727, + "step": 970 + }, + { + "epoch": 0.7768, + "grad_norm": 1.1307467443908437, + "learning_rate": 2.503004759861258e-05, + "loss": 0.7457, + "step": 971 + }, + { + "epoch": 0.7776, + "grad_norm": 0.8714489356501227, + "learning_rate": 2.485876184956928e-05, + "loss": 0.5797, + "step": 972 + }, + { + "epoch": 0.7784, + "grad_norm": 0.8727769203474613, + "learning_rate": 2.4687980962440072e-05, + "loss": 0.5305, + "step": 973 + }, + { + "epoch": 0.7792, + "grad_norm": 1.012849393772381, + "learning_rate": 2.451770608467432e-05, + "loss": 0.6181, + "step": 974 + }, + { + "epoch": 0.78, + "grad_norm": 0.9295677357269199, + "learning_rate": 2.4347938360321566e-05, + "loss": 0.5982, + "step": 975 + }, + { + "epoch": 0.7808, + "grad_norm": 0.96562031519255, + "learning_rate": 2.417867893002387e-05, + "loss": 0.4983, + "step": 976 + }, + { + "epoch": 0.7816, + "grad_norm": 0.8730122804461445, + "learning_rate": 2.400992893100822e-05, + "loss": 0.5385, + "step": 977 + }, + { + "epoch": 0.7824, + "grad_norm": 0.9789972079391175, + "learning_rate": 2.3841689497078746e-05, + "loss": 0.5423, + "step": 978 + }, + { + "epoch": 0.7832, + "grad_norm": 1.010970133723809, + "learning_rate": 2.3673961758609152e-05, + "loss": 0.5722, + "step": 979 + }, + { + "epoch": 0.784, + "grad_norm": 0.9170646823952904, + "learning_rate": 2.3506746842535242e-05, + "loss": 0.5465, + "step": 980 + }, + { + "epoch": 0.7848, + "grad_norm": 0.9973204900340932, + "learning_rate": 2.334004587234717e-05, + "loss": 0.6262, + "step": 981 + }, + { + "epoch": 0.7856, + "grad_norm": 0.8259919276621404, + "learning_rate": 2.3173859968081944e-05, + "loss": 0.4667, + "step": 982 + }, + { + "epoch": 0.7864, + "grad_norm": 0.8937375710756645, + "learning_rate": 2.300819024631603e-05, + "loss": 0.5263, + "step": 983 + }, + { + "epoch": 0.7872, + "grad_norm": 0.8914255511382069, + "learning_rate": 2.2843037820157675e-05, + "loss": 0.5328, + "step": 984 + }, + { + "epoch": 0.788, + "grad_norm": 0.964531191188055, + "learning_rate": 2.26784037992395e-05, + "loss": 0.6234, + "step": 985 + }, + { + "epoch": 0.7888, + "grad_norm": 1.016152490851934, + "learning_rate": 2.251428928971102e-05, + "loss": 0.5976, + "step": 986 + }, + { + "epoch": 0.7896, + "grad_norm": 0.9408720151506826, + "learning_rate": 2.2350695394231345e-05, + "loss": 0.5926, + "step": 987 + }, + { + "epoch": 0.7904, + "grad_norm": 0.8491116729945938, + "learning_rate": 2.2187623211961562e-05, + "loss": 0.5053, + "step": 988 + }, + { + "epoch": 0.7912, + "grad_norm": 0.8801389581367419, + "learning_rate": 2.2025073838557454e-05, + "loss": 0.4839, + "step": 989 + }, + { + "epoch": 0.792, + "grad_norm": 0.9189757810625665, + "learning_rate": 2.1863048366162208e-05, + "loss": 0.5776, + "step": 990 + }, + { + "epoch": 0.7928, + "grad_norm": 0.8679856769713041, + "learning_rate": 2.1701547883398922e-05, + "loss": 0.5109, + "step": 991 + }, + { + "epoch": 0.7936, + "grad_norm": 0.8653361527703157, + "learning_rate": 2.1540573475363402e-05, + "loss": 0.4714, + "step": 992 + }, + { + "epoch": 0.7944, + "grad_norm": 1.2154684681960608, + "learning_rate": 2.138012622361689e-05, + "loss": 0.5409, + "step": 993 + }, + { + "epoch": 0.7952, + "grad_norm": 0.8117989635671922, + "learning_rate": 2.1220207206178688e-05, + "loss": 0.4881, + "step": 994 + }, + { + "epoch": 0.796, + "grad_norm": 0.8933660238435036, + "learning_rate": 2.106081749751897e-05, + "loss": 0.4991, + "step": 995 + }, + { + "epoch": 0.7968, + "grad_norm": 0.8805551533380775, + "learning_rate": 2.0901958168551638e-05, + "loss": 0.556, + "step": 996 + }, + { + "epoch": 0.7976, + "grad_norm": 0.9076112378541764, + "learning_rate": 2.0743630286627002e-05, + "loss": 0.5162, + "step": 997 + }, + { + "epoch": 0.7984, + "grad_norm": 0.7783752852786534, + "learning_rate": 2.058583491552465e-05, + "loss": 0.4355, + "step": 998 + }, + { + "epoch": 0.7992, + "grad_norm": 0.9681238878174612, + "learning_rate": 2.0428573115446392e-05, + "loss": 0.6387, + "step": 999 + }, + { + "epoch": 0.8, + "grad_norm": 0.986067456670194, + "learning_rate": 2.027184594300898e-05, + "loss": 0.6401, + "step": 1000 + }, + { + "epoch": 0.8008, + "grad_norm": 1.1616887354158458, + "learning_rate": 2.011565445123711e-05, + "loss": 0.6503, + "step": 1001 + }, + { + "epoch": 0.8016, + "grad_norm": 0.8356171170071415, + "learning_rate": 1.995999968955641e-05, + "loss": 0.46, + "step": 1002 + }, + { + "epoch": 0.8024, + "grad_norm": 0.8234938113671183, + "learning_rate": 1.980488270378612e-05, + "loss": 0.4529, + "step": 1003 + }, + { + "epoch": 0.8032, + "grad_norm": 0.9021931268838739, + "learning_rate": 1.9650304536132426e-05, + "loss": 0.619, + "step": 1004 + }, + { + "epoch": 0.804, + "grad_norm": 0.8805150998519309, + "learning_rate": 1.9496266225181248e-05, + "loss": 0.5648, + "step": 1005 + }, + { + "epoch": 0.8048, + "grad_norm": 0.8766414205095304, + "learning_rate": 1.9342768805891178e-05, + "loss": 0.4507, + "step": 1006 + }, + { + "epoch": 0.8056, + "grad_norm": 0.8963944790833713, + "learning_rate": 1.918981330958678e-05, + "loss": 0.5129, + "step": 1007 + }, + { + "epoch": 0.8064, + "grad_norm": 0.9674164869308384, + "learning_rate": 1.903740076395151e-05, + "loss": 0.6082, + "step": 1008 + }, + { + "epoch": 0.8072, + "grad_norm": 0.9387142620377266, + "learning_rate": 1.8885532193020704e-05, + "loss": 0.5818, + "step": 1009 + }, + { + "epoch": 0.808, + "grad_norm": 1.0476174430955054, + "learning_rate": 1.8734208617174988e-05, + "loss": 0.5424, + "step": 1010 + }, + { + "epoch": 0.8088, + "grad_norm": 0.778305736612384, + "learning_rate": 1.8583431053133127e-05, + "loss": 0.4626, + "step": 1011 + }, + { + "epoch": 0.8096, + "grad_norm": 0.8066049464477373, + "learning_rate": 1.8433200513945337e-05, + "loss": 0.484, + "step": 1012 + }, + { + "epoch": 0.8104, + "grad_norm": 0.7361159517063306, + "learning_rate": 1.8283518008986567e-05, + "loss": 0.4208, + "step": 1013 + }, + { + "epoch": 0.8112, + "grad_norm": 0.7861135477198569, + "learning_rate": 1.8134384543949478e-05, + "loss": 0.4816, + "step": 1014 + }, + { + "epoch": 0.812, + "grad_norm": 1.1101606366000765, + "learning_rate": 1.7985801120837865e-05, + "loss": 0.8143, + "step": 1015 + }, + { + "epoch": 0.8128, + "grad_norm": 0.8571834353244482, + "learning_rate": 1.783776873795994e-05, + "loss": 0.4639, + "step": 1016 + }, + { + "epoch": 0.8136, + "grad_norm": 0.8812710706618156, + "learning_rate": 1.7690288389921493e-05, + "loss": 0.5343, + "step": 1017 + }, + { + "epoch": 0.8144, + "grad_norm": 0.9011119020017496, + "learning_rate": 1.754336106761927e-05, + "loss": 0.5465, + "step": 1018 + }, + { + "epoch": 0.8152, + "grad_norm": 0.7150605871766601, + "learning_rate": 1.739698775823442e-05, + "loss": 0.3887, + "step": 1019 + }, + { + "epoch": 0.816, + "grad_norm": 0.934933015144909, + "learning_rate": 1.7251169445225657e-05, + "loss": 0.5632, + "step": 1020 + }, + { + "epoch": 0.8168, + "grad_norm": 0.814265512837061, + "learning_rate": 1.7105907108322816e-05, + "loss": 0.513, + "step": 1021 + }, + { + "epoch": 0.8176, + "grad_norm": 0.8898108833293623, + "learning_rate": 1.696120172352025e-05, + "loss": 0.4953, + "step": 1022 + }, + { + "epoch": 0.8184, + "grad_norm": 0.7878727211367245, + "learning_rate": 1.6817054263070174e-05, + "loss": 0.4097, + "step": 1023 + }, + { + "epoch": 0.8192, + "grad_norm": 0.8262094779399805, + "learning_rate": 1.6673465695476232e-05, + "loss": 0.5031, + "step": 1024 + }, + { + "epoch": 0.82, + "grad_norm": 0.8602059461423265, + "learning_rate": 1.6530436985486996e-05, + "loss": 0.5065, + "step": 1025 + }, + { + "epoch": 0.8208, + "grad_norm": 0.9102039847676353, + "learning_rate": 1.6387969094089316e-05, + "loss": 0.5793, + "step": 1026 + }, + { + "epoch": 0.8216, + "grad_norm": 0.8593913559563465, + "learning_rate": 1.6246062978502164e-05, + "loss": 0.5173, + "step": 1027 + }, + { + "epoch": 0.8224, + "grad_norm": 0.9612889504391398, + "learning_rate": 1.6104719592169902e-05, + "loss": 0.5509, + "step": 1028 + }, + { + "epoch": 0.8232, + "grad_norm": 0.8199863353032216, + "learning_rate": 1.5963939884756042e-05, + "loss": 0.4669, + "step": 1029 + }, + { + "epoch": 0.824, + "grad_norm": 0.8683319349314995, + "learning_rate": 1.5823724802136865e-05, + "loss": 0.4832, + "step": 1030 + }, + { + "epoch": 0.8248, + "grad_norm": 0.8853586682877709, + "learning_rate": 1.5684075286394985e-05, + "loss": 0.5853, + "step": 1031 + }, + { + "epoch": 0.8256, + "grad_norm": 0.870653065272409, + "learning_rate": 1.5544992275813053e-05, + "loss": 0.5657, + "step": 1032 + }, + { + "epoch": 0.8264, + "grad_norm": 1.0360095669560587, + "learning_rate": 1.5406476704867524e-05, + "loss": 0.5844, + "step": 1033 + }, + { + "epoch": 0.8272, + "grad_norm": 0.9127764534167723, + "learning_rate": 1.526852950422226e-05, + "loss": 0.5158, + "step": 1034 + }, + { + "epoch": 0.828, + "grad_norm": 0.8324285302416667, + "learning_rate": 1.5131151600722337e-05, + "loss": 0.4904, + "step": 1035 + }, + { + "epoch": 0.8288, + "grad_norm": 0.9126213682993818, + "learning_rate": 1.4994343917387854e-05, + "loss": 0.5612, + "step": 1036 + }, + { + "epoch": 0.8296, + "grad_norm": 0.9377897981379643, + "learning_rate": 1.485810737340767e-05, + "loss": 0.5324, + "step": 1037 + }, + { + "epoch": 0.8304, + "grad_norm": 0.9267564340290367, + "learning_rate": 1.4722442884133214e-05, + "loss": 0.54, + "step": 1038 + }, + { + "epoch": 0.8312, + "grad_norm": 0.8885418872921662, + "learning_rate": 1.4587351361072454e-05, + "loss": 0.5605, + "step": 1039 + }, + { + "epoch": 0.832, + "grad_norm": 0.9220925719860372, + "learning_rate": 1.4452833711883628e-05, + "loss": 0.5891, + "step": 1040 + }, + { + "epoch": 0.8328, + "grad_norm": 0.8917359520594882, + "learning_rate": 1.4318890840369182e-05, + "loss": 0.5343, + "step": 1041 + }, + { + "epoch": 0.8336, + "grad_norm": 0.856374299549514, + "learning_rate": 1.4185523646469822e-05, + "loss": 0.4947, + "step": 1042 + }, + { + "epoch": 0.8344, + "grad_norm": 0.8472421331309948, + "learning_rate": 1.4052733026258281e-05, + "loss": 0.4742, + "step": 1043 + }, + { + "epoch": 0.8352, + "grad_norm": 0.9059799615295063, + "learning_rate": 1.3920519871933424e-05, + "loss": 0.5649, + "step": 1044 + }, + { + "epoch": 0.836, + "grad_norm": 0.903380145241749, + "learning_rate": 1.3788885071814172e-05, + "loss": 0.5267, + "step": 1045 + }, + { + "epoch": 0.8368, + "grad_norm": 0.838159360823987, + "learning_rate": 1.3657829510333654e-05, + "loss": 0.4785, + "step": 1046 + }, + { + "epoch": 0.8376, + "grad_norm": 0.95426650020417, + "learning_rate": 1.3527354068033139e-05, + "loss": 0.6238, + "step": 1047 + }, + { + "epoch": 0.8384, + "grad_norm": 0.9835303491327146, + "learning_rate": 1.339745962155613e-05, + "loss": 0.5337, + "step": 1048 + }, + { + "epoch": 0.8392, + "grad_norm": 0.9085320670809434, + "learning_rate": 1.326814704364262e-05, + "loss": 0.5523, + "step": 1049 + }, + { + "epoch": 0.84, + "grad_norm": 0.9402108229444758, + "learning_rate": 1.3139417203123027e-05, + "loss": 0.5777, + "step": 1050 + }, + { + "epoch": 0.8408, + "grad_norm": 0.8033901405681682, + "learning_rate": 1.3011270964912459e-05, + "loss": 0.4331, + "step": 1051 + }, + { + "epoch": 0.8416, + "grad_norm": 0.8483623278184083, + "learning_rate": 1.2883709190004955e-05, + "loss": 0.4641, + "step": 1052 + }, + { + "epoch": 0.8424, + "grad_norm": 0.8382782289402538, + "learning_rate": 1.275673273546758e-05, + "loss": 0.5495, + "step": 1053 + }, + { + "epoch": 0.8432, + "grad_norm": 1.0653563377438948, + "learning_rate": 1.263034245443473e-05, + "loss": 0.6996, + "step": 1054 + }, + { + "epoch": 0.844, + "grad_norm": 0.8526435347765418, + "learning_rate": 1.2504539196102439e-05, + "loss": 0.4201, + "step": 1055 + }, + { + "epoch": 0.8448, + "grad_norm": 0.9173974064653941, + "learning_rate": 1.2379323805722576e-05, + "loss": 0.5011, + "step": 1056 + }, + { + "epoch": 0.8456, + "grad_norm": 0.9118765944555586, + "learning_rate": 1.2254697124597237e-05, + "loss": 0.5349, + "step": 1057 + }, + { + "epoch": 0.8464, + "grad_norm": 0.8523726711246717, + "learning_rate": 1.2130659990073146e-05, + "loss": 0.4843, + "step": 1058 + }, + { + "epoch": 0.8472, + "grad_norm": 0.8310938443735266, + "learning_rate": 1.2007213235535786e-05, + "loss": 0.4752, + "step": 1059 + }, + { + "epoch": 0.848, + "grad_norm": 0.8160240253137404, + "learning_rate": 1.1884357690404158e-05, + "loss": 0.4935, + "step": 1060 + }, + { + "epoch": 0.8488, + "grad_norm": 0.9133439238402501, + "learning_rate": 1.176209418012495e-05, + "loss": 0.5186, + "step": 1061 + }, + { + "epoch": 0.8496, + "grad_norm": 1.2257899135753538, + "learning_rate": 1.1640423526166988e-05, + "loss": 0.7849, + "step": 1062 + }, + { + "epoch": 0.8504, + "grad_norm": 0.8941487901942106, + "learning_rate": 1.1519346546015907e-05, + "loss": 0.511, + "step": 1063 + }, + { + "epoch": 0.8512, + "grad_norm": 0.8927617816033743, + "learning_rate": 1.1398864053168534e-05, + "loss": 0.5277, + "step": 1064 + }, + { + "epoch": 0.852, + "grad_norm": 0.8569616628261307, + "learning_rate": 1.1278976857127311e-05, + "loss": 0.4934, + "step": 1065 + }, + { + "epoch": 0.8528, + "grad_norm": 0.8623307013532433, + "learning_rate": 1.1159685763395111e-05, + "loss": 0.4872, + "step": 1066 + }, + { + "epoch": 0.8536, + "grad_norm": 0.8339219712284653, + "learning_rate": 1.1040991573469629e-05, + "loss": 0.4717, + "step": 1067 + }, + { + "epoch": 0.8544, + "grad_norm": 0.8919463963392258, + "learning_rate": 1.0922895084838037e-05, + "loss": 0.5271, + "step": 1068 + }, + { + "epoch": 0.8552, + "grad_norm": 1.038351660842284, + "learning_rate": 1.0805397090971737e-05, + "loss": 0.5991, + "step": 1069 + }, + { + "epoch": 0.856, + "grad_norm": 0.9113680210072598, + "learning_rate": 1.0688498381320855e-05, + "loss": 0.5128, + "step": 1070 + }, + { + "epoch": 0.8568, + "grad_norm": 0.9405512171624507, + "learning_rate": 1.057219974130903e-05, + "loss": 0.581, + "step": 1071 + }, + { + "epoch": 0.8576, + "grad_norm": 1.0139304340195732, + "learning_rate": 1.045650195232819e-05, + "loss": 0.5738, + "step": 1072 + }, + { + "epoch": 0.8584, + "grad_norm": 0.8329899919177766, + "learning_rate": 1.0341405791733183e-05, + "loss": 0.4126, + "step": 1073 + }, + { + "epoch": 0.8592, + "grad_norm": 0.8290302795588016, + "learning_rate": 1.0226912032836611e-05, + "loss": 0.524, + "step": 1074 + }, + { + "epoch": 0.86, + "grad_norm": 0.831940504501025, + "learning_rate": 1.0113021444903726e-05, + "loss": 0.5042, + "step": 1075 + }, + { + "epoch": 0.8608, + "grad_norm": 0.7654111092715857, + "learning_rate": 9.999734793146998e-06, + "loss": 0.3579, + "step": 1076 + }, + { + "epoch": 0.8616, + "grad_norm": 0.8185108283565805, + "learning_rate": 9.887052838721322e-06, + "loss": 0.5127, + "step": 1077 + }, + { + "epoch": 0.8624, + "grad_norm": 1.2175561725900603, + "learning_rate": 9.774976338718677e-06, + "loss": 0.4866, + "step": 1078 + }, + { + "epoch": 0.8632, + "grad_norm": 0.9256230720577836, + "learning_rate": 9.663506046162985e-06, + "loss": 0.575, + "step": 1079 + }, + { + "epoch": 0.864, + "grad_norm": 0.8594777460193576, + "learning_rate": 9.552642710005299e-06, + "loss": 0.5353, + "step": 1080 + }, + { + "epoch": 0.8648, + "grad_norm": 1.0304680086366294, + "learning_rate": 9.44238707511862e-06, + "loss": 0.551, + "step": 1081 + }, + { + "epoch": 0.8656, + "grad_norm": 0.9025342564934193, + "learning_rate": 9.332739882292752e-06, + "loss": 0.4693, + "step": 1082 + }, + { + "epoch": 0.8664, + "grad_norm": 0.9341684848617188, + "learning_rate": 9.22370186822965e-06, + "loss": 0.4824, + "step": 1083 + }, + { + "epoch": 0.8672, + "grad_norm": 0.9461860263023557, + "learning_rate": 9.115273765538202e-06, + "loss": 0.5283, + "step": 1084 + }, + { + "epoch": 0.868, + "grad_norm": 1.2350611078400762, + "learning_rate": 9.0074563027294e-06, + "loss": 0.5681, + "step": 1085 + }, + { + "epoch": 0.8688, + "grad_norm": 0.7474778193812881, + "learning_rate": 8.900250204211514e-06, + "loss": 0.3756, + "step": 1086 + }, + { + "epoch": 0.8696, + "grad_norm": 0.9464908649387499, + "learning_rate": 8.79365619028507e-06, + "loss": 0.6277, + "step": 1087 + }, + { + "epoch": 0.8704, + "grad_norm": 0.9416513453191021, + "learning_rate": 8.687674977138116e-06, + "loss": 0.4862, + "step": 1088 + }, + { + "epoch": 0.8712, + "grad_norm": 0.8908592829855512, + "learning_rate": 8.582307276841462e-06, + "loss": 0.4729, + "step": 1089 + }, + { + "epoch": 0.872, + "grad_norm": 0.8856623091540741, + "learning_rate": 8.47755379734373e-06, + "loss": 0.4909, + "step": 1090 + }, + { + "epoch": 0.8728, + "grad_norm": 0.9979093631514518, + "learning_rate": 8.37341524246672e-06, + "loss": 0.6267, + "step": 1091 + }, + { + "epoch": 0.8736, + "grad_norm": 0.9396388164146111, + "learning_rate": 8.269892311900696e-06, + "loss": 0.4944, + "step": 1092 + }, + { + "epoch": 0.8744, + "grad_norm": 0.9349642997890655, + "learning_rate": 8.166985701199582e-06, + "loss": 0.5823, + "step": 1093 + }, + { + "epoch": 0.8752, + "grad_norm": 0.9302060251593333, + "learning_rate": 8.064696101776358e-06, + "loss": 0.5565, + "step": 1094 + }, + { + "epoch": 0.876, + "grad_norm": 1.0068653034108261, + "learning_rate": 7.963024200898462e-06, + "loss": 0.6292, + "step": 1095 + }, + { + "epoch": 0.8768, + "grad_norm": 0.8308900215815783, + "learning_rate": 7.861970681683051e-06, + "loss": 0.558, + "step": 1096 + }, + { + "epoch": 0.8776, + "grad_norm": 0.8291497760442449, + "learning_rate": 7.761536223092458e-06, + "loss": 0.4724, + "step": 1097 + }, + { + "epoch": 0.8784, + "grad_norm": 0.837140049991436, + "learning_rate": 7.661721499929753e-06, + "loss": 0.4343, + "step": 1098 + }, + { + "epoch": 0.8792, + "grad_norm": 0.8206278559411827, + "learning_rate": 7.562527182833978e-06, + "loss": 0.522, + "step": 1099 + }, + { + "epoch": 0.88, + "grad_norm": 0.768654648097681, + "learning_rate": 7.463953938275858e-06, + "loss": 0.3869, + "step": 1100 + }, + { + "epoch": 0.8808, + "grad_norm": 0.9008291992190718, + "learning_rate": 7.366002428553153e-06, + "loss": 0.4763, + "step": 1101 + }, + { + "epoch": 0.8816, + "grad_norm": 1.004138999783533, + "learning_rate": 7.2686733117863784e-06, + "loss": 0.5653, + "step": 1102 + }, + { + "epoch": 0.8824, + "grad_norm": 0.8866055795866293, + "learning_rate": 7.171967241914224e-06, + "loss": 0.5081, + "step": 1103 + }, + { + "epoch": 0.8832, + "grad_norm": 0.8712416614852868, + "learning_rate": 7.07588486868922e-06, + "loss": 0.5173, + "step": 1104 + }, + { + "epoch": 0.884, + "grad_norm": 0.8352905774940463, + "learning_rate": 6.980426837673437e-06, + "loss": 0.5085, + "step": 1105 + }, + { + "epoch": 0.8848, + "grad_norm": 0.8664854906477405, + "learning_rate": 6.8855937902340576e-06, + "loss": 0.5379, + "step": 1106 + }, + { + "epoch": 0.8856, + "grad_norm": 0.8545854637602966, + "learning_rate": 6.791386363539065e-06, + "loss": 0.4901, + "step": 1107 + }, + { + "epoch": 0.8864, + "grad_norm": 0.8792443966755817, + "learning_rate": 6.6978051905530855e-06, + "loss": 0.4969, + "step": 1108 + }, + { + "epoch": 0.8872, + "grad_norm": 0.8275160725349503, + "learning_rate": 6.604850900032955e-06, + "loss": 0.5005, + "step": 1109 + }, + { + "epoch": 0.888, + "grad_norm": 0.9555667842829724, + "learning_rate": 6.512524116523633e-06, + "loss": 0.5709, + "step": 1110 + }, + { + "epoch": 0.8888, + "grad_norm": 0.8893416025085475, + "learning_rate": 6.420825460353974e-06, + "loss": 0.5108, + "step": 1111 + }, + { + "epoch": 0.8896, + "grad_norm": 0.9198927290446861, + "learning_rate": 6.329755547632499e-06, + "loss": 0.4771, + "step": 1112 + }, + { + "epoch": 0.8904, + "grad_norm": 0.9765388094416342, + "learning_rate": 6.239314990243339e-06, + "loss": 0.5807, + "step": 1113 + }, + { + "epoch": 0.8912, + "grad_norm": 0.9779341822780092, + "learning_rate": 6.149504395842087e-06, + "loss": 0.6186, + "step": 1114 + }, + { + "epoch": 0.892, + "grad_norm": 1.0534004761522051, + "learning_rate": 6.0603243678516995e-06, + "loss": 0.5374, + "step": 1115 + }, + { + "epoch": 0.8928, + "grad_norm": 0.845348850963641, + "learning_rate": 5.971775505458444e-06, + "loss": 0.4122, + "step": 1116 + }, + { + "epoch": 0.8936, + "grad_norm": 1.104942550352276, + "learning_rate": 5.883858403607967e-06, + "loss": 0.4983, + "step": 1117 + }, + { + "epoch": 0.8944, + "grad_norm": 1.0475255602464242, + "learning_rate": 5.7965736530010916e-06, + "loss": 0.5859, + "step": 1118 + }, + { + "epoch": 0.8952, + "grad_norm": 0.9522886093773041, + "learning_rate": 5.7099218400900716e-06, + "loss": 0.4994, + "step": 1119 + }, + { + "epoch": 0.896, + "grad_norm": 0.7025830954078448, + "learning_rate": 5.623903547074549e-06, + "loss": 0.3753, + "step": 1120 + }, + { + "epoch": 0.8968, + "grad_norm": 0.8969366965782625, + "learning_rate": 5.538519351897575e-06, + "loss": 0.4291, + "step": 1121 + }, + { + "epoch": 0.8976, + "grad_norm": 0.9794017411293363, + "learning_rate": 5.453769828241872e-06, + "loss": 0.5851, + "step": 1122 + }, + { + "epoch": 0.8984, + "grad_norm": 0.8854550965461073, + "learning_rate": 5.369655545525909e-06, + "loss": 0.5149, + "step": 1123 + }, + { + "epoch": 0.8992, + "grad_norm": 0.823691828844835, + "learning_rate": 5.286177068899989e-06, + "loss": 0.4734, + "step": 1124 + }, + { + "epoch": 0.9, + "grad_norm": 0.7818438263006319, + "learning_rate": 5.2033349592426335e-06, + "loss": 0.4146, + "step": 1125 + }, + { + "epoch": 0.9008, + "grad_norm": 0.9426024371050443, + "learning_rate": 5.121129773156663e-06, + "loss": 0.4801, + "step": 1126 + }, + { + "epoch": 0.9016, + "grad_norm": 0.8749080560869444, + "learning_rate": 5.039562062965508e-06, + "loss": 0.4793, + "step": 1127 + }, + { + "epoch": 0.9024, + "grad_norm": 0.8676696472413886, + "learning_rate": 4.95863237670956e-06, + "loss": 0.4645, + "step": 1128 + }, + { + "epoch": 0.9032, + "grad_norm": 0.8960028684930892, + "learning_rate": 4.87834125814235e-06, + "loss": 0.4813, + "step": 1129 + }, + { + "epoch": 0.904, + "grad_norm": 0.841782397347136, + "learning_rate": 4.798689246727006e-06, + "loss": 0.469, + "step": 1130 + }, + { + "epoch": 0.9048, + "grad_norm": 0.933131225290578, + "learning_rate": 4.719676877632639e-06, + "loss": 0.5261, + "step": 1131 + }, + { + "epoch": 0.9056, + "grad_norm": 0.8684117605795351, + "learning_rate": 4.641304681730641e-06, + "loss": 0.4353, + "step": 1132 + }, + { + "epoch": 0.9064, + "grad_norm": 1.0568779961110095, + "learning_rate": 4.563573185591219e-06, + "loss": 0.5826, + "step": 1133 + }, + { + "epoch": 0.9072, + "grad_norm": 0.9697568087755433, + "learning_rate": 4.486482911479839e-06, + "loss": 0.5972, + "step": 1134 + }, + { + "epoch": 0.908, + "grad_norm": 0.8828340754449046, + "learning_rate": 4.4100343773536225e-06, + "loss": 0.4705, + "step": 1135 + }, + { + "epoch": 0.9088, + "grad_norm": 0.9568532376050941, + "learning_rate": 4.3342280968580285e-06, + "loss": 0.5999, + "step": 1136 + }, + { + "epoch": 0.9096, + "grad_norm": 0.8968564158876486, + "learning_rate": 4.259064579323302e-06, + "loss": 0.494, + "step": 1137 + }, + { + "epoch": 0.9104, + "grad_norm": 0.9443142557280755, + "learning_rate": 4.184544329761009e-06, + "loss": 0.6001, + "step": 1138 + }, + { + "epoch": 0.9112, + "grad_norm": 0.9579343285914728, + "learning_rate": 4.1106678488607495e-06, + "loss": 0.5305, + "step": 1139 + }, + { + "epoch": 0.912, + "grad_norm": 0.8775488211919319, + "learning_rate": 4.037435632986786e-06, + "loss": 0.507, + "step": 1140 + }, + { + "epoch": 0.9128, + "grad_norm": 0.9473459034568387, + "learning_rate": 3.964848174174541e-06, + "loss": 0.5673, + "step": 1141 + }, + { + "epoch": 0.9136, + "grad_norm": 0.8299701680297237, + "learning_rate": 3.892905960127546e-06, + "loss": 0.4673, + "step": 1142 + }, + { + "epoch": 0.9144, + "grad_norm": 0.868648290322588, + "learning_rate": 3.821609474213983e-06, + "loss": 0.5002, + "step": 1143 + }, + { + "epoch": 0.9152, + "grad_norm": 0.7270428842077594, + "learning_rate": 3.750959195463466e-06, + "loss": 0.4236, + "step": 1144 + }, + { + "epoch": 0.916, + "grad_norm": 0.8074750039881929, + "learning_rate": 3.6809555985639068e-06, + "loss": 0.4342, + "step": 1145 + }, + { + "epoch": 0.9168, + "grad_norm": 0.9026392457184746, + "learning_rate": 3.611599153858214e-06, + "loss": 0.4882, + "step": 1146 + }, + { + "epoch": 0.9176, + "grad_norm": 0.9283783800439218, + "learning_rate": 3.5428903273411863e-06, + "loss": 0.4729, + "step": 1147 + }, + { + "epoch": 0.9184, + "grad_norm": 1.0460686665759698, + "learning_rate": 3.4748295806564356e-06, + "loss": 0.6621, + "step": 1148 + }, + { + "epoch": 0.9192, + "grad_norm": 0.9080809068007054, + "learning_rate": 3.40741737109318e-06, + "loss": 0.4621, + "step": 1149 + }, + { + "epoch": 0.92, + "grad_norm": 0.8947614683641865, + "learning_rate": 3.3406541515832003e-06, + "loss": 0.5121, + "step": 1150 + }, + { + "epoch": 0.9208, + "grad_norm": 0.8655363229396491, + "learning_rate": 3.2745403706978872e-06, + "loss": 0.4584, + "step": 1151 + }, + { + "epoch": 0.9216, + "grad_norm": 0.8968007468514912, + "learning_rate": 3.209076472645112e-06, + "loss": 0.5095, + "step": 1152 + }, + { + "epoch": 0.9224, + "grad_norm": 0.9905380754651123, + "learning_rate": 3.1442628972662704e-06, + "loss": 0.5721, + "step": 1153 + }, + { + "epoch": 0.9232, + "grad_norm": 0.9825103431981693, + "learning_rate": 3.0801000800333877e-06, + "loss": 0.5568, + "step": 1154 + }, + { + "epoch": 0.924, + "grad_norm": 0.9732696070377225, + "learning_rate": 3.0165884520461316e-06, + "loss": 0.5385, + "step": 1155 + }, + { + "epoch": 0.9248, + "grad_norm": 0.8911017162323968, + "learning_rate": 2.9537284400289355e-06, + "loss": 0.486, + "step": 1156 + }, + { + "epoch": 0.9256, + "grad_norm": 0.8415838751932674, + "learning_rate": 2.8915204663281013e-06, + "loss": 0.4727, + "step": 1157 + }, + { + "epoch": 0.9264, + "grad_norm": 0.9298508591052215, + "learning_rate": 2.8299649489090475e-06, + "loss": 0.4488, + "step": 1158 + }, + { + "epoch": 0.9272, + "grad_norm": 0.951180288226257, + "learning_rate": 2.7690623013533976e-06, + "loss": 0.6119, + "step": 1159 + }, + { + "epoch": 0.928, + "grad_norm": 0.9594481967706358, + "learning_rate": 2.708812932856253e-06, + "loss": 0.522, + "step": 1160 + }, + { + "epoch": 0.9288, + "grad_norm": 0.8216569948534782, + "learning_rate": 2.649217248223468e-06, + "loss": 0.4426, + "step": 1161 + }, + { + "epoch": 0.9296, + "grad_norm": 0.8764995815135124, + "learning_rate": 2.590275647868867e-06, + "loss": 0.4885, + "step": 1162 + }, + { + "epoch": 0.9304, + "grad_norm": 0.8669171775122492, + "learning_rate": 2.5319885278115906e-06, + "loss": 0.4792, + "step": 1163 + }, + { + "epoch": 0.9312, + "grad_norm": 1.0636096340620473, + "learning_rate": 2.4743562796734622e-06, + "loss": 0.5597, + "step": 1164 + }, + { + "epoch": 0.932, + "grad_norm": 0.9666529676562569, + "learning_rate": 2.4173792906762804e-06, + "loss": 0.5385, + "step": 1165 + }, + { + "epoch": 0.9328, + "grad_norm": 0.975179351024621, + "learning_rate": 2.3610579436393e-06, + "loss": 0.5263, + "step": 1166 + }, + { + "epoch": 0.9336, + "grad_norm": 0.9728368420681078, + "learning_rate": 2.3053926169765984e-06, + "loss": 0.4715, + "step": 1167 + }, + { + "epoch": 0.9344, + "grad_norm": 0.8307393168249513, + "learning_rate": 2.250383684694579e-06, + "loss": 0.4706, + "step": 1168 + }, + { + "epoch": 0.9352, + "grad_norm": 0.933506651180675, + "learning_rate": 2.1960315163894075e-06, + "loss": 0.487, + "step": 1169 + }, + { + "epoch": 0.936, + "grad_norm": 0.9003621002918134, + "learning_rate": 2.1423364772445887e-06, + "loss": 0.5217, + "step": 1170 + }, + { + "epoch": 0.9368, + "grad_norm": 0.9934422138613694, + "learning_rate": 2.0892989280284823e-06, + "loss": 0.5806, + "step": 1171 + }, + { + "epoch": 0.9376, + "grad_norm": 1.1353093338542166, + "learning_rate": 2.036919225091827e-06, + "loss": 0.5983, + "step": 1172 + }, + { + "epoch": 0.9384, + "grad_norm": 1.0046597135773063, + "learning_rate": 1.9851977203654835e-06, + "loss": 0.5806, + "step": 1173 + }, + { + "epoch": 0.9392, + "grad_norm": 0.9049654952924316, + "learning_rate": 1.9341347613579087e-06, + "loss": 0.4912, + "step": 1174 + }, + { + "epoch": 0.94, + "grad_norm": 0.9797372066358172, + "learning_rate": 1.8837306911529184e-06, + "loss": 0.5539, + "step": 1175 + }, + { + "epoch": 0.9408, + "grad_norm": 0.8191030872242712, + "learning_rate": 1.8339858484073935e-06, + "loss": 0.4482, + "step": 1176 + }, + { + "epoch": 0.9416, + "grad_norm": 1.0361003460873626, + "learning_rate": 1.7849005673489127e-06, + "loss": 0.5914, + "step": 1177 + }, + { + "epoch": 0.9424, + "grad_norm": 0.9631844884063604, + "learning_rate": 1.7364751777736332e-06, + "loss": 0.5133, + "step": 1178 + }, + { + "epoch": 0.9432, + "grad_norm": 1.0839124306252106, + "learning_rate": 1.6887100050439587e-06, + "loss": 0.5904, + "step": 1179 + }, + { + "epoch": 0.944, + "grad_norm": 0.9087621324174773, + "learning_rate": 1.6416053700863964e-06, + "loss": 0.4718, + "step": 1180 + }, + { + "epoch": 0.9448, + "grad_norm": 0.8537357892822648, + "learning_rate": 1.595161589389449e-06, + "loss": 0.4538, + "step": 1181 + }, + { + "epoch": 0.9456, + "grad_norm": 0.8868391059398126, + "learning_rate": 1.5493789750014031e-06, + "loss": 0.4907, + "step": 1182 + }, + { + "epoch": 0.9464, + "grad_norm": 0.9849941050555328, + "learning_rate": 1.5042578345283108e-06, + "loss": 0.536, + "step": 1183 + }, + { + "epoch": 0.9472, + "grad_norm": 0.8566136171382083, + "learning_rate": 1.459798471131868e-06, + "loss": 0.5163, + "step": 1184 + }, + { + "epoch": 0.948, + "grad_norm": 0.9598154308582632, + "learning_rate": 1.4160011835273934e-06, + "loss": 0.5125, + "step": 1185 + }, + { + "epoch": 0.9488, + "grad_norm": 1.1024945081202426, + "learning_rate": 1.3728662659818204e-06, + "loss": 0.5302, + "step": 1186 + }, + { + "epoch": 0.9496, + "grad_norm": 0.9417202792081554, + "learning_rate": 1.3303940083117527e-06, + "loss": 0.5601, + "step": 1187 + }, + { + "epoch": 0.9504, + "grad_norm": 0.8954980066354414, + "learning_rate": 1.2885846958814673e-06, + "loss": 0.5416, + "step": 1188 + }, + { + "epoch": 0.9512, + "grad_norm": 0.999982721740313, + "learning_rate": 1.2474386096010039e-06, + "loss": 0.5495, + "step": 1189 + }, + { + "epoch": 0.952, + "grad_norm": 1.0111231909200546, + "learning_rate": 1.2069560259243328e-06, + "loss": 0.5273, + "step": 1190 + }, + { + "epoch": 0.9528, + "grad_norm": 0.9274679387425223, + "learning_rate": 1.1671372168474138e-06, + "loss": 0.5113, + "step": 1191 + }, + { + "epoch": 0.9536, + "grad_norm": 0.9598365132710229, + "learning_rate": 1.1279824499064396e-06, + "loss": 0.4457, + "step": 1192 + }, + { + "epoch": 0.9544, + "grad_norm": 1.0479412201817522, + "learning_rate": 1.089491988176017e-06, + "loss": 0.6291, + "step": 1193 + }, + { + "epoch": 0.9552, + "grad_norm": 0.9306949568961659, + "learning_rate": 1.0516660902673448e-06, + "loss": 0.5439, + "step": 1194 + }, + { + "epoch": 0.956, + "grad_norm": 0.856405806668651, + "learning_rate": 1.014505010326583e-06, + "loss": 0.4288, + "step": 1195 + }, + { + "epoch": 0.9568, + "grad_norm": 0.9681397336561196, + "learning_rate": 9.780089980330642e-07, + "loss": 0.477, + "step": 1196 + }, + { + "epoch": 0.9576, + "grad_norm": 0.9783902754901848, + "learning_rate": 9.421782985976068e-07, + "loss": 0.5747, + "step": 1197 + }, + { + "epoch": 0.9584, + "grad_norm": 0.7718073981075178, + "learning_rate": 9.070131527609604e-07, + "loss": 0.3934, + "step": 1198 + }, + { + "epoch": 0.9592, + "grad_norm": 0.9365851525522183, + "learning_rate": 8.725137967920738e-07, + "loss": 0.5427, + "step": 1199 + }, + { + "epoch": 0.96, + "grad_norm": 0.8946986907015603, + "learning_rate": 8.386804624865851e-07, + "loss": 0.4729, + "step": 1200 + }, + { + "epoch": 0.9608, + "grad_norm": 0.9712126805966191, + "learning_rate": 8.055133771652345e-07, + "loss": 0.5563, + "step": 1201 + }, + { + "epoch": 0.9616, + "grad_norm": 0.9914126134977888, + "learning_rate": 7.730127636723539e-07, + "loss": 0.5616, + "step": 1202 + }, + { + "epoch": 0.9624, + "grad_norm": 0.9915353472321419, + "learning_rate": 7.411788403743237e-07, + "loss": 0.6006, + "step": 1203 + }, + { + "epoch": 0.9632, + "grad_norm": 1.021875288392754, + "learning_rate": 7.100118211581852e-07, + "loss": 0.5085, + "step": 1204 + }, + { + "epoch": 0.964, + "grad_norm": 0.9575275995276004, + "learning_rate": 6.7951191543012e-07, + "loss": 0.5485, + "step": 1205 + }, + { + "epoch": 0.9648, + "grad_norm": 0.9815525486849324, + "learning_rate": 6.496793281141056e-07, + "loss": 0.524, + "step": 1206 + }, + { + "epoch": 0.9656, + "grad_norm": 0.8472755247959326, + "learning_rate": 6.205142596505176e-07, + "loss": 0.5045, + "step": 1207 + }, + { + "epoch": 0.9664, + "grad_norm": 0.9110161852107633, + "learning_rate": 5.920169059947411e-07, + "loss": 0.4986, + "step": 1208 + }, + { + "epoch": 0.9672, + "grad_norm": 1.1735288736114748, + "learning_rate": 5.64187458615939e-07, + "loss": 0.7224, + "step": 1209 + }, + { + "epoch": 0.968, + "grad_norm": 0.987447662445318, + "learning_rate": 5.370261044956971e-07, + "loss": 0.4973, + "step": 1210 + }, + { + "epoch": 0.9688, + "grad_norm": 1.0488609031326457, + "learning_rate": 5.105330261267916e-07, + "loss": 0.4996, + "step": 1211 + }, + { + "epoch": 0.9696, + "grad_norm": 1.014047500109056, + "learning_rate": 4.847084015119574e-07, + "loss": 0.5138, + "step": 1212 + }, + { + "epoch": 0.9704, + "grad_norm": 0.9588783339157702, + "learning_rate": 4.5955240416271084e-07, + "loss": 0.5101, + "step": 1213 + }, + { + "epoch": 0.9712, + "grad_norm": 1.035740452465745, + "learning_rate": 4.3506520309813947e-07, + "loss": 0.5993, + "step": 1214 + }, + { + "epoch": 0.972, + "grad_norm": 1.1102270687547424, + "learning_rate": 4.112469628438365e-07, + "loss": 0.6304, + "step": 1215 + }, + { + "epoch": 0.9728, + "grad_norm": 0.7831216938523164, + "learning_rate": 3.8809784343072366e-07, + "loss": 0.4387, + "step": 1216 + }, + { + "epoch": 0.9736, + "grad_norm": 0.9117675424261213, + "learning_rate": 3.6561800039403016e-07, + "loss": 0.5163, + "step": 1217 + }, + { + "epoch": 0.9744, + "grad_norm": 1.1329774437267308, + "learning_rate": 3.4380758477219333e-07, + "loss": 0.7174, + "step": 1218 + }, + { + "epoch": 0.9752, + "grad_norm": 1.067893713503416, + "learning_rate": 3.2266674310589273e-07, + "loss": 0.6084, + "step": 1219 + }, + { + "epoch": 0.976, + "grad_norm": 0.9006368268680338, + "learning_rate": 3.0219561743707326e-07, + "loss": 0.5008, + "step": 1220 + }, + { + "epoch": 0.9768, + "grad_norm": 0.9708569171881641, + "learning_rate": 2.8239434530792365e-07, + "loss": 0.5353, + "step": 1221 + }, + { + "epoch": 0.9776, + "grad_norm": 0.8938517118108713, + "learning_rate": 2.6326305976001055e-07, + "loss": 0.4912, + "step": 1222 + }, + { + "epoch": 0.9784, + "grad_norm": 0.9424813688217818, + "learning_rate": 2.448018893333681e-07, + "loss": 0.5371, + "step": 1223 + }, + { + "epoch": 0.9792, + "grad_norm": 0.9506715977030524, + "learning_rate": 2.2701095806565432e-07, + "loss": 0.5418, + "step": 1224 + }, + { + "epoch": 0.98, + "grad_norm": 1.0195314451194224, + "learning_rate": 2.098903854912515e-07, + "loss": 0.5502, + "step": 1225 + }, + { + "epoch": 0.9808, + "grad_norm": 0.9360909013448883, + "learning_rate": 1.9344028664056713e-07, + "loss": 0.4926, + "step": 1226 + }, + { + "epoch": 0.9816, + "grad_norm": 0.8925123180992532, + "learning_rate": 1.7766077203915655e-07, + "loss": 0.5075, + "step": 1227 + }, + { + "epoch": 0.9824, + "grad_norm": 0.9373120885129952, + "learning_rate": 1.6255194770704586e-07, + "loss": 0.5522, + "step": 1228 + }, + { + "epoch": 0.9832, + "grad_norm": 0.9045565462595742, + "learning_rate": 1.481139151579991e-07, + "loss": 0.4455, + "step": 1229 + }, + { + "epoch": 0.984, + "grad_norm": 0.9496806607012137, + "learning_rate": 1.3434677139885222e-07, + "loss": 0.4764, + "step": 1230 + }, + { + "epoch": 0.9848, + "grad_norm": 1.2070971555248793, + "learning_rate": 1.2125060892881346e-07, + "loss": 0.5305, + "step": 1231 + }, + { + "epoch": 0.9856, + "grad_norm": 0.9884453096239335, + "learning_rate": 1.0882551573891953e-07, + "loss": 0.5564, + "step": 1232 + }, + { + "epoch": 0.9864, + "grad_norm": 0.925262765183813, + "learning_rate": 9.707157531134713e-08, + "loss": 0.5431, + "step": 1233 + }, + { + "epoch": 0.9872, + "grad_norm": 0.8650475451010552, + "learning_rate": 8.598886661895788e-08, + "loss": 0.4824, + "step": 1234 + }, + { + "epoch": 0.988, + "grad_norm": 0.9375027024745487, + "learning_rate": 7.557746412468758e-08, + "loss": 0.5334, + "step": 1235 + }, + { + "epoch": 0.9888, + "grad_norm": 1.0258138994156334, + "learning_rate": 6.583743778106887e-08, + "loss": 0.566, + "step": 1236 + }, + { + "epoch": 0.9896, + "grad_norm": 1.1348295457470585, + "learning_rate": 5.6768853029787184e-08, + "loss": 0.6353, + "step": 1237 + }, + { + "epoch": 0.9904, + "grad_norm": 0.8099381398300214, + "learning_rate": 4.837177080119215e-08, + "loss": 0.5236, + "step": 1238 + }, + { + "epoch": 0.9912, + "grad_norm": 1.1335763733398434, + "learning_rate": 4.064624751394242e-08, + "loss": 0.6208, + "step": 1239 + }, + { + "epoch": 0.992, + "grad_norm": 0.8417059804666327, + "learning_rate": 3.359233507459481e-08, + "loss": 0.4808, + "step": 1240 + }, + { + "epoch": 0.9928, + "grad_norm": 0.8740279745856687, + "learning_rate": 2.7210080877237976e-08, + "loss": 0.4519, + "step": 1241 + }, + { + "epoch": 0.9936, + "grad_norm": 0.9025682701839401, + "learning_rate": 2.1499527803214846e-08, + "loss": 0.4474, + "step": 1242 + }, + { + "epoch": 0.9944, + "grad_norm": 0.9356977884254927, + "learning_rate": 1.646071422083395e-08, + "loss": 0.4785, + "step": 1243 + }, + { + "epoch": 0.9952, + "grad_norm": 0.7786241335150642, + "learning_rate": 1.209367398504746e-08, + "loss": 0.4236, + "step": 1244 + }, + { + "epoch": 0.996, + "grad_norm": 0.904185452995062, + "learning_rate": 8.398436437317969e-09, + "loss": 0.5729, + "step": 1245 + }, + { + "epoch": 0.9968, + "grad_norm": 1.01166300380065, + "learning_rate": 5.375026405352035e-09, + "loss": 0.42, + "step": 1246 + }, + { + "epoch": 0.9976, + "grad_norm": 0.8484776609022155, + "learning_rate": 3.023464202944748e-09, + "loss": 0.4598, + "step": 1247 + }, + { + "epoch": 0.9984, + "grad_norm": 0.8555465620019524, + "learning_rate": 1.3437656298687097e-09, + "loss": 0.515, + "step": 1248 + }, + { + "epoch": 0.9992, + "grad_norm": 0.9853659491986522, + "learning_rate": 3.3594197175190745e-10, + "loss": 0.5119, + "step": 1249 + }, + { + "epoch": 1.0, + "grad_norm": 0.8368195146029167, + "learning_rate": 0.0, + "loss": 0.4741, + "step": 1250 + }, + { + "epoch": 1.0, + "step": 1250, + "total_flos": 165935961923584.0, + "train_loss": 0.6441711540222168, + "train_runtime": 10384.4138, + "train_samples_per_second": 1.926, + "train_steps_per_second": 0.12 + } + ], + "logging_steps": 1.0, + "max_steps": 1250, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 165935961923584.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..ca2de9b5033ee8244cdadf276aafa1f753a8b6f5 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "o_proj", + "k_proj", + "q_proj", + "gate_proj", + "up_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3845c8b530bfcf80fe75196425957db77bed50fc --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfb0e40d889acd8f4b66ac3dd43335936c109f7b563dc264e53aa62df3f03b4e +size 671150064 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..aab918516733b2284e4bb7680bb2a5ae6091d811 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80fd6743c026c9eca2d229e104ed0a0e0541174078c4f04e1cd24aeeb01bb209 +size 918507402 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..364f874fe4df900e5b5711dbb0efc161f99683ac --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_1_lora/trainer_state.json @@ -0,0 +1,917 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 125, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 4.619897607575414, + "learning_rate": 5e-05, + "loss": 1.4867, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 4.886720880463927, + "learning_rate": 0.0001, + "loss": 1.5103, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 2.0826440735967027, + "learning_rate": 0.00015000000000000001, + "loss": 1.3063, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 1.7883162219744941, + "learning_rate": 0.0002, + "loss": 1.0129, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 2.424535127131405, + "learning_rate": 0.00019996629653035126, + "loss": 1.0793, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 2.6477025684213955, + "learning_rate": 0.00019986520883988232, + "loss": 0.9522, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 2.383868971479716, + "learning_rate": 0.00019969680506871137, + "loss": 1.0021, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 1.6100909214247756, + "learning_rate": 0.00019946119873266613, + "loss": 0.8346, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 1.3081346305473396, + "learning_rate": 0.00019915854864676664, + "loss": 0.8299, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 1.7441102778418505, + "learning_rate": 0.00019878905881817252, + "loss": 1.0063, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 1.2701284480164619, + "learning_rate": 0.00019835297830866826, + "loss": 0.8531, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 1.736304860214383, + "learning_rate": 0.00019785060106677818, + "loss": 0.8597, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 1.4341392622612212, + "learning_rate": 0.00019728226572962473, + "loss": 0.7976, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 1.3844496084613491, + "learning_rate": 0.0001966483553946637, + "loss": 0.9177, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 1.4694767893133263, + "learning_rate": 0.00019594929736144976, + "loss": 0.885, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 1.1939356484363937, + "learning_rate": 0.00019518556284360696, + "loss": 0.873, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 1.4401907008611985, + "learning_rate": 0.0001943576666511982, + "loss": 0.9079, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 1.3988894591269279, + "learning_rate": 0.0001934661668437073, + "loss": 0.9131, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 1.2454329527509385, + "learning_rate": 0.0001925116643538684, + "loss": 0.8773, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 1.255418146967074, + "learning_rate": 0.00019149480258259533, + "loss": 0.8735, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 1.1560074136431318, + "learning_rate": 0.00019041626696528503, + "loss": 0.7798, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 1.291681536643461, + "learning_rate": 0.0001892767845097864, + "loss": 0.8684, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 1.3674424850401692, + "learning_rate": 0.00018807712330634642, + "loss": 0.7092, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 1.2999036173310123, + "learning_rate": 0.0001868180920098644, + "loss": 0.7637, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 1.2391085483231103, + "learning_rate": 0.00018550053929480202, + "loss": 0.7795, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 1.5082519625603912, + "learning_rate": 0.00018412535328311814, + "loss": 0.8217, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 1.3850556089113835, + "learning_rate": 0.0001826934609456129, + "loss": 0.818, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 1.1935139075969265, + "learning_rate": 0.00018120582747708502, + "loss": 0.7651, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 1.1508390722152042, + "learning_rate": 0.0001796634556457236, + "loss": 0.8159, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 1.2770011363549434, + "learning_rate": 0.0001780673851171728, + "loss": 0.8071, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 1.068067124408631, + "learning_rate": 0.00017641869175372493, + "loss": 0.7454, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 1.3051298462722027, + "learning_rate": 0.00017471848688911464, + "loss": 0.8419, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.9798996910610092, + "learning_rate": 0.000172967916579403, + "loss": 0.7307, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 1.142833058756903, + "learning_rate": 0.00017116816083045602, + "loss": 0.7414, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 1.145518711497527, + "learning_rate": 0.0001693204328025389, + "loss": 0.8124, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 1.0614283304633985, + "learning_rate": 0.00016742597799256182, + "loss": 0.7862, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 1.094667484989183, + "learning_rate": 0.00016548607339452853, + "loss": 0.7924, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 1.1491934862922923, + "learning_rate": 0.00016350202663875386, + "loss": 0.8699, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 1.1268283363130298, + "learning_rate": 0.0001614751751104301, + "loss": 0.7633, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 1.043542678445131, + "learning_rate": 0.00015940688504813662, + "loss": 0.7574, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 1.092441023312135, + "learning_rate": 0.00015729855062290022, + "loss": 0.7096, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 1.1601589532727679, + "learning_rate": 0.00015515159299842707, + "loss": 0.8354, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 1.1353600206932608, + "learning_rate": 0.00015296745937313987, + "loss": 0.7407, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 1.0503363091620095, + "learning_rate": 0.00015074762200466556, + "loss": 0.7131, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 1.1275262381443367, + "learning_rate": 0.00014849357721743168, + "loss": 0.7309, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 1.0900400445912137, + "learning_rate": 0.00014620684439403962, + "loss": 0.7705, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 1.0836608138977326, + "learning_rate": 0.0001438889649510956, + "loss": 0.7584, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 1.0680998853596113, + "learning_rate": 0.00014154150130018866, + "loss": 0.7439, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 1.0990957659179, + "learning_rate": 0.00013916603579471705, + "loss": 0.7257, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 1.0225040423604024, + "learning_rate": 0.000136764169663272, + "loss": 0.6854, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 1.1031078059651571, + "learning_rate": 0.00013433752193029886, + "loss": 0.7601, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 1.0970372956505092, + "learning_rate": 0.00013188772832476188, + "loss": 0.7407, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 1.2912652706234626, + "learning_rate": 0.00012941644017754964, + "loss": 0.8306, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 1.0303770662924296, + "learning_rate": 0.00012692532330836346, + "loss": 0.7283, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 1.1838484089311536, + "learning_rate": 0.00012441605690283915, + "loss": 0.7869, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 1.1996508231205614, + "learning_rate": 0.0001218903323806595, + "loss": 0.8304, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 0.9973023772472317, + "learning_rate": 0.00011934985225541998, + "loss": 0.7435, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 1.1394855232486916, + "learning_rate": 0.00011679632898701649, + "loss": 0.8035, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 1.0023205132345805, + "learning_rate": 0.00011423148382732853, + "loss": 0.6697, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.9649329991466065, + "learning_rate": 0.00011165704565997593, + "loss": 0.7329, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 0.9120142833559911, + "learning_rate": 0.00010907474983493144, + "loss": 0.6681, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 1.0448187897096388, + "learning_rate": 0.0001064863369987743, + "loss": 0.7906, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 0.9139072472140527, + "learning_rate": 0.00010389355192137377, + "loss": 0.6626, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 1.1574772800929347, + "learning_rate": 0.0001012981423197931, + "loss": 0.8664, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 1.0220362084048196, + "learning_rate": 9.870185768020693e-05, + "loss": 0.7479, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 1.199094525413627, + "learning_rate": 9.610644807862625e-05, + "loss": 0.8719, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 1.000008577603346, + "learning_rate": 9.35136630012257e-05, + "loss": 0.7416, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 1.060537149329608, + "learning_rate": 9.092525016506858e-05, + "loss": 0.7707, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 1.1371266027976432, + "learning_rate": 8.83429543400241e-05, + "loss": 0.769, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 1.1350763178425805, + "learning_rate": 8.57685161726715e-05, + "loss": 0.7817, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 1.0218141814745094, + "learning_rate": 8.320367101298351e-05, + "loss": 0.737, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.9202080674794947, + "learning_rate": 8.065014774458003e-05, + "loss": 0.6632, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 0.9296699980814602, + "learning_rate": 7.810966761934053e-05, + "loss": 0.6548, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 1.0335725615091327, + "learning_rate": 7.558394309716088e-05, + "loss": 0.6948, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 1.141947553762206, + "learning_rate": 7.307467669163655e-05, + "loss": 0.7382, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 1.4100203818688208, + "learning_rate": 7.058355982245037e-05, + "loss": 0.8474, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 1.0163008682743984, + "learning_rate": 6.811227167523815e-05, + "loss": 0.6651, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 1.221529976174952, + "learning_rate": 6.566247806970119e-05, + "loss": 0.8131, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 0.9618984836374378, + "learning_rate": 6.323583033672799e-05, + "loss": 0.65, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 1.0996286151580774, + "learning_rate": 6.083396420528298e-05, + "loss": 0.8285, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 0.9268846013697055, + "learning_rate": 5.845849869981137e-05, + "loss": 0.6435, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 1.027211457290476, + "learning_rate": 5.611103504890444e-05, + "loss": 0.745, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 1.082529517207295, + "learning_rate": 5.379315560596038e-05, + "loss": 0.7019, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 1.0135468128062057, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.7226, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 0.97001184986676, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.8256, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 0.9740270101491069, + "learning_rate": 4.703254062686017e-05, + "loss": 0.6412, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 1.0372568205011428, + "learning_rate": 4.484840700157295e-05, + "loss": 0.6513, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 1.1221621988691768, + "learning_rate": 4.270144937709981e-05, + "loss": 0.6919, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 1.0375063434620733, + "learning_rate": 4.059311495186338e-05, + "loss": 0.7526, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 1.0174249016341808, + "learning_rate": 3.852482488956992e-05, + "loss": 0.6827, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.9085677543812799, + "learning_rate": 3.649797336124615e-05, + "loss": 0.6304, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.9208505244485531, + "learning_rate": 3.45139266054715e-05, + "loss": 0.7146, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 1.0697826972171396, + "learning_rate": 3.257402200743821e-05, + "loss": 0.9217, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 0.968128643452508, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.6088, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 1.0269110861575053, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.7421, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 1.0747124471237177, + "learning_rate": 2.7032083420597e-05, + "loss": 0.7495, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 1.023035458249502, + "learning_rate": 2.528151311088537e-05, + "loss": 0.7403, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 0.8812897998547033, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.7222, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 1.0027482226792357, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.6957, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 1.0890324102494235, + "learning_rate": 2.03365443542764e-05, + "loss": 0.758, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 1.1577158487469124, + "learning_rate": 1.879417252291502e-05, + "loss": 0.6337, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 0.9448443729492736, + "learning_rate": 1.730653905438714e-05, + "loss": 0.6452, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 0.9259142261290009, + "learning_rate": 1.587464671688187e-05, + "loss": 0.6566, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 1.0542917132855005, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.7358, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 1.091275350594823, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.693, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 0.9786531655966347, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.6642, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 1.1278517232346554, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.7668, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 1.0005845602335413, + "learning_rate": 9.583733034714981e-06, + "loss": 0.7357, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.8841528934669991, + "learning_rate": 8.505197417404687e-06, + "loss": 0.5717, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 1.3521091334370579, + "learning_rate": 7.488335646131628e-06, + "loss": 0.6707, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 0.8846307307259047, + "learning_rate": 6.533833156292679e-06, + "loss": 0.5618, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 1.0703643421078397, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.7374, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 0.9746740913726185, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.6961, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 0.9414487770868957, + "learning_rate": 4.050702638550275e-06, + "loss": 0.7156, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.9304329120448038, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.6689, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 1.0025867854859485, + "learning_rate": 2.717734270375272e-06, + "loss": 0.6602, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 0.9348458350132222, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.6133, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 1.0271198817911413, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.6659, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 1.048864398380103, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.6276, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 1.101718976429846, + "learning_rate": 8.41451353233369e-07, + "loss": 0.8493, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.9521334208542822, + "learning_rate": 5.388012673338661e-07, + "loss": 0.6474, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.9276337231350391, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.6515, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.9611591344997523, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.6451, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.8927926010295639, + "learning_rate": 3.370346964876036e-08, + "loss": 0.7052, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 0.9124525841463956, + "learning_rate": 0.0, + "loss": 0.5924, + "step": 125 + }, + { + "epoch": 1.0, + "step": 125, + "total_flos": 16496717570048.0, + "train_loss": 0.7736291184425353, + "train_runtime": 1040.5743, + "train_samples_per_second": 1.922, + "train_steps_per_second": 0.12 + } + ], + "logging_steps": 1.0, + "max_steps": 125, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 16496717570048.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..347647564608ef24ccb20498950dac46c08d01cf --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "v_proj", + "up_proj", + "q_proj", + "k_proj", + "o_proj", + "gate_proj", + "down_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..702fc23be7e5a1d7e9d91980a80968c974b1cff0 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a35f458620f5a287a03c955f6acb741a079e64971521b1dcbaf8c8a942f08430 +size 671150064 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..b0ea064a30190425bfde06794dccea4671c42329 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cf0e54bb1000e80eb51dae2ebfed904d13a5dd56cdef499b7980b2999dd6532 +size 918507402 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..a06b06d0464b81930b9129812fe5d7b88f0152bb --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_2000_epochs_2_lora/trainer_state.json @@ -0,0 +1,1792 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.0, + "eval_steps": 500, + "global_step": 250, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.008, + "grad_norm": 4.674744838529102, + "learning_rate": 2.5e-05, + "loss": 1.4867, + "step": 1 + }, + { + "epoch": 0.016, + "grad_norm": 4.9256407021066675, + "learning_rate": 5e-05, + "loss": 1.5103, + "step": 2 + }, + { + "epoch": 0.024, + "grad_norm": 2.5677944230432193, + "learning_rate": 7.500000000000001e-05, + "loss": 1.4188, + "step": 3 + }, + { + "epoch": 0.032, + "grad_norm": 2.2529029118150716, + "learning_rate": 0.0001, + "loss": 1.1377, + "step": 4 + }, + { + "epoch": 0.04, + "grad_norm": 2.047964065232966, + "learning_rate": 0.000125, + "loss": 1.0446, + "step": 5 + }, + { + "epoch": 0.048, + "grad_norm": 1.7555120313603354, + "learning_rate": 0.00015000000000000001, + "loss": 0.9269, + "step": 6 + }, + { + "epoch": 0.056, + "grad_norm": 2.134531403067591, + "learning_rate": 0.000175, + "loss": 1.0221, + "step": 7 + }, + { + "epoch": 0.064, + "grad_norm": 1.665748281051102, + "learning_rate": 0.0002, + "loss": 0.8459, + "step": 8 + }, + { + "epoch": 0.072, + "grad_norm": 1.4759575846757216, + "learning_rate": 0.0001999915737775817, + "loss": 0.7934, + "step": 9 + }, + { + "epoch": 0.08, + "grad_norm": 1.6304033303566767, + "learning_rate": 0.00019996629653035126, + "loss": 0.9748, + "step": 10 + }, + { + "epoch": 0.088, + "grad_norm": 1.3110558409884459, + "learning_rate": 0.00019992417251814282, + "loss": 0.8587, + "step": 11 + }, + { + "epoch": 0.096, + "grad_norm": 1.7201492275174837, + "learning_rate": 0.00019986520883988232, + "loss": 0.847, + "step": 12 + }, + { + "epoch": 0.104, + "grad_norm": 1.5758552031912056, + "learning_rate": 0.0001997894154323911, + "loss": 0.7981, + "step": 13 + }, + { + "epoch": 0.112, + "grad_norm": 1.433500573442223, + "learning_rate": 0.00019969680506871137, + "loss": 0.9366, + "step": 14 + }, + { + "epoch": 0.12, + "grad_norm": 1.4217505783095623, + "learning_rate": 0.0001995873933559535, + "loss": 0.8733, + "step": 15 + }, + { + "epoch": 0.128, + "grad_norm": 1.2003920444262448, + "learning_rate": 0.00019946119873266613, + "loss": 0.8634, + "step": 16 + }, + { + "epoch": 0.136, + "grad_norm": 1.6665877835121163, + "learning_rate": 0.0001993182424657285, + "loss": 0.9127, + "step": 17 + }, + { + "epoch": 0.144, + "grad_norm": 1.341015054535535, + "learning_rate": 0.00019915854864676664, + "loss": 0.91, + "step": 18 + }, + { + "epoch": 0.152, + "grad_norm": 1.2273257645062086, + "learning_rate": 0.0001989821441880933, + "loss": 0.8859, + "step": 19 + }, + { + "epoch": 0.16, + "grad_norm": 1.2894468326209563, + "learning_rate": 0.00019878905881817252, + "loss": 0.869, + "step": 20 + }, + { + "epoch": 0.168, + "grad_norm": 1.1469297491500698, + "learning_rate": 0.0001985793250766098, + "loss": 0.7831, + "step": 21 + }, + { + "epoch": 0.176, + "grad_norm": 1.2871033572214328, + "learning_rate": 0.00019835297830866826, + "loss": 0.8706, + "step": 22 + }, + { + "epoch": 0.184, + "grad_norm": 1.470258548075832, + "learning_rate": 0.00019811005665931205, + "loss": 0.7019, + "step": 23 + }, + { + "epoch": 0.192, + "grad_norm": 1.2810881750965715, + "learning_rate": 0.00019785060106677818, + "loss": 0.7606, + "step": 24 + }, + { + "epoch": 0.2, + "grad_norm": 1.4072344720956307, + "learning_rate": 0.0001975746552556772, + "loss": 0.7905, + "step": 25 + }, + { + "epoch": 0.208, + "grad_norm": 1.7180156821685193, + "learning_rate": 0.00019728226572962473, + "loss": 0.8396, + "step": 26 + }, + { + "epoch": 0.216, + "grad_norm": 1.367099869066617, + "learning_rate": 0.0001969734817634044, + "loss": 0.8305, + "step": 27 + }, + { + "epoch": 0.224, + "grad_norm": 1.2888961724874664, + "learning_rate": 0.0001966483553946637, + "loss": 0.7905, + "step": 28 + }, + { + "epoch": 0.232, + "grad_norm": 1.1765669030550359, + "learning_rate": 0.00019630694141514464, + "loss": 0.8319, + "step": 29 + }, + { + "epoch": 0.24, + "grad_norm": 1.1841530286472177, + "learning_rate": 0.00019594929736144976, + "loss": 0.8116, + "step": 30 + }, + { + "epoch": 0.248, + "grad_norm": 1.0679725826738797, + "learning_rate": 0.0001955754835053459, + "loss": 0.7553, + "step": 31 + }, + { + "epoch": 0.256, + "grad_norm": 1.2219448462839118, + "learning_rate": 0.00019518556284360696, + "loss": 0.8355, + "step": 32 + }, + { + "epoch": 0.264, + "grad_norm": 0.9711904551107217, + "learning_rate": 0.0001947796010873974, + "loss": 0.7263, + "step": 33 + }, + { + "epoch": 0.272, + "grad_norm": 1.1075790710725923, + "learning_rate": 0.0001943576666511982, + "loss": 0.7483, + "step": 34 + }, + { + "epoch": 0.28, + "grad_norm": 1.278006549263634, + "learning_rate": 0.0001939198306412775, + "loss": 0.8164, + "step": 35 + }, + { + "epoch": 0.288, + "grad_norm": 1.1536165558422753, + "learning_rate": 0.0001934661668437073, + "loss": 0.7936, + "step": 36 + }, + { + "epoch": 0.296, + "grad_norm": 1.1865654471702436, + "learning_rate": 0.0001929967517119289, + "loss": 0.8048, + "step": 37 + }, + { + "epoch": 0.304, + "grad_norm": 1.2343222074360047, + "learning_rate": 0.0001925116643538684, + "loss": 0.8808, + "step": 38 + }, + { + "epoch": 0.312, + "grad_norm": 1.247526068607731, + "learning_rate": 0.0001920109865186052, + "loss": 0.7913, + "step": 39 + }, + { + "epoch": 0.32, + "grad_norm": 1.1852742286804527, + "learning_rate": 0.00019149480258259533, + "loss": 0.7706, + "step": 40 + }, + { + "epoch": 0.328, + "grad_norm": 1.1325327330263366, + "learning_rate": 0.00019096319953545185, + "loss": 0.7215, + "step": 41 + }, + { + "epoch": 0.336, + "grad_norm": 1.3582324850496053, + "learning_rate": 0.00019041626696528503, + "loss": 0.8489, + "step": 42 + }, + { + "epoch": 0.344, + "grad_norm": 1.2041545764445587, + "learning_rate": 0.00018985409704360456, + "loss": 0.7603, + "step": 43 + }, + { + "epoch": 0.352, + "grad_norm": 1.038114948366513, + "learning_rate": 0.0001892767845097864, + "loss": 0.7286, + "step": 44 + }, + { + "epoch": 0.36, + "grad_norm": 1.1182206003231439, + "learning_rate": 0.00018868442665510678, + "loss": 0.7306, + "step": 45 + }, + { + "epoch": 0.368, + "grad_norm": 1.0303890087751462, + "learning_rate": 0.00018807712330634642, + "loss": 0.7666, + "step": 46 + }, + { + "epoch": 0.376, + "grad_norm": 1.1911443097100909, + "learning_rate": 0.00018745497680896722, + "loss": 0.7796, + "step": 47 + }, + { + "epoch": 0.384, + "grad_norm": 1.2016413454646881, + "learning_rate": 0.0001868180920098644, + "loss": 0.7697, + "step": 48 + }, + { + "epoch": 0.392, + "grad_norm": 1.1822272055096672, + "learning_rate": 0.0001861665762396974, + "loss": 0.736, + "step": 49 + }, + { + "epoch": 0.4, + "grad_norm": 1.0983349187588152, + "learning_rate": 0.00018550053929480202, + "loss": 0.6999, + "step": 50 + }, + { + "epoch": 0.408, + "grad_norm": 1.845199079764097, + "learning_rate": 0.00018482009341868697, + "loss": 0.8069, + "step": 51 + }, + { + "epoch": 0.416, + "grad_norm": 1.276303709369214, + "learning_rate": 0.00018412535328311814, + "loss": 0.7438, + "step": 52 + }, + { + "epoch": 0.424, + "grad_norm": 1.3944781099073964, + "learning_rate": 0.00018341643596879367, + "loss": 0.8595, + "step": 53 + }, + { + "epoch": 0.432, + "grad_norm": 1.0632222354851693, + "learning_rate": 0.0001826934609456129, + "loss": 0.7425, + "step": 54 + }, + { + "epoch": 0.44, + "grad_norm": 1.2263074444516069, + "learning_rate": 0.00018195655005254273, + "loss": 0.8127, + "step": 55 + }, + { + "epoch": 0.448, + "grad_norm": 1.2149536699256438, + "learning_rate": 0.00018120582747708502, + "loss": 0.8457, + "step": 56 + }, + { + "epoch": 0.456, + "grad_norm": 1.0138171981919357, + "learning_rate": 0.00018044141973434758, + "loss": 0.768, + "step": 57 + }, + { + "epoch": 0.464, + "grad_norm": 1.2061520303740656, + "learning_rate": 0.0001796634556457236, + "loss": 0.8194, + "step": 58 + }, + { + "epoch": 0.472, + "grad_norm": 1.0823588512569589, + "learning_rate": 0.00017887206631718203, + "loss": 0.6896, + "step": 59 + }, + { + "epoch": 0.48, + "grad_norm": 0.9904390215800949, + "learning_rate": 0.0001780673851171728, + "loss": 0.7558, + "step": 60 + }, + { + "epoch": 0.488, + "grad_norm": 1.0142633742867952, + "learning_rate": 0.00017724954765415137, + "loss": 0.7038, + "step": 61 + }, + { + "epoch": 0.496, + "grad_norm": 1.1916449607132265, + "learning_rate": 0.00017641869175372493, + "loss": 0.8259, + "step": 62 + }, + { + "epoch": 0.504, + "grad_norm": 1.0649736202924283, + "learning_rate": 0.00017557495743542585, + "loss": 0.7195, + "step": 63 + }, + { + "epoch": 0.512, + "grad_norm": 1.2212720973523508, + "learning_rate": 0.00017471848688911464, + "loss": 0.8927, + "step": 64 + }, + { + "epoch": 0.52, + "grad_norm": 1.1324306359339695, + "learning_rate": 0.00017384942445101772, + "loss": 0.774, + "step": 65 + }, + { + "epoch": 0.528, + "grad_norm": 1.2347954497560891, + "learning_rate": 0.000172967916579403, + "loss": 0.9011, + "step": 66 + }, + { + "epoch": 0.536, + "grad_norm": 1.0730716533928852, + "learning_rate": 0.00017207411182989832, + "loss": 0.7699, + "step": 67 + }, + { + "epoch": 0.544, + "grad_norm": 1.3567009841762714, + "learning_rate": 0.00017116816083045602, + "loss": 0.8084, + "step": 68 + }, + { + "epoch": 0.552, + "grad_norm": 1.222724537355913, + "learning_rate": 0.00017025021625596853, + "loss": 0.8189, + "step": 69 + }, + { + "epoch": 0.56, + "grad_norm": 1.3196168838771813, + "learning_rate": 0.0001693204328025389, + "loss": 0.8392, + "step": 70 + }, + { + "epoch": 0.568, + "grad_norm": 1.1216400372050441, + "learning_rate": 0.0001683789671614107, + "loss": 0.78, + "step": 71 + }, + { + "epoch": 0.576, + "grad_norm": 0.9974595631972965, + "learning_rate": 0.00016742597799256182, + "loss": 0.6931, + "step": 72 + }, + { + "epoch": 0.584, + "grad_norm": 1.043035145267162, + "learning_rate": 0.00016646162589796615, + "loss": 0.7029, + "step": 73 + }, + { + "epoch": 0.592, + "grad_norm": 1.1202125958554328, + "learning_rate": 0.00016548607339452853, + "loss": 0.7526, + "step": 74 + }, + { + "epoch": 0.6, + "grad_norm": 1.124759105227289, + "learning_rate": 0.00016449948488669639, + "loss": 0.8025, + "step": 75 + }, + { + "epoch": 0.608, + "grad_norm": 1.1386547403182927, + "learning_rate": 0.00016350202663875386, + "loss": 0.8874, + "step": 76 + }, + { + "epoch": 0.616, + "grad_norm": 0.9988928534548326, + "learning_rate": 0.00016249386674680184, + "loss": 0.7016, + "step": 77 + }, + { + "epoch": 0.624, + "grad_norm": 1.2940115313890033, + "learning_rate": 0.0001614751751104301, + "loss": 0.8591, + "step": 78 + }, + { + "epoch": 0.632, + "grad_norm": 1.0315854341464359, + "learning_rate": 0.00016044612340408466, + "loss": 0.6896, + "step": 79 + }, + { + "epoch": 0.64, + "grad_norm": 1.109559558392284, + "learning_rate": 0.00015940688504813662, + "loss": 0.8844, + "step": 80 + }, + { + "epoch": 0.648, + "grad_norm": 1.115080148205073, + "learning_rate": 0.00015835763517965673, + "loss": 0.6882, + "step": 81 + }, + { + "epoch": 0.656, + "grad_norm": 1.1027828326759102, + "learning_rate": 0.00015729855062290022, + "loss": 0.8085, + "step": 82 + }, + { + "epoch": 0.664, + "grad_norm": 1.1463616078622434, + "learning_rate": 0.0001562298098595078, + "loss": 0.7364, + "step": 83 + }, + { + "epoch": 0.672, + "grad_norm": 1.1244532000255234, + "learning_rate": 0.00015515159299842707, + "loss": 0.781, + "step": 84 + }, + { + "epoch": 0.68, + "grad_norm": 1.011475799895498, + "learning_rate": 0.00015406408174555976, + "loss": 0.8474, + "step": 85 + }, + { + "epoch": 0.688, + "grad_norm": 1.0240992575029322, + "learning_rate": 0.00015296745937313987, + "loss": 0.7006, + "step": 86 + }, + { + "epoch": 0.696, + "grad_norm": 1.0456985935846164, + "learning_rate": 0.00015186191068884775, + "loss": 0.7034, + "step": 87 + }, + { + "epoch": 0.704, + "grad_norm": 1.038941847745798, + "learning_rate": 0.00015074762200466556, + "loss": 0.7024, + "step": 88 + }, + { + "epoch": 0.712, + "grad_norm": 1.0874554195537363, + "learning_rate": 0.00014962478110547918, + "loss": 0.7905, + "step": 89 + }, + { + "epoch": 0.72, + "grad_norm": 1.0707901639706023, + "learning_rate": 0.00014849357721743168, + "loss": 0.7269, + "step": 90 + }, + { + "epoch": 0.728, + "grad_norm": 0.9715185980838664, + "learning_rate": 0.0001473542009760343, + "loss": 0.6711, + "step": 91 + }, + { + "epoch": 0.736, + "grad_norm": 0.9913081098330498, + "learning_rate": 0.00014620684439403962, + "loss": 0.7484, + "step": 92 + }, + { + "epoch": 0.744, + "grad_norm": 1.1232772391486232, + "learning_rate": 0.0001450517008290827, + "loss": 0.9143, + "step": 93 + }, + { + "epoch": 0.752, + "grad_norm": 1.0817504121616406, + "learning_rate": 0.0001438889649510956, + "loss": 0.6649, + "step": 94 + }, + { + "epoch": 0.76, + "grad_norm": 1.0723926535058825, + "learning_rate": 0.00014271883270950073, + "loss": 0.7912, + "step": 95 + }, + { + "epoch": 0.768, + "grad_norm": 1.0967417138149718, + "learning_rate": 0.00014154150130018866, + "loss": 0.7739, + "step": 96 + }, + { + "epoch": 0.776, + "grad_norm": 1.1162816331090357, + "learning_rate": 0.00014035716913228568, + "loss": 0.7969, + "step": 97 + }, + { + "epoch": 0.784, + "grad_norm": 1.0409425412997686, + "learning_rate": 0.00013916603579471705, + "loss": 0.7703, + "step": 98 + }, + { + "epoch": 0.792, + "grad_norm": 1.1944482073063658, + "learning_rate": 0.0001379683020225714, + "loss": 0.7718, + "step": 99 + }, + { + "epoch": 0.8, + "grad_norm": 1.1102093067375531, + "learning_rate": 0.000136764169663272, + "loss": 0.8045, + "step": 100 + }, + { + "epoch": 0.808, + "grad_norm": 1.2340044538914852, + "learning_rate": 0.00013555384164256048, + "loss": 0.6847, + "step": 101 + }, + { + "epoch": 0.816, + "grad_norm": 1.0344458272008699, + "learning_rate": 0.00013433752193029886, + "loss": 0.6817, + "step": 102 + }, + { + "epoch": 0.824, + "grad_norm": 1.0626120506094445, + "learning_rate": 0.00013311541550609565, + "loss": 0.7149, + "step": 103 + }, + { + "epoch": 0.832, + "grad_norm": 1.1218393616590492, + "learning_rate": 0.00013188772832476188, + "loss": 0.7871, + "step": 104 + }, + { + "epoch": 0.84, + "grad_norm": 1.1197044459523418, + "learning_rate": 0.00013065466728160252, + "loss": 0.7283, + "step": 105 + }, + { + "epoch": 0.848, + "grad_norm": 1.0062988279593645, + "learning_rate": 0.00012941644017754964, + "loss": 0.7224, + "step": 106 + }, + { + "epoch": 0.856, + "grad_norm": 1.1044521804118692, + "learning_rate": 0.00012817325568414297, + "loss": 0.8117, + "step": 107 + }, + { + "epoch": 0.864, + "grad_norm": 1.0391694322668483, + "learning_rate": 0.00012692532330836346, + "loss": 0.7874, + "step": 108 + }, + { + "epoch": 0.872, + "grad_norm": 0.9011177202476485, + "learning_rate": 0.00012567285335732633, + "loss": 0.6216, + "step": 109 + }, + { + "epoch": 0.88, + "grad_norm": 1.033983706569211, + "learning_rate": 0.00012441605690283915, + "loss": 0.7362, + "step": 110 + }, + { + "epoch": 0.888, + "grad_norm": 1.0371409551989839, + "learning_rate": 0.00012315514574583113, + "loss": 0.6164, + "step": 111 + }, + { + "epoch": 0.896, + "grad_norm": 1.1031708450635234, + "learning_rate": 0.0001218903323806595, + "loss": 0.7865, + "step": 112 + }, + { + "epoch": 0.904, + "grad_norm": 1.0278240722304612, + "learning_rate": 0.00012062182995929882, + "loss": 0.7084, + "step": 113 + }, + { + "epoch": 0.912, + "grad_norm": 1.0344356319609227, + "learning_rate": 0.00011934985225541998, + "loss": 0.7945, + "step": 114 + }, + { + "epoch": 0.92, + "grad_norm": 0.9526235778560432, + "learning_rate": 0.0001180746136283638, + "loss": 0.7011, + "step": 115 + }, + { + "epoch": 0.928, + "grad_norm": 0.9870993105336784, + "learning_rate": 0.00011679632898701649, + "loss": 0.689, + "step": 116 + }, + { + "epoch": 0.936, + "grad_norm": 1.0630454299198857, + "learning_rate": 0.00011551521375359206, + "loss": 0.6496, + "step": 117 + }, + { + "epoch": 0.944, + "grad_norm": 1.0109882559812657, + "learning_rate": 0.00011423148382732853, + "loss": 0.6951, + "step": 118 + }, + { + "epoch": 0.952, + "grad_norm": 1.1715016413417012, + "learning_rate": 0.00011294535554810354, + "loss": 0.6781, + "step": 119 + }, + { + "epoch": 0.96, + "grad_norm": 1.2052720292878927, + "learning_rate": 0.00011165704565997593, + "loss": 0.8652, + "step": 120 + }, + { + "epoch": 0.968, + "grad_norm": 0.9966996386903084, + "learning_rate": 0.00011036677127465889, + "loss": 0.6521, + "step": 121 + }, + { + "epoch": 0.976, + "grad_norm": 0.9951818796352864, + "learning_rate": 0.00010907474983493144, + "loss": 0.7029, + "step": 122 + }, + { + "epoch": 0.984, + "grad_norm": 0.9368117058127309, + "learning_rate": 0.00010778119907799398, + "loss": 0.6698, + "step": 123 + }, + { + "epoch": 0.992, + "grad_norm": 0.9318100336553307, + "learning_rate": 0.0001064863369987743, + "loss": 0.7318, + "step": 124 + }, + { + "epoch": 1.0, + "grad_norm": 1.0983885346285733, + "learning_rate": 0.00010519038181318999, + "loss": 0.6213, + "step": 125 + }, + { + "epoch": 1.008, + "grad_norm": 0.7638747532597451, + "learning_rate": 0.00010389355192137377, + "loss": 0.3842, + "step": 126 + }, + { + "epoch": 1.016, + "grad_norm": 0.7793981616830645, + "learning_rate": 0.00010259606587086783, + "loss": 0.4023, + "step": 127 + }, + { + "epoch": 1.024, + "grad_norm": 0.6663909865377676, + "learning_rate": 0.0001012981423197931, + "loss": 0.3208, + "step": 128 + }, + { + "epoch": 1.032, + "grad_norm": 0.8427465859741629, + "learning_rate": 0.0001, + "loss": 0.3975, + "step": 129 + }, + { + "epoch": 1.04, + "grad_norm": 0.7866238132594401, + "learning_rate": 9.870185768020693e-05, + "loss": 0.3673, + "step": 130 + }, + { + "epoch": 1.048, + "grad_norm": 0.8616426327296077, + "learning_rate": 9.740393412913219e-05, + "loss": 0.3719, + "step": 131 + }, + { + "epoch": 1.056, + "grad_norm": 0.9456553559591211, + "learning_rate": 9.610644807862625e-05, + "loss": 0.3657, + "step": 132 + }, + { + "epoch": 1.064, + "grad_norm": 0.9086242195831249, + "learning_rate": 9.480961818681004e-05, + "loss": 0.3225, + "step": 133 + }, + { + "epoch": 1.072, + "grad_norm": 1.0747694362727902, + "learning_rate": 9.35136630012257e-05, + "loss": 0.3502, + "step": 134 + }, + { + "epoch": 1.08, + "grad_norm": 1.0596031466336995, + "learning_rate": 9.221880092200601e-05, + "loss": 0.3133, + "step": 135 + }, + { + "epoch": 1.088, + "grad_norm": 1.1388934802519084, + "learning_rate": 9.092525016506858e-05, + "loss": 0.3266, + "step": 136 + }, + { + "epoch": 1.096, + "grad_norm": 1.1907787848999825, + "learning_rate": 8.963322872534114e-05, + "loss": 0.38, + "step": 137 + }, + { + "epoch": 1.104, + "grad_norm": 1.1833920892982426, + "learning_rate": 8.83429543400241e-05, + "loss": 0.3654, + "step": 138 + }, + { + "epoch": 1.112, + "grad_norm": 0.932565927788482, + "learning_rate": 8.705464445189647e-05, + "loss": 0.3348, + "step": 139 + }, + { + "epoch": 1.12, + "grad_norm": 1.056948343348815, + "learning_rate": 8.57685161726715e-05, + "loss": 0.3325, + "step": 140 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 1.1116858198987372, + "learning_rate": 8.448478624640797e-05, + "loss": 0.3399, + "step": 141 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 0.9487153714016062, + "learning_rate": 8.320367101298351e-05, + "loss": 0.2648, + "step": 142 + }, + { + "epoch": 1.144, + "grad_norm": 0.987507461623801, + "learning_rate": 8.192538637163621e-05, + "loss": 0.2793, + "step": 143 + }, + { + "epoch": 1.152, + "grad_norm": 1.0489433838473108, + "learning_rate": 8.065014774458003e-05, + "loss": 0.3665, + "step": 144 + }, + { + "epoch": 1.16, + "grad_norm": 1.0030902972919105, + "learning_rate": 7.93781700407012e-05, + "loss": 0.3552, + "step": 145 + }, + { + "epoch": 1.168, + "grad_norm": 0.8904106592566092, + "learning_rate": 7.810966761934053e-05, + "loss": 0.2711, + "step": 146 + }, + { + "epoch": 1.176, + "grad_norm": 0.9706926884829528, + "learning_rate": 7.684485425416888e-05, + "loss": 0.3373, + "step": 147 + }, + { + "epoch": 1.184, + "grad_norm": 0.9389335945233177, + "learning_rate": 7.558394309716088e-05, + "loss": 0.3055, + "step": 148 + }, + { + "epoch": 1.192, + "grad_norm": 0.9872080557866764, + "learning_rate": 7.432714664267373e-05, + "loss": 0.3001, + "step": 149 + }, + { + "epoch": 1.2, + "grad_norm": 0.9094267656601626, + "learning_rate": 7.307467669163655e-05, + "loss": 0.2747, + "step": 150 + }, + { + "epoch": 1.208, + "grad_norm": 1.5104293946560028, + "learning_rate": 7.182674431585704e-05, + "loss": 0.4515, + "step": 151 + }, + { + "epoch": 1.216, + "grad_norm": 0.9503009794649055, + "learning_rate": 7.058355982245037e-05, + "loss": 0.3233, + "step": 152 + }, + { + "epoch": 1.224, + "grad_norm": 0.8142713969471524, + "learning_rate": 6.934533271839752e-05, + "loss": 0.2838, + "step": 153 + }, + { + "epoch": 1.232, + "grad_norm": 0.8935891149236612, + "learning_rate": 6.811227167523815e-05, + "loss": 0.267, + "step": 154 + }, + { + "epoch": 1.24, + "grad_norm": 1.2671981198957942, + "learning_rate": 6.688458449390437e-05, + "loss": 0.3911, + "step": 155 + }, + { + "epoch": 1.248, + "grad_norm": 0.9857137845834811, + "learning_rate": 6.566247806970119e-05, + "loss": 0.3482, + "step": 156 + }, + { + "epoch": 1.256, + "grad_norm": 0.8992426112369553, + "learning_rate": 6.444615835743955e-05, + "loss": 0.276, + "step": 157 + }, + { + "epoch": 1.264, + "grad_norm": 0.8690897102819473, + "learning_rate": 6.323583033672799e-05, + "loss": 0.2812, + "step": 158 + }, + { + "epoch": 1.272, + "grad_norm": 1.0592691387703361, + "learning_rate": 6.203169797742861e-05, + "loss": 0.3428, + "step": 159 + }, + { + "epoch": 1.28, + "grad_norm": 1.1066763115172018, + "learning_rate": 6.083396420528298e-05, + "loss": 0.3572, + "step": 160 + }, + { + "epoch": 1.288, + "grad_norm": 1.01859697464143, + "learning_rate": 5.964283086771435e-05, + "loss": 0.3142, + "step": 161 + }, + { + "epoch": 1.296, + "grad_norm": 1.1019021725049405, + "learning_rate": 5.845849869981137e-05, + "loss": 0.4041, + "step": 162 + }, + { + "epoch": 1.304, + "grad_norm": 0.9615876490141757, + "learning_rate": 5.728116729049928e-05, + "loss": 0.3092, + "step": 163 + }, + { + "epoch": 1.312, + "grad_norm": 0.9397205153627258, + "learning_rate": 5.611103504890444e-05, + "loss": 0.3022, + "step": 164 + }, + { + "epoch": 1.32, + "grad_norm": 1.1842470031836703, + "learning_rate": 5.4948299170917325e-05, + "loss": 0.3963, + "step": 165 + }, + { + "epoch": 1.328, + "grad_norm": 1.029661373530674, + "learning_rate": 5.379315560596038e-05, + "loss": 0.33, + "step": 166 + }, + { + "epoch": 1.336, + "grad_norm": 0.8341974846050139, + "learning_rate": 5.26457990239657e-05, + "loss": 0.2993, + "step": 167 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 0.9528163807665674, + "learning_rate": 5.1506422782568345e-05, + "loss": 0.331, + "step": 168 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 0.8127656540497246, + "learning_rate": 5.0375218894520834e-05, + "loss": 0.27, + "step": 169 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.815368869193228, + "learning_rate": 4.9252377995334444e-05, + "loss": 0.3123, + "step": 170 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 0.9341793171721062, + "learning_rate": 4.813808931115228e-05, + "loss": 0.3241, + "step": 171 + }, + { + "epoch": 1.376, + "grad_norm": 0.8406613326246486, + "learning_rate": 4.703254062686017e-05, + "loss": 0.2474, + "step": 172 + }, + { + "epoch": 1.384, + "grad_norm": 0.9014918983900241, + "learning_rate": 4.593591825444028e-05, + "loss": 0.2941, + "step": 173 + }, + { + "epoch": 1.392, + "grad_norm": 1.052110080300373, + "learning_rate": 4.484840700157295e-05, + "loss": 0.3315, + "step": 174 + }, + { + "epoch": 1.4, + "grad_norm": 0.943872054660999, + "learning_rate": 4.377019014049223e-05, + "loss": 0.3199, + "step": 175 + }, + { + "epoch": 1.408, + "grad_norm": 1.0067114411249958, + "learning_rate": 4.270144937709981e-05, + "loss": 0.3588, + "step": 176 + }, + { + "epoch": 1.416, + "grad_norm": 0.8629665177436001, + "learning_rate": 4.164236482034327e-05, + "loss": 0.2606, + "step": 177 + }, + { + "epoch": 1.424, + "grad_norm": 0.8343441015042404, + "learning_rate": 4.059311495186338e-05, + "loss": 0.255, + "step": 178 + }, + { + "epoch": 1.432, + "grad_norm": 1.048342053248487, + "learning_rate": 3.9553876595915375e-05, + "loss": 0.3247, + "step": 179 + }, + { + "epoch": 1.44, + "grad_norm": 0.9714346890332698, + "learning_rate": 3.852482488956992e-05, + "loss": 0.2662, + "step": 180 + }, + { + "epoch": 1.448, + "grad_norm": 0.9744546187297268, + "learning_rate": 3.750613325319817e-05, + "loss": 0.2609, + "step": 181 + }, + { + "epoch": 1.456, + "grad_norm": 0.992170075556931, + "learning_rate": 3.649797336124615e-05, + "loss": 0.2967, + "step": 182 + }, + { + "epoch": 1.464, + "grad_norm": 0.9207170612761674, + "learning_rate": 3.550051511330361e-05, + "loss": 0.2764, + "step": 183 + }, + { + "epoch": 1.472, + "grad_norm": 0.987847549650391, + "learning_rate": 3.45139266054715e-05, + "loss": 0.2718, + "step": 184 + }, + { + "epoch": 1.48, + "grad_norm": 0.9576257344728829, + "learning_rate": 3.3538374102033866e-05, + "loss": 0.3137, + "step": 185 + }, + { + "epoch": 1.488, + "grad_norm": 0.8814189665237778, + "learning_rate": 3.257402200743821e-05, + "loss": 0.2281, + "step": 186 + }, + { + "epoch": 1.496, + "grad_norm": 0.9917324983590853, + "learning_rate": 3.1621032838589305e-05, + "loss": 0.2718, + "step": 187 + }, + { + "epoch": 1.504, + "grad_norm": 0.9523929333349113, + "learning_rate": 3.0679567197461134e-05, + "loss": 0.3222, + "step": 188 + }, + { + "epoch": 1.512, + "grad_norm": 1.3965148113166408, + "learning_rate": 2.974978374403147e-05, + "loss": 0.3282, + "step": 189 + }, + { + "epoch": 1.52, + "grad_norm": 1.0212998172537535, + "learning_rate": 2.8831839169543996e-05, + "loss": 0.3197, + "step": 190 + }, + { + "epoch": 1.528, + "grad_norm": 0.9320938412848552, + "learning_rate": 2.7925888170101665e-05, + "loss": 0.2568, + "step": 191 + }, + { + "epoch": 1.536, + "grad_norm": 1.0038773999662451, + "learning_rate": 2.7032083420597e-05, + "loss": 0.3048, + "step": 192 + }, + { + "epoch": 1.544, + "grad_norm": 1.067071275673017, + "learning_rate": 2.6150575548982292e-05, + "loss": 0.2752, + "step": 193 + }, + { + "epoch": 1.552, + "grad_norm": 1.106070124000284, + "learning_rate": 2.528151311088537e-05, + "loss": 0.3095, + "step": 194 + }, + { + "epoch": 1.56, + "grad_norm": 1.011696313071652, + "learning_rate": 2.4425042564574184e-05, + "loss": 0.2762, + "step": 195 + }, + { + "epoch": 1.568, + "grad_norm": 0.9236563136547, + "learning_rate": 2.3581308246275103e-05, + "loss": 0.2892, + "step": 196 + }, + { + "epoch": 1.576, + "grad_norm": 1.0255063848944046, + "learning_rate": 2.2750452345848682e-05, + "loss": 0.3077, + "step": 197 + }, + { + "epoch": 1.584, + "grad_norm": 0.9782082431449886, + "learning_rate": 2.1932614882827197e-05, + "loss": 0.2852, + "step": 198 + }, + { + "epoch": 1.592, + "grad_norm": 0.8326643844084437, + "learning_rate": 2.112793368281799e-05, + "loss": 0.2342, + "step": 199 + }, + { + "epoch": 1.6, + "grad_norm": 1.5900666578407399, + "learning_rate": 2.03365443542764e-05, + "loss": 0.3175, + "step": 200 + }, + { + "epoch": 1.608, + "grad_norm": 1.0190552209754948, + "learning_rate": 1.9558580265652448e-05, + "loss": 0.3258, + "step": 201 + }, + { + "epoch": 1.616, + "grad_norm": 0.8932354211021565, + "learning_rate": 1.879417252291502e-05, + "loss": 0.2457, + "step": 202 + }, + { + "epoch": 1.624, + "grad_norm": 1.0784167744092246, + "learning_rate": 1.804344994745727e-05, + "loss": 0.3324, + "step": 203 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 1.2975319869241422, + "learning_rate": 1.730653905438714e-05, + "loss": 0.3326, + "step": 204 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 1.0722622793694678, + "learning_rate": 1.6583564031206357e-05, + "loss": 0.2942, + "step": 205 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 0.9268908762977555, + "learning_rate": 1.587464671688187e-05, + "loss": 0.2885, + "step": 206 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 0.975540228005638, + "learning_rate": 1.5179906581313064e-05, + "loss": 0.3475, + "step": 207 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 1.2947356002550057, + "learning_rate": 1.4499460705197998e-05, + "loss": 0.3367, + "step": 208 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 0.9969017308710312, + "learning_rate": 1.3833423760302611e-05, + "loss": 0.2799, + "step": 209 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 1.105832602470686, + "learning_rate": 1.3181907990135622e-05, + "loss": 0.3545, + "step": 210 + }, + { + "epoch": 1.688, + "grad_norm": 1.008270175667529, + "learning_rate": 1.2545023191032801e-05, + "loss": 0.3062, + "step": 211 + }, + { + "epoch": 1.696, + "grad_norm": 1.047066942797903, + "learning_rate": 1.1922876693653585e-05, + "loss": 0.3305, + "step": 212 + }, + { + "epoch": 1.704, + "grad_norm": 1.084646274515253, + "learning_rate": 1.131557334489326e-05, + "loss": 0.3136, + "step": 213 + }, + { + "epoch": 1.712, + "grad_norm": 0.9947790374688665, + "learning_rate": 1.0723215490213634e-05, + "loss": 0.3364, + "step": 214 + }, + { + "epoch": 1.72, + "grad_norm": 0.8940746388885817, + "learning_rate": 1.0145902956395447e-05, + "loss": 0.3075, + "step": 215 + }, + { + "epoch": 1.728, + "grad_norm": 0.8710247100820544, + "learning_rate": 9.583733034714981e-06, + "loss": 0.2787, + "step": 216 + }, + { + "epoch": 1.736, + "grad_norm": 0.8808640303308677, + "learning_rate": 9.036800464548157e-06, + "loss": 0.2776, + "step": 217 + }, + { + "epoch": 1.744, + "grad_norm": 0.9007324930455055, + "learning_rate": 8.505197417404687e-06, + "loss": 0.2931, + "step": 218 + }, + { + "epoch": 1.752, + "grad_norm": 1.3360245358851073, + "learning_rate": 7.989013481394814e-06, + "loss": 0.3635, + "step": 219 + }, + { + "epoch": 1.76, + "grad_norm": 0.8662064286590857, + "learning_rate": 7.488335646131628e-06, + "loss": 0.2964, + "step": 220 + }, + { + "epoch": 1.768, + "grad_norm": 1.0915529365558145, + "learning_rate": 7.003248288071118e-06, + "loss": 0.3648, + "step": 221 + }, + { + "epoch": 1.776, + "grad_norm": 0.9202789725816855, + "learning_rate": 6.533833156292679e-06, + "loss": 0.2857, + "step": 222 + }, + { + "epoch": 1.784, + "grad_norm": 0.9318116552878593, + "learning_rate": 6.08016935872251e-06, + "loss": 0.2843, + "step": 223 + }, + { + "epoch": 1.792, + "grad_norm": 0.8494920756416293, + "learning_rate": 5.6423333488018095e-06, + "loss": 0.2712, + "step": 224 + }, + { + "epoch": 1.8, + "grad_norm": 0.8797892336415777, + "learning_rate": 5.22039891260262e-06, + "loss": 0.2812, + "step": 225 + }, + { + "epoch": 1.808, + "grad_norm": 0.8760823018514744, + "learning_rate": 4.8144371563930476e-06, + "loss": 0.2645, + "step": 226 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 0.9572331290852502, + "learning_rate": 4.424516494654118e-06, + "loss": 0.2643, + "step": 227 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 1.0584565777951243, + "learning_rate": 4.050702638550275e-06, + "loss": 0.2744, + "step": 228 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 0.9829527904030954, + "learning_rate": 3.693058584855369e-06, + "loss": 0.3071, + "step": 229 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 1.0086221717790416, + "learning_rate": 3.3516446053363015e-06, + "loss": 0.3151, + "step": 230 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 0.9037186453744863, + "learning_rate": 3.026518236595621e-06, + "loss": 0.2567, + "step": 231 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 1.0019997239355016, + "learning_rate": 2.717734270375272e-06, + "loss": 0.3082, + "step": 232 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 0.882894828622119, + "learning_rate": 2.4253447443228106e-06, + "loss": 0.2677, + "step": 233 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 0.8996565449715206, + "learning_rate": 2.1493989332218468e-06, + "loss": 0.2796, + "step": 234 + }, + { + "epoch": 1.88, + "grad_norm": 0.9089579248416909, + "learning_rate": 1.8899433406879608e-06, + "loss": 0.2515, + "step": 235 + }, + { + "epoch": 1.888, + "grad_norm": 0.9685412253870014, + "learning_rate": 1.6470216913317626e-06, + "loss": 0.2778, + "step": 236 + }, + { + "epoch": 1.896, + "grad_norm": 0.8996738035578177, + "learning_rate": 1.4206749233902084e-06, + "loss": 0.2506, + "step": 237 + }, + { + "epoch": 1.904, + "grad_norm": 0.9475958329418473, + "learning_rate": 1.2109411818274852e-06, + "loss": 0.2694, + "step": 238 + }, + { + "epoch": 1.912, + "grad_norm": 0.8885136392291223, + "learning_rate": 1.0178558119067315e-06, + "loss": 0.3436, + "step": 239 + }, + { + "epoch": 1.92, + "grad_norm": 0.8280334345400172, + "learning_rate": 8.41451353233369e-07, + "loss": 0.2436, + "step": 240 + }, + { + "epoch": 1.928, + "grad_norm": 1.1115438568448008, + "learning_rate": 6.817575342714988e-07, + "loss": 0.2731, + "step": 241 + }, + { + "epoch": 1.936, + "grad_norm": 0.8211660429326406, + "learning_rate": 5.388012673338661e-07, + "loss": 0.2471, + "step": 242 + }, + { + "epoch": 1.944, + "grad_norm": 0.8498742119952564, + "learning_rate": 4.126066440464982e-07, + "loss": 0.2561, + "step": 243 + }, + { + "epoch": 1.952, + "grad_norm": 1.0168713887533058, + "learning_rate": 3.0319493128866396e-07, + "loss": 0.2906, + "step": 244 + }, + { + "epoch": 1.96, + "grad_norm": 0.9691850112649262, + "learning_rate": 2.1058456760891798e-07, + "loss": 0.3031, + "step": 245 + }, + { + "epoch": 1.968, + "grad_norm": 0.9259385068622227, + "learning_rate": 1.3479116011769767e-07, + "loss": 0.2394, + "step": 246 + }, + { + "epoch": 1.976, + "grad_norm": 0.9391835901797269, + "learning_rate": 7.582748185719358e-08, + "loss": 0.2998, + "step": 247 + }, + { + "epoch": 1.984, + "grad_norm": 1.0287335651544172, + "learning_rate": 3.370346964876036e-08, + "loss": 0.2866, + "step": 248 + }, + { + "epoch": 1.992, + "grad_norm": 1.0000330802426152, + "learning_rate": 8.426222418311814e-09, + "loss": 0.3155, + "step": 249 + }, + { + "epoch": 2.0, + "grad_norm": 1.0246789859149088, + "learning_rate": 0.0, + "loss": 0.3072, + "step": 250 + }, + { + "epoch": 2.0, + "step": 250, + "total_flos": 32940892651520.0, + "train_loss": 0.5541693149805069, + "train_runtime": 2079.3998, + "train_samples_per_second": 1.924, + "train_steps_per_second": 0.12 + } + ], + "logging_steps": 1.0, + "max_steps": 250, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 32940892651520.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/README.md b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4f291d8d6fdb82fcff274f83bcc52b834347797c --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/README.md @@ -0,0 +1,202 @@ +--- +library_name: peft +base_model: ./weights/Bunny-v1_1-Llama-3-8B-V +--- + +# Model Card for Model ID + + + + + +## Model Details + +### Model Description + + + + + +- **Developed by:** [More Information Needed] +- **Funded by [optional]:** [More Information Needed] +- **Shared by [optional]:** [More Information Needed] +- **Model type:** [More Information Needed] +- **Language(s) (NLP):** [More Information Needed] +- **License:** [More Information Needed] +- **Finetuned from model [optional]:** [More Information Needed] + +### Model Sources [optional] + + + +- **Repository:** [More Information Needed] +- **Paper [optional]:** [More Information Needed] +- **Demo [optional]:** [More Information Needed] + +## Uses + + + +### Direct Use + + + +[More Information Needed] + +### Downstream Use [optional] + + + +[More Information Needed] + +### Out-of-Scope Use + + + +[More Information Needed] + +## Bias, Risks, and Limitations + + + +[More Information Needed] + +### Recommendations + + + +Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations. + +## How to Get Started with the Model + +Use the code below to get started with the model. + +[More Information Needed] + +## Training Details + +### Training Data + + + +[More Information Needed] + +### Training Procedure + + + +#### Preprocessing [optional] + +[More Information Needed] + + +#### Training Hyperparameters + +- **Training regime:** [More Information Needed] + +#### Speeds, Sizes, Times [optional] + + + +[More Information Needed] + +## Evaluation + + + +### Testing Data, Factors & Metrics + +#### Testing Data + + + +[More Information Needed] + +#### Factors + + + +[More Information Needed] + +#### Metrics + + + +[More Information Needed] + +### Results + +[More Information Needed] + +#### Summary + + + +## Model Examination [optional] + + + +[More Information Needed] + +## Environmental Impact + + + +Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). + +- **Hardware Type:** [More Information Needed] +- **Hours used:** [More Information Needed] +- **Cloud Provider:** [More Information Needed] +- **Compute Region:** [More Information Needed] +- **Carbon Emitted:** [More Information Needed] + +## Technical Specifications [optional] + +### Model Architecture and Objective + +[More Information Needed] + +### Compute Infrastructure + +[More Information Needed] + +#### Hardware + +[More Information Needed] + +#### Software + +[More Information Needed] + +## Citation [optional] + + + +**BibTeX:** + +[More Information Needed] + +**APA:** + +[More Information Needed] + +## Glossary [optional] + + + +[More Information Needed] + +## More Information [optional] + +[More Information Needed] + +## Model Card Authors [optional] + +[More Information Needed] + +## Model Card Contact + +[More Information Needed] +### Framework versions + +- PEFT 0.11.1 \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_config.json new file mode 100644 index 0000000000000000000000000000000000000000..8a88111f02050b3f7366d4ed8a16f98ab5418837 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_config.json @@ -0,0 +1,34 @@ +{ + "alpha_pattern": {}, + "auto_mapping": null, + "base_model_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "bias": "none", + "fan_in_fan_out": false, + "inference_mode": true, + "init_lora_weights": true, + "layer_replication": null, + "layers_pattern": null, + "layers_to_transform": null, + "loftq_config": {}, + "lora_alpha": 256, + "lora_dropout": 0.1, + "megatron_config": null, + "megatron_core": "megatron.core", + "modules_to_save": null, + "peft_type": "LORA", + "r": 128, + "rank_pattern": {}, + "revision": null, + "target_modules": [ + "up_proj", + "k_proj", + "down_proj", + "gate_proj", + "v_proj", + "q_proj", + "o_proj" + ], + "task_type": "CAUSAL_LM", + "use_dora": false, + "use_rslora": false +} \ No newline at end of file diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ae265615db8c3b07f89e7c4a0db61fc6a2a02412 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/adapter_model.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4375ea8b2c76c461b0cf4df969503d14eef4bc3ada7e63ff213d15486fda9e6e +size 671150064 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/config.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/config.json new file mode 100644 index 0000000000000000000000000000000000000000..ee7ed86f4ff6f3381a6bb24981934be89aa376e7 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/config.json @@ -0,0 +1,45 @@ +{ + "_name_or_path": "./weights/Bunny-v1_1-Llama-3-8B-V", + "architectures": [ + "BunnyLlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_bunny_llama.BunnyLlamaConfig", + "AutoModelForCausalLM": "modeling_bunny_llama.BunnyLlamaForCausalLM" + }, + "bos_token_id": 128000, + "continuous_training": false, + "eos_token_id": 128001, + "freeze_mm_mlp_adapter": false, + "hidden_act": "silu", + "hidden_size": 4096, + "image_aspect_ratio": "pad", + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 8192, + "mm_hidden_size": 3456, + "mm_projector_lr": null, + "mm_projector_type": "mlp2x_gelu", + "mm_vision_tower": "./weights/siglip-so400m-patch14-384", + "model_type": "bunny-llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "tokenizer_model_max_length": 2048, + "tokenizer_padding_side": "right", + "torch_dtype": "float16", + "transformers_version": "4.41.2", + "tune_mm_mlp_adapter": false, + "unfreeze_vision_tower": true, + "use_cache": true, + "use_mm_proj": true, + "use_s2": true, + "vocab_size": 128256 +} diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin new file mode 100644 index 0000000000000000000000000000000000000000..508fe400ff01b922df556ef0dc69306574677869 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/non_lora_trainables.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a1255eca8de6f9224f95a6ba5909cfd0ebefaf1d5ab548019e86fb7d0f3121b +size 918507402 diff --git a/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/trainer_state.json b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..64ba627bbd9831927d2eb27f733c50e194d890e9 --- /dev/null +++ b/single_dataset/short_caption/VideoGameBunny_v1_1-Llama-3-8B-V-short_caption_dataset_5000_epochs_1_lora/trainer_state.json @@ -0,0 +1,2226 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9984, + "eval_steps": 500, + "global_step": 312, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0032, + "grad_norm": 3.8494427382984977, + "learning_rate": 2e-05, + "loss": 1.5206, + "step": 1 + }, + { + "epoch": 0.0064, + "grad_norm": 4.452605449564811, + "learning_rate": 4e-05, + "loss": 1.5109, + "step": 2 + }, + { + "epoch": 0.0096, + "grad_norm": 2.2827512458595445, + "learning_rate": 6e-05, + "loss": 1.3696, + "step": 3 + }, + { + "epoch": 0.0128, + "grad_norm": 1.9005110908633476, + "learning_rate": 8e-05, + "loss": 1.1741, + "step": 4 + }, + { + "epoch": 0.016, + "grad_norm": 1.4910329120321058, + "learning_rate": 0.0001, + "loss": 0.9923, + "step": 5 + }, + { + "epoch": 0.0192, + "grad_norm": 1.88035086322046, + "learning_rate": 0.00012, + "loss": 0.9127, + "step": 6 + }, + { + "epoch": 0.0224, + "grad_norm": 2.042098191915081, + "learning_rate": 0.00014, + "loss": 0.9648, + "step": 7 + }, + { + "epoch": 0.0256, + "grad_norm": 1.7233770983581165, + "learning_rate": 0.00016, + "loss": 0.9461, + "step": 8 + }, + { + "epoch": 0.0288, + "grad_norm": 1.449197633443149, + "learning_rate": 0.00018, + "loss": 0.836, + "step": 9 + }, + { + "epoch": 0.032, + "grad_norm": 1.2900359918295659, + "learning_rate": 0.0002, + "loss": 0.876, + "step": 10 + }, + { + "epoch": 0.0352, + "grad_norm": 1.4852099226013014, + "learning_rate": 0.00019999458931878073, + "loss": 0.8284, + "step": 11 + }, + { + "epoch": 0.0384, + "grad_norm": 1.3889814325040792, + "learning_rate": 0.0001999783578606323, + "loss": 0.8337, + "step": 12 + }, + { + "epoch": 0.0416, + "grad_norm": 1.5064544417143808, + "learning_rate": 0.00019995130738201966, + "loss": 0.9325, + "step": 13 + }, + { + "epoch": 0.0448, + "grad_norm": 1.3764770861409006, + "learning_rate": 0.0001999134408101731, + "loss": 0.7912, + "step": 14 + }, + { + "epoch": 0.048, + "grad_norm": 1.407685959814556, + "learning_rate": 0.00019986476224277165, + "loss": 0.8235, + "step": 15 + }, + { + "epoch": 0.0512, + "grad_norm": 1.419892646905105, + "learning_rate": 0.00019980527694749952, + "loss": 0.9167, + "step": 16 + }, + { + "epoch": 0.0544, + "grad_norm": 1.2542459367172696, + "learning_rate": 0.00019973499136147606, + "loss": 0.7894, + "step": 17 + }, + { + "epoch": 0.0576, + "grad_norm": 1.5053470324097624, + "learning_rate": 0.0001996539130905593, + "loss": 0.9018, + "step": 18 + }, + { + "epoch": 0.0608, + "grad_norm": 1.4266643857980994, + "learning_rate": 0.0001995620509085228, + "loss": 0.8374, + "step": 19 + }, + { + "epoch": 0.064, + "grad_norm": 1.4513379128350314, + "learning_rate": 0.00019945941475610623, + "loss": 0.9682, + "step": 20 + }, + { + "epoch": 0.0672, + "grad_norm": 1.437867162646637, + "learning_rate": 0.0001993460157399396, + "loss": 0.802, + "step": 21 + }, + { + "epoch": 0.0704, + "grad_norm": 1.2646231736967548, + "learning_rate": 0.0001992218661313415, + "loss": 0.7591, + "step": 22 + }, + { + "epoch": 0.0736, + "grad_norm": 1.2760649437008011, + "learning_rate": 0.00019908697936499103, + "loss": 0.8405, + "step": 23 + }, + { + "epoch": 0.0768, + "grad_norm": 1.3244564743716525, + "learning_rate": 0.00019894137003747403, + "loss": 0.8056, + "step": 24 + }, + { + "epoch": 0.08, + "grad_norm": 1.3781336157488413, + "learning_rate": 0.00019878505390570362, + "loss": 0.7843, + "step": 25 + }, + { + "epoch": 0.0832, + "grad_norm": 1.2349066076612103, + "learning_rate": 0.00019861804788521493, + "loss": 0.8032, + "step": 26 + }, + { + "epoch": 0.0864, + "grad_norm": 1.4471220778165244, + "learning_rate": 0.00019844037004833473, + "loss": 0.8885, + "step": 27 + }, + { + "epoch": 0.0896, + "grad_norm": 1.1706339840369515, + "learning_rate": 0.00019825203962222572, + "loss": 0.8026, + "step": 28 + }, + { + "epoch": 0.0928, + "grad_norm": 1.2842245444466585, + "learning_rate": 0.0001980530769868059, + "loss": 0.7821, + "step": 29 + }, + { + "epoch": 0.096, + "grad_norm": 1.1574449208863158, + "learning_rate": 0.00019784350367254322, + "loss": 0.7997, + "step": 30 + }, + { + "epoch": 0.0992, + "grad_norm": 1.4393641501848962, + "learning_rate": 0.0001976233423581255, + "loss": 0.8374, + "step": 31 + }, + { + "epoch": 0.1024, + "grad_norm": 1.1022637704133273, + "learning_rate": 0.0001973926168680066, + "loss": 0.691, + "step": 32 + }, + { + "epoch": 0.1056, + "grad_norm": 1.2390416489286467, + "learning_rate": 0.00019715135216982798, + "loss": 0.8228, + "step": 33 + }, + { + "epoch": 0.1088, + "grad_norm": 1.2723065693593631, + "learning_rate": 0.0001968995743717171, + "loss": 0.9256, + "step": 34 + }, + { + "epoch": 0.112, + "grad_norm": 1.3320103811934423, + "learning_rate": 0.00019663731071946206, + "loss": 0.9903, + "step": 35 + }, + { + "epoch": 0.1152, + "grad_norm": 1.1082008197953372, + "learning_rate": 0.00019636458959356316, + "loss": 0.7964, + "step": 36 + }, + { + "epoch": 0.1184, + "grad_norm": 1.198571527552164, + "learning_rate": 0.0001960814405061619, + "loss": 0.788, + "step": 37 + }, + { + "epoch": 0.1216, + "grad_norm": 1.0499439896228493, + "learning_rate": 0.00019578789409784727, + "loss": 0.6915, + "step": 38 + }, + { + "epoch": 0.1248, + "grad_norm": 1.2787423292037434, + "learning_rate": 0.00019548398213434007, + "loss": 0.8774, + "step": 39 + }, + { + "epoch": 0.128, + "grad_norm": 1.1042952212305208, + "learning_rate": 0.00019516973750305532, + "loss": 0.7451, + "step": 40 + }, + { + "epoch": 0.1312, + "grad_norm": 1.1149949790665101, + "learning_rate": 0.00019484519420954354, + "loss": 0.7537, + "step": 41 + }, + { + "epoch": 0.1344, + "grad_norm": 1.237976361978822, + "learning_rate": 0.00019451038737381077, + "loss": 0.8782, + "step": 42 + }, + { + "epoch": 0.1376, + "grad_norm": 1.3004540665627777, + "learning_rate": 0.00019416535322651818, + "loss": 0.8582, + "step": 43 + }, + { + "epoch": 0.1408, + "grad_norm": 1.1000708833200137, + "learning_rate": 0.00019381012910506146, + "loss": 0.7909, + "step": 44 + }, + { + "epoch": 0.144, + "grad_norm": 1.1899967921766514, + "learning_rate": 0.00019344475344953012, + "loss": 0.8335, + "step": 45 + }, + { + "epoch": 0.1472, + "grad_norm": 1.2054207735640328, + "learning_rate": 0.00019306926579854821, + "loss": 0.8692, + "step": 46 + }, + { + "epoch": 0.1504, + "grad_norm": 1.1050589236248065, + "learning_rate": 0.00019268370678499533, + "loss": 0.7355, + "step": 47 + }, + { + "epoch": 0.1536, + "grad_norm": 1.0872751210429186, + "learning_rate": 0.0001922881181316097, + "loss": 0.7336, + "step": 48 + }, + { + "epoch": 0.1568, + "grad_norm": 1.1511449947711034, + "learning_rate": 0.00019188254264647337, + "loss": 0.7859, + "step": 49 + }, + { + "epoch": 0.16, + "grad_norm": 1.1307539265986024, + "learning_rate": 0.0001914670242183795, + "loss": 0.7765, + "step": 50 + }, + { + "epoch": 0.1632, + "grad_norm": 1.351090330595727, + "learning_rate": 0.0001910416078120832, + "loss": 0.7897, + "step": 51 + }, + { + "epoch": 0.1664, + "grad_norm": 1.2975204595562546, + "learning_rate": 0.0001906063394634356, + "loss": 0.7415, + "step": 52 + }, + { + "epoch": 0.1696, + "grad_norm": 1.1163670705147442, + "learning_rate": 0.00019016126627440237, + "loss": 0.7743, + "step": 53 + }, + { + "epoch": 0.1728, + "grad_norm": 1.2755707068662836, + "learning_rate": 0.00018970643640796642, + "loss": 0.7988, + "step": 54 + }, + { + "epoch": 0.176, + "grad_norm": 1.2702567625694303, + "learning_rate": 0.000189241899082916, + "loss": 0.8539, + "step": 55 + }, + { + "epoch": 0.1792, + "grad_norm": 1.1163092203414966, + "learning_rate": 0.00018876770456851877, + "loss": 0.7748, + "step": 56 + }, + { + "epoch": 0.1824, + "grad_norm": 1.0293590185248311, + "learning_rate": 0.0001882839041790818, + "loss": 0.7143, + "step": 57 + }, + { + "epoch": 0.1856, + "grad_norm": 1.4409398453864177, + "learning_rate": 0.00018779055026839868, + "loss": 1.0833, + "step": 58 + }, + { + "epoch": 0.1888, + "grad_norm": 1.1706574644576464, + "learning_rate": 0.00018728769622408423, + "loss": 0.8348, + "step": 59 + }, + { + "epoch": 0.192, + "grad_norm": 1.135414367069039, + "learning_rate": 0.00018677539646179707, + "loss": 0.8813, + "step": 60 + }, + { + "epoch": 0.1952, + "grad_norm": 1.079296304758334, + "learning_rate": 0.00018625370641935129, + "loss": 0.8135, + "step": 61 + }, + { + "epoch": 0.1984, + "grad_norm": 1.1043937803678643, + "learning_rate": 0.00018572268255071718, + "loss": 0.8187, + "step": 62 + }, + { + "epoch": 0.2016, + "grad_norm": 2.1270436164977844, + "learning_rate": 0.00018518238231991218, + "loss": 0.66, + "step": 63 + }, + { + "epoch": 0.2048, + "grad_norm": 1.219994157328429, + "learning_rate": 0.00018463286419478255, + "loss": 0.8427, + "step": 64 + }, + { + "epoch": 0.208, + "grad_norm": 1.2125200585844553, + "learning_rate": 0.00018407418764067627, + "loss": 0.8415, + "step": 65 + }, + { + "epoch": 0.2112, + "grad_norm": 1.1001073708321165, + "learning_rate": 0.00018350641311400812, + "loss": 0.6918, + "step": 66 + }, + { + "epoch": 0.2144, + "grad_norm": 1.194540673785202, + "learning_rate": 0.0001829296020557174, + "loss": 0.8302, + "step": 67 + }, + { + "epoch": 0.2176, + "grad_norm": 1.1654655298759733, + "learning_rate": 0.00018234381688461942, + "loss": 0.7777, + "step": 68 + }, + { + "epoch": 0.2208, + "grad_norm": 1.0960669927086184, + "learning_rate": 0.0001817491209906506, + "loss": 0.7228, + "step": 69 + }, + { + "epoch": 0.224, + "grad_norm": 1.3072226786043357, + "learning_rate": 0.00018114557872800905, + "loss": 0.745, + "step": 70 + }, + { + "epoch": 0.2272, + "grad_norm": 1.0376194288323604, + "learning_rate": 0.00018053325540819045, + "loss": 0.7159, + "step": 71 + }, + { + "epoch": 0.2304, + "grad_norm": 1.1329808368309582, + "learning_rate": 0.0001799122172929206, + "loss": 0.793, + "step": 72 + }, + { + "epoch": 0.2336, + "grad_norm": 1.1858370493394028, + "learning_rate": 0.00017928253158698473, + "loss": 0.7923, + "step": 73 + }, + { + "epoch": 0.2368, + "grad_norm": 1.1937400981515296, + "learning_rate": 0.0001786442664309554, + "loss": 0.9004, + "step": 74 + }, + { + "epoch": 0.24, + "grad_norm": 1.1762178456447314, + "learning_rate": 0.0001779974908938184, + "loss": 0.768, + "step": 75 + }, + { + "epoch": 0.2432, + "grad_norm": 1.207471247637441, + "learning_rate": 0.0001773422749654988, + "loss": 0.8273, + "step": 76 + }, + { + "epoch": 0.2464, + "grad_norm": 1.169346276818301, + "learning_rate": 0.00017667868954928694, + "loss": 0.8023, + "step": 77 + }, + { + "epoch": 0.2496, + "grad_norm": 0.9837697261481548, + "learning_rate": 0.00017600680645416583, + "loss": 0.7058, + "step": 78 + }, + { + "epoch": 0.2528, + "grad_norm": 1.1967491234936953, + "learning_rate": 0.00017532669838704035, + "loss": 0.7353, + "step": 79 + }, + { + "epoch": 0.256, + "grad_norm": 1.059535525619828, + "learning_rate": 0.00017463843894486937, + "loss": 0.7919, + "step": 80 + }, + { + "epoch": 0.2592, + "grad_norm": 1.0669095062974419, + "learning_rate": 0.0001739421026067017, + "loss": 0.701, + "step": 81 + }, + { + "epoch": 0.2624, + "grad_norm": 0.9427635478779778, + "learning_rate": 0.00017323776472561627, + "loss": 0.5928, + "step": 82 + }, + { + "epoch": 0.2656, + "grad_norm": 1.0750208025455008, + "learning_rate": 0.00017252550152056795, + "loss": 0.7733, + "step": 83 + }, + { + "epoch": 0.2688, + "grad_norm": 1.0277897136233543, + "learning_rate": 0.0001718053900681397, + "loss": 0.7672, + "step": 84 + }, + { + "epoch": 0.272, + "grad_norm": 1.0599838033481415, + "learning_rate": 0.00017107750829420176, + "loss": 0.7549, + "step": 85 + }, + { + "epoch": 0.2752, + "grad_norm": 1.1699655604824146, + "learning_rate": 0.00017034193496547902, + "loss": 0.8371, + "step": 86 + }, + { + "epoch": 0.2784, + "grad_norm": 0.9071262053759483, + "learning_rate": 0.00016959874968102735, + "loss": 0.6607, + "step": 87 + }, + { + "epoch": 0.2816, + "grad_norm": 1.0435436605605914, + "learning_rate": 0.00016884803286362, + "loss": 0.7598, + "step": 88 + }, + { + "epoch": 0.2848, + "grad_norm": 0.9531556376560218, + "learning_rate": 0.00016808986575104465, + "loss": 0.6814, + "step": 89 + }, + { + "epoch": 0.288, + "grad_norm": 1.0526100850960158, + "learning_rate": 0.00016732433038731242, + "loss": 0.7612, + "step": 90 + }, + { + "epoch": 0.2912, + "grad_norm": 1.0884276174032113, + "learning_rate": 0.0001665515096137797, + "loss": 0.8, + "step": 91 + }, + { + "epoch": 0.2944, + "grad_norm": 1.0135538049398993, + "learning_rate": 0.00016577148706018328, + "loss": 0.7234, + "step": 92 + }, + { + "epoch": 0.2976, + "grad_norm": 1.053611711654421, + "learning_rate": 0.00016498434713559088, + "loss": 0.7816, + "step": 93 + }, + { + "epoch": 0.3008, + "grad_norm": 1.0192932237511134, + "learning_rate": 0.00016419017501926656, + "loss": 0.6904, + "step": 94 + }, + { + "epoch": 0.304, + "grad_norm": 1.2401048947247897, + "learning_rate": 0.0001633890566514535, + "loss": 0.8299, + "step": 95 + }, + { + "epoch": 0.3072, + "grad_norm": 3.083913619287634, + "learning_rate": 0.00016258107872407375, + "loss": 0.7016, + "step": 96 + }, + { + "epoch": 0.3104, + "grad_norm": 1.1696294945463654, + "learning_rate": 0.0001617663286713474, + "loss": 0.7838, + "step": 97 + }, + { + "epoch": 0.3136, + "grad_norm": 1.1109483737471186, + "learning_rate": 0.00016094489466033043, + "loss": 0.731, + "step": 98 + }, + { + "epoch": 0.3168, + "grad_norm": 1.059952420722134, + "learning_rate": 0.00016011686558137448, + "loss": 0.7391, + "step": 99 + }, + { + "epoch": 0.32, + "grad_norm": 0.9655078300817889, + "learning_rate": 0.0001592823310385073, + "loss": 0.6989, + "step": 100 + }, + { + "epoch": 0.3232, + "grad_norm": 1.0540407412485533, + "learning_rate": 0.0001584413813397364, + "loss": 0.797, + "step": 101 + }, + { + "epoch": 0.3264, + "grad_norm": 1.0698672980989976, + "learning_rate": 0.00015759410748727662, + "loss": 0.7769, + "step": 102 + }, + { + "epoch": 0.3296, + "grad_norm": 1.112817505418569, + "learning_rate": 0.00015674060116770236, + "loss": 0.8024, + "step": 103 + }, + { + "epoch": 0.3328, + "grad_norm": 1.072425620863512, + "learning_rate": 0.00015588095474202595, + "loss": 0.7873, + "step": 104 + }, + { + "epoch": 0.336, + "grad_norm": 1.056488530423442, + "learning_rate": 0.00015501526123570277, + "loss": 0.6963, + "step": 105 + }, + { + "epoch": 0.3392, + "grad_norm": 0.9885609012700043, + "learning_rate": 0.00015414361432856475, + "loss": 0.6829, + "step": 106 + }, + { + "epoch": 0.3424, + "grad_norm": 1.0493733552659685, + "learning_rate": 0.0001532661083446829, + "loss": 0.6784, + "step": 107 + }, + { + "epoch": 0.3456, + "grad_norm": 1.230162801686097, + "learning_rate": 0.00015238283824216015, + "loss": 0.736, + "step": 108 + }, + { + "epoch": 0.3488, + "grad_norm": 1.0120333723067219, + "learning_rate": 0.00015149389960285558, + "loss": 0.6967, + "step": 109 + }, + { + "epoch": 0.352, + "grad_norm": 0.9661599057340882, + "learning_rate": 0.00015059938862204127, + "loss": 0.6997, + "step": 110 + }, + { + "epoch": 0.3552, + "grad_norm": 1.058761827010355, + "learning_rate": 0.00014969940209799248, + "loss": 0.7332, + "step": 111 + }, + { + "epoch": 0.3584, + "grad_norm": 1.1232621172663235, + "learning_rate": 0.00014879403742151283, + "loss": 0.6784, + "step": 112 + }, + { + "epoch": 0.3616, + "grad_norm": 1.2271609398143184, + "learning_rate": 0.00014788339256539544, + "loss": 0.8297, + "step": 113 + }, + { + "epoch": 0.3648, + "grad_norm": 1.0165493420787164, + "learning_rate": 0.0001469675660738206, + "loss": 0.7142, + "step": 114 + }, + { + "epoch": 0.368, + "grad_norm": 0.858774180611481, + "learning_rate": 0.00014604665705169237, + "loss": 0.6662, + "step": 115 + }, + { + "epoch": 0.3712, + "grad_norm": 1.0431358133949369, + "learning_rate": 0.00014512076515391375, + "loss": 0.7564, + "step": 116 + }, + { + "epoch": 0.3744, + "grad_norm": 1.1210394667165413, + "learning_rate": 0.00014418999057460276, + "loss": 0.7709, + "step": 117 + }, + { + "epoch": 0.3776, + "grad_norm": 1.1097745247659903, + "learning_rate": 0.0001432544340362501, + "loss": 0.8047, + "step": 118 + }, + { + "epoch": 0.3808, + "grad_norm": 1.0081865656866769, + "learning_rate": 0.00014231419677881966, + "loss": 0.6768, + "step": 119 + }, + { + "epoch": 0.384, + "grad_norm": 0.9203647511384035, + "learning_rate": 0.00014136938054879283, + "loss": 0.6705, + "step": 120 + }, + { + "epoch": 0.3872, + "grad_norm": 0.994380717363222, + "learning_rate": 0.00014042008758815818, + "loss": 0.6535, + "step": 121 + }, + { + "epoch": 0.3904, + "grad_norm": 0.9326117765014191, + "learning_rate": 0.00013946642062334766, + "loss": 0.6857, + "step": 122 + }, + { + "epoch": 0.3936, + "grad_norm": 1.0065656737007167, + "learning_rate": 0.00013850848285411994, + "loss": 0.6634, + "step": 123 + }, + { + "epoch": 0.3968, + "grad_norm": 1.260262573325505, + "learning_rate": 0.000137546377942393, + "loss": 0.7611, + "step": 124 + }, + { + "epoch": 0.4, + "grad_norm": 1.1440674766124106, + "learning_rate": 0.00013658021000102636, + "loss": 0.7246, + "step": 125 + }, + { + "epoch": 0.4032, + "grad_norm": 1.1768333911491151, + "learning_rate": 0.00013561008358255468, + "loss": 0.7046, + "step": 126 + }, + { + "epoch": 0.4064, + "grad_norm": 1.1989107764043931, + "learning_rate": 0.00013463610366787392, + "loss": 0.7431, + "step": 127 + }, + { + "epoch": 0.4096, + "grad_norm": 1.0665368519092593, + "learning_rate": 0.00013365837565488064, + "loss": 0.7049, + "step": 128 + }, + { + "epoch": 0.4128, + "grad_norm": 1.0963729026160156, + "learning_rate": 0.0001326770053470668, + "loss": 0.737, + "step": 129 + }, + { + "epoch": 0.416, + "grad_norm": 1.1174308562968942, + "learning_rate": 0.0001316920989420703, + "loss": 0.821, + "step": 130 + }, + { + "epoch": 0.4192, + "grad_norm": 1.1514901189931928, + "learning_rate": 0.00013070376302018287, + "loss": 0.7987, + "step": 131 + }, + { + "epoch": 0.4224, + "grad_norm": 1.1622485278068753, + "learning_rate": 0.00012971210453281674, + "loss": 0.7098, + "step": 132 + }, + { + "epoch": 0.4256, + "grad_norm": 1.0955540600948737, + "learning_rate": 0.000128717230790931, + "loss": 0.6526, + "step": 133 + }, + { + "epoch": 0.4288, + "grad_norm": 1.0457876787355673, + "learning_rate": 0.00012771924945341906, + "loss": 0.7341, + "step": 134 + }, + { + "epoch": 0.432, + "grad_norm": 0.9877483882289841, + "learning_rate": 0.00012671826851545851, + "loss": 0.6878, + "step": 135 + }, + { + "epoch": 0.4352, + "grad_norm": 1.1182496551780474, + "learning_rate": 0.0001257143962968246, + "loss": 0.7395, + "step": 136 + }, + { + "epoch": 0.4384, + "grad_norm": 1.0452123422985957, + "learning_rate": 0.00012470774143016853, + "loss": 0.6148, + "step": 137 + }, + { + "epoch": 0.4416, + "grad_norm": 1.0767028709731223, + "learning_rate": 0.00012369841284926188, + "loss": 0.7519, + "step": 138 + }, + { + "epoch": 0.4448, + "grad_norm": 1.1318399991187422, + "learning_rate": 0.00012268651977720866, + "loss": 0.7297, + "step": 139 + }, + { + "epoch": 0.448, + "grad_norm": 2.753142555925865, + "learning_rate": 0.00012167217171462566, + "loss": 0.6695, + "step": 140 + }, + { + "epoch": 0.4512, + "grad_norm": 1.0276528201896862, + "learning_rate": 0.0001206554784277931, + "loss": 0.6798, + "step": 141 + }, + { + "epoch": 0.4544, + "grad_norm": 1.0831750543575958, + "learning_rate": 0.00011963654993677645, + "loss": 0.7158, + "step": 142 + }, + { + "epoch": 0.4576, + "grad_norm": 1.002228184109368, + "learning_rate": 0.00011861549650352069, + "loss": 0.7234, + "step": 143 + }, + { + "epoch": 0.4608, + "grad_norm": 1.0616413597198417, + "learning_rate": 0.00011759242861991855, + "loss": 0.7657, + "step": 144 + }, + { + "epoch": 0.464, + "grad_norm": 1.0898849037474774, + "learning_rate": 0.00011656745699585371, + "loss": 0.7213, + "step": 145 + }, + { + "epoch": 0.4672, + "grad_norm": 1.0783972954908356, + "learning_rate": 0.00011554069254722051, + "loss": 0.7423, + "step": 146 + }, + { + "epoch": 0.4704, + "grad_norm": 0.8991306321972456, + "learning_rate": 0.00011451224638392129, + "loss": 0.5727, + "step": 147 + }, + { + "epoch": 0.4736, + "grad_norm": 1.0158886111527536, + "learning_rate": 0.00011348222979784289, + "loss": 0.6369, + "step": 148 + }, + { + "epoch": 0.4768, + "grad_norm": 0.9976456938631907, + "learning_rate": 0.00011245075425081328, + "loss": 0.7267, + "step": 149 + }, + { + "epoch": 0.48, + "grad_norm": 1.105515497623268, + "learning_rate": 0.00011141793136253986, + "loss": 0.8043, + "step": 150 + }, + { + "epoch": 0.4832, + "grad_norm": 0.947098123978281, + "learning_rate": 0.0001103838728985307, + "loss": 0.6139, + "step": 151 + }, + { + "epoch": 0.4864, + "grad_norm": 0.9067278980666293, + "learning_rate": 0.000109348690758, + "loss": 0.6185, + "step": 152 + }, + { + "epoch": 0.4896, + "grad_norm": 0.9798295874045432, + "learning_rate": 0.00010831249696175918, + "loss": 0.647, + "step": 153 + }, + { + "epoch": 0.4928, + "grad_norm": 0.9494198844289646, + "learning_rate": 0.0001072754036400944, + "loss": 0.6033, + "step": 154 + }, + { + "epoch": 0.496, + "grad_norm": 0.9658328400277845, + "learning_rate": 0.00010623752302063283, + "loss": 0.6627, + "step": 155 + }, + { + "epoch": 0.4992, + "grad_norm": 1.0409284666705174, + "learning_rate": 0.00010519896741619803, + "loss": 0.6509, + "step": 156 + }, + { + "epoch": 0.5024, + "grad_norm": 1.0605420636323952, + "learning_rate": 0.00010415984921265609, + "loss": 0.7419, + "step": 157 + }, + { + "epoch": 0.5056, + "grad_norm": 0.9931102063466744, + "learning_rate": 0.00010312028085675391, + "loss": 0.6379, + "step": 158 + }, + { + "epoch": 0.5088, + "grad_norm": 1.0758646517408224, + "learning_rate": 0.00010208037484395114, + "loss": 0.7947, + "step": 159 + }, + { + "epoch": 0.512, + "grad_norm": 1.0607905274771352, + "learning_rate": 0.00010104024370624644, + "loss": 0.7912, + "step": 160 + }, + { + "epoch": 0.5152, + "grad_norm": 1.0749429491379305, + "learning_rate": 0.0001, + "loss": 0.6665, + "step": 161 + }, + { + "epoch": 0.5184, + "grad_norm": 1.0274907638555637, + "learning_rate": 9.895975629375359e-05, + "loss": 0.7552, + "step": 162 + }, + { + "epoch": 0.5216, + "grad_norm": 1.0269983555780358, + "learning_rate": 9.791962515604887e-05, + "loss": 0.7295, + "step": 163 + }, + { + "epoch": 0.5248, + "grad_norm": 1.0172077288402206, + "learning_rate": 9.687971914324607e-05, + "loss": 0.6356, + "step": 164 + }, + { + "epoch": 0.528, + "grad_norm": 0.9488263777741074, + "learning_rate": 9.584015078734395e-05, + "loss": 0.6731, + "step": 165 + }, + { + "epoch": 0.5312, + "grad_norm": 0.9280411234896112, + "learning_rate": 9.480103258380198e-05, + "loss": 0.6565, + "step": 166 + }, + { + "epoch": 0.5344, + "grad_norm": 1.0865275056210568, + "learning_rate": 9.376247697936719e-05, + "loss": 0.7646, + "step": 167 + }, + { + "epoch": 0.5376, + "grad_norm": 0.9676804385852837, + "learning_rate": 9.272459635990562e-05, + "loss": 0.6872, + "step": 168 + }, + { + "epoch": 0.5408, + "grad_norm": 0.9715346008017007, + "learning_rate": 9.168750303824084e-05, + "loss": 0.6078, + "step": 169 + }, + { + "epoch": 0.544, + "grad_norm": 1.040258403190395, + "learning_rate": 9.065130924199998e-05, + "loss": 0.7776, + "step": 170 + }, + { + "epoch": 0.5472, + "grad_norm": 1.3488643904138622, + "learning_rate": 8.961612710146934e-05, + "loss": 0.962, + "step": 171 + }, + { + "epoch": 0.5504, + "grad_norm": 1.0101420289903322, + "learning_rate": 8.858206863746018e-05, + "loss": 0.6972, + "step": 172 + }, + { + "epoch": 0.5536, + "grad_norm": 1.0487242372617993, + "learning_rate": 8.754924574918675e-05, + "loss": 0.6946, + "step": 173 + }, + { + "epoch": 0.5568, + "grad_norm": 0.9580188922300577, + "learning_rate": 8.651777020215712e-05, + "loss": 0.6509, + "step": 174 + }, + { + "epoch": 0.56, + "grad_norm": 0.9729345979989085, + "learning_rate": 8.548775361607872e-05, + "loss": 0.6879, + "step": 175 + }, + { + "epoch": 0.5632, + "grad_norm": 0.9019650238052146, + "learning_rate": 8.445930745277953e-05, + "loss": 0.5875, + "step": 176 + }, + { + "epoch": 0.5664, + "grad_norm": 1.0697398651008763, + "learning_rate": 8.343254300414628e-05, + "loss": 0.7303, + "step": 177 + }, + { + "epoch": 0.5696, + "grad_norm": 0.9617653135515002, + "learning_rate": 8.240757138008149e-05, + "loss": 0.6529, + "step": 178 + }, + { + "epoch": 0.5728, + "grad_norm": 1.188799963147903, + "learning_rate": 8.138450349647936e-05, + "loss": 0.743, + "step": 179 + }, + { + "epoch": 0.576, + "grad_norm": 1.1346778862709697, + "learning_rate": 8.036345006322359e-05, + "loss": 0.7093, + "step": 180 + }, + { + "epoch": 0.5792, + "grad_norm": 0.9423687094682542, + "learning_rate": 7.934452157220694e-05, + "loss": 0.6348, + "step": 181 + }, + { + "epoch": 0.5824, + "grad_norm": 0.972756373128702, + "learning_rate": 7.832782828537437e-05, + "loss": 0.6293, + "step": 182 + }, + { + "epoch": 0.5856, + "grad_norm": 0.9423401625896834, + "learning_rate": 7.731348022279134e-05, + "loss": 0.6321, + "step": 183 + }, + { + "epoch": 0.5888, + "grad_norm": 0.8782322342238923, + "learning_rate": 7.630158715073813e-05, + "loss": 0.5851, + "step": 184 + }, + { + "epoch": 0.592, + "grad_norm": 0.950614076585, + "learning_rate": 7.52922585698315e-05, + "loss": 0.7006, + "step": 185 + }, + { + "epoch": 0.5952, + "grad_norm": 1.0217147617291982, + "learning_rate": 7.428560370317542e-05, + "loss": 0.7017, + "step": 186 + }, + { + "epoch": 0.5984, + "grad_norm": 1.0530995450285776, + "learning_rate": 7.328173148454151e-05, + "loss": 0.7241, + "step": 187 + }, + { + "epoch": 0.6016, + "grad_norm": 0.9777035014050306, + "learning_rate": 7.228075054658096e-05, + "loss": 0.6389, + "step": 188 + }, + { + "epoch": 0.6048, + "grad_norm": 0.9143916182392459, + "learning_rate": 7.1282769209069e-05, + "loss": 0.6462, + "step": 189 + }, + { + "epoch": 0.608, + "grad_norm": 0.8374075204133037, + "learning_rate": 7.028789546718326e-05, + "loss": 0.5759, + "step": 190 + }, + { + "epoch": 0.6112, + "grad_norm": 1.0167524821100296, + "learning_rate": 6.929623697981718e-05, + "loss": 0.7093, + "step": 191 + }, + { + "epoch": 0.6144, + "grad_norm": 1.1410797162196067, + "learning_rate": 6.830790105792973e-05, + "loss": 0.6992, + "step": 192 + }, + { + "epoch": 0.6176, + "grad_norm": 0.9723149428461653, + "learning_rate": 6.732299465293322e-05, + "loss": 0.699, + "step": 193 + }, + { + "epoch": 0.6208, + "grad_norm": 0.9347966751909983, + "learning_rate": 6.63416243451194e-05, + "loss": 0.6143, + "step": 194 + }, + { + "epoch": 0.624, + "grad_norm": 0.9459430624777844, + "learning_rate": 6.536389633212609e-05, + "loss": 0.5846, + "step": 195 + }, + { + "epoch": 0.6272, + "grad_norm": 1.006635068656351, + "learning_rate": 6.43899164174453e-05, + "loss": 0.6846, + "step": 196 + }, + { + "epoch": 0.6304, + "grad_norm": 1.0868346637391972, + "learning_rate": 6.341978999897365e-05, + "loss": 0.7714, + "step": 197 + }, + { + "epoch": 0.6336, + "grad_norm": 1.0001561725192702, + "learning_rate": 6.245362205760704e-05, + "loss": 0.5994, + "step": 198 + }, + { + "epoch": 0.6368, + "grad_norm": 0.9630366831759528, + "learning_rate": 6.149151714588009e-05, + "loss": 0.6332, + "step": 199 + }, + { + "epoch": 0.64, + "grad_norm": 0.9990161510199028, + "learning_rate": 6.053357937665237e-05, + "loss": 0.5778, + "step": 200 + }, + { + "epoch": 0.6432, + "grad_norm": 1.0270025834274374, + "learning_rate": 5.957991241184184e-05, + "loss": 0.7001, + "step": 201 + }, + { + "epoch": 0.6464, + "grad_norm": 0.9797614392174078, + "learning_rate": 5.863061945120719e-05, + "loss": 0.552, + "step": 202 + }, + { + "epoch": 0.6496, + "grad_norm": 0.8856936306713705, + "learning_rate": 5.768580322118034e-05, + "loss": 0.5838, + "step": 203 + }, + { + "epoch": 0.6528, + "grad_norm": 0.9824172956948665, + "learning_rate": 5.6745565963749925e-05, + "loss": 0.6261, + "step": 204 + }, + { + "epoch": 0.656, + "grad_norm": 0.9484220610341958, + "learning_rate": 5.5810009425397294e-05, + "loss": 0.6504, + "step": 205 + }, + { + "epoch": 0.6592, + "grad_norm": 0.9594657155142843, + "learning_rate": 5.487923484608629e-05, + "loss": 0.7055, + "step": 206 + }, + { + "epoch": 0.6624, + "grad_norm": 0.972240224378151, + "learning_rate": 5.395334294830765e-05, + "loss": 0.7094, + "step": 207 + }, + { + "epoch": 0.6656, + "grad_norm": 0.8488682826787697, + "learning_rate": 5.3032433926179395e-05, + "loss": 0.5163, + "step": 208 + }, + { + "epoch": 0.6688, + "grad_norm": 1.0290117142316402, + "learning_rate": 5.211660743460458e-05, + "loss": 0.6177, + "step": 209 + }, + { + "epoch": 0.672, + "grad_norm": 0.9570819786081641, + "learning_rate": 5.1205962578487155e-05, + "loss": 0.7241, + "step": 210 + }, + { + "epoch": 0.6752, + "grad_norm": 0.9374896149693231, + "learning_rate": 5.030059790200756e-05, + "loss": 0.6133, + "step": 211 + }, + { + "epoch": 0.6784, + "grad_norm": 0.9213351808810384, + "learning_rate": 4.940061137795876e-05, + "loss": 0.6202, + "step": 212 + }, + { + "epoch": 0.6816, + "grad_norm": 0.9280034382542609, + "learning_rate": 4.850610039714444e-05, + "loss": 0.6877, + "step": 213 + }, + { + "epoch": 0.6848, + "grad_norm": 0.860741711039809, + "learning_rate": 4.761716175783989e-05, + "loss": 0.5419, + "step": 214 + }, + { + "epoch": 0.688, + "grad_norm": 0.827205240779305, + "learning_rate": 4.673389165531714e-05, + "loss": 0.532, + "step": 215 + }, + { + "epoch": 0.6912, + "grad_norm": 0.9609370356067223, + "learning_rate": 4.585638567143529e-05, + "loss": 0.5443, + "step": 216 + }, + { + "epoch": 0.6944, + "grad_norm": 1.054834291417378, + "learning_rate": 4.498473876429726e-05, + "loss": 0.6386, + "step": 217 + }, + { + "epoch": 0.6976, + "grad_norm": 0.8127636855002158, + "learning_rate": 4.411904525797408e-05, + "loss": 0.5077, + "step": 218 + }, + { + "epoch": 0.7008, + "grad_norm": 1.2734576666636381, + "learning_rate": 4.325939883229766e-05, + "loss": 0.6817, + "step": 219 + }, + { + "epoch": 0.704, + "grad_norm": 0.9137566241882511, + "learning_rate": 4.240589251272342e-05, + "loss": 0.6076, + "step": 220 + }, + { + "epoch": 0.7072, + "grad_norm": 0.943976185714207, + "learning_rate": 4.155861866026364e-05, + "loss": 0.6374, + "step": 221 + }, + { + "epoch": 0.7104, + "grad_norm": 0.9240301677365691, + "learning_rate": 4.071766896149273e-05, + "loss": 0.6086, + "step": 222 + }, + { + "epoch": 0.7136, + "grad_norm": 0.9608932693214731, + "learning_rate": 3.988313441862553e-05, + "loss": 0.5893, + "step": 223 + }, + { + "epoch": 0.7168, + "grad_norm": 0.9109456902151709, + "learning_rate": 3.9055105339669595e-05, + "loss": 0.5603, + "step": 224 + }, + { + "epoch": 0.72, + "grad_norm": 1.054931654692335, + "learning_rate": 3.823367132865265e-05, + "loss": 0.6465, + "step": 225 + }, + { + "epoch": 0.7232, + "grad_norm": 1.0060225278548545, + "learning_rate": 3.741892127592625e-05, + "loss": 0.6534, + "step": 226 + }, + { + "epoch": 0.7264, + "grad_norm": 0.9301145221677807, + "learning_rate": 3.6610943348546526e-05, + "loss": 0.7283, + "step": 227 + }, + { + "epoch": 0.7296, + "grad_norm": 0.9425878421394381, + "learning_rate": 3.580982498073344e-05, + "loss": 0.5308, + "step": 228 + }, + { + "epoch": 0.7328, + "grad_norm": 0.968421456129294, + "learning_rate": 3.501565286440914e-05, + "loss": 0.6099, + "step": 229 + }, + { + "epoch": 0.736, + "grad_norm": 0.9190664904011367, + "learning_rate": 3.422851293981676e-05, + "loss": 0.664, + "step": 230 + }, + { + "epoch": 0.7392, + "grad_norm": 0.9544868911319989, + "learning_rate": 3.3448490386220355e-05, + "loss": 0.6063, + "step": 231 + }, + { + "epoch": 0.7424, + "grad_norm": 0.9538656246451748, + "learning_rate": 3.2675669612687565e-05, + "loss": 0.5914, + "step": 232 + }, + { + "epoch": 0.7456, + "grad_norm": 5.864599498064909, + "learning_rate": 3.191013424895536e-05, + "loss": 0.5884, + "step": 233 + }, + { + "epoch": 0.7488, + "grad_norm": 0.924638663719702, + "learning_rate": 3.115196713638e-05, + "loss": 0.5661, + "step": 234 + }, + { + "epoch": 0.752, + "grad_norm": 1.077927200421442, + "learning_rate": 3.040125031897264e-05, + "loss": 0.7104, + "step": 235 + }, + { + "epoch": 0.7552, + "grad_norm": 0.9126341698953319, + "learning_rate": 2.9658065034520978e-05, + "loss": 0.5792, + "step": 236 + }, + { + "epoch": 0.7584, + "grad_norm": 0.9167560621645063, + "learning_rate": 2.892249170579826e-05, + "loss": 0.6753, + "step": 237 + }, + { + "epoch": 0.7616, + "grad_norm": 0.8934477881360058, + "learning_rate": 2.8194609931860316e-05, + "loss": 0.5331, + "step": 238 + }, + { + "epoch": 0.7648, + "grad_norm": 1.0911526373889162, + "learning_rate": 2.7474498479432087e-05, + "loss": 0.5562, + "step": 239 + }, + { + "epoch": 0.768, + "grad_norm": 1.0372617919862839, + "learning_rate": 2.6762235274383772e-05, + "loss": 0.6061, + "step": 240 + }, + { + "epoch": 0.7712, + "grad_norm": 1.874872012362247, + "learning_rate": 2.6057897393298324e-05, + "loss": 0.6462, + "step": 241 + }, + { + "epoch": 0.7744, + "grad_norm": 1.002221190667952, + "learning_rate": 2.536156105513062e-05, + "loss": 0.5881, + "step": 242 + }, + { + "epoch": 0.7776, + "grad_norm": 0.964781181057697, + "learning_rate": 2.4673301612959654e-05, + "loss": 0.5915, + "step": 243 + }, + { + "epoch": 0.7808, + "grad_norm": 0.9627190399764589, + "learning_rate": 2.399319354583418e-05, + "loss": 0.6914, + "step": 244 + }, + { + "epoch": 0.784, + "grad_norm": 1.017981397065761, + "learning_rate": 2.3321310450713062e-05, + "loss": 0.706, + "step": 245 + }, + { + "epoch": 0.7872, + "grad_norm": 1.012164734208218, + "learning_rate": 2.265772503450122e-05, + "loss": 0.6131, + "step": 246 + }, + { + "epoch": 0.7904, + "grad_norm": 0.8897086839324428, + "learning_rate": 2.2002509106181624e-05, + "loss": 0.5776, + "step": 247 + }, + { + "epoch": 0.7936, + "grad_norm": 1.0174871061711546, + "learning_rate": 2.1355733569044635e-05, + "loss": 0.7699, + "step": 248 + }, + { + "epoch": 0.7968, + "grad_norm": 0.9409135368695652, + "learning_rate": 2.0717468413015283e-05, + "loss": 0.56, + "step": 249 + }, + { + "epoch": 0.8, + "grad_norm": 0.9684884037872186, + "learning_rate": 2.008778270707944e-05, + "loss": 0.5933, + "step": 250 + }, + { + "epoch": 0.8032, + "grad_norm": 0.8685648466866711, + "learning_rate": 1.946674459180955e-05, + "loss": 0.5562, + "step": 251 + }, + { + "epoch": 0.8064, + "grad_norm": 1.0087704494348668, + "learning_rate": 1.8854421271990964e-05, + "loss": 0.6087, + "step": 252 + }, + { + "epoch": 0.8096, + "grad_norm": 0.8961312204544799, + "learning_rate": 1.8250879009349398e-05, + "loss": 0.6149, + "step": 253 + }, + { + "epoch": 0.8128, + "grad_norm": 0.9365625630552173, + "learning_rate": 1.7656183115380577e-05, + "loss": 0.6084, + "step": 254 + }, + { + "epoch": 0.816, + "grad_norm": 0.9702115890508202, + "learning_rate": 1.707039794428259e-05, + "loss": 0.6264, + "step": 255 + }, + { + "epoch": 0.8192, + "grad_norm": 1.1242270439776727, + "learning_rate": 1.649358688599191e-05, + "loss": 0.697, + "step": 256 + }, + { + "epoch": 0.8224, + "grad_norm": 0.9099287195259446, + "learning_rate": 1.5925812359323745e-05, + "loss": 0.5553, + "step": 257 + }, + { + "epoch": 0.8256, + "grad_norm": 1.0095969089859835, + "learning_rate": 1.5367135805217458e-05, + "loss": 0.7007, + "step": 258 + }, + { + "epoch": 0.8288, + "grad_norm": 1.159962294378398, + "learning_rate": 1.4817617680087825e-05, + "loss": 0.6413, + "step": 259 + }, + { + "epoch": 0.832, + "grad_norm": 0.86927957380752, + "learning_rate": 1.4277317449282834e-05, + "loss": 0.5238, + "step": 260 + }, + { + "epoch": 0.8352, + "grad_norm": 0.9693712197844089, + "learning_rate": 1.3746293580648717e-05, + "loss": 0.6028, + "step": 261 + }, + { + "epoch": 0.8384, + "grad_norm": 0.9843252719096152, + "learning_rate": 1.3224603538202929e-05, + "loss": 0.615, + "step": 262 + }, + { + "epoch": 0.8416, + "grad_norm": 0.8901672647222229, + "learning_rate": 1.2712303775915802e-05, + "loss": 0.5222, + "step": 263 + }, + { + "epoch": 0.8448, + "grad_norm": 0.9651282196979104, + "learning_rate": 1.220944973160133e-05, + "loss": 0.6034, + "step": 264 + }, + { + "epoch": 0.848, + "grad_norm": 0.8990217271577625, + "learning_rate": 1.1716095820918216e-05, + "loss": 0.5897, + "step": 265 + }, + { + "epoch": 0.8512, + "grad_norm": 0.9311848638173101, + "learning_rate": 1.1232295431481222e-05, + "loss": 0.6126, + "step": 266 + }, + { + "epoch": 0.8544, + "grad_norm": 0.917504640800634, + "learning_rate": 1.0758100917083991e-05, + "loss": 0.588, + "step": 267 + }, + { + "epoch": 0.8576, + "grad_norm": 0.9633897678979783, + "learning_rate": 1.0293563592033595e-05, + "loss": 0.6615, + "step": 268 + }, + { + "epoch": 0.8608, + "grad_norm": 1.00795740870038, + "learning_rate": 9.838733725597615e-06, + "loss": 0.5764, + "step": 269 + }, + { + "epoch": 0.864, + "grad_norm": 0.8508045166113912, + "learning_rate": 9.393660536564408e-06, + "loss": 0.5191, + "step": 270 + }, + { + "epoch": 0.8672, + "grad_norm": 1.047343602543878, + "learning_rate": 8.958392187916841e-06, + "loss": 0.6884, + "step": 271 + }, + { + "epoch": 0.8704, + "grad_norm": 0.8558631508335752, + "learning_rate": 8.532975781620512e-06, + "loss": 0.5093, + "step": 272 + }, + { + "epoch": 0.8736, + "grad_norm": 0.9472568352001958, + "learning_rate": 8.117457353526625e-06, + "loss": 0.6343, + "step": 273 + }, + { + "epoch": 0.8768, + "grad_norm": 0.9440682841311664, + "learning_rate": 7.711881868390291e-06, + "loss": 0.7094, + "step": 274 + }, + { + "epoch": 0.88, + "grad_norm": 1.0202250496053065, + "learning_rate": 7.3162932150046885e-06, + "loss": 0.6833, + "step": 275 + }, + { + "epoch": 0.8832, + "grad_norm": 0.9520439938838217, + "learning_rate": 6.930734201451816e-06, + "loss": 0.6359, + "step": 276 + }, + { + "epoch": 0.8864, + "grad_norm": 1.094476040858008, + "learning_rate": 6.555246550469907e-06, + "loss": 0.6435, + "step": 277 + }, + { + "epoch": 0.8896, + "grad_norm": 1.0048329225203338, + "learning_rate": 6.189870894938587e-06, + "loss": 0.6153, + "step": 278 + }, + { + "epoch": 0.8928, + "grad_norm": 0.9456091943259975, + "learning_rate": 5.834646773481811e-06, + "loss": 0.6306, + "step": 279 + }, + { + "epoch": 0.896, + "grad_norm": 1.4770809130718134, + "learning_rate": 5.489612626189245e-06, + "loss": 0.9741, + "step": 280 + }, + { + "epoch": 0.8992, + "grad_norm": 0.974923751267628, + "learning_rate": 5.154805790456485e-06, + "loss": 0.6106, + "step": 281 + }, + { + "epoch": 0.9024, + "grad_norm": 0.9459621761058625, + "learning_rate": 4.830262496944693e-06, + "loss": 0.5571, + "step": 282 + }, + { + "epoch": 0.9056, + "grad_norm": 0.8957426369513457, + "learning_rate": 4.516017865659949e-06, + "loss": 0.5007, + "step": 283 + }, + { + "epoch": 0.9088, + "grad_norm": 0.9364274853309743, + "learning_rate": 4.21210590215273e-06, + "loss": 0.5761, + "step": 284 + }, + { + "epoch": 0.912, + "grad_norm": 0.9165602428300078, + "learning_rate": 3.918559493838114e-06, + "loss": 0.5077, + "step": 285 + }, + { + "epoch": 0.9152, + "grad_norm": 0.9834989066154466, + "learning_rate": 3.6354104064368566e-06, + "loss": 0.6162, + "step": 286 + }, + { + "epoch": 0.9184, + "grad_norm": 1.1292517278477647, + "learning_rate": 3.3626892805379562e-06, + "loss": 0.6428, + "step": 287 + }, + { + "epoch": 0.9216, + "grad_norm": 0.9337817865286436, + "learning_rate": 3.100425628282899e-06, + "loss": 0.5469, + "step": 288 + }, + { + "epoch": 0.9248, + "grad_norm": 0.9912221780816013, + "learning_rate": 2.848647830172024e-06, + "loss": 0.5816, + "step": 289 + }, + { + "epoch": 0.928, + "grad_norm": 0.9106967657243581, + "learning_rate": 2.607383131993424e-06, + "loss": 0.5833, + "step": 290 + }, + { + "epoch": 0.9312, + "grad_norm": 1.07092147689949, + "learning_rate": 2.3766576418745022e-06, + "loss": 0.7294, + "step": 291 + }, + { + "epoch": 0.9344, + "grad_norm": 0.9219668633777998, + "learning_rate": 2.1564963274568027e-06, + "loss": 0.6053, + "step": 292 + }, + { + "epoch": 0.9376, + "grad_norm": 1.7589766769110666, + "learning_rate": 1.9469230131940907e-06, + "loss": 0.7696, + "step": 293 + }, + { + "epoch": 0.9408, + "grad_norm": 0.8972215579228963, + "learning_rate": 1.7479603777742938e-06, + "loss": 0.5035, + "step": 294 + }, + { + "epoch": 0.944, + "grad_norm": 0.9047492331034355, + "learning_rate": 1.559629951665298e-06, + "loss": 0.5227, + "step": 295 + }, + { + "epoch": 0.9472, + "grad_norm": 0.9475427562480715, + "learning_rate": 1.3819521147851123e-06, + "loss": 0.5762, + "step": 296 + }, + { + "epoch": 0.9504, + "grad_norm": 0.9811638817803725, + "learning_rate": 1.2149460942964098e-06, + "loss": 0.5815, + "step": 297 + }, + { + "epoch": 0.9536, + "grad_norm": 1.0020873846705007, + "learning_rate": 1.05862996252597e-06, + "loss": 0.538, + "step": 298 + }, + { + "epoch": 0.9568, + "grad_norm": 1.1067755275823017, + "learning_rate": 9.130206350089765e-07, + "loss": 0.6765, + "step": 299 + }, + { + "epoch": 0.96, + "grad_norm": 0.8715046732367977, + "learning_rate": 7.781338686584927e-07, + "loss": 0.4914, + "step": 300 + }, + { + "epoch": 0.9632, + "grad_norm": 1.0634576890292167, + "learning_rate": 6.539842600603918e-07, + "loss": 0.6705, + "step": 301 + }, + { + "epoch": 0.9664, + "grad_norm": 0.8549848401831835, + "learning_rate": 5.405852438937764e-07, + "loss": 0.5987, + "step": 302 + }, + { + "epoch": 0.9696, + "grad_norm": 1.012508590543694, + "learning_rate": 4.3794909147720773e-07, + "loss": 0.659, + "step": 303 + }, + { + "epoch": 0.9728, + "grad_norm": 0.9693468696989235, + "learning_rate": 3.4608690944071263e-07, + "loss": 0.6589, + "step": 304 + }, + { + "epoch": 0.976, + "grad_norm": 0.9666706891053034, + "learning_rate": 2.6500863852395584e-07, + "loss": 0.6538, + "step": 305 + }, + { + "epoch": 0.9792, + "grad_norm": 0.8992506419411962, + "learning_rate": 1.947230525005006e-07, + "loss": 0.5815, + "step": 306 + }, + { + "epoch": 0.9824, + "grad_norm": 0.9571595303459484, + "learning_rate": 1.3523775722834587e-07, + "loss": 0.6566, + "step": 307 + }, + { + "epoch": 0.9856, + "grad_norm": 0.9173568643648345, + "learning_rate": 8.655918982689581e-08, + "loss": 0.6204, + "step": 308 + }, + { + "epoch": 0.9888, + "grad_norm": 1.0748041207141892, + "learning_rate": 4.8692617980350406e-08, + "loss": 0.7146, + "step": 309 + }, + { + "epoch": 0.992, + "grad_norm": 0.9754095477814513, + "learning_rate": 2.164213936770576e-08, + "loss": 0.679, + "step": 310 + }, + { + "epoch": 0.9952, + "grad_norm": 0.90539763646193, + "learning_rate": 5.410681219286673e-09, + "loss": 0.5196, + "step": 311 + }, + { + "epoch": 0.9984, + "grad_norm": 1.0640839815841943, + "learning_rate": 0.0, + "loss": 0.6511, + "step": 312 + }, + { + "epoch": 0.9984, + "step": 312, + "total_flos": 41706149216256.0, + "train_loss": 0.7100516487008486, + "train_runtime": 2587.0165, + "train_samples_per_second": 1.933, + "train_steps_per_second": 0.121 + } + ], + "logging_steps": 1.0, + "max_steps": 312, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 41706149216256.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}